sglang 0.4.6__tar.gz → 0.4.6.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (630) hide show
  1. {sglang-0.4.6/sglang.egg-info → sglang-0.4.6.post1}/PKG-INFO +2 -2
  2. {sglang-0.4.6 → sglang-0.4.6.post1}/pyproject.toml +2 -2
  3. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/decode.py +8 -2
  4. sglang-0.4.6.post1/sglang/srt/disaggregation/fake/__init__.py +1 -0
  5. sglang-0.4.6.post1/sglang/srt/disaggregation/fake/conn.py +88 -0
  6. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/prefill.py +12 -3
  7. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/utils.py +16 -2
  8. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/entrypoints/engine.py +9 -0
  9. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/entrypoints/http_server.py +27 -2
  10. sglang-0.4.6.post1/sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  11. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/utils.py +1 -1
  12. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  13. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  14. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  15. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  16. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  17. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  18. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  19. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  20. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  21. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  22. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  23. sglang-0.4.6.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  24. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -2
  25. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  26. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8.py +20 -22
  27. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8_utils.py +2 -2
  28. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/schedule_batch.py +9 -0
  29. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/scheduler.py +10 -8
  30. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
  31. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/tp_worker.py +3 -3
  32. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
  33. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_executor/model_runner.py +8 -1
  34. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/openai_api/adapter.py +32 -3
  35. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/openai_api/protocol.py +2 -0
  36. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/reasoning_parser.py +25 -1
  37. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/server_args.py +16 -2
  38. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/utils.py +3 -0
  39. sglang-0.4.6.post1/sglang/test/send_one.py +144 -0
  40. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_utils.py +38 -0
  41. sglang-0.4.6.post1/sglang/version.py +1 -0
  42. {sglang-0.4.6 → sglang-0.4.6.post1/sglang.egg-info}/PKG-INFO +2 -2
  43. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang.egg-info/SOURCES.txt +15 -0
  44. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang.egg-info/requires.txt +1 -1
  45. sglang-0.4.6/sglang/test/send_one.py +0 -88
  46. sglang-0.4.6/sglang/version.py +0 -1
  47. {sglang-0.4.6 → sglang-0.4.6.post1}/LICENSE +0 -0
  48. {sglang-0.4.6 → sglang-0.4.6.post1}/README.md +0 -0
  49. {sglang-0.4.6 → sglang-0.4.6.post1}/setup.cfg +0 -0
  50. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/__init__.py +0 -0
  51. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/api.py +0 -0
  52. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/bench_offline_throughput.py +0 -0
  53. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/bench_one_batch.py +0 -0
  54. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/bench_one_batch_server.py +0 -0
  55. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/bench_serving.py +0 -0
  56. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/check_env.py +0 -0
  57. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/compile_deep_gemm.py +0 -0
  58. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/global_config.py +0 -0
  59. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/__init__.py +0 -0
  60. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/anthropic.py +0 -0
  61. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/base_backend.py +0 -0
  62. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/litellm.py +0 -0
  63. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/openai.py +0 -0
  64. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  65. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/backend/vertexai.py +0 -0
  66. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/chat_template.py +0 -0
  67. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/choices.py +0 -0
  68. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/compiler.py +0 -0
  69. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/interpreter.py +0 -0
  70. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/ir.py +0 -0
  71. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/lang/tracer.py +0 -0
  72. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/launch_server.py +0 -0
  73. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/llama3_eval.py +0 -0
  74. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/_custom_ops.py +0 -0
  75. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/aio_rwlock.py +0 -0
  76. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/code_completion_parser.py +0 -0
  77. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/__init__.py +0 -0
  78. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/chatglm.py +0 -0
  79. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/dbrx.py +0 -0
  80. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/deepseekvl2.py +0 -0
  81. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/device_config.py +0 -0
  82. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/exaone.py +0 -0
  83. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/janus_pro.py +0 -0
  84. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/load_config.py +0 -0
  85. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/model_config.py +0 -0
  86. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/configs/utils.py +0 -0
  87. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/__init__.py +0 -0
  88. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/base_connector.py +0 -0
  89. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/redis.py +0 -0
  90. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/s3.py +0 -0
  91. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/serde/__init__.py +0 -0
  92. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
  93. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/serde/serde.py +0 -0
  94. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/connector/utils.py +0 -0
  95. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  96. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/llguidance_backend.py +0 -0
  97. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  98. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  99. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  100. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  101. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  102. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/conversation.py +0 -0
  103. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/custom_op.py +0 -0
  104. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/base/__init__.py +0 -0
  105. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/base/conn.py +0 -0
  106. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mini_lb.py +0 -0
  107. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  108. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/conn.py +0 -0
  109. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
  110. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  111. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  112. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/__init__.py +0 -0
  113. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/communication_op.py +0 -0
  114. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  115. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  116. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  117. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  118. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  119. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  120. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  121. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  122. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  123. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/distributed/utils.py +0 -0
  124. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/entrypoints/EngineBase.py +0 -0
  125. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/entrypoints/http_server_engine.py +0 -0
  126. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/entrypoints/verl_engine.py +0 -0
  127. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/function_call_parser.py +0 -0
  128. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  129. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/activation.py +0 -0
  130. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  131. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  132. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
  133. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  134. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  135. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  136. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  137. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  138. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  139. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  140. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  141. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  142. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  143. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/attention/vision.py +0 -0
  144. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/dp_attention.py +0 -0
  145. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/elementwise.py +0 -0
  146. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/layernorm.py +0 -0
  147. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/linear.py +0 -0
  148. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/logits_processor.py +0 -0
  149. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  150. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  151. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  152. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
  153. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  154. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  155. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  156. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  157. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  158. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  159. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  160. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  161. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  162. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  163. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  164. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  165. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  166. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  167. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  168. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  169. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  170. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  171. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  172. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  173. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  174. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  175. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  176. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  177. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  178. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  179. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  180. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  181. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  182. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  183. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  184. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  185. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  186. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  187. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  188. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  189. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  190. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  191. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  192. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  193. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  194. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  195. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  196. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  197. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  198. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  199. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  200. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  201. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  202. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  203. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  204. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  205. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  206. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  207. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  208. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  209. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  210. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  211. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  212. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  213. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  214. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  215. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  216. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  217. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  218. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  219. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  220. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  221. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  222. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  223. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  224. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  225. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  226. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  227. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  228. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  229. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  230. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  231. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  233. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  234. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  235. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  236. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  237. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  238. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  239. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  240. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  241. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  242. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  243. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  244. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  245. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  246. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  247. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  248. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  249. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  250. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  251. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  252. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  253. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  254. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  255. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  256. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  257. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  258. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  259. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  260. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  261. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  262. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  263. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  264. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  265. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  266. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  267. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  268. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  269. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  270. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  271. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  272. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  273. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  274. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  275. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  276. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  277. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  278. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  279. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  280. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  281. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  282. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  283. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  284. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  285. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  286. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  287. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  288. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  289. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  290. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/router.py +0 -0
  291. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/moe/topk.py +0 -0
  292. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/parameter.py +0 -0
  293. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/pooler.py +0 -0
  294. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  295. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/awq.py +0 -0
  296. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  297. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  298. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  299. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  300. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  301. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  302. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  303. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  304. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  305. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  306. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  307. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  308. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  309. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  310. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  311. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  312. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  313. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  314. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  315. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  316. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  317. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  318. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  319. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  320. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  321. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  322. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  323. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  324. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  325. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  326. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  327. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  328. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  329. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  330. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  331. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  332. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  333. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  334. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  335. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  336. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  337. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  338. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  339. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  340. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  341. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  342. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  344. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  345. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  346. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  348. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/deep_gemm.py +0 -0
  459. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  460. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
  461. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  462. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  463. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  464. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  465. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
  466. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/utils.py +0 -0
  467. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  468. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  469. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/radix_attention.py +0 -0
  470. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
  471. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/sampler.py +0 -0
  472. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  473. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  474. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
  475. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  476. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
  477. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/layers.py +0 -0
  478. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/lora.py +0 -0
  479. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/lora_config.py +0 -0
  480. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/lora_manager.py +0 -0
  481. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/mem_pool.py +0 -0
  482. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  483. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  484. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  485. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  486. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  487. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/lora/utils.py +0 -0
  488. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/cache_controller.py +0 -0
  489. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/configure_logging.py +0 -0
  490. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  491. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  492. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/expert_distribution.py +0 -0
  493. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/io_struct.py +0 -0
  494. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/mm_utils.py +0 -0
  495. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
  496. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/base_processor.py +0 -0
  497. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  498. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  499. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  500. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/janus_pro.py +0 -0
  501. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  502. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  503. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  504. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/mllama4.py +0 -0
  505. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  506. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  507. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/session_controller.py +0 -0
  508. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/tokenizer_manager.py +0 -0
  509. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/managers/utils.py +0 -0
  510. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  511. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  512. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  513. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  514. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
  515. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  516. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  517. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/metrics/collector.py +0 -0
  518. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/metrics/func_timer.py +0 -0
  519. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/mm_utils.py +0 -0
  520. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  521. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  522. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_loader/__init__.py +0 -0
  523. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_loader/loader.py +0 -0
  524. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_loader/utils.py +0 -0
  525. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
  526. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/model_parallel.py +0 -0
  527. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/baichuan.py +0 -0
  528. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/bert.py +0 -0
  529. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/chatglm.py +0 -0
  530. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/clip.py +0 -0
  531. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/commandr.py +0 -0
  532. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/dbrx.py +0 -0
  533. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/deepseek.py +0 -0
  534. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  535. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_nextn.py +0 -0
  536. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_v2.py +0 -0
  537. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/deepseek_vl2.py +0 -0
  538. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/exaone.py +0 -0
  539. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gemma.py +0 -0
  540. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gemma2.py +0 -0
  541. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  542. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gemma3_causal.py +0 -0
  543. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gemma3_mm.py +0 -0
  544. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gpt2.py +0 -0
  545. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  546. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/granite.py +0 -0
  547. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/grok.py +0 -0
  548. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/internlm2.py +0 -0
  549. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  550. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama.py +0 -0
  551. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama4.py +0 -0
  552. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama_classification.py +0 -0
  553. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama_eagle.py +0 -0
  554. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama_eagle3.py +0 -0
  555. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama_embedding.py +0 -0
  556. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llama_reward.py +0 -0
  557. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llava.py +0 -0
  558. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/llavavid.py +0 -0
  559. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/minicpm.py +0 -0
  560. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/minicpm3.py +0 -0
  561. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/minicpmo.py +0 -0
  562. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/minicpmv.py +0 -0
  563. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/mistral.py +0 -0
  564. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/mixtral.py +0 -0
  565. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  566. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/mllama.py +0 -0
  567. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/mllama4.py +0 -0
  568. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/olmo.py +0 -0
  569. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/olmo2.py +0 -0
  570. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/olmoe.py +0 -0
  571. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/phi3_small.py +0 -0
  572. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen.py +0 -0
  573. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2.py +0 -0
  574. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_5_vl.py +0 -0
  575. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_classification.py +0 -0
  576. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  577. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  578. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_rm.py +0 -0
  579. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen2_vl.py +0 -0
  580. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen3.py +0 -0
  581. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/qwen3_moe.py +0 -0
  582. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/registry.py +0 -0
  583. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/roberta.py +0 -0
  584. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/stablelm.py +0 -0
  585. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  586. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/xverse.py +0 -0
  587. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/xverse_moe.py +0 -0
  588. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/models/yivl.py +0 -0
  589. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/patch_torch.py +0 -0
  590. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/platforms/interface.py +0 -0
  591. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  592. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  593. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  594. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  595. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  596. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  597. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  598. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  599. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  600. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  601. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_utils.py +0 -0
  602. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/speculative/eagle_worker.py +0 -0
  603. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/speculative/spec_info.py +0 -0
  604. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  605. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/srt/warmup.py +0 -0
  606. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/__init__.py +0 -0
  607. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/attention/__init__.py +0 -0
  608. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/attention/test_flashattn_backend.py +0 -0
  609. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  610. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  611. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  612. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  613. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/run_eval.py +0 -0
  614. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/runners.py +0 -0
  615. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/simple_eval_common.py +0 -0
  616. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  617. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  618. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/simple_eval_math.py +0 -0
  619. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  620. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  621. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_activation.py +0 -0
  622. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_block_fp8.py +0 -0
  623. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_block_fp8_ep.py +0 -0
  624. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_custom_ops.py +0 -0
  625. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
  626. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_layernorm.py +0 -0
  627. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/test/test_programs.py +0 -0
  628. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang/utils.py +0 -0
  629. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang.egg-info/dependency_links.txt +0 -0
  630. {sglang-0.4.6 → sglang-0.4.6.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6
3
+ Version: 0.4.6.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -245,7 +245,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
246
  Provides-Extra: srt
247
247
  Requires-Dist: sglang[runtime_common]; extra == "srt"
248
- Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
248
+ Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
249
249
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
250
250
  Requires-Dist: torch==2.6.0; extra == "srt"
251
251
  Requires-Dist: torchvision==0.21.0; extra == "srt"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.6"
7
+ version = "0.4.6.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -46,7 +46,7 @@ runtime_common = [
46
46
 
47
47
  srt = [
48
48
  "sglang[runtime_common]",
49
- "sgl-kernel==0.0.9.post2",
49
+ "sgl-kernel==0.1.0",
50
50
  "flashinfer_python==0.2.3",
51
51
  "torch==2.6.0",
52
52
  "torchvision==0.21.0",
@@ -32,6 +32,7 @@ from torch.distributed import ProcessGroup
32
32
  from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVArgs, KVPoll
33
33
  from sglang.srt.disaggregation.utils import (
34
34
  DisaggregationMode,
35
+ FakeBootstrapHost,
35
36
  KVClassType,
36
37
  ReqToMetadataIdxAllocator,
37
38
  TransferBackend,
@@ -133,8 +134,13 @@ class DecodePreallocQueue:
133
134
 
134
135
  def add(self, req: Req) -> None:
135
136
  """Add a request to the pending queue."""
136
-
137
- kv_receiver_class = get_kv_class(self.transfer_backend, KVClassType.RECEIVER)
137
+ if req.bootstrap_host == FakeBootstrapHost:
138
+ # Fake transfer for warmup reqs
139
+ kv_receiver_class = get_kv_class(TransferBackend.FAKE, KVClassType.RECEIVER)
140
+ else:
141
+ kv_receiver_class = get_kv_class(
142
+ self.transfer_backend, KVClassType.RECEIVER
143
+ )
138
144
  kv_receiver = kv_receiver_class(
139
145
  mgr=self.kv_manager,
140
146
  bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
@@ -0,0 +1 @@
1
+ from .conn import FakeKVReceiver, FakeKVSender
@@ -0,0 +1,88 @@
1
+ import logging
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+
7
+ from sglang.srt.disaggregation.base.conn import (
8
+ BaseKVManager,
9
+ BaseKVReceiver,
10
+ BaseKVSender,
11
+ KVArgs,
12
+ KVPoll,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # For warmup reqs, we don't kv transfer, we use the fake sender and receiver
19
+ class FakeKVSender(BaseKVSender):
20
+ def __init__(self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int):
21
+ self.has_sent = False
22
+
23
+ def poll(self) -> KVPoll:
24
+ if self.has_sent is False:
25
+ # Assume handshake completed instantly
26
+ return KVPoll.WaitingForInput
27
+ else:
28
+ # Assume transfer completed instantly
29
+ logger.info("FakeKVSender poll success")
30
+ return KVPoll.Success
31
+
32
+ def init(
33
+ self,
34
+ kv_indices: list[int],
35
+ aux_index: Optional[int] = None,
36
+ dest_ranks: Optional[list[int]] = None,
37
+ ):
38
+ logger.info(
39
+ f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}, dest_ranks: {dest_ranks}"
40
+ )
41
+ pass
42
+
43
+ def send(
44
+ self,
45
+ kv_indices: npt.NDArray[np.int64],
46
+ index_slice: slice,
47
+ is_last: bool,
48
+ ):
49
+ logger.info(
50
+ f"FakeKVSender send with kv_indices: {kv_indices}, index_slice: {index_slice}, is_last: {is_last}"
51
+ )
52
+ if is_last:
53
+ self.has_sent = True
54
+ logger.info(f"FakeKVSender send success")
55
+ else:
56
+ self.has_sent = False
57
+ logger.info(f"FakeKVSender send fake transfering")
58
+
59
+ def failure_exception(self):
60
+ raise Exception("Fake KVSender Exception")
61
+
62
+
63
+ class FakeKVReceiver(BaseKVReceiver):
64
+ def __init__(
65
+ self,
66
+ mgr: BaseKVManager,
67
+ bootstrap_addr: str,
68
+ bootstrap_room: Optional[int] = None,
69
+ ):
70
+ self.has_init = False
71
+
72
+ def poll(self) -> KVPoll:
73
+ if self.has_init is False:
74
+ # Assume handshake completed instantly
75
+ return KVPoll.WaitingForInput
76
+ else:
77
+ # Assume transfer completed instantly
78
+ logger.info("FakeKVReceiver poll success")
79
+ return KVPoll.Success
80
+
81
+ def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
82
+ self.has_init = True
83
+ logger.info(
84
+ f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
85
+ )
86
+
87
+ def failure_exception(self):
88
+ raise Exception("Fake KVReceiver Exception")
@@ -20,6 +20,7 @@ Life cycle of a request in the prefill server
20
20
  from __future__ import annotations
21
21
 
22
22
  import logging
23
+ import threading
23
24
  from collections import deque
24
25
  from typing import TYPE_CHECKING, List, Optional
25
26
 
@@ -28,6 +29,7 @@ import torch
28
29
  from sglang.srt.disaggregation.base import BaseKVManager, KVArgs, KVPoll
29
30
  from sglang.srt.disaggregation.utils import (
30
31
  DisaggregationMode,
32
+ FakeBootstrapHost,
31
33
  KVClassType,
32
34
  ReqToMetadataIdxAllocator,
33
35
  TransferBackend,
@@ -115,7 +117,11 @@ class PrefillBootstrapQueue:
115
117
  return kv_manager
116
118
 
117
119
  def add(self, req: Req) -> None:
118
- kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
120
+ if req.bootstrap_host == FakeBootstrapHost:
121
+ # Fake transfer for warmup reqs
122
+ kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
123
+ else:
124
+ kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
119
125
  req.disagg_kv_sender = kv_sender_class(
120
126
  mgr=self.kv_manager,
121
127
  bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
@@ -256,7 +262,10 @@ class SchedulerDisaggregationPrefillMixin:
256
262
  self.running_batch.batch_is_full = False
257
263
 
258
264
  def process_batch_result_disagg_prefill(
259
- self: Scheduler, batch: ScheduleBatch, result: GenerationBatchResult
265
+ self: Scheduler,
266
+ batch: ScheduleBatch,
267
+ result: GenerationBatchResult,
268
+ launch_done: Optional[threading.Event] = None,
260
269
  ) -> None:
261
270
  """
262
271
  Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
@@ -280,7 +289,7 @@ class SchedulerDisaggregationPrefillMixin:
280
289
  # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
281
290
  if self.enable_overlap:
282
291
  # wait
283
- _, next_token_ids = self.tp_worker.resolve_batch_result(bid)
292
+ _, next_token_ids = self.tp_worker.resolve_last_batch_result(launch_done)
284
293
  else:
285
294
  next_token_ids = result.next_token_ids.tolist()
286
295
 
@@ -15,6 +15,9 @@ class DisaggregationMode(Enum):
15
15
  DECODE = "decode"
16
16
 
17
17
 
18
+ FakeBootstrapHost = "2.2.2.2"
19
+
20
+
18
21
  def poll_and_all_reduce(pollers, gloo_group):
19
22
  polls = [int(poller.poll()) for poller in pollers]
20
23
  tensor_to_reduce = torch.tensor(polls, dtype=torch.uint8, device="cpu")
@@ -59,6 +62,8 @@ class KVClassType(Enum):
59
62
 
60
63
 
61
64
  def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
65
+ from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
66
+
62
67
  if transfer_backend == TransferBackend.MOONCAKE:
63
68
  from sglang.srt.disaggregation.mooncake import (
64
69
  MooncakeKVBootstrapServer,
@@ -70,7 +75,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
70
75
  class_mapping = {
71
76
  KVClassType.MANAGER: MooncakeKVManager,
72
77
  KVClassType.SENDER: MooncakeKVSender,
73
- KVClassType.RECEIVER: MooncakeKVReceiver,
78
+ KVClassType.RECEIVER: (MooncakeKVReceiver),
74
79
  KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
75
80
  }
76
81
  return class_mapping.get(class_type)
@@ -85,10 +90,19 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
85
90
  class_mapping = {
86
91
  KVClassType.MANAGER: NixlKVManager,
87
92
  KVClassType.SENDER: NixlKVSender,
88
- KVClassType.RECEIVER: NixlKVReceiver,
93
+ KVClassType.RECEIVER: (NixlKVReceiver),
89
94
  KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
90
95
  }
91
96
  return class_mapping.get(class_type)
97
+ if transfer_backend == TransferBackend.FAKE:
98
+ from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
99
+
100
+ class_mapping = {
101
+ KVClassType.SENDER: FakeKVSender,
102
+ KVClassType.RECEIVER: (FakeKVReceiver),
103
+ }
104
+ return class_mapping.get(class_type)
105
+
92
106
  raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
93
107
 
94
108
 
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
66
66
  assert_pkg_version,
67
67
  configure_logger,
68
68
  get_zmq_socket,
69
+ is_cuda,
69
70
  kill_process_tree,
70
71
  launch_dummy_health_check_server,
71
72
  maybe_set_triton_cache_manager,
@@ -78,6 +79,8 @@ from sglang.version import __version__
78
79
  logger = logging.getLogger(__name__)
79
80
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
80
81
 
82
+ _is_cuda = is_cuda()
83
+
81
84
 
82
85
  class Engine(EngineBase):
83
86
  """
@@ -452,6 +455,12 @@ def _set_envs_and_config(server_args: ServerArgs):
452
455
  "reinstall the latest version by following the instructions "
453
456
  "at https://docs.flashinfer.ai/installation.html.",
454
457
  )
458
+ if _is_cuda:
459
+ assert_pkg_version(
460
+ "sgl-kernel",
461
+ "0.1.0",
462
+ "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
463
+ )
455
464
 
456
465
  def sigchld_handler(signum, frame):
457
466
  pid, exitcode = os.waitpid(0, os.WNOHANG)
@@ -42,6 +42,7 @@ from fastapi import FastAPI, File, Form, Request, UploadFile
42
42
  from fastapi.middleware.cors import CORSMiddleware
43
43
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
44
44
 
45
+ from sglang.srt.disaggregation.utils import FakeBootstrapHost
45
46
  from sglang.srt.entrypoints.engine import _launch_subprocesses
46
47
  from sglang.srt.function_call_parser import FunctionCallParser
47
48
  from sglang.srt.managers.io_struct import (
@@ -821,8 +822,32 @@ def _wait_and_warmup(
821
822
  )
822
823
  assert res.status_code == 200, f"{res}"
823
824
  else:
824
- # Warmup request currently hangs in disaggregation mode, so we skip it.
825
- logger.info("Skipping warmup request in disaggregation mode")
825
+ logger.info(f"Start of prefill warmup ...")
826
+ json_data = {
827
+ "sampling_params": {
828
+ "temperature": 0.0,
829
+ "max_new_tokens": 8,
830
+ "ignore_eos": True,
831
+ },
832
+ "bootstrap_host": [FakeBootstrapHost] * server_args.dp_size,
833
+ # This is a hack to ensure fake transfer is enabled during prefill warmup
834
+ # ensure each dp rank has a unique bootstrap_room during prefill warmup
835
+ "bootstrap_room": [
836
+ i * (2**63 // server_args.dp_size) + (i % server_args.tp_size)
837
+ for i in range(server_args.dp_size)
838
+ ],
839
+ "input_ids": [[0, 1, 2, 3]] * server_args.dp_size,
840
+ }
841
+ res = requests.post(
842
+ url + request_name,
843
+ json=json_data,
844
+ headers=headers,
845
+ timeout=1800, # because of deep gemm precache is very long if not precache.
846
+ )
847
+ logger.info(
848
+ f"End of prefill warmup with status {res.status_code}, resp: {res.json()}"
849
+ )
850
+
826
851
  except Exception:
827
852
  last_traceback = get_exception_traceback()
828
853
  if pipe_finish_writer is not None:
@@ -0,0 +1,278 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Support attention backend for Cutlass MLA.
5
+
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import TYPE_CHECKING, Optional, Union
10
+
11
+ import torch
12
+ import triton
13
+
14
+ from sglang.global_config import global_config
15
+ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
16
+ from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
17
+ from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
18
+ from sglang.srt.layers.dp_attention import get_attention_tp_size
19
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
20
+ from sglang.srt.utils import is_cuda
21
+
22
+ if TYPE_CHECKING:
23
+ from sglang.srt.layers.radix_attention import RadixAttention
24
+ from sglang.srt.model_executor.model_runner import ModelRunner
25
+ from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
26
+ from sglang.srt.speculative.spec_info import SpecInfo
27
+
28
+ _is_cuda = is_cuda()
29
+ if _is_cuda:
30
+ from sgl_kernel import cutlass_mla_decode, cutlass_mla_get_workspace_size
31
+
32
+
33
+ # Cutlass MLA only supports pagesize=128
34
+ PAGE_SIZE = 128
35
+
36
+
37
+ @dataclass
38
+ class CutlassMLADecodeMetadata:
39
+ workspace: Optional[torch.Tensor] = None
40
+ block_kv_indices: Optional[torch.Tensor] = None
41
+
42
+ def __init__(
43
+ self,
44
+ workspace: Optional[torch.Tensor] = None,
45
+ block_kv_indices: Optional[torch.Tensor] = None,
46
+ ):
47
+ self.workspace = workspace
48
+ self.block_kv_indices = block_kv_indices
49
+
50
+
51
+ class CutlassMLABackend(FlashInferMLAAttnBackend):
52
+ """Cutlass attention kernels."""
53
+
54
+ def __init__(
55
+ self,
56
+ model_runner: ModelRunner,
57
+ skip_prefill: bool = False,
58
+ kv_indptr_buf: Optional[torch.Tensor] = None,
59
+ kv_last_page_len_buf: Optional[torch.Tensor] = None,
60
+ ):
61
+ super().__init__(
62
+ model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
63
+ )
64
+
65
+ self.num_q_heads = (
66
+ model_runner.model_config.num_attention_heads // get_attention_tp_size()
67
+ )
68
+ self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
69
+ get_attention_tp_size()
70
+ )
71
+ self.req_to_token = model_runner.req_to_token_pool.req_to_token
72
+ self.num_local_heads = (
73
+ model_runner.model_config.num_attention_heads // get_attention_tp_size()
74
+ )
75
+ self.forward_metadata: Union[CutlassMLADecodeMetadata] = None
76
+ self.kv_lora_rank = model_runner.model_config.kv_lora_rank
77
+ self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
78
+ self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
79
+ self.v_head_dim = model_runner.model_config.v_head_dim
80
+ self.scaling = model_runner.model_config.scaling
81
+ self.data_type = model_runner.kv_cache_dtype
82
+ self.q_data_type = model_runner.dtype
83
+ self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
84
+
85
+ def init_forward_metadata(self, forward_batch: ForwardBatch):
86
+
87
+ bs = forward_batch.batch_size
88
+ spec_info = forward_batch.spec_info
89
+ if forward_batch.forward_mode.is_decode_or_idle():
90
+ if spec_info is None:
91
+ max_seqlen_pad = triton.cdiv(
92
+ forward_batch.seq_lens_cpu.max().item(), PAGE_SIZE
93
+ )
94
+ block_kv_indices = torch.full(
95
+ (bs, max_seqlen_pad),
96
+ -1,
97
+ dtype=torch.int32,
98
+ device=forward_batch.seq_lens.device,
99
+ )
100
+ create_flashmla_kv_indices_triton[(bs,)](
101
+ self.req_to_token,
102
+ forward_batch.req_pool_indices,
103
+ forward_batch.seq_lens,
104
+ None,
105
+ block_kv_indices,
106
+ self.req_to_token.stride(0),
107
+ max_seqlen_pad,
108
+ PAGE_SIZE,
109
+ )
110
+ workspace_size = cutlass_mla_get_workspace_size(
111
+ max_seqlen_pad * PAGE_SIZE, bs
112
+ )
113
+ workspace = torch.empty(
114
+ workspace_size, device="cuda", dtype=torch.uint8
115
+ )
116
+ self.forward_metadata = CutlassMLADecodeMetadata(
117
+ workspace,
118
+ block_kv_indices,
119
+ )
120
+ else:
121
+ super().init_forward_metadata(forward_batch)
122
+ else:
123
+ super().init_forward_metadata(forward_batch)
124
+
125
+ def init_cuda_graph_state(
126
+ self,
127
+ max_bs: int,
128
+ block_kv_indices: Optional[torch.Tensor] = None,
129
+ ):
130
+ if block_kv_indices is None:
131
+ cuda_graph_kv_indices = torch.full(
132
+ (max_bs, (self.max_context_len + PAGE_SIZE) // PAGE_SIZE),
133
+ 1,
134
+ dtype=torch.int32,
135
+ device="cuda",
136
+ )
137
+ else:
138
+ cuda_graph_kv_indices = block_kv_indices
139
+
140
+ workspace_size = cutlass_mla_get_workspace_size(
141
+ cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs
142
+ )
143
+ self.cuda_graph_mla_workspace = torch.empty(
144
+ workspace_size, device="cuda", dtype=torch.uint8
145
+ )
146
+ self.cuda_graph_kv_indices = cuda_graph_kv_indices
147
+
148
+ def init_forward_metadata_capture_cuda_graph(
149
+ self,
150
+ bs: int,
151
+ num_tokens: int,
152
+ req_pool_indices: torch.Tensor,
153
+ seq_lens: torch.Tensor,
154
+ encoder_lens: Optional[torch.Tensor],
155
+ forward_mode: ForwardMode,
156
+ spec_info: Optional[SpecInfo],
157
+ ):
158
+ if forward_mode.is_decode_or_idle():
159
+ if spec_info is None:
160
+ max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
161
+
162
+ create_flashmla_kv_indices_triton[(bs,)](
163
+ self.req_to_token,
164
+ req_pool_indices,
165
+ seq_lens,
166
+ None,
167
+ self.cuda_graph_kv_indices,
168
+ self.req_to_token.stride(0),
169
+ self.cuda_graph_kv_indices.stride(0),
170
+ PAGE_SIZE,
171
+ )
172
+ workspace_size = cutlass_mla_get_workspace_size(
173
+ max_seqlen_pad * PAGE_SIZE, bs
174
+ )
175
+ self.cuda_graph_mla_workspace = torch.empty(
176
+ workspace_size, device="cuda", dtype=torch.uint8
177
+ )
178
+ self.forward_metadata = CutlassMLADecodeMetadata(
179
+ self.cuda_graph_mla_workspace,
180
+ self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
181
+ )
182
+ else:
183
+ super().init_forward_metadata_capture_cuda_graph(
184
+ bs,
185
+ num_tokens,
186
+ req_pool_indices,
187
+ seq_lens,
188
+ encoder_lens,
189
+ forward_mode,
190
+ spec_info,
191
+ )
192
+
193
+ def init_forward_metadata_replay_cuda_graph(
194
+ self,
195
+ bs: int,
196
+ req_pool_indices: torch.Tensor,
197
+ seq_lens: torch.Tensor,
198
+ seq_lens_sum: int,
199
+ encoder_lens: Optional[torch.Tensor],
200
+ forward_mode: ForwardMode,
201
+ spec_info: Optional[SpecInfo],
202
+ seq_lens_cpu: Optional[torch.Tensor],
203
+ ):
204
+
205
+ if forward_mode.is_decode_or_idle():
206
+ assert seq_lens_cpu is not None
207
+ seq_lens = seq_lens[:bs]
208
+ seq_lens_cpu = seq_lens_cpu[:bs]
209
+ max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
210
+ create_flashmla_kv_indices_triton[(bs,)](
211
+ self.req_to_token,
212
+ req_pool_indices[:bs],
213
+ seq_lens,
214
+ None,
215
+ self.cuda_graph_kv_indices,
216
+ self.req_to_token.stride(0),
217
+ self.cuda_graph_kv_indices.stride(0),
218
+ PAGE_SIZE,
219
+ )
220
+ workspace_size = cutlass_mla_get_workspace_size(
221
+ max_seqlen_pad * PAGE_SIZE, bs
222
+ )
223
+ self.cuda_graph_mla_workspace = torch.empty(
224
+ workspace_size, device="cuda", dtype=torch.uint8
225
+ )
226
+ self.forward_metadata.workspace = self.cuda_graph_mla_workspace
227
+ self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
228
+ :bs, :max_seqlen_pad
229
+ ]
230
+ else:
231
+ super().init_forward_metadata_replay_cuda_graph(
232
+ bs,
233
+ req_pool_indices,
234
+ seq_lens,
235
+ seq_lens_sum,
236
+ encoder_lens,
237
+ forward_mode,
238
+ spec_info,
239
+ seq_lens_cpu,
240
+ )
241
+
242
+ def get_cuda_graph_seq_len_fill_value(self):
243
+ return 1
244
+
245
+ def forward_decode(
246
+ self,
247
+ q: torch.Tensor,
248
+ k: torch.Tensor,
249
+ v: torch.Tensor,
250
+ layer: RadixAttention,
251
+ forward_batch: ForwardBatch,
252
+ save_kv_cache: bool = True,
253
+ ):
254
+ cache_loc = forward_batch.out_cache_loc
255
+
256
+ if k is not None:
257
+ assert v is not None
258
+ if save_kv_cache:
259
+ forward_batch.token_to_kv_pool.set_kv_buffer(
260
+ layer,
261
+ cache_loc,
262
+ k,
263
+ v,
264
+ )
265
+ bs = forward_batch.batch_size
266
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
267
+
268
+ reshape_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
269
+
270
+ o = cutlass_mla_decode(
271
+ q_nope_and_q_pe=reshape_q,
272
+ kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
273
+ seq_lens=forward_batch.seq_lens.to(torch.int32),
274
+ page_table=self.forward_metadata.block_kv_indices,
275
+ workspace=self.forward_metadata.workspace,
276
+ )
277
+
278
+ return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
@@ -49,8 +49,8 @@ def create_flashmla_kv_indices_triton(
49
49
  kv_indices_ptr,
50
50
  req_to_token_ptr_stride: tl.constexpr,
51
51
  kv_indices_ptr_stride: tl.constexpr,
52
+ PAGED_SIZE: tl.constexpr = 64,
52
53
  ):
53
- PAGED_SIZE: tl.constexpr = 64
54
54
  BLOCK_SIZE: tl.constexpr = 4096
55
55
  NUM_PAGE_PER_BLOCK: tl.constexpr = 64
56
56
  pid = tl.program_id(axis=0)