sglang 0.4.9__tar.gz → 0.4.9.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (744) hide show
  1. {sglang-0.4.9/sglang.egg-info → sglang-0.4.9.post1}/PKG-INFO +4 -3
  2. {sglang-0.4.9 → sglang-0.4.9.post1}/README.md +1 -1
  3. {sglang-0.4.9 → sglang-0.4.9.post1}/pyproject.toml +3 -2
  4. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/bench_serving.py +2 -2
  5. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/model_config.py +12 -1
  6. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/conversation.py +35 -1
  7. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/mooncake/conn.py +35 -4
  8. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/http_server_engine.py +1 -1
  9. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/communicator.py +3 -1
  10. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
  11. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/layernorm.py +2 -2
  12. sglang-0.4.9.post1/sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  13. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +58 -0
  14. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/ep_moe/layer.py +140 -2
  15. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
  16. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +135 -58
  17. sglang-0.4.9.post1/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  18. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/__init__.py +2 -0
  19. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/fp8.py +28 -7
  20. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  21. sglang-0.4.9.post1/sglang/srt/layers/quantization/w4afp8.py +264 -0
  22. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/vocab_parallel_embedding.py +9 -3
  23. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  24. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  25. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  26. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  27. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/cache_controller.py +41 -195
  28. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/io_struct.py +8 -1
  29. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/mm_utils.py +4 -2
  30. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/schedule_batch.py +1 -1
  31. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/scheduler.py +17 -5
  32. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/hiradix_cache.py +2 -0
  33. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/memory_pool.py +113 -63
  34. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/memory_pool_host.py +6 -109
  35. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/radix_cache.py +8 -4
  36. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/deepseek_v2.py +16 -2
  37. sglang-0.4.9.post1/sglang/srt/models/mllama4.py +540 -0
  38. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/mm_utils.py +2 -2
  39. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/mllama4.py +62 -60
  40. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/server_args.py +15 -0
  41. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/two_batch_overlap.py +3 -0
  42. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/utils.py +37 -17
  43. sglang-0.4.9.post1/sglang/test/test_cutlass_w4a8_moe.py +281 -0
  44. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/utils.py +5 -5
  45. sglang-0.4.9.post1/sglang/version.py +1 -0
  46. {sglang-0.4.9 → sglang-0.4.9.post1/sglang.egg-info}/PKG-INFO +4 -3
  47. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang.egg-info/SOURCES.txt +4 -0
  48. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang.egg-info/requires.txt +2 -1
  49. sglang-0.4.9/sglang/srt/models/mllama4.py +0 -259
  50. sglang-0.4.9/sglang/version.py +0 -1
  51. {sglang-0.4.9 → sglang-0.4.9.post1}/LICENSE +0 -0
  52. {sglang-0.4.9 → sglang-0.4.9.post1}/setup.cfg +0 -0
  53. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/__init__.py +0 -0
  54. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/api.py +0 -0
  55. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/bench_offline_throughput.py +0 -0
  56. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/bench_one_batch.py +0 -0
  57. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/bench_one_batch_server.py +0 -0
  58. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/check_env.py +0 -0
  59. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/compile_deep_gemm.py +0 -0
  60. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/eval/llama3_eval.py +0 -0
  61. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/eval/loogle_eval.py +0 -0
  62. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/global_config.py +0 -0
  63. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/__init__.py +0 -0
  64. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/anthropic.py +0 -0
  65. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/base_backend.py +0 -0
  66. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/litellm.py +0 -0
  67. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/openai.py +0 -0
  68. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  69. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/backend/vertexai.py +0 -0
  70. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/chat_template.py +0 -0
  71. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/choices.py +0 -0
  72. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/compiler.py +0 -0
  73. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/interpreter.py +0 -0
  74. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/ir.py +0 -0
  75. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/lang/tracer.py +0 -0
  76. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/launch_server.py +0 -0
  77. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/profiler.py +0 -0
  78. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/_custom_ops.py +0 -0
  79. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/aio_rwlock.py +0 -0
  80. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/code_completion_parser.py +0 -0
  81. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/__init__.py +0 -0
  82. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/chatglm.py +0 -0
  83. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/dbrx.py +0 -0
  84. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/deepseekvl2.py +0 -0
  85. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/device_config.py +0 -0
  86. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/exaone.py +0 -0
  87. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/internvl.py +0 -0
  88. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/janus_pro.py +0 -0
  89. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/kimi_vl.py +0 -0
  90. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/kimi_vl_moonvit.py +0 -0
  91. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/load_config.py +0 -0
  92. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/update_config.py +0 -0
  93. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/configs/utils.py +0 -0
  94. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/__init__.py +0 -0
  95. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/base_connector.py +0 -0
  96. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/redis.py +0 -0
  97. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/s3.py +0 -0
  98. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/serde/__init__.py +0 -0
  99. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/serde/safe_serde.py +0 -0
  100. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/serde/serde.py +0 -0
  101. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/connector/utils.py +0 -0
  102. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constants.py +0 -0
  103. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  104. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/llguidance_backend.py +0 -0
  105. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
  106. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  107. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  108. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  109. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  110. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/custom_op.py +0 -0
  111. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/debug_utils.py +0 -0
  112. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/base/__init__.py +0 -0
  113. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/base/conn.py +0 -0
  114. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/common/__init__.py +0 -0
  115. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/common/conn.py +0 -0
  116. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/common/utils.py +0 -0
  117. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/decode.py +0 -0
  118. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/decode_schedule_batch_mixin.py +0 -0
  119. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/fake/__init__.py +0 -0
  120. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/fake/conn.py +0 -0
  121. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/kv_events.py +0 -0
  122. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/launch_lb.py +0 -0
  123. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/mini_lb.py +0 -0
  124. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  125. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
  126. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  127. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  128. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/prefill.py +0 -0
  129. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/disaggregation/utils.py +0 -0
  130. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/__init__.py +0 -0
  131. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/communication_op.py +0 -0
  132. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  133. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  134. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  135. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  136. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/npu_communicator.py +0 -0
  137. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/pymscclpp.py +0 -0
  138. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  139. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  140. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  141. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  142. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  143. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/distributed/utils.py +0 -0
  144. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/EngineBase.py +0 -0
  145. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/engine.py +0 -0
  146. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/http_server.py +0 -0
  147. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/__init__.py +0 -0
  148. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/protocol.py +0 -0
  149. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/serving_base.py +0 -0
  150. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/serving_chat.py +0 -0
  151. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/serving_completions.py +0 -0
  152. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/serving_embedding.py +0 -0
  153. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/serving_rerank.py +0 -0
  154. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/serving_score.py +0 -0
  155. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/usage_processor.py +0 -0
  156. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/entrypoints/openai/utils.py +0 -0
  157. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/__init__.py +0 -0
  158. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/eplb_algorithms/__init__.py +0 -0
  159. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -0
  160. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py +0 -0
  161. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/eplb_manager.py +0 -0
  162. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/eplb_simulator/__init__.py +0 -0
  163. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/eplb_simulator/reader.py +0 -0
  164. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/expert_distribution.py +0 -0
  165. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/expert_location.py +0 -0
  166. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/expert_location_dispatch.py +0 -0
  167. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/eplb/expert_location_updater.py +0 -0
  168. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/base_format_detector.py +0 -0
  169. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/core_types.py +0 -0
  170. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/deepseekv3_detector.py +0 -0
  171. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/ebnf_composer.py +0 -0
  172. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/function_call_parser.py +0 -0
  173. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/llama32_detector.py +0 -0
  174. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/mistral_detector.py +0 -0
  175. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/pythonic_detector.py +0 -0
  176. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/qwen25_detector.py +0 -0
  177. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/function_call/utils.py +0 -0
  178. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  179. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/jinja_template_utils.py +0 -0
  180. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/activation.py +0 -0
  181. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/amx_utils.py +0 -0
  182. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/aiter_backend.py +0 -0
  183. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/ascend_backend.py +0 -0
  184. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  185. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/cutlass_mla_backend.py +0 -0
  186. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  187. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
  188. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
  189. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/flashinfer_mla_backend.py +0 -0
  190. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/flashmla_backend.py +0 -0
  191. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/intel_amx_backend.py +0 -0
  192. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/merge_state.py +0 -0
  193. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/tbo_backend.py +0 -0
  194. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  195. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
  196. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  197. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  198. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  199. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_ops/merge_state.py +0 -0
  200. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  201. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  202. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/utils.py +0 -0
  203. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/attention/vision.py +0 -0
  204. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/dp_attention.py +0 -0
  205. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/elementwise.py +0 -0
  206. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/linear.py +0 -0
  207. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/logits_processor.py +0 -0
  208. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/cutlass_moe.py +0 -0
  209. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/cutlass_moe_params.py +0 -0
  210. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  211. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +0 -0
  212. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  213. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  214. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  215. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  216. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  217. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  218. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  219. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  220. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  221. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  222. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  223. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  224. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  225. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  226. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  227. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  228. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  229. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  230. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  231. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  232. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  233. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  234. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  235. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  236. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  237. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  238. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  239. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  240. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  241. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  242. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  243. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  244. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  245. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  246. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  247. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  248. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  249. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  250. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  251. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  253. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  254. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  255. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  261. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  262. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  263. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  264. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  265. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  266. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  267. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  268. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  269. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  270. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  271. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  272. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  273. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  274. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  275. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  276. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  277. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  278. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  279. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  280. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  281. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  282. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  284. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  285. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  286. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  287. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  288. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  289. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  290. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  291. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  292. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  293. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  294. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  295. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  296. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  297. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  298. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  299. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  300. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  301. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  302. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  303. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  304. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  305. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  306. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  307. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  308. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  309. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  310. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  311. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  312. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  313. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  314. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  315. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  316. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  317. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  319. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  320. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  321. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  322. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  323. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  324. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  325. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  326. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  327. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  328. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  329. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  330. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  331. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  332. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  333. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  334. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  335. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  336. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  337. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  338. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  339. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  340. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  341. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  342. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  343. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
  344. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
  345. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  346. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
  348. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
  350. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  351. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  352. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  353. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
  355. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
  357. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
  358. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  364. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  368. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  371. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  372. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  373. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  376. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/router.py +0 -0
  377. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/moe/topk.py +0 -0
  378. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/multimodal.py +0 -0
  379. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/parameter.py +0 -0
  380. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/pooler.py +0 -0
  381. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/awq.py +0 -0
  382. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  383. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  384. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  385. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  386. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  387. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  388. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  389. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  390. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  391. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  392. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  488. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  489. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  490. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  491. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  492. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  493. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  494. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  495. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  496. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  497. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  498. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  499. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  500. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  501. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  502. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  503. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  504. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  505. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  506. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  507. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  508. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  509. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  510. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  511. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  512. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  513. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  514. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  515. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  516. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  517. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  518. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  519. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  520. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  521. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  522. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  523. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  524. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  525. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  526. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  527. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  528. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  529. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  530. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  531. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  532. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  533. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  534. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  535. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  536. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  537. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  538. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  539. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  540. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  541. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  542. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  543. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  544. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +0 -0
  545. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +0 -0
  546. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -0
  547. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +0 -0
  548. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  549. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  550. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/gptq.py +0 -0
  551. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  552. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  553. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  554. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
  555. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/qoq.py +0 -0
  556. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/quant_utils.py +0 -0
  557. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/utils.py +0 -0
  558. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  559. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  560. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/radix_attention.py +0 -0
  561. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
  562. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/sampler.py +0 -0
  563. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  564. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/layers/utils.py +0 -0
  565. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/backend/base_backend.py +0 -0
  566. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  567. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/backend/triton_backend.py +0 -0
  568. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/layers.py +0 -0
  569. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/lora.py +0 -0
  570. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/lora_config.py +0 -0
  571. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/lora_manager.py +0 -0
  572. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/mem_pool.py +0 -0
  573. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  574. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/lora/utils.py +0 -0
  575. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/configure_logging.py +0 -0
  576. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  577. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  578. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/multimodal_processor.py +0 -0
  579. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -0
  580. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  581. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/scheduler_output_processor_mixin.py +0 -0
  582. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/session_controller.py +0 -0
  583. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/template_manager.py +0 -0
  584. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/tokenizer_manager.py +0 -0
  585. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/tp_worker.py +0 -0
  586. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  587. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/managers/utils.py +0 -0
  588. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/allocator.py +0 -0
  589. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  590. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  591. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  592. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/mem_cache/multimodal_cache.py +0 -0
  593. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/metrics/collector.py +0 -0
  594. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/metrics/func_timer.py +0 -0
  595. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  596. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  597. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_executor/model_runner.py +0 -0
  598. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_loader/__init__.py +0 -0
  599. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_loader/loader.py +0 -0
  600. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_loader/utils.py +0 -0
  601. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
  602. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/model_parallel.py +0 -0
  603. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/baichuan.py +0 -0
  604. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/bert.py +0 -0
  605. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/chatglm.py +0 -0
  606. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/clip.py +0 -0
  607. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/commandr.py +0 -0
  608. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/dbrx.py +0 -0
  609. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/deepseek.py +0 -0
  610. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  611. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/deepseek_nextn.py +0 -0
  612. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/deepseek_vl2.py +0 -0
  613. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/exaone.py +0 -0
  614. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma.py +0 -0
  615. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma2.py +0 -0
  616. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  617. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma3_causal.py +0 -0
  618. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma3_mm.py +0 -0
  619. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma3n_audio.py +0 -0
  620. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma3n_causal.py +0 -0
  621. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gemma3n_mm.py +0 -0
  622. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/glm4.py +0 -0
  623. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gpt2.py +0 -0
  624. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  625. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/granite.py +0 -0
  626. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/grok.py +0 -0
  627. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/hunyuan.py +0 -0
  628. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/idefics2.py +0 -0
  629. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/internlm2.py +0 -0
  630. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  631. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/internvl.py +0 -0
  632. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/kimi_vl.py +0 -0
  633. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/kimi_vl_moonvit.py +0 -0
  634. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama.py +0 -0
  635. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama4.py +0 -0
  636. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama_classification.py +0 -0
  637. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama_eagle.py +0 -0
  638. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama_eagle3.py +0 -0
  639. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama_embedding.py +0 -0
  640. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llama_reward.py +0 -0
  641. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llava.py +0 -0
  642. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/llavavid.py +0 -0
  643. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/mimo.py +0 -0
  644. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/mimo_mtp.py +0 -0
  645. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/minicpm.py +0 -0
  646. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/minicpm3.py +0 -0
  647. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/minicpmo.py +0 -0
  648. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/minicpmv.py +0 -0
  649. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/mistral.py +0 -0
  650. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/mixtral.py +0 -0
  651. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  652. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/mllama.py +0 -0
  653. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/olmo.py +0 -0
  654. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/olmo2.py +0 -0
  655. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/olmoe.py +0 -0
  656. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/phi3_small.py +0 -0
  657. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/phi4mm.py +0 -0
  658. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/pixtral.py +0 -0
  659. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen.py +0 -0
  660. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2.py +0 -0
  661. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_5_vl.py +0 -0
  662. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_audio.py +0 -0
  663. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_classification.py +0 -0
  664. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
  665. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  666. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_rm.py +0 -0
  667. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen2_vl.py +0 -0
  668. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen3.py +0 -0
  669. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/qwen3_moe.py +0 -0
  670. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/registry.py +0 -0
  671. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/roberta.py +0 -0
  672. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/siglip.py +0 -0
  673. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/stablelm.py +0 -0
  674. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  675. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/transformers.py +0 -0
  676. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/vila.py +0 -0
  677. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/xverse.py +0 -0
  678. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/xverse_moe.py +0 -0
  679. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/models/yivl.py +0 -0
  680. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/base_processor.py +0 -0
  681. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/clip.py +0 -0
  682. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -0
  683. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/gemma3.py +0 -0
  684. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/gemma3n.py +0 -0
  685. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/internvl.py +0 -0
  686. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/janus_pro.py +0 -0
  687. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/kimi_vl.py +0 -0
  688. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/llava.py +0 -0
  689. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/minicpm.py +0 -0
  690. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/mlama.py +0 -0
  691. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/phi4mm.py +0 -0
  692. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/pixtral.py +0 -0
  693. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/qwen_vl.py +0 -0
  694. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/multimodal/processors/vila.py +0 -0
  695. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/operations.py +0 -0
  696. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/operations_strategy.py +0 -0
  697. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/patch_torch.py +0 -0
  698. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/reasoning_parser.py +0 -0
  699. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  700. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  701. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  702. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  703. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  704. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  705. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  706. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  707. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  708. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  709. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +0 -0
  710. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/speculative/eagle_utils.py +0 -0
  711. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/speculative/eagle_worker.py +0 -0
  712. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/speculative/spec_info.py +0 -0
  713. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  714. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/srt/warmup.py +0 -0
  715. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/__init__.py +0 -0
  716. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/attention/__init__.py +0 -0
  717. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/attention/test_flashattn_backend.py +0 -0
  718. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  719. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  720. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  721. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  722. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/run_eval.py +0 -0
  723. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/runners.py +0 -0
  724. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/send_one.py +0 -0
  725. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/simple_eval_common.py +0 -0
  726. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  727. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  728. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/simple_eval_math.py +0 -0
  729. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  730. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  731. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_activation.py +0 -0
  732. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_block_fp8.py +0 -0
  733. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -0
  734. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_block_fp8_ep.py +0 -0
  735. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_custom_ops.py +0 -0
  736. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_cutlass_moe.py +0 -0
  737. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_deepep_utils.py +0 -0
  738. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_dynamic_grad_mode.py +0 -0
  739. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_fp4_moe.py +0 -0
  740. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_layernorm.py +0 -0
  741. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_programs.py +0 -0
  742. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang/test/test_utils.py +0 -0
  743. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang.egg-info/dependency_links.txt +0 -0
  744. {sglang-0.4.9 → sglang-0.4.9.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.9
3
+ Version: 0.4.9.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,6 +239,7 @@ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
239
239
  Requires-Dist: psutil; extra == "runtime-common"
240
240
  Requires-Dist: pydantic; extra == "runtime-common"
241
241
  Requires-Dist: pynvml; extra == "runtime-common"
242
+ Requires-Dist: pybase64; extra == "runtime-common"
242
243
  Requires-Dist: python-multipart; extra == "runtime-common"
243
244
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
244
245
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
@@ -248,7 +249,7 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
248
249
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
249
250
  Requires-Dist: uvicorn; extra == "runtime-common"
250
251
  Requires-Dist: uvloop; extra == "runtime-common"
251
- Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
252
+ Requires-Dist: xgrammar==0.1.20; extra == "runtime-common"
252
253
  Provides-Extra: srt
253
254
  Requires-Dist: sglang[runtime_common]; extra == "srt"
254
255
  Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
@@ -419,7 +420,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
419
420
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
420
421
 
421
422
  ## Adoption and Sponsorship
422
- SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
423
+ SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
423
424
 
424
425
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
425
426
 
@@ -65,7 +65,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
65
65
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
66
66
 
67
67
  ## Adoption and Sponsorship
68
- SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
68
+ SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
69
69
 
70
70
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
71
71
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.9"
7
+ version = "0.4.9.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -38,6 +38,7 @@ runtime_common = [
38
38
  "psutil",
39
39
  "pydantic",
40
40
  "pynvml",
41
+ "pybase64",
41
42
  "python-multipart",
42
43
  "pyzmq>=25.1.2",
43
44
  "soundfile==0.13.1",
@@ -47,7 +48,7 @@ runtime_common = [
47
48
  "timm==1.0.16",
48
49
  "uvicorn",
49
50
  "uvloop",
50
- "xgrammar==0.1.19",
51
+ "xgrammar==0.1.20",
51
52
  ]
52
53
 
53
54
  srt = [
@@ -814,9 +814,9 @@ def sample_mmmu_requests(
814
814
  List of tuples (prompt, prompt_token_len, output_token_len).
815
815
  """
816
816
  try:
817
- import base64
818
817
  import io
819
818
 
819
+ import pybase64
820
820
  from datasets import load_dataset
821
821
  except ImportError:
822
822
  raise ImportError("Please install datasets: pip install datasets")
@@ -867,7 +867,7 @@ def sample_mmmu_requests(
867
867
  # Encode image to base64
868
868
  buffered = io.BytesIO()
869
869
  image.save(buffered, format="JPEG")
870
- img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
870
+ img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
871
871
  image_data = f"data:image/jpeg;base64,{img_str}"
872
872
  else:
873
873
  continue
@@ -359,7 +359,17 @@ class ModelConfig:
359
359
  if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
360
360
  quant_cfg = modelopt_quant_config
361
361
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
362
- quant_cfg = modelopt_quant_config
362
+ quant_config_file = os.path.join(
363
+ self.model_path, "hf_quant_config.json"
364
+ )
365
+ with open(quant_config_file) as f:
366
+ quant_config_dict = json.load(f)
367
+ json_quant_configs = quant_config_dict["quantization"]
368
+ quant_algo = json_quant_configs.get("quant_algo", None)
369
+ if quant_algo == "MIXED_PRECISION":
370
+ quant_cfg = {"quant_method": "w4afp8"}
371
+ else:
372
+ quant_cfg = modelopt_quant_config
363
373
  return quant_cfg
364
374
 
365
375
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -389,6 +399,7 @@ class ModelConfig:
389
399
  "w8a8_fp8",
390
400
  "moe_wna16",
391
401
  "qoq",
402
+ "w4afp8",
392
403
  ]
393
404
  compatible_quantization_methods = {
394
405
  "modelopt_fp4": ["modelopt"],
@@ -921,6 +921,19 @@ register_conv_template(
921
921
  )
922
922
  )
923
923
 
924
+ register_conv_template(
925
+ Conversation(
926
+ name="mimo-vl",
927
+ system_message="You are MiMo, an AI assistant developed by Xiaomi.",
928
+ system_template="<|im_start|>system\n{system_message}",
929
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
930
+ sep="<|im_end|>\n",
931
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
932
+ stop_str=["<|im_end|>"],
933
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
934
+ )
935
+ )
936
+
924
937
 
925
938
  register_conv_template(
926
939
  Conversation(
@@ -935,6 +948,19 @@ register_conv_template(
935
948
  )
936
949
  )
937
950
 
951
+ register_conv_template(
952
+ Conversation(
953
+ name="llama_4_vision",
954
+ system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
955
+ system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
956
+ roles=("user", "assistant"),
957
+ sep_style=SeparatorStyle.LLAMA4,
958
+ sep="",
959
+ stop_str="<|eot|>",
960
+ image_token="<|image|>",
961
+ )
962
+ )
963
+
938
964
 
939
965
  @register_conv_template_matching_function
940
966
  def match_internvl(model_path: str):
@@ -943,9 +969,11 @@ def match_internvl(model_path: str):
943
969
 
944
970
 
945
971
  @register_conv_template_matching_function
946
- def match_llama_3_vision(model_path: str):
972
+ def match_llama_vision(model_path: str):
947
973
  if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
948
974
  return "llama_3_vision"
975
+ if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
976
+ return "llama_4_vision"
949
977
 
950
978
 
951
979
  @register_conv_template_matching_function
@@ -1034,3 +1062,9 @@ def match_phi_4_mm(model_path: str):
1034
1062
  def match_vila(model_path: str):
1035
1063
  if re.search(r"vila", model_path, re.IGNORECASE):
1036
1064
  return "chatml"
1065
+
1066
+
1067
+ @register_conv_template_matching_function
1068
+ def match_mimo_vl(model_path: str):
1069
+ if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
1070
+ return "mimo-vl"
@@ -185,9 +185,11 @@ class MooncakeKVManager(BaseKVManager):
185
185
  threading.Thread(
186
186
  target=self.transfer_worker, args=(queue, executor), daemon=True
187
187
  ).start()
188
-
189
- self.bootstrap_time_out = get_int_env_var(
190
- "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 120
188
+ # If a timeout happens on the prefill side, it means prefill instances
189
+ # fail to receive the KV indices from the decode instance of this request.
190
+ # These timeout requests should be aborted to release the tree cache.
191
+ self.bootstrap_timeout = get_int_env_var(
192
+ "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 300
191
193
  )
192
194
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
193
195
  self.heartbeat_failures = {}
@@ -209,6 +211,12 @@ class MooncakeKVManager(BaseKVManager):
209
211
  self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
210
212
  self.prefill_tp_size_table: Dict[str, int] = {}
211
213
  self.prefill_dp_size_table: Dict[str, int] = {}
214
+ # If a timeout happens on the decode side, it means decode instances
215
+ # fail to receive the KV Cache transfer done signal after bootstrapping.
216
+ # These timeout requests should be aborted to release the tree cache.
217
+ self.waiting_timeout = get_int_env_var(
218
+ "SGLANG_DISAGGREGATION_WAITING_TIMEOUT", 300
219
+ )
212
220
  else:
213
221
  raise ValueError(
214
222
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
@@ -938,7 +946,12 @@ class MooncakeKVSender(BaseKVSender):
938
946
  if self.init_time is not None:
939
947
  now = time.time()
940
948
  elapsed = now - self.init_time
941
- if elapsed >= self.kv_mgr.bootstrap_time_out:
949
+ if elapsed >= self.kv_mgr.bootstrap_timeout:
950
+ logger.warning_once(
951
+ "Some requests timed out when bootstrapping, "
952
+ "which means prefill instances fail to receive the KV indices from the decode instance of this request. "
953
+ "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
954
+ )
942
955
  self.kv_mgr.record_failure(
943
956
  self.bootstrap_room,
944
957
  f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
@@ -987,6 +1000,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
987
1000
  self.session_id = self.kv_mgr.get_session_id()
988
1001
  self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
989
1002
  self.conclude_state = None
1003
+ self.init_time = None
990
1004
  self.data_parallel_rank = data_parallel_rank
991
1005
 
992
1006
  if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
@@ -1222,14 +1236,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
1222
1236
  str(self.required_dst_info_num).encode("ascii"),
1223
1237
  ]
1224
1238
  )
1239
+ self.init_time = time.time()
1225
1240
 
1226
1241
  def poll(self) -> KVPoll:
1227
1242
  if self.conclude_state is None:
1228
1243
  status = self.kv_mgr.check_status(self.bootstrap_room)
1229
1244
  if status in (KVPoll.Success, KVPoll.Failed):
1230
1245
  self.conclude_state = status
1246
+ elif status == KVPoll.WaitingForInput:
1247
+ if self.init_time is not None:
1248
+ now = time.time()
1249
+ elapsed = now - self.init_time
1250
+ if elapsed >= self.kv_mgr.waiting_timeout:
1251
+ logger.warning_once(
1252
+ "Some requests fail to receive KV Cache transfer done signal after bootstrapping. "
1253
+ "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
1254
+ )
1255
+ self.kv_mgr.record_failure(
1256
+ self.bootstrap_room,
1257
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
1258
+ )
1259
+ self.conclude_state = KVPoll.Failed
1260
+ return KVPoll.Failed
1231
1261
 
1232
1262
  return status
1263
+
1233
1264
  else:
1234
1265
  return self.conclude_state
1235
1266
 
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import copy
3
2
  import dataclasses
4
3
  import multiprocessing
@@ -7,6 +6,7 @@ import threading
7
6
  import time
8
7
  from typing import Any, Dict, List, Optional, Tuple, Union
9
8
 
9
+ import pybase64
10
10
  import requests
11
11
  import torch
12
12
  import torch.distributed as dist
@@ -402,12 +402,14 @@ class CommunicateWithAllReduceAndLayerNormFn:
402
402
  if hidden_states.shape[0] != 0:
403
403
  hidden_states = layernorm(hidden_states)
404
404
  else:
405
+ # According to the discussion in https://github.com/flashinfer-ai/flashinfer/issues/1223#issuecomment-3047256465
406
+ # We set the max token num to 128 for allreduce fusion with min-latency case(use_oneshot=True).
405
407
  if (
406
408
  _is_sm100_supported
407
409
  and _is_flashinfer_available
408
410
  and hasattr(layernorm, "forward_with_allreduce_fusion")
409
411
  and global_server_args_dict["enable_flashinfer_allreduce_fusion"]
410
- and hidden_states.shape[0] <= 1024
412
+ and hidden_states.shape[0] <= 128
411
413
  ):
412
414
  hidden_states, residual = layernorm.forward_with_allreduce_fusion(
413
415
  hidden_states, residual
@@ -92,7 +92,7 @@ _workspace_manager = FlashInferWorkspaceManager()
92
92
 
93
93
 
94
94
  def ensure_workspace_initialized(
95
- max_token_num: int = 1024, hidden_dim: int = 4096, use_fp32_lamport: bool = False
95
+ max_token_num: int = 128, hidden_dim: int = 4096, use_fp32_lamport: bool = False
96
96
  ):
97
97
  """Ensure workspace is initialized"""
98
98
  if not is_flashinfer_available() or _flashinfer_comm is None:
@@ -119,12 +119,12 @@ def ensure_workspace_initialized(
119
119
  return _workspace_manager.initialized
120
120
 
121
121
 
122
- def flashinfer_allreduce_add_rmsnorm(
122
+ def flashinfer_allreduce_residual_rmsnorm(
123
123
  input_tensor: torch.Tensor,
124
124
  residual: torch.Tensor,
125
125
  weight: torch.Tensor,
126
126
  eps: float = 1e-6,
127
- max_token_num: int = 1024,
127
+ max_token_num: int = 128,
128
128
  use_oneshot: bool = True,
129
129
  trigger_completion_at_end: bool = False,
130
130
  fp32_acc: bool = False,
@@ -174,11 +174,11 @@ class RMSNorm(CustomOp):
174
174
  if residual is not None:
175
175
  from sglang.srt.distributed import get_tensor_model_parallel_world_size
176
176
  from sglang.srt.layers.flashinfer_comm_fusion import (
177
- flashinfer_allreduce_add_rmsnorm,
177
+ flashinfer_allreduce_residual_rmsnorm,
178
178
  )
179
179
 
180
180
  if get_tensor_model_parallel_world_size() > 1:
181
- fused_result = flashinfer_allreduce_add_rmsnorm(
181
+ fused_result = flashinfer_allreduce_residual_rmsnorm(
182
182
  input_tensor=x,
183
183
  residual=residual,
184
184
  weight=self.weight,
@@ -0,0 +1,215 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Cutlass W4A8 MoE kernel."""
3
+ from typing import Optional
4
+
5
+ import torch
6
+ from sgl_kernel import (
7
+ cutlass_w4a8_moe_mm,
8
+ get_cutlass_w4a8_moe_mm_data,
9
+ sgl_per_tensor_quant_fp8,
10
+ silu_and_mul,
11
+ )
12
+
13
+ from sglang.srt.layers.moe.ep_moe.kernels import (
14
+ post_reorder_triton_kernel,
15
+ pre_reorder_triton_kernel_for_cutlass_moe,
16
+ run_cutlass_moe_ep_preproess,
17
+ )
18
+
19
+
20
+ def cutlass_w4a8_moe(
21
+ start_expert_id: int,
22
+ end_expert_id: int,
23
+ total_num_experts: int,
24
+ a: torch.Tensor,
25
+ w1_q: torch.Tensor,
26
+ w2_q: torch.Tensor,
27
+ w1_scale: torch.Tensor,
28
+ w2_scale: torch.Tensor,
29
+ topk_weights: torch.Tensor,
30
+ topk_ids_: torch.Tensor,
31
+ local_topk_ids: torch.Tensor,
32
+ a_strides1: torch.Tensor,
33
+ b_strides1: torch.Tensor,
34
+ c_strides1: torch.Tensor,
35
+ a_strides2: torch.Tensor,
36
+ b_strides2: torch.Tensor,
37
+ c_strides2: torch.Tensor,
38
+ s_strides13: torch.Tensor,
39
+ s_strides2: torch.Tensor,
40
+ expert_offsets: torch.Tensor,
41
+ problem_sizes1: torch.Tensor,
42
+ problem_sizes2: torch.Tensor,
43
+ a1_scale: Optional[torch.Tensor] = None,
44
+ a2_scale: Optional[torch.Tensor] = None,
45
+ apply_router_weight_on_input: bool = False,
46
+ ) -> torch.Tensor:
47
+ """
48
+ This function computes a w4a8-quantized Mixture of Experts (MoE) layer
49
+ using two sets of quantized weights, w1_q and w2_q, and top-k gating
50
+ mechanism. The matrix multiplications are implemented with CUTLASS
51
+ grouped gemm.
52
+
53
+ Parameters:
54
+ - a (torch.Tensor): The input tensor to the MoE layer.
55
+ Shape: [M, K]
56
+ - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
57
+ Shape: [num_experts, N * 2, K // 2]
58
+ (the weights are passed transposed and int4-packed)
59
+ - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
60
+ Shape: [num_experts, K, N // 2]
61
+ (the weights are passed transposed and int4-packed)
62
+ - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
63
+ Shape: [num_experts, K // 512, N * 8]
64
+ - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
65
+ Shape: [num_experts, N // 512, K * 4]
66
+ - topk_weights (torch.Tensor): The weights of each token->expert mapping.
67
+ - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
68
+ - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
69
+ - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
70
+ - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
71
+ - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
72
+ - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
73
+ - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
74
+ - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
75
+ - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
76
+ Shape: scalar or [1, K]
77
+ - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
78
+ quantize the intermediate result between the gemms.
79
+ Shape: scalar or [1, N]
80
+ - apply_router_weight_on_input (bool): When true, the topk weights are
81
+ applied directly on the inputs. This is only applicable when topk is 1.
82
+
83
+ Returns:
84
+ - torch.Tensor: The fp8 output tensor after applying the MoE layer.
85
+ """
86
+ assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
87
+ assert w1_q.dtype == torch.int8
88
+ assert w2_q.dtype == torch.int8
89
+ assert a.shape[1] // 2 == w1_q.shape[2], "Hidden size mismatch w1"
90
+ assert w1_q.shape[2] * 2 == w2_q.shape[1], "Hidden size mismatch w2"
91
+ assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
92
+ assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
93
+ assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
94
+ assert (
95
+ w1_scale.shape[1] == w1_q.shape[2] * 2 / 512
96
+ and w1_scale.shape[2] == w1_q.shape[1] * 4
97
+ ), "W1 scale shape mismatch"
98
+ assert (
99
+ w2_scale.shape[1] == w2_q.shape[2] * 2 / 512
100
+ and w2_scale.shape[2] == w2_q.shape[1] * 4
101
+ ), "W2 scale shape mismatch"
102
+
103
+ assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
104
+ assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
105
+ assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
106
+ assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
107
+ num_experts = w1_q.size(0)
108
+ m = a.size(0)
109
+ k = w1_q.size(2) * 2 # w1_q is transposed and packed
110
+ n = w2_q.size(2) * 2 # w2_q is transposed and packed
111
+ topk = topk_ids_.size(1)
112
+
113
+ if apply_router_weight_on_input:
114
+ assert topk == 1, "apply_router_weight_on_input is only implemented for topk=1"
115
+
116
+ device = a.device
117
+
118
+ _, src2dst, _ = run_cutlass_moe_ep_preproess(
119
+ local_topk_ids,
120
+ num_experts,
121
+ )
122
+
123
+ gateup_input = torch.empty(
124
+ (m * topk, k),
125
+ device=device,
126
+ dtype=torch.float8_e4m3fn,
127
+ )
128
+
129
+ pre_reorder_triton_kernel_for_cutlass_moe[(m,)](
130
+ a,
131
+ gateup_input,
132
+ src2dst,
133
+ local_topk_ids,
134
+ a1_scale,
135
+ total_num_experts,
136
+ topk,
137
+ k,
138
+ BLOCK_SIZE=512,
139
+ )
140
+
141
+ # NOTE: a_map and c_map are not used in the get_cutlass_w4a8_moe_mm_data kernel,
142
+ # they are kept to allow for a quick switch of the permutation logic
143
+ # from the current triton kernel implementation to the cutlass-based one if needed.
144
+ a_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
145
+ c_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
146
+ get_cutlass_w4a8_moe_mm_data(
147
+ local_topk_ids,
148
+ expert_offsets,
149
+ problem_sizes1,
150
+ problem_sizes2,
151
+ a_map,
152
+ c_map,
153
+ num_experts,
154
+ n,
155
+ k,
156
+ )
157
+
158
+ c1 = torch.empty((m * topk, n * 2), device=device, dtype=torch.half)
159
+ c2 = torch.zeros((m * topk, k), device=device, dtype=torch.half)
160
+
161
+ cutlass_w4a8_moe_mm(
162
+ c1,
163
+ gateup_input,
164
+ w1_q,
165
+ a1_scale.float(),
166
+ w1_scale,
167
+ expert_offsets[:-1],
168
+ problem_sizes1,
169
+ a_strides1,
170
+ b_strides1,
171
+ c_strides1,
172
+ s_strides13,
173
+ 128,
174
+ topk,
175
+ )
176
+
177
+ intermediate = torch.empty((m * topk, n), device=device, dtype=torch.half)
178
+ silu_and_mul(c1, intermediate)
179
+
180
+ intermediate_q = torch.empty(
181
+ intermediate.shape, dtype=torch.float8_e4m3fn, device=device
182
+ )
183
+ sgl_per_tensor_quant_fp8(intermediate, intermediate_q, a2_scale.float(), True)
184
+
185
+ cutlass_w4a8_moe_mm(
186
+ c2,
187
+ intermediate_q,
188
+ w2_q,
189
+ a2_scale.float(),
190
+ w2_scale,
191
+ expert_offsets[:-1],
192
+ problem_sizes2,
193
+ a_strides2,
194
+ b_strides2,
195
+ c_strides2,
196
+ s_strides2,
197
+ 128,
198
+ topk,
199
+ )
200
+
201
+ output = torch.empty_like(a)
202
+ post_reorder_triton_kernel[(m,)](
203
+ c2,
204
+ output,
205
+ src2dst,
206
+ topk_ids_,
207
+ topk_weights,
208
+ start_expert_id,
209
+ end_expert_id,
210
+ topk,
211
+ k,
212
+ 0,
213
+ BLOCK_SIZE=512,
214
+ )
215
+ return output
@@ -146,6 +146,7 @@ def compute_seg_indptr_triton_kernel(reorder_topk_ids, seg_indptr, num_toks):
146
146
 
147
147
  def run_moe_ep_preproess(topk_ids: torch.Tensor, num_experts: int):
148
148
  reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
149
+
149
150
  seg_indptr = torch.zeros(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
150
151
  src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32)
151
152
 
@@ -158,9 +159,66 @@ def run_moe_ep_preproess(topk_ids: torch.Tensor, num_experts: int):
158
159
  compute_src2dst_triton_kernel[grid](
159
160
  reorder_ids, src2dst, topk_ids.numel(), BLOCK_SIZE
160
161
  )
162
+
161
163
  return reorder_topk_ids, src2dst, seg_indptr
162
164
 
163
165
 
166
+ def run_cutlass_moe_ep_preproess(local_topk_ids: torch.Tensor, local_num_experts: int):
167
+ reorder_topk_ids, reorder_ids = torch.sort(local_topk_ids.view(-1), stable=True)
168
+
169
+ seg_indptr = torch.zeros(
170
+ local_num_experts + 1, device=local_topk_ids.device, dtype=torch.int64
171
+ )
172
+ src2dst = torch.empty(
173
+ local_topk_ids.numel(), device=local_topk_ids.device, dtype=torch.int32
174
+ )
175
+
176
+ BLOCK_SIZE = 512
177
+ grid = (triton.cdiv(local_topk_ids.numel(), BLOCK_SIZE),)
178
+ compute_src2dst_triton_kernel[grid](
179
+ reorder_ids, src2dst, local_topk_ids.numel(), BLOCK_SIZE
180
+ )
181
+
182
+ return reorder_topk_ids, src2dst, seg_indptr
183
+
184
+
185
+ @triton.jit
186
+ def pre_reorder_triton_kernel_for_cutlass_moe(
187
+ input_ptr,
188
+ gateup_input_ptr,
189
+ src2dst_ptr,
190
+ topk_ids_ptr,
191
+ a1_scales_ptr,
192
+ num_experts,
193
+ topk,
194
+ hidden_size,
195
+ BLOCK_SIZE: tl.constexpr,
196
+ ):
197
+ OutDtype = gateup_input_ptr.dtype.element_ty
198
+
199
+ src_idx = tl.program_id(0)
200
+ src2dst_ptr = src2dst_ptr + src_idx * topk
201
+ topk_ids_ptr = topk_ids_ptr + src_idx * topk
202
+
203
+ src_ptr = input_ptr + src_idx * hidden_size
204
+ for idx in range(topk):
205
+ expert_id = tl.load(topk_ids_ptr + idx)
206
+ if expert_id != num_experts:
207
+ if a1_scales_ptr is not None:
208
+ scale = 1.0 / tl.load(a1_scales_ptr)
209
+ else:
210
+ scale = 1.0
211
+
212
+ dst_idx = tl.load(src2dst_ptr + idx)
213
+ dst_ptr = gateup_input_ptr + dst_idx * hidden_size
214
+ for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
215
+ offset = start_offset + tl.arange(0, BLOCK_SIZE)
216
+ mask = offset < hidden_size
217
+ in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
218
+ out_data = (in_data * scale).to(OutDtype)
219
+ tl.store(dst_ptr + offset, out_data, mask=mask)
220
+
221
+
164
222
  @triton.jit
165
223
  def pre_reorder_triton_kernel(
166
224
  input_ptr,