sglang 0.4.6.post1__tar.gz → 0.4.6.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (640) hide show
  1. {sglang-0.4.6.post1/sglang.egg-info → sglang-0.4.6.post2}/PKG-INFO +5 -4
  2. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/pyproject.toml +5 -4
  3. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/bench_one_batch.py +2 -0
  4. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/check_env.py +3 -3
  5. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/__init__.py +4 -0
  6. sglang-0.4.6.post2/sglang/srt/configs/kimi_vl.py +38 -0
  7. sglang-0.4.6.post2/sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  8. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/model_config.py +15 -0
  9. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/conversation.py +122 -1
  10. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/entrypoints/engine.py +44 -22
  11. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/function_call_parser.py +97 -0
  12. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/hf_transformers_utils.py +2 -0
  13. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
  14. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  15. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  16. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashmla_backend.py +3 -0
  17. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/dp_attention.py +5 -2
  18. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  19. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  20. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  22. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  23. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
  24. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/__init__.py +2 -2
  25. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/deep_gemm.py +1 -1
  26. sglang-0.4.6.post2/sglang/srt/layers/utils.py +35 -0
  27. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/layers.py +35 -9
  28. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/lora_manager.py +84 -35
  29. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/data_parallel_controller.py +52 -34
  30. sglang-0.4.6.post2/sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  31. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/schedule_batch.py +25 -15
  32. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/scheduler.py +263 -59
  33. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
  34. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/tp_worker.py +51 -16
  35. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
  36. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/memory_pool.py +70 -36
  37. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  38. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_executor/forward_batch_info.py +31 -1
  39. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_executor/model_runner.py +115 -57
  40. sglang-0.4.6.post2/sglang/srt/models/deepseek_nextn.py +183 -0
  41. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/deepseek_v2.py +78 -18
  42. sglang-0.4.6.post2/sglang/srt/models/kimi_vl.py +308 -0
  43. sglang-0.4.6.post2/sglang/srt/models/kimi_vl_moonvit.py +639 -0
  44. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama.py +92 -30
  45. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama4.py +2 -1
  46. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama_eagle.py +4 -1
  47. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama_eagle3.py +4 -1
  48. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_moe.py +8 -3
  49. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_vl.py +0 -12
  50. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen3_moe.py +8 -3
  51. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/openai_api/adapter.py +34 -22
  52. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/openai_api/protocol.py +11 -1
  53. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/server_args.py +67 -22
  54. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/speculative/eagle_worker.py +3 -2
  55. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/utils.py +88 -9
  56. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/runners.py +4 -0
  57. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_utils.py +29 -0
  58. sglang-0.4.6.post2/sglang/version.py +1 -0
  59. {sglang-0.4.6.post1 → sglang-0.4.6.post2/sglang.egg-info}/PKG-INFO +5 -4
  60. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang.egg-info/SOURCES.txt +10 -0
  61. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang.egg-info/requires.txt +4 -3
  62. sglang-0.4.6.post1/sglang/srt/models/deepseek_nextn.py +0 -439
  63. sglang-0.4.6.post1/sglang/version.py +0 -1
  64. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/LICENSE +0 -0
  65. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/README.md +0 -0
  66. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/setup.cfg +0 -0
  67. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/__init__.py +0 -0
  68. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/api.py +0 -0
  69. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/bench_offline_throughput.py +0 -0
  70. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/bench_one_batch_server.py +0 -0
  71. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/bench_serving.py +0 -0
  72. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/compile_deep_gemm.py +0 -0
  73. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/global_config.py +0 -0
  74. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/__init__.py +0 -0
  75. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/anthropic.py +0 -0
  76. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/base_backend.py +0 -0
  77. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/litellm.py +0 -0
  78. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/openai.py +0 -0
  79. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  80. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/backend/vertexai.py +0 -0
  81. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/chat_template.py +0 -0
  82. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/choices.py +0 -0
  83. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/compiler.py +0 -0
  84. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/interpreter.py +0 -0
  85. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/ir.py +0 -0
  86. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/lang/tracer.py +0 -0
  87. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/launch_server.py +0 -0
  88. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/llama3_eval.py +0 -0
  89. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/_custom_ops.py +0 -0
  90. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/aio_rwlock.py +0 -0
  91. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/code_completion_parser.py +0 -0
  92. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/chatglm.py +0 -0
  93. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/dbrx.py +0 -0
  94. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/deepseekvl2.py +0 -0
  95. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/device_config.py +0 -0
  96. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/exaone.py +0 -0
  97. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/janus_pro.py +0 -0
  98. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/load_config.py +0 -0
  99. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/configs/utils.py +0 -0
  100. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/__init__.py +0 -0
  101. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/base_connector.py +0 -0
  102. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/redis.py +0 -0
  103. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/s3.py +0 -0
  104. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/serde/__init__.py +0 -0
  105. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/serde/safe_serde.py +0 -0
  106. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/serde/serde.py +0 -0
  107. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/connector/utils.py +0 -0
  108. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  109. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/llguidance_backend.py +0 -0
  110. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/outlines_backend.py +0 -0
  111. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  112. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  113. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  114. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  115. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/custom_op.py +0 -0
  116. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/base/__init__.py +0 -0
  117. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/base/conn.py +0 -0
  118. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/decode.py +0 -0
  119. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/fake/__init__.py +0 -0
  120. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/fake/conn.py +0 -0
  121. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mini_lb.py +0 -0
  122. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  123. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mooncake/conn.py +0 -0
  124. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
  125. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  126. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  127. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/prefill.py +0 -0
  128. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/disaggregation/utils.py +0 -0
  129. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/__init__.py +0 -0
  130. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/communication_op.py +0 -0
  131. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  132. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  133. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  134. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  135. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  136. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  137. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  138. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  139. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  140. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/distributed/utils.py +0 -0
  141. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/entrypoints/EngineBase.py +0 -0
  142. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/entrypoints/http_server.py +0 -0
  143. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/entrypoints/http_server_engine.py +0 -0
  144. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/entrypoints/verl_engine.py +0 -0
  145. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/activation.py +0 -0
  146. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  147. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  148. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
  149. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  150. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
  151. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  152. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  153. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  154. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  155. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  156. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/utils.py +0 -0
  157. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/attention/vision.py +0 -0
  158. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/elementwise.py +0 -0
  159. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/layernorm.py +0 -0
  160. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/linear.py +0 -0
  161. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/logits_processor.py +0 -0
  162. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  163. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  164. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  165. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  166. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  167. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  168. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  169. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  170. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  171. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  172. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  173. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  174. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  175. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  176. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  177. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  178. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  179. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +0 -0
  180. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +0 -0
  181. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  182. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +0 -0
  183. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  184. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +0 -0
  185. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  186. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  187. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +0 -0
  188. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  189. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +0 -0
  190. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +0 -0
  191. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  192. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  193. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  194. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  195. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  196. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  197. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  198. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  199. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  200. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  201. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  202. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  203. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  204. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  205. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  206. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  207. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  208. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  209. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  210. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  211. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  212. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  213. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  214. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  215. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  216. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  217. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  218. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  219. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  220. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  221. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  222. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  223. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  224. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  225. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  226. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  227. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  232. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  233. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  234. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  235. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  238. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  239. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  240. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  241. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  243. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  244. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  245. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  246. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  247. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  248. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  249. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  250. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  251. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  252. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  253. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  254. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  255. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  257. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  258. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  259. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  260. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  261. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  262. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  263. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  264. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  265. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  266. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  267. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  268. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  269. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  270. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  271. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  272. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  273. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  274. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  275. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  276. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  277. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  278. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  279. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  280. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  281. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  282. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  283. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  284. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  285. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  286. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  287. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  288. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  289. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  290. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  291. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  292. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  293. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  294. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  295. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  296. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  297. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  298. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  299. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  300. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  301. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  302. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  303. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  304. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  305. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  306. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  307. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  308. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  309. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  310. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  311. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  312. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  313. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  314. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -0
  315. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/router.py +0 -0
  316. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/moe/topk.py +0 -0
  317. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/parameter.py +0 -0
  318. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/pooler.py +0 -0
  319. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/awq.py +0 -0
  320. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  321. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  322. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  323. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  324. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  325. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  326. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  327. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  328. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  329. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  330. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  331. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  332. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  333. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  334. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  335. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  336. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  337. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  338. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  339. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  340. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  341. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  342. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  344. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  345. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  346. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  348. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/fp8.py +0 -0
  483. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  484. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  485. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/gptq.py +0 -0
  486. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  487. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  488. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  489. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  490. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
  491. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/utils.py +0 -0
  492. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  493. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  494. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/radix_attention.py +0 -0
  495. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
  496. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/sampler.py +0 -0
  497. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  498. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  499. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/backend/base_backend.py +0 -0
  500. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  501. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/backend/triton_backend.py +0 -0
  502. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/lora.py +0 -0
  503. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/lora_config.py +0 -0
  504. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/mem_pool.py +0 -0
  505. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  506. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  507. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  508. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  509. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  510. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/lora/utils.py +0 -0
  511. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/cache_controller.py +0 -0
  512. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/configure_logging.py +0 -0
  513. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
  514. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/expert_distribution.py +0 -0
  515. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/io_struct.py +0 -0
  516. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/mm_utils.py +0 -0
  517. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processor.py +0 -0
  518. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/base_processor.py +0 -0
  519. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  520. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  521. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  522. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/janus_pro.py +0 -0
  523. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  524. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  525. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  526. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/mllama4.py +0 -0
  527. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  528. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  529. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/session_controller.py +0 -0
  530. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/tokenizer_manager.py +0 -0
  531. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/managers/utils.py +0 -0
  532. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  533. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  534. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  535. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  536. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  537. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  538. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/metrics/collector.py +0 -0
  539. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/metrics/func_timer.py +0 -0
  540. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/mm_utils.py +0 -0
  541. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_loader/__init__.py +0 -0
  542. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_loader/loader.py +0 -0
  543. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_loader/utils.py +0 -0
  544. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
  545. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/model_parallel.py +0 -0
  546. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/baichuan.py +0 -0
  547. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/bert.py +0 -0
  548. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/chatglm.py +0 -0
  549. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/clip.py +0 -0
  550. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/commandr.py +0 -0
  551. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/dbrx.py +0 -0
  552. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/deepseek.py +0 -0
  553. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  554. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/deepseek_vl2.py +0 -0
  555. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/exaone.py +0 -0
  556. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gemma.py +0 -0
  557. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gemma2.py +0 -0
  558. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gemma2_reward.py +0 -0
  559. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gemma3_causal.py +0 -0
  560. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gemma3_mm.py +0 -0
  561. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gpt2.py +0 -0
  562. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  563. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/granite.py +0 -0
  564. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/grok.py +0 -0
  565. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/internlm2.py +0 -0
  566. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  567. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama_classification.py +0 -0
  568. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama_embedding.py +0 -0
  569. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llama_reward.py +0 -0
  570. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llava.py +0 -0
  571. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/llavavid.py +0 -0
  572. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/minicpm.py +0 -0
  573. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/minicpm3.py +0 -0
  574. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/minicpmo.py +0 -0
  575. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/minicpmv.py +0 -0
  576. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/mistral.py +0 -0
  577. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/mixtral.py +0 -0
  578. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  579. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/mllama.py +0 -0
  580. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/mllama4.py +0 -0
  581. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/olmo.py +0 -0
  582. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/olmo2.py +0 -0
  583. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/olmoe.py +0 -0
  584. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/phi3_small.py +0 -0
  585. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen.py +0 -0
  586. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2.py +0 -0
  587. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_5_vl.py +0 -0
  588. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_classification.py +0 -0
  589. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
  590. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_rm.py +0 -0
  591. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/qwen3.py +0 -0
  592. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/registry.py +0 -0
  593. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/roberta.py +0 -0
  594. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/stablelm.py +0 -0
  595. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  596. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/xverse.py +0 -0
  597. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/xverse_moe.py +0 -0
  598. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/models/yivl.py +0 -0
  599. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/patch_torch.py +0 -0
  600. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/platforms/interface.py +0 -0
  601. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/reasoning_parser.py +0 -0
  602. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  603. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  604. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  605. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  606. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  607. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  608. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  609. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  610. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  611. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  612. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/speculative/eagle_utils.py +0 -0
  613. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/speculative/spec_info.py +0 -0
  614. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  615. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/srt/warmup.py +0 -0
  616. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/__init__.py +0 -0
  617. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/attention/__init__.py +0 -0
  618. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/attention/test_flashattn_backend.py +0 -0
  619. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  620. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  621. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  622. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  623. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/run_eval.py +0 -0
  624. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/send_one.py +0 -0
  625. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/simple_eval_common.py +0 -0
  626. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  627. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  628. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/simple_eval_math.py +0 -0
  629. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  630. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  631. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_activation.py +0 -0
  632. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_block_fp8.py +0 -0
  633. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_block_fp8_ep.py +0 -0
  634. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_custom_ops.py +0 -0
  635. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_dynamic_grad_mode.py +0 -0
  636. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_layernorm.py +0 -0
  637. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/test/test_programs.py +0 -0
  638. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang/utils.py +0 -0
  639. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang.egg-info/dependency_links.txt +0 -0
  640. {sglang-0.4.6.post1 → sglang-0.4.6.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6.post1
3
+ Version: 0.4.6.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
238
238
  Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
- Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
241
+ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
246
247
  Provides-Extra: srt
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
248
- Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
249
- Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
250
+ Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
250
251
  Requires-Dist: torch==2.6.0; extra == "srt"
251
252
  Requires-Dist: torchvision==0.21.0; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.6.post1"
7
+ version = "0.4.6.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -37,17 +37,18 @@ runtime_common = [
37
37
  "python-multipart",
38
38
  "pyzmq>=25.1.2",
39
39
  "soundfile==0.13.1",
40
- "torchao>=0.7.0",
40
+ "torchao>=0.9.0",
41
41
  "transformers==4.51.1",
42
42
  "uvicorn",
43
43
  "uvloop",
44
44
  "xgrammar==0.1.17",
45
+ "blobfile==3.0.0"
45
46
  ]
46
47
 
47
48
  srt = [
48
49
  "sglang[runtime_common]",
49
- "sgl-kernel==0.1.0",
50
- "flashinfer_python==0.2.3",
50
+ "sgl-kernel==0.1.1",
51
+ "flashinfer_python==0.2.5",
51
52
  "torch==2.6.0",
52
53
  "torchvision==0.21.0",
53
54
  "cuda-python",
@@ -154,6 +154,8 @@ def load_model(server_args, port_args, tp_rank):
154
154
  gpu_id=tp_rank,
155
155
  tp_rank=tp_rank,
156
156
  tp_size=server_args.tp_size,
157
+ pp_rank=0,
158
+ pp_size=1,
157
159
  nccl_port=port_args.nccl_port,
158
160
  server_args=server_args,
159
161
  )
@@ -20,7 +20,7 @@ def is_cuda_v2():
20
20
  PACKAGE_LIST = [
21
21
  "sglang",
22
22
  "sgl_kernel",
23
- "flashinfer",
23
+ "flashinfer_python",
24
24
  "triton",
25
25
  "transformers",
26
26
  "torchao",
@@ -36,8 +36,8 @@ PACKAGE_LIST = [
36
36
  "packaging",
37
37
  "psutil",
38
38
  "pydantic",
39
- "multipart",
40
- "zmq",
39
+ "python-multipart",
40
+ "pyzmq",
41
41
  "torchao",
42
42
  "uvicorn",
43
43
  "uvloop",
@@ -3,6 +3,8 @@ from sglang.srt.configs.dbrx import DbrxConfig
3
3
  from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
4
4
  from sglang.srt.configs.exaone import ExaoneConfig
5
5
  from sglang.srt.configs.janus_pro import MultiModalityConfig
6
+ from sglang.srt.configs.kimi_vl import KimiVLConfig
7
+ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
6
8
 
7
9
  __all__ = [
8
10
  "ExaoneConfig",
@@ -10,4 +12,6 @@ __all__ = [
10
12
  "DbrxConfig",
11
13
  "DeepseekVL2Config",
12
14
  "MultiModalityConfig",
15
+ "KimiVLConfig",
16
+ "MoonViTConfig",
13
17
  ]
@@ -0,0 +1,38 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
3
+ from typing import Optional, Union
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+ from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
8
+ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
9
+
10
+
11
+ class KimiVLConfig(PretrainedConfig):
12
+ model_type = "kimi_vl"
13
+
14
+ def __init__(
15
+ self,
16
+ vision_config: Optional[Union[dict, MoonViTConfig]] = None,
17
+ text_config: Optional[Union[dict, DeepseekV2Config]] = None,
18
+ ignore_index: int = -100,
19
+ media_placeholder_token_id: int = 163605,
20
+ pad_token_id: int = 0,
21
+ **kwargs
22
+ ):
23
+ if vision_config is None:
24
+ vision_config = MoonViTConfig()
25
+ elif isinstance(vision_config, dict):
26
+ vision_config = MoonViTConfig(**vision_config)
27
+ self.vision_config = vision_config
28
+
29
+ if text_config is None:
30
+ text_config = DeepseekV2Config()
31
+ elif isinstance(text_config, dict):
32
+ text_config = DeepseekV2Config(**text_config)
33
+ self.text_config = text_config
34
+
35
+ self.ignore_index = ignore_index
36
+ self.media_placeholder_token_id = media_placeholder_token_id
37
+
38
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -0,0 +1,32 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class MoonViTConfig(PretrainedConfig):
7
+ model_type = "moonvit"
8
+
9
+ def __init__(
10
+ self,
11
+ patch_size: int = 14,
12
+ init_pos_emb_height: int = 64,
13
+ init_pos_emb_width: int = 64,
14
+ num_attention_heads: int = 16,
15
+ num_hidden_layers: int = 27,
16
+ hidden_size: int = 1152,
17
+ intermediate_size: int = 4304,
18
+ merge_kernel_size: tuple[int, int] = (2, 2),
19
+ **kwargs,
20
+ ):
21
+ super().__init__(**kwargs)
22
+ self.patch_size = patch_size
23
+ # Positional embedding config
24
+ self.init_pos_emb_height = init_pos_emb_height
25
+ self.init_pos_emb_width = init_pos_emb_width
26
+ # Transformer config
27
+ self.num_hidden_layers = num_hidden_layers
28
+ self.num_attention_heads = num_attention_heads
29
+ self.hidden_size = hidden_size
30
+ self.intermediate_size = intermediate_size
31
+ # Patch merger config
32
+ self.merge_kernel_size = merge_kernel_size
@@ -47,6 +47,7 @@ class ModelConfig:
47
47
  dtype: str = "auto",
48
48
  quantization: Optional[str] = None,
49
49
  override_config_file: Optional[str] = None,
50
+ is_draft_model: bool = False,
50
51
  ) -> None:
51
52
 
52
53
  self.model_path = model_path
@@ -85,6 +86,12 @@ class ModelConfig:
85
86
  else:
86
87
  enable_multimodal = True
87
88
 
89
+ if (
90
+ is_draft_model
91
+ and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
92
+ ):
93
+ self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
94
+
88
95
  # Check model type
89
96
  self.is_generation = is_generation_model(
90
97
  self.hf_config.architectures, is_embedding
@@ -169,6 +176,13 @@ class ModelConfig:
169
176
  self.attention_arch = AttentionArch.MLA
170
177
  self.kv_lora_rank = self.hf_text_config.kv_lora_rank
171
178
  self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
179
+ elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
180
+ self.head_dim = 256
181
+ self.attention_arch = AttentionArch.MLA
182
+ self.kv_lora_rank = self.hf_text_config.kv_lora_rank
183
+ self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
184
+ self.v_head_dim = self.hf_text_config.v_head_dim
185
+ self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
172
186
  else:
173
187
  self.attention_arch = AttentionArch.MHA
174
188
 
@@ -523,6 +537,7 @@ multimodal_model_archs = [
523
537
  "Qwen2VLForConditionalGeneration",
524
538
  "Qwen2_5_VLForConditionalGeneration",
525
539
  "CLIPModel",
540
+ "KimiVLForConditionalGeneration",
526
541
  ]
527
542
 
528
543
 
@@ -17,7 +17,7 @@
17
17
  # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
18
18
  import dataclasses
19
19
  from enum import IntEnum, auto
20
- from typing import Dict, List, Optional, Tuple, Union
20
+ from typing import Callable, Dict, List, Optional, Tuple, Union
21
21
 
22
22
  from sglang.srt.openai_api.protocol import ChatCompletionRequest
23
23
 
@@ -407,6 +407,7 @@ class Conversation:
407
407
 
408
408
  # A global registry for all conversation templates
409
409
  chat_templates: Dict[str, Conversation] = {}
410
+ matching_function_registry: List[Callable] = []
410
411
 
411
412
 
412
413
  def register_conv_template(template: Conversation, override: bool = False):
@@ -419,6 +420,18 @@ def register_conv_template(template: Conversation, override: bool = False):
419
420
  chat_templates[template.name] = template
420
421
 
421
422
 
423
+ def register_conv_template_matching_function(func):
424
+ matching_function_registry.append(func)
425
+
426
+
427
+ def get_conv_template_by_model_path(model_path):
428
+ for matching_func in matching_function_registry:
429
+ conv_name = matching_func(model_path)
430
+ if conv_name is not None:
431
+ return conv_name
432
+ return None
433
+
434
+
422
435
  def chat_template_exists(template_name: str) -> bool:
423
436
  return template_name in chat_templates
424
437
 
@@ -792,3 +805,111 @@ register_conv_template(
792
805
  audio_token="(<audio>./</audio>)",
793
806
  )
794
807
  )
808
+
809
+ # Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
810
+ register_conv_template(
811
+ Conversation(
812
+ name="kimi-vl",
813
+ system_message="You are a helpful assistant",
814
+ system_template="<|im_system|>system<|im_middle|>{system_message}",
815
+ roles=(
816
+ "<|im_user|>user<|im_middle|>",
817
+ "<|im_assistant|>assistant<|im_middle|>",
818
+ ),
819
+ messages=[],
820
+ sep="<|im_end|>",
821
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
822
+ stop_str="<|im_end|>",
823
+ image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
824
+ )
825
+ )
826
+
827
+
828
+ @register_conv_template_matching_function
829
+ def match_deepseek_janus_pro(model_path: str):
830
+ if (
831
+ "llama" in model_path.lower()
832
+ and "3.2" in model_path.lower()
833
+ and "vision" in model_path.lower()
834
+ ):
835
+ return "llama_3_vision"
836
+
837
+
838
+ @register_conv_template_matching_function
839
+ def match_deepseek_janus_pro(model_path: str):
840
+ if "janus" in model_path.lower():
841
+ return "janus-pro"
842
+
843
+
844
+ @register_conv_template_matching_function
845
+ def match_vicuna(model_path: str):
846
+ if "vicuna" in model_path.lower():
847
+ return "vicuna_v1.1"
848
+ if "llava-v1.5" in model_path.lower():
849
+ return "vicuna_v1.1"
850
+ if "llava-next-video-7b" in model_path.lower():
851
+ return "vicuna_v1.1"
852
+
853
+
854
+ @register_conv_template_matching_function
855
+ def match_llama2_chat(model_path: str):
856
+ model_path = model_path.lower()
857
+ if "llama-2" in model_path and "chat" in model_path:
858
+ return "llama-2"
859
+ if (
860
+ "mistral" in model_path or "mixtral" in model_path
861
+ ) and "instruct" in model_path:
862
+ return "llama-2"
863
+ if "codellama" in model_path and "instruct" in model_path:
864
+ return "llama-2"
865
+
866
+
867
+ @register_conv_template_matching_function
868
+ def match_deepseek_vl(model_path: str):
869
+ model_path = model_path.lower()
870
+ if "deepseek" in model_path and "vl2" in model_path:
871
+ return "deepseek-vl2"
872
+
873
+
874
+ @register_conv_template_matching_function
875
+ def match_chat_ml(model_path: str):
876
+ # import pdb;pdb.set_trace()
877
+ model_path = model_path.lower()
878
+ # Now the suffix for qwen2 chat model is "instruct"
879
+ if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
880
+ return "gme-qwen2-vl"
881
+ if "qwen" in model_path and "vl" in model_path:
882
+ return "qwen2-vl"
883
+ if (
884
+ "llava-v1.6-34b" in model_path
885
+ or "llava-v1.6-yi-34b" in model_path
886
+ or "llava-next-video-34b" in model_path
887
+ or "llava-onevision-qwen2" in model_path
888
+ ):
889
+ return "chatml-llava"
890
+
891
+
892
+ @register_conv_template_matching_function
893
+ def match_gemma_it(model_path: str):
894
+ model_path = model_path.lower()
895
+ if "gemma" in model_path and "it" in model_path:
896
+ return "gemma-it"
897
+ if "gemma-3" in model_path and "1b" not in model_path:
898
+ # gemma-3-1b-it is completion model
899
+ return "gemma-it"
900
+
901
+
902
+ @register_conv_template_matching_function
903
+ def match_openbmb_minicpm(model_path: str):
904
+ model_path = model_path.lower()
905
+ if "minicpm-v" in model_path:
906
+ return "minicpmv"
907
+ elif "minicpm-o" in model_path:
908
+ return "minicpmo"
909
+
910
+
911
+ @register_conv_template_matching_function
912
+ def match_moonshot_kimivl(model_path: str):
913
+ model_path = model_path.lower()
914
+ if "kimi" in model_path and "vl" in model_path:
915
+ return "kimi-vl"
@@ -58,7 +58,10 @@ from sglang.srt.managers.io_struct import (
58
58
  )
59
59
  from sglang.srt.managers.scheduler import run_scheduler_process
60
60
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
61
- from sglang.srt.openai_api.adapter import load_chat_template_for_openai_api
61
+ from sglang.srt.openai_api.adapter import (
62
+ guess_chat_template_name_from_model_path,
63
+ load_chat_template_for_openai_api,
64
+ )
62
65
  from sglang.srt.server_args import PortArgs, ServerArgs
63
66
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
64
67
  from sglang.srt.utils import (
@@ -123,7 +126,6 @@ class Engine(EngineBase):
123
126
  server_args=server_args,
124
127
  port_args=port_args,
125
128
  )
126
-
127
129
  self.server_args = server_args
128
130
  self.tokenizer_manager = tokenizer_manager
129
131
  self.scheduler_info = scheduler_info
@@ -298,7 +300,6 @@ class Engine(EngineBase):
298
300
  internal_states = loop.run_until_complete(
299
301
  self.tokenizer_manager.get_internal_state()
300
302
  )
301
-
302
303
  return {
303
304
  **dataclasses.asdict(self.tokenizer_manager.server_args),
304
305
  **self.scheduler_info,
@@ -450,7 +451,7 @@ def _set_envs_and_config(server_args: ServerArgs):
450
451
  if server_args.attention_backend == "flashinfer":
451
452
  assert_pkg_version(
452
453
  "flashinfer_python",
453
- "0.2.3",
454
+ "0.2.5",
454
455
  "Please uninstall the old version and "
455
456
  "reinstall the latest version by following the instructions "
456
457
  "at https://docs.flashinfer.ai/installation.html.",
@@ -458,7 +459,7 @@ def _set_envs_and_config(server_args: ServerArgs):
458
459
  if _is_cuda:
459
460
  assert_pkg_version(
460
461
  "sgl-kernel",
461
- "0.1.0",
462
+ "0.1.1",
462
463
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
463
464
  )
464
465
 
@@ -517,25 +518,44 @@ def _launch_subprocesses(
517
518
  )
518
519
 
519
520
  scheduler_pipe_readers = []
520
- tp_size_per_node = server_args.tp_size // server_args.nnodes
521
+
522
+ nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
523
+ tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
521
524
  tp_rank_range = range(
522
- tp_size_per_node * server_args.node_rank,
523
- tp_size_per_node * (server_args.node_rank + 1),
525
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
526
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
524
527
  )
525
- for tp_rank in tp_rank_range:
526
- reader, writer = mp.Pipe(duplex=False)
527
- gpu_id = (
528
- server_args.base_gpu_id
529
- + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
530
- )
531
- proc = mp.Process(
532
- target=run_scheduler_process,
533
- args=(server_args, port_args, gpu_id, tp_rank, None, writer),
534
- )
535
- with memory_saver_adapter.configure_subprocess():
536
- proc.start()
537
- scheduler_procs.append(proc)
538
- scheduler_pipe_readers.append(reader)
528
+
529
+ pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
530
+ pp_rank_range = range(
531
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
532
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
533
+ )
534
+
535
+ for pp_rank in pp_rank_range:
536
+ for tp_rank in tp_rank_range:
537
+ reader, writer = mp.Pipe(duplex=False)
538
+ gpu_id = (
539
+ server_args.base_gpu_id
540
+ + ((pp_rank % pp_size_per_node) * tp_size_per_node)
541
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
542
+ )
543
+ proc = mp.Process(
544
+ target=run_scheduler_process,
545
+ args=(
546
+ server_args,
547
+ port_args,
548
+ gpu_id,
549
+ tp_rank,
550
+ pp_rank,
551
+ None,
552
+ writer,
553
+ ),
554
+ )
555
+ with memory_saver_adapter.configure_subprocess():
556
+ proc.start()
557
+ scheduler_procs.append(proc)
558
+ scheduler_pipe_readers.append(reader)
539
559
  else:
540
560
  # Launch the data parallel controller
541
561
  reader, writer = mp.Pipe(duplex=False)
@@ -584,6 +604,8 @@ def _launch_subprocesses(
584
604
  load_chat_template_for_openai_api(
585
605
  tokenizer_manager, server_args.chat_template, server_args.model_path
586
606
  )
607
+ else:
608
+ guess_chat_template_name_from_model_path(server_args.model_path)
587
609
 
588
610
  if server_args.completion_template:
589
611
  load_completion_template_for_openai_api(server_args.completion_template)
@@ -1,3 +1,4 @@
1
+ import ast
1
2
  import json
2
3
  import logging
3
4
  import re
@@ -664,6 +665,101 @@ class MultiFormatParser:
664
665
  return final_normal_text, final_calls
665
666
 
666
667
 
668
+ class PythonicDetector(BaseFormatDetector):
669
+ """
670
+ Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
671
+ Assumes function call format:
672
+ [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
673
+ Arguments are Python literals (not JSON).
674
+ """
675
+
676
+ def __init__(self):
677
+ super().__init__()
678
+ self.tool_call_regex = re.compile(
679
+ r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
680
+ re.DOTALL,
681
+ )
682
+
683
+ def has_tool_call(self, text: str) -> bool:
684
+ return bool(self.tool_call_regex.match(text.strip()))
685
+
686
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
687
+ # Try parsing the text as a Python list of function calls
688
+ text = text.strip()
689
+ if not (text.startswith("[") and text.endswith("]")):
690
+ # Not a pythonic tool call format
691
+ return StreamingParseResult(normal_text=text, calls=[])
692
+ try:
693
+ module = ast.parse(text)
694
+ parsed = getattr(module.body[0], "value", None)
695
+ if not (
696
+ isinstance(parsed, ast.List)
697
+ and all(isinstance(e, ast.Call) for e in parsed.elts)
698
+ ):
699
+ return StreamingParseResult(normal_text=text, calls=[])
700
+ calls = []
701
+ tool_indices = {
702
+ tool.function.name: i
703
+ for i, tool in enumerate(tools)
704
+ if tool.function.name
705
+ }
706
+ for call in parsed.elts:
707
+ if not isinstance(call.func, ast.Name):
708
+ continue
709
+ function_name = call.func.id
710
+ arguments = {}
711
+ for keyword in call.keywords:
712
+ arguments[keyword.arg] = self._get_parameter_value(keyword.value)
713
+ calls.append(
714
+ ToolCallItem(
715
+ tool_index=tool_indices.get(function_name, -1),
716
+ name=function_name,
717
+ parameters=json.dumps(arguments, ensure_ascii=False),
718
+ )
719
+ )
720
+ return StreamingParseResult(normal_text="", calls=calls)
721
+ except Exception:
722
+ logger.exception("Error in pythonic tool call parsing.")
723
+ return StreamingParseResult(normal_text=text, calls=[])
724
+
725
+ def parse_streaming_increment(
726
+ self, new_text: str, tools: List[Tool]
727
+ ) -> StreamingParseResult:
728
+ """
729
+ Streaming incremental parsing for pythonic tool calls.
730
+ Buffers input until a complete pythonic tool call (from [ to ]) is found,
731
+ then parses and emits any detected calls.
732
+ """
733
+ self._buffer += new_text
734
+ start = self._buffer.find("[")
735
+ end = self._buffer.find("]", start)
736
+ if start != -1 and end != -1:
737
+ call_text = self._buffer[start : end + 1]
738
+ result = self.detect_and_parse(call_text, tools)
739
+ self._buffer = self._buffer[end + 1 :]
740
+ return result
741
+ return StreamingParseResult(normal_text="")
742
+
743
+ def _get_parameter_value(self, val):
744
+ if isinstance(val, ast.Constant):
745
+ return val.value
746
+ elif isinstance(val, ast.Dict):
747
+ return {
748
+ k.value: self._get_parameter_value(v)
749
+ for k, v in zip(val.keys, val.values)
750
+ }
751
+ elif isinstance(val, ast.List):
752
+ return [self._get_parameter_value(v) for v in val.elts]
753
+ else:
754
+ raise ValueError("Tool call arguments must be literals")
755
+
756
+ def structure_info(self) -> _GetInfoFunc:
757
+ def info(name: str):
758
+ return StructureInfo(begin="[", end="]", trigger="")
759
+
760
+ return info
761
+
762
+
667
763
  class FunctionCallParser:
668
764
  """
669
765
  In streaming scenarios, each time new_text is received, it calls multi_format_parser.parse_streaming_increment
@@ -675,6 +771,7 @@ class FunctionCallParser:
675
771
  "qwen25": Qwen25Detector,
676
772
  "mistral": MistralDetector,
677
773
  "deepseekv3": DeepSeekV3Detector,
774
+ "pythonic": PythonicDetector,
678
775
  }
679
776
 
680
777
  def __init__(self, tools: List[Tool], tool_call_parser: str):
@@ -35,6 +35,7 @@ from sglang.srt.configs import (
35
35
  DbrxConfig,
36
36
  DeepseekVL2Config,
37
37
  ExaoneConfig,
38
+ KimiVLConfig,
38
39
  MultiModalityConfig,
39
40
  )
40
41
  from sglang.srt.connector import create_remote_connector
@@ -46,6 +47,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
46
47
  ExaoneConfig.model_type: ExaoneConfig,
47
48
  DeepseekVL2Config.model_type: DeepseekVL2Config,
48
49
  MultiModalityConfig.model_type: MultiModalityConfig,
50
+ KimiVLConfig.model_type: KimiVLConfig,
49
51
  }
50
52
 
51
53
  for name, cls in _CONFIG_REGISTRY.items():
@@ -268,7 +268,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
268
268
  reshape_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
269
269
 
270
270
  o = cutlass_mla_decode(
271
- q_nope_and_q_pe=reshape_q,
271
+ q_nope_and_q_pe=reshape_q.to(self.q_data_type),
272
272
  kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
273
273
  seq_lens=forward_batch.seq_lens.to(torch.int32),
274
274
  page_table=self.forward_metadata.block_kv_indices,