sglang 0.4.6__tar.gz → 0.4.6.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (641) hide show
  1. {sglang-0.4.6/sglang.egg-info → sglang-0.4.6.post2}/PKG-INFO +5 -4
  2. {sglang-0.4.6 → sglang-0.4.6.post2}/pyproject.toml +5 -4
  3. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/bench_one_batch.py +2 -0
  4. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/check_env.py +3 -3
  5. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/__init__.py +4 -0
  6. sglang-0.4.6.post2/sglang/srt/configs/kimi_vl.py +38 -0
  7. sglang-0.4.6.post2/sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  8. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/model_config.py +15 -0
  9. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/conversation.py +122 -1
  10. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/decode.py +8 -2
  11. sglang-0.4.6.post2/sglang/srt/disaggregation/fake/__init__.py +1 -0
  12. sglang-0.4.6.post2/sglang/srt/disaggregation/fake/conn.py +88 -0
  13. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/prefill.py +12 -3
  14. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/utils.py +16 -2
  15. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/entrypoints/engine.py +52 -21
  16. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/entrypoints/http_server.py +27 -2
  17. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/function_call_parser.py +97 -0
  18. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/hf_transformers_utils.py +2 -0
  19. sglang-0.4.6.post2/sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  20. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  21. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  22. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashmla_backend.py +3 -0
  23. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/utils.py +1 -1
  24. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/dp_attention.py +5 -2
  25. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  26. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  27. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  28. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  29. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  30. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  31. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  33. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  35. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  36. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  37. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  39. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  40. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  41. sglang-0.4.6.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  42. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -8
  43. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  44. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/__init__.py +2 -2
  45. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/deep_gemm.py +1 -1
  46. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/fp8.py +20 -22
  47. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/fp8_utils.py +2 -2
  48. sglang-0.4.6.post2/sglang/srt/layers/utils.py +35 -0
  49. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/layers.py +35 -9
  50. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/lora_manager.py +84 -35
  51. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/data_parallel_controller.py +52 -34
  52. sglang-0.4.6.post2/sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  53. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/schedule_batch.py +34 -15
  54. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/scheduler.py +273 -67
  55. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/scheduler_output_processor_mixin.py +26 -10
  56. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/tp_worker.py +52 -17
  57. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +18 -7
  58. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/memory_pool.py +70 -36
  59. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  60. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_executor/forward_batch_info.py +31 -1
  61. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_executor/model_runner.py +123 -58
  62. sglang-0.4.6.post2/sglang/srt/models/deepseek_nextn.py +183 -0
  63. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/deepseek_v2.py +78 -18
  64. sglang-0.4.6.post2/sglang/srt/models/kimi_vl.py +308 -0
  65. sglang-0.4.6.post2/sglang/srt/models/kimi_vl_moonvit.py +639 -0
  66. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama.py +92 -30
  67. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama4.py +2 -1
  68. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama_eagle.py +4 -1
  69. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama_eagle3.py +4 -1
  70. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_moe.py +8 -3
  71. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_vl.py +0 -12
  72. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen3_moe.py +8 -3
  73. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/openai_api/adapter.py +49 -8
  74. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/openai_api/protocol.py +13 -1
  75. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/reasoning_parser.py +25 -1
  76. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/server_args.py +83 -24
  77. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/speculative/eagle_worker.py +3 -2
  78. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/utils.py +91 -9
  79. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/runners.py +4 -0
  80. sglang-0.4.6.post2/sglang/test/send_one.py +144 -0
  81. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_utils.py +67 -0
  82. sglang-0.4.6.post2/sglang/version.py +1 -0
  83. {sglang-0.4.6 → sglang-0.4.6.post2/sglang.egg-info}/PKG-INFO +5 -4
  84. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang.egg-info/SOURCES.txt +25 -0
  85. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang.egg-info/requires.txt +4 -3
  86. sglang-0.4.6/sglang/srt/models/deepseek_nextn.py +0 -439
  87. sglang-0.4.6/sglang/test/send_one.py +0 -88
  88. sglang-0.4.6/sglang/version.py +0 -1
  89. {sglang-0.4.6 → sglang-0.4.6.post2}/LICENSE +0 -0
  90. {sglang-0.4.6 → sglang-0.4.6.post2}/README.md +0 -0
  91. {sglang-0.4.6 → sglang-0.4.6.post2}/setup.cfg +0 -0
  92. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/__init__.py +0 -0
  93. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/api.py +0 -0
  94. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/bench_offline_throughput.py +0 -0
  95. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/bench_one_batch_server.py +0 -0
  96. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/bench_serving.py +0 -0
  97. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/compile_deep_gemm.py +0 -0
  98. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/global_config.py +0 -0
  99. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/__init__.py +0 -0
  100. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/anthropic.py +0 -0
  101. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/base_backend.py +0 -0
  102. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/litellm.py +0 -0
  103. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/openai.py +0 -0
  104. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  105. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/backend/vertexai.py +0 -0
  106. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/chat_template.py +0 -0
  107. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/choices.py +0 -0
  108. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/compiler.py +0 -0
  109. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/interpreter.py +0 -0
  110. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/ir.py +0 -0
  111. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/lang/tracer.py +0 -0
  112. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/launch_server.py +0 -0
  113. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/llama3_eval.py +0 -0
  114. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/_custom_ops.py +0 -0
  115. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/aio_rwlock.py +0 -0
  116. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/code_completion_parser.py +0 -0
  117. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/chatglm.py +0 -0
  118. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/dbrx.py +0 -0
  119. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/deepseekvl2.py +0 -0
  120. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/device_config.py +0 -0
  121. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/exaone.py +0 -0
  122. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/janus_pro.py +0 -0
  123. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/load_config.py +0 -0
  124. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/utils.py +0 -0
  125. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/__init__.py +0 -0
  126. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/base_connector.py +0 -0
  127. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/redis.py +0 -0
  128. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/s3.py +0 -0
  129. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/serde/__init__.py +0 -0
  130. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/serde/safe_serde.py +0 -0
  131. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/serde/serde.py +0 -0
  132. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/connector/utils.py +0 -0
  133. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  134. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/llguidance_backend.py +0 -0
  135. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/outlines_backend.py +0 -0
  136. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  137. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/reasoner_grammar_backend.py +0 -0
  138. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/triton_ops/bitmask_ops.py +0 -0
  139. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  140. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/custom_op.py +0 -0
  141. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/base/__init__.py +0 -0
  142. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/base/conn.py +0 -0
  143. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mini_lb.py +0 -0
  144. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mooncake/__init__.py +0 -0
  145. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mooncake/conn.py +0 -0
  146. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/mooncake/transfer_engine.py +0 -0
  147. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/nixl/__init__.py +0 -0
  148. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/nixl/conn.py +0 -0
  149. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/__init__.py +0 -0
  150. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/communication_op.py +0 -0
  151. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  152. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  153. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  154. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  155. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  156. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  157. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  158. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  159. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  160. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/distributed/utils.py +0 -0
  161. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/entrypoints/EngineBase.py +0 -0
  162. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/entrypoints/http_server_engine.py +0 -0
  163. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/entrypoints/verl_engine.py +0 -0
  164. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/activation.py +0 -0
  165. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/base_attn_backend.py +0 -0
  166. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  167. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/flashattention_backend.py +0 -0
  168. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  169. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
  170. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  171. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  172. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  173. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  174. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +0 -0
  175. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/attention/vision.py +0 -0
  176. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/elementwise.py +0 -0
  177. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/layernorm.py +0 -0
  178. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/linear.py +0 -0
  179. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/logits_processor.py +0 -0
  180. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  181. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  182. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/ep_moe/layer.py +0 -0
  183. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  184. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  185. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  186. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  187. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  188. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  189. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  190. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  191. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  192. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  193. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  194. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  195. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  196. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  197. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  198. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  199. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +0 -0
  200. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  201. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  202. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  203. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  204. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  205. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  206. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  207. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  208. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  209. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  210. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  211. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  212. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  213. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  214. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  215. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  216. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  217. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  218. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  219. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  220. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  221. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  222. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  223. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  224. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  225. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  226. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  227. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  228. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +0 -0
  229. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  230. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  231. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  232. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  233. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  234. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  235. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  236. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  237. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  238. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +0 -0
  239. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +0 -0
  240. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  241. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  242. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  243. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +0 -0
  244. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  246. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  247. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  249. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  250. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  251. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  252. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  253. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  254. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  255. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  256. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  257. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  258. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  259. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  260. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  261. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  262. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  263. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  264. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  265. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  266. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  267. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  268. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  269. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  270. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +0 -0
  271. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +0 -0
  272. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  273. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  274. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  275. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  276. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +0 -0
  277. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +0 -0
  278. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  279. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  280. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  281. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  282. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  283. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  284. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  285. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  286. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  287. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  288. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  289. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +0 -0
  290. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +0 -0
  291. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  292. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  293. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  294. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  295. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  296. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  297. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  298. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  299. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  300. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  301. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  302. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  303. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  304. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  305. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  306. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  307. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  308. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +0 -0
  309. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +0 -0
  310. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  311. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  312. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  313. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  314. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  315. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  316. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +0 -0
  317. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +0 -0
  318. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  319. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  320. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/router.py +0 -0
  321. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/moe/topk.py +0 -0
  322. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/parameter.py +0 -0
  323. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/pooler.py +0 -0
  324. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/awq.py +0 -0
  325. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  326. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/blockwise_int8.py +0 -0
  327. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  328. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +0 -0
  329. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  330. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +0 -0
  331. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +0 -0
  332. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +0 -0
  333. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  334. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/compressed_tensors/utils.py +0 -0
  335. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  336. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  337. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  338. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  339. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  340. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  341. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  342. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  343. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  344. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  345. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  346. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  347. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  348. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  349. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  350. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  351. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  352. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  353. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  354. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  355. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  356. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  357. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  358. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  359. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  360. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  361. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  362. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  363. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  364. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  365. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  366. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  367. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  368. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  369. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  370. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  371. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  372. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  373. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  374. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  375. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  376. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  377. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  378. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  379. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  380. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  381. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  382. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  383. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  384. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  385. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  386. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  387. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  388. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  389. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  390. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  391. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  392. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  393. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  394. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  395. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  396. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  397. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  398. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  399. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  400. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  401. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  402. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  403. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  404. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  405. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  406. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  407. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  408. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  409. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  410. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  411. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  412. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  413. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  414. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  415. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  416. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  417. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  418. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  419. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  420. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  421. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  422. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  423. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  424. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  425. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  426. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  427. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  428. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  429. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  430. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  431. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  432. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  433. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  434. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  435. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  436. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  437. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  438. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  439. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  440. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  441. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  442. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  443. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  444. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  445. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  446. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  447. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  448. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  449. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  450. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  451. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  452. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  453. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  454. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  455. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  456. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  457. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  458. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  459. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  460. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  461. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  462. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  463. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  464. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  465. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  466. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  467. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  468. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  469. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  470. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  471. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  472. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  473. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  474. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  475. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  476. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  477. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  478. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  479. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  480. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  481. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  482. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  483. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  484. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  485. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +0 -0
  486. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  487. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
  488. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/gptq.py +0 -0
  489. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  490. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/int8_utils.py +0 -0
  491. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/kv_cache.py +0 -0
  492. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  493. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/moe_wna16.py +0 -0
  494. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/utils.py +0 -0
  495. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/w8a8_fp8.py +0 -0
  496. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  497. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/radix_attention.py +0 -0
  498. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
  499. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/sampler.py +0 -0
  500. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  501. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  502. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/backend/base_backend.py +0 -0
  503. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
  504. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/backend/triton_backend.py +0 -0
  505. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/lora.py +0 -0
  506. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/lora_config.py +0 -0
  507. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/mem_pool.py +0 -0
  508. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/__init__.py +0 -0
  509. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
  510. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
  511. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
  512. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
  513. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/lora/utils.py +0 -0
  514. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/cache_controller.py +0 -0
  515. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/configure_logging.py +0 -0
  516. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
  517. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/expert_distribution.py +0 -0
  518. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/io_struct.py +0 -0
  519. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/mm_utils.py +0 -0
  520. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processor.py +0 -0
  521. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/base_processor.py +0 -0
  522. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/clip.py +0 -0
  523. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +0 -0
  524. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/gemma3.py +0 -0
  525. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/janus_pro.py +0 -0
  526. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/llava.py +0 -0
  527. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/minicpm.py +0 -0
  528. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/mlama.py +0 -0
  529. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/mllama4.py +0 -0
  530. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/multimodal_processors/qwen_vl.py +0 -0
  531. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  532. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/session_controller.py +0 -0
  533. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/tokenizer_manager.py +0 -0
  534. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/managers/utils.py +0 -0
  535. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  536. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  537. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  538. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/hiradix_cache.py +0 -0
  539. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/paged_allocator.py +0 -0
  540. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  541. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/metrics/collector.py +0 -0
  542. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/metrics/func_timer.py +0 -0
  543. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/mm_utils.py +0 -0
  544. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_loader/__init__.py +0 -0
  545. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_loader/loader.py +0 -0
  546. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_loader/utils.py +0 -0
  547. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
  548. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/model_parallel.py +0 -0
  549. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/baichuan.py +0 -0
  550. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/bert.py +0 -0
  551. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/chatglm.py +0 -0
  552. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/clip.py +0 -0
  553. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/commandr.py +0 -0
  554. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/dbrx.py +0 -0
  555. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/deepseek.py +0 -0
  556. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/deepseek_janus_pro.py +0 -0
  557. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/deepseek_vl2.py +0 -0
  558. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/exaone.py +0 -0
  559. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gemma.py +0 -0
  560. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gemma2.py +0 -0
  561. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gemma2_reward.py +0 -0
  562. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gemma3_causal.py +0 -0
  563. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gemma3_mm.py +0 -0
  564. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gpt2.py +0 -0
  565. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  566. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/granite.py +0 -0
  567. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/grok.py +0 -0
  568. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/internlm2.py +0 -0
  569. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  570. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama_classification.py +0 -0
  571. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama_embedding.py +0 -0
  572. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llama_reward.py +0 -0
  573. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llava.py +0 -0
  574. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/llavavid.py +0 -0
  575. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/minicpm.py +0 -0
  576. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/minicpm3.py +0 -0
  577. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/minicpmo.py +0 -0
  578. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/minicpmv.py +0 -0
  579. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/mistral.py +0 -0
  580. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/mixtral.py +0 -0
  581. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  582. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/mllama.py +0 -0
  583. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/mllama4.py +0 -0
  584. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/olmo.py +0 -0
  585. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/olmo2.py +0 -0
  586. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/olmoe.py +0 -0
  587. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/phi3_small.py +0 -0
  588. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen.py +0 -0
  589. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2.py +0 -0
  590. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_5_vl.py +0 -0
  591. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_classification.py +0 -0
  592. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
  593. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen2_rm.py +0 -0
  594. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/qwen3.py +0 -0
  595. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/registry.py +0 -0
  596. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/roberta.py +0 -0
  597. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/stablelm.py +0 -0
  598. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  599. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/xverse.py +0 -0
  600. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/xverse_moe.py +0 -0
  601. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/models/yivl.py +0 -0
  602. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/patch_torch.py +0 -0
  603. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/platforms/interface.py +0 -0
  604. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  605. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  606. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -0
  607. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/min_new_tokens.py +0 -0
  608. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  609. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/penaltylib/presence_penalty.py +0 -0
  610. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  611. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  612. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/speculative/build_eagle_tree.py +0 -0
  613. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -0
  614. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/speculative/eagle_utils.py +0 -0
  615. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/speculative/spec_info.py +0 -0
  616. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  617. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/warmup.py +0 -0
  618. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/__init__.py +0 -0
  619. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/attention/__init__.py +0 -0
  620. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/attention/test_flashattn_backend.py +0 -0
  621. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/attention/test_flashattn_mla_backend.py +0 -0
  622. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/attention/test_prefix_chunk_info.py +0 -0
  623. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  624. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  625. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/run_eval.py +0 -0
  626. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/simple_eval_common.py +0 -0
  627. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  628. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  629. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/simple_eval_math.py +0 -0
  630. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  631. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  632. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_activation.py +0 -0
  633. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_block_fp8.py +0 -0
  634. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_block_fp8_ep.py +0 -0
  635. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_custom_ops.py +0 -0
  636. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_dynamic_grad_mode.py +0 -0
  637. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_layernorm.py +0 -0
  638. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/test/test_programs.py +0 -0
  639. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang/utils.py +0 -0
  640. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang.egg-info/dependency_links.txt +0 -0
  641. {sglang-0.4.6 → sglang-0.4.6.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6
3
+ Version: 0.4.6.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
238
238
  Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
- Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
241
+ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
246
247
  Provides-Extra: srt
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
248
- Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
249
- Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
250
+ Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
250
251
  Requires-Dist: torch==2.6.0; extra == "srt"
251
252
  Requires-Dist: torchvision==0.21.0; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.6"
7
+ version = "0.4.6.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -37,17 +37,18 @@ runtime_common = [
37
37
  "python-multipart",
38
38
  "pyzmq>=25.1.2",
39
39
  "soundfile==0.13.1",
40
- "torchao>=0.7.0",
40
+ "torchao>=0.9.0",
41
41
  "transformers==4.51.1",
42
42
  "uvicorn",
43
43
  "uvloop",
44
44
  "xgrammar==0.1.17",
45
+ "blobfile==3.0.0"
45
46
  ]
46
47
 
47
48
  srt = [
48
49
  "sglang[runtime_common]",
49
- "sgl-kernel==0.0.9.post2",
50
- "flashinfer_python==0.2.3",
50
+ "sgl-kernel==0.1.1",
51
+ "flashinfer_python==0.2.5",
51
52
  "torch==2.6.0",
52
53
  "torchvision==0.21.0",
53
54
  "cuda-python",
@@ -154,6 +154,8 @@ def load_model(server_args, port_args, tp_rank):
154
154
  gpu_id=tp_rank,
155
155
  tp_rank=tp_rank,
156
156
  tp_size=server_args.tp_size,
157
+ pp_rank=0,
158
+ pp_size=1,
157
159
  nccl_port=port_args.nccl_port,
158
160
  server_args=server_args,
159
161
  )
@@ -20,7 +20,7 @@ def is_cuda_v2():
20
20
  PACKAGE_LIST = [
21
21
  "sglang",
22
22
  "sgl_kernel",
23
- "flashinfer",
23
+ "flashinfer_python",
24
24
  "triton",
25
25
  "transformers",
26
26
  "torchao",
@@ -36,8 +36,8 @@ PACKAGE_LIST = [
36
36
  "packaging",
37
37
  "psutil",
38
38
  "pydantic",
39
- "multipart",
40
- "zmq",
39
+ "python-multipart",
40
+ "pyzmq",
41
41
  "torchao",
42
42
  "uvicorn",
43
43
  "uvloop",
@@ -3,6 +3,8 @@ from sglang.srt.configs.dbrx import DbrxConfig
3
3
  from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
4
4
  from sglang.srt.configs.exaone import ExaoneConfig
5
5
  from sglang.srt.configs.janus_pro import MultiModalityConfig
6
+ from sglang.srt.configs.kimi_vl import KimiVLConfig
7
+ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
6
8
 
7
9
  __all__ = [
8
10
  "ExaoneConfig",
@@ -10,4 +12,6 @@ __all__ = [
10
12
  "DbrxConfig",
11
13
  "DeepseekVL2Config",
12
14
  "MultiModalityConfig",
15
+ "KimiVLConfig",
16
+ "MoonViTConfig",
13
17
  ]
@@ -0,0 +1,38 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
3
+ from typing import Optional, Union
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+ from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
8
+ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
9
+
10
+
11
+ class KimiVLConfig(PretrainedConfig):
12
+ model_type = "kimi_vl"
13
+
14
+ def __init__(
15
+ self,
16
+ vision_config: Optional[Union[dict, MoonViTConfig]] = None,
17
+ text_config: Optional[Union[dict, DeepseekV2Config]] = None,
18
+ ignore_index: int = -100,
19
+ media_placeholder_token_id: int = 163605,
20
+ pad_token_id: int = 0,
21
+ **kwargs
22
+ ):
23
+ if vision_config is None:
24
+ vision_config = MoonViTConfig()
25
+ elif isinstance(vision_config, dict):
26
+ vision_config = MoonViTConfig(**vision_config)
27
+ self.vision_config = vision_config
28
+
29
+ if text_config is None:
30
+ text_config = DeepseekV2Config()
31
+ elif isinstance(text_config, dict):
32
+ text_config = DeepseekV2Config(**text_config)
33
+ self.text_config = text_config
34
+
35
+ self.ignore_index = ignore_index
36
+ self.media_placeholder_token_id = media_placeholder_token_id
37
+
38
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -0,0 +1,32 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class MoonViTConfig(PretrainedConfig):
7
+ model_type = "moonvit"
8
+
9
+ def __init__(
10
+ self,
11
+ patch_size: int = 14,
12
+ init_pos_emb_height: int = 64,
13
+ init_pos_emb_width: int = 64,
14
+ num_attention_heads: int = 16,
15
+ num_hidden_layers: int = 27,
16
+ hidden_size: int = 1152,
17
+ intermediate_size: int = 4304,
18
+ merge_kernel_size: tuple[int, int] = (2, 2),
19
+ **kwargs,
20
+ ):
21
+ super().__init__(**kwargs)
22
+ self.patch_size = patch_size
23
+ # Positional embedding config
24
+ self.init_pos_emb_height = init_pos_emb_height
25
+ self.init_pos_emb_width = init_pos_emb_width
26
+ # Transformer config
27
+ self.num_hidden_layers = num_hidden_layers
28
+ self.num_attention_heads = num_attention_heads
29
+ self.hidden_size = hidden_size
30
+ self.intermediate_size = intermediate_size
31
+ # Patch merger config
32
+ self.merge_kernel_size = merge_kernel_size
@@ -47,6 +47,7 @@ class ModelConfig:
47
47
  dtype: str = "auto",
48
48
  quantization: Optional[str] = None,
49
49
  override_config_file: Optional[str] = None,
50
+ is_draft_model: bool = False,
50
51
  ) -> None:
51
52
 
52
53
  self.model_path = model_path
@@ -85,6 +86,12 @@ class ModelConfig:
85
86
  else:
86
87
  enable_multimodal = True
87
88
 
89
+ if (
90
+ is_draft_model
91
+ and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
92
+ ):
93
+ self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
94
+
88
95
  # Check model type
89
96
  self.is_generation = is_generation_model(
90
97
  self.hf_config.architectures, is_embedding
@@ -169,6 +176,13 @@ class ModelConfig:
169
176
  self.attention_arch = AttentionArch.MLA
170
177
  self.kv_lora_rank = self.hf_text_config.kv_lora_rank
171
178
  self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
179
+ elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
180
+ self.head_dim = 256
181
+ self.attention_arch = AttentionArch.MLA
182
+ self.kv_lora_rank = self.hf_text_config.kv_lora_rank
183
+ self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
184
+ self.v_head_dim = self.hf_text_config.v_head_dim
185
+ self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
172
186
  else:
173
187
  self.attention_arch = AttentionArch.MHA
174
188
 
@@ -523,6 +537,7 @@ multimodal_model_archs = [
523
537
  "Qwen2VLForConditionalGeneration",
524
538
  "Qwen2_5_VLForConditionalGeneration",
525
539
  "CLIPModel",
540
+ "KimiVLForConditionalGeneration",
526
541
  ]
527
542
 
528
543
 
@@ -17,7 +17,7 @@
17
17
  # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
18
18
  import dataclasses
19
19
  from enum import IntEnum, auto
20
- from typing import Dict, List, Optional, Tuple, Union
20
+ from typing import Callable, Dict, List, Optional, Tuple, Union
21
21
 
22
22
  from sglang.srt.openai_api.protocol import ChatCompletionRequest
23
23
 
@@ -407,6 +407,7 @@ class Conversation:
407
407
 
408
408
  # A global registry for all conversation templates
409
409
  chat_templates: Dict[str, Conversation] = {}
410
+ matching_function_registry: List[Callable] = []
410
411
 
411
412
 
412
413
  def register_conv_template(template: Conversation, override: bool = False):
@@ -419,6 +420,18 @@ def register_conv_template(template: Conversation, override: bool = False):
419
420
  chat_templates[template.name] = template
420
421
 
421
422
 
423
+ def register_conv_template_matching_function(func):
424
+ matching_function_registry.append(func)
425
+
426
+
427
+ def get_conv_template_by_model_path(model_path):
428
+ for matching_func in matching_function_registry:
429
+ conv_name = matching_func(model_path)
430
+ if conv_name is not None:
431
+ return conv_name
432
+ return None
433
+
434
+
422
435
  def chat_template_exists(template_name: str) -> bool:
423
436
  return template_name in chat_templates
424
437
 
@@ -792,3 +805,111 @@ register_conv_template(
792
805
  audio_token="(<audio>./</audio>)",
793
806
  )
794
807
  )
808
+
809
+ # Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
810
+ register_conv_template(
811
+ Conversation(
812
+ name="kimi-vl",
813
+ system_message="You are a helpful assistant",
814
+ system_template="<|im_system|>system<|im_middle|>{system_message}",
815
+ roles=(
816
+ "<|im_user|>user<|im_middle|>",
817
+ "<|im_assistant|>assistant<|im_middle|>",
818
+ ),
819
+ messages=[],
820
+ sep="<|im_end|>",
821
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
822
+ stop_str="<|im_end|>",
823
+ image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
824
+ )
825
+ )
826
+
827
+
828
+ @register_conv_template_matching_function
829
+ def match_deepseek_janus_pro(model_path: str):
830
+ if (
831
+ "llama" in model_path.lower()
832
+ and "3.2" in model_path.lower()
833
+ and "vision" in model_path.lower()
834
+ ):
835
+ return "llama_3_vision"
836
+
837
+
838
+ @register_conv_template_matching_function
839
+ def match_deepseek_janus_pro(model_path: str):
840
+ if "janus" in model_path.lower():
841
+ return "janus-pro"
842
+
843
+
844
+ @register_conv_template_matching_function
845
+ def match_vicuna(model_path: str):
846
+ if "vicuna" in model_path.lower():
847
+ return "vicuna_v1.1"
848
+ if "llava-v1.5" in model_path.lower():
849
+ return "vicuna_v1.1"
850
+ if "llava-next-video-7b" in model_path.lower():
851
+ return "vicuna_v1.1"
852
+
853
+
854
+ @register_conv_template_matching_function
855
+ def match_llama2_chat(model_path: str):
856
+ model_path = model_path.lower()
857
+ if "llama-2" in model_path and "chat" in model_path:
858
+ return "llama-2"
859
+ if (
860
+ "mistral" in model_path or "mixtral" in model_path
861
+ ) and "instruct" in model_path:
862
+ return "llama-2"
863
+ if "codellama" in model_path and "instruct" in model_path:
864
+ return "llama-2"
865
+
866
+
867
+ @register_conv_template_matching_function
868
+ def match_deepseek_vl(model_path: str):
869
+ model_path = model_path.lower()
870
+ if "deepseek" in model_path and "vl2" in model_path:
871
+ return "deepseek-vl2"
872
+
873
+
874
+ @register_conv_template_matching_function
875
+ def match_chat_ml(model_path: str):
876
+ # import pdb;pdb.set_trace()
877
+ model_path = model_path.lower()
878
+ # Now the suffix for qwen2 chat model is "instruct"
879
+ if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
880
+ return "gme-qwen2-vl"
881
+ if "qwen" in model_path and "vl" in model_path:
882
+ return "qwen2-vl"
883
+ if (
884
+ "llava-v1.6-34b" in model_path
885
+ or "llava-v1.6-yi-34b" in model_path
886
+ or "llava-next-video-34b" in model_path
887
+ or "llava-onevision-qwen2" in model_path
888
+ ):
889
+ return "chatml-llava"
890
+
891
+
892
+ @register_conv_template_matching_function
893
+ def match_gemma_it(model_path: str):
894
+ model_path = model_path.lower()
895
+ if "gemma" in model_path and "it" in model_path:
896
+ return "gemma-it"
897
+ if "gemma-3" in model_path and "1b" not in model_path:
898
+ # gemma-3-1b-it is completion model
899
+ return "gemma-it"
900
+
901
+
902
+ @register_conv_template_matching_function
903
+ def match_openbmb_minicpm(model_path: str):
904
+ model_path = model_path.lower()
905
+ if "minicpm-v" in model_path:
906
+ return "minicpmv"
907
+ elif "minicpm-o" in model_path:
908
+ return "minicpmo"
909
+
910
+
911
+ @register_conv_template_matching_function
912
+ def match_moonshot_kimivl(model_path: str):
913
+ model_path = model_path.lower()
914
+ if "kimi" in model_path and "vl" in model_path:
915
+ return "kimi-vl"
@@ -32,6 +32,7 @@ from torch.distributed import ProcessGroup
32
32
  from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVArgs, KVPoll
33
33
  from sglang.srt.disaggregation.utils import (
34
34
  DisaggregationMode,
35
+ FakeBootstrapHost,
35
36
  KVClassType,
36
37
  ReqToMetadataIdxAllocator,
37
38
  TransferBackend,
@@ -133,8 +134,13 @@ class DecodePreallocQueue:
133
134
 
134
135
  def add(self, req: Req) -> None:
135
136
  """Add a request to the pending queue."""
136
-
137
- kv_receiver_class = get_kv_class(self.transfer_backend, KVClassType.RECEIVER)
137
+ if req.bootstrap_host == FakeBootstrapHost:
138
+ # Fake transfer for warmup reqs
139
+ kv_receiver_class = get_kv_class(TransferBackend.FAKE, KVClassType.RECEIVER)
140
+ else:
141
+ kv_receiver_class = get_kv_class(
142
+ self.transfer_backend, KVClassType.RECEIVER
143
+ )
138
144
  kv_receiver = kv_receiver_class(
139
145
  mgr=self.kv_manager,
140
146
  bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
@@ -0,0 +1 @@
1
+ from .conn import FakeKVReceiver, FakeKVSender
@@ -0,0 +1,88 @@
1
+ import logging
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+
7
+ from sglang.srt.disaggregation.base.conn import (
8
+ BaseKVManager,
9
+ BaseKVReceiver,
10
+ BaseKVSender,
11
+ KVArgs,
12
+ KVPoll,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # For warmup reqs, we don't kv transfer, we use the fake sender and receiver
19
+ class FakeKVSender(BaseKVSender):
20
+ def __init__(self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int):
21
+ self.has_sent = False
22
+
23
+ def poll(self) -> KVPoll:
24
+ if self.has_sent is False:
25
+ # Assume handshake completed instantly
26
+ return KVPoll.WaitingForInput
27
+ else:
28
+ # Assume transfer completed instantly
29
+ logger.info("FakeKVSender poll success")
30
+ return KVPoll.Success
31
+
32
+ def init(
33
+ self,
34
+ kv_indices: list[int],
35
+ aux_index: Optional[int] = None,
36
+ dest_ranks: Optional[list[int]] = None,
37
+ ):
38
+ logger.info(
39
+ f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}, dest_ranks: {dest_ranks}"
40
+ )
41
+ pass
42
+
43
+ def send(
44
+ self,
45
+ kv_indices: npt.NDArray[np.int64],
46
+ index_slice: slice,
47
+ is_last: bool,
48
+ ):
49
+ logger.info(
50
+ f"FakeKVSender send with kv_indices: {kv_indices}, index_slice: {index_slice}, is_last: {is_last}"
51
+ )
52
+ if is_last:
53
+ self.has_sent = True
54
+ logger.info(f"FakeKVSender send success")
55
+ else:
56
+ self.has_sent = False
57
+ logger.info(f"FakeKVSender send fake transfering")
58
+
59
+ def failure_exception(self):
60
+ raise Exception("Fake KVSender Exception")
61
+
62
+
63
+ class FakeKVReceiver(BaseKVReceiver):
64
+ def __init__(
65
+ self,
66
+ mgr: BaseKVManager,
67
+ bootstrap_addr: str,
68
+ bootstrap_room: Optional[int] = None,
69
+ ):
70
+ self.has_init = False
71
+
72
+ def poll(self) -> KVPoll:
73
+ if self.has_init is False:
74
+ # Assume handshake completed instantly
75
+ return KVPoll.WaitingForInput
76
+ else:
77
+ # Assume transfer completed instantly
78
+ logger.info("FakeKVReceiver poll success")
79
+ return KVPoll.Success
80
+
81
+ def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
82
+ self.has_init = True
83
+ logger.info(
84
+ f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
85
+ )
86
+
87
+ def failure_exception(self):
88
+ raise Exception("Fake KVReceiver Exception")
@@ -20,6 +20,7 @@ Life cycle of a request in the prefill server
20
20
  from __future__ import annotations
21
21
 
22
22
  import logging
23
+ import threading
23
24
  from collections import deque
24
25
  from typing import TYPE_CHECKING, List, Optional
25
26
 
@@ -28,6 +29,7 @@ import torch
28
29
  from sglang.srt.disaggregation.base import BaseKVManager, KVArgs, KVPoll
29
30
  from sglang.srt.disaggregation.utils import (
30
31
  DisaggregationMode,
32
+ FakeBootstrapHost,
31
33
  KVClassType,
32
34
  ReqToMetadataIdxAllocator,
33
35
  TransferBackend,
@@ -115,7 +117,11 @@ class PrefillBootstrapQueue:
115
117
  return kv_manager
116
118
 
117
119
  def add(self, req: Req) -> None:
118
- kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
120
+ if req.bootstrap_host == FakeBootstrapHost:
121
+ # Fake transfer for warmup reqs
122
+ kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
123
+ else:
124
+ kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
119
125
  req.disagg_kv_sender = kv_sender_class(
120
126
  mgr=self.kv_manager,
121
127
  bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
@@ -256,7 +262,10 @@ class SchedulerDisaggregationPrefillMixin:
256
262
  self.running_batch.batch_is_full = False
257
263
 
258
264
  def process_batch_result_disagg_prefill(
259
- self: Scheduler, batch: ScheduleBatch, result: GenerationBatchResult
265
+ self: Scheduler,
266
+ batch: ScheduleBatch,
267
+ result: GenerationBatchResult,
268
+ launch_done: Optional[threading.Event] = None,
260
269
  ) -> None:
261
270
  """
262
271
  Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
@@ -280,7 +289,7 @@ class SchedulerDisaggregationPrefillMixin:
280
289
  # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
281
290
  if self.enable_overlap:
282
291
  # wait
283
- _, next_token_ids = self.tp_worker.resolve_batch_result(bid)
292
+ _, next_token_ids = self.tp_worker.resolve_last_batch_result(launch_done)
284
293
  else:
285
294
  next_token_ids = result.next_token_ids.tolist()
286
295
 
@@ -15,6 +15,9 @@ class DisaggregationMode(Enum):
15
15
  DECODE = "decode"
16
16
 
17
17
 
18
+ FakeBootstrapHost = "2.2.2.2"
19
+
20
+
18
21
  def poll_and_all_reduce(pollers, gloo_group):
19
22
  polls = [int(poller.poll()) for poller in pollers]
20
23
  tensor_to_reduce = torch.tensor(polls, dtype=torch.uint8, device="cpu")
@@ -59,6 +62,8 @@ class KVClassType(Enum):
59
62
 
60
63
 
61
64
  def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
65
+ from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
66
+
62
67
  if transfer_backend == TransferBackend.MOONCAKE:
63
68
  from sglang.srt.disaggregation.mooncake import (
64
69
  MooncakeKVBootstrapServer,
@@ -70,7 +75,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
70
75
  class_mapping = {
71
76
  KVClassType.MANAGER: MooncakeKVManager,
72
77
  KVClassType.SENDER: MooncakeKVSender,
73
- KVClassType.RECEIVER: MooncakeKVReceiver,
78
+ KVClassType.RECEIVER: (MooncakeKVReceiver),
74
79
  KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
75
80
  }
76
81
  return class_mapping.get(class_type)
@@ -85,10 +90,19 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
85
90
  class_mapping = {
86
91
  KVClassType.MANAGER: NixlKVManager,
87
92
  KVClassType.SENDER: NixlKVSender,
88
- KVClassType.RECEIVER: NixlKVReceiver,
93
+ KVClassType.RECEIVER: (NixlKVReceiver),
89
94
  KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
90
95
  }
91
96
  return class_mapping.get(class_type)
97
+ if transfer_backend == TransferBackend.FAKE:
98
+ from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
99
+
100
+ class_mapping = {
101
+ KVClassType.SENDER: FakeKVSender,
102
+ KVClassType.RECEIVER: (FakeKVReceiver),
103
+ }
104
+ return class_mapping.get(class_type)
105
+
92
106
  raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
93
107
 
94
108