sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +208 -295
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +9 -6
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  64. sglang/srt/layers/moe/topk.py +13 -4
  65. sglang/srt/layers/quantization/__init__.py +111 -7
  66. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  69. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  71. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  72. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  73. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  75. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  80. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  82. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  83. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  87. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  89. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  90. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  91. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  93. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  94. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  95. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  96. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  97. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  98. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  99. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  100. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  101. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  102. sglang/srt/layers/quantization/fp8.py +69 -28
  103. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  104. sglang/srt/layers/quantization/gptq.py +416 -0
  105. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  106. sglang/srt/layers/quantization/int8_utils.py +73 -0
  107. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  108. sglang/srt/layers/radix_attention.py +1 -0
  109. sglang/srt/layers/rotary_embedding.py +0 -1
  110. sglang/srt/layers/sampler.py +76 -31
  111. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  112. sglang/srt/lora/lora.py +17 -1
  113. sglang/srt/lora/lora_config.py +5 -0
  114. sglang/srt/lora/lora_manager.py +1 -3
  115. sglang/srt/managers/cache_controller.py +193 -62
  116. sglang/srt/managers/configure_logging.py +2 -1
  117. sglang/srt/managers/data_parallel_controller.py +6 -2
  118. sglang/srt/managers/detokenizer_manager.py +124 -102
  119. sglang/srt/managers/image_processor.py +2 -1
  120. sglang/srt/managers/io_struct.py +143 -6
  121. sglang/srt/managers/schedule_batch.py +238 -197
  122. sglang/srt/managers/schedule_policy.py +29 -29
  123. sglang/srt/managers/scheduler.py +681 -259
  124. sglang/srt/managers/session_controller.py +6 -2
  125. sglang/srt/managers/tokenizer_manager.py +224 -68
  126. sglang/srt/managers/tp_worker.py +15 -4
  127. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  128. sglang/srt/mem_cache/chunk_cache.py +18 -11
  129. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  130. sglang/srt/mem_cache/memory_pool.py +44 -18
  131. sglang/srt/mem_cache/radix_cache.py +58 -47
  132. sglang/srt/metrics/collector.py +94 -36
  133. sglang/srt/model_executor/cuda_graph_runner.py +55 -24
  134. sglang/srt/model_executor/forward_batch_info.py +49 -16
  135. sglang/srt/model_executor/model_runner.py +209 -28
  136. sglang/srt/model_loader/loader.py +3 -3
  137. sglang/srt/model_loader/weight_utils.py +36 -14
  138. sglang/srt/models/baichuan.py +31 -6
  139. sglang/srt/models/chatglm.py +39 -7
  140. sglang/srt/models/commandr.py +29 -5
  141. sglang/srt/models/dbrx.py +31 -5
  142. sglang/srt/models/deepseek.py +43 -6
  143. sglang/srt/models/deepseek_nextn.py +32 -19
  144. sglang/srt/models/deepseek_v2.py +265 -29
  145. sglang/srt/models/exaone.py +19 -9
  146. sglang/srt/models/gemma.py +22 -8
  147. sglang/srt/models/gemma2.py +25 -12
  148. sglang/srt/models/gemma2_reward.py +5 -1
  149. sglang/srt/models/gpt2.py +28 -13
  150. sglang/srt/models/gpt_bigcode.py +27 -5
  151. sglang/srt/models/granite.py +21 -9
  152. sglang/srt/models/grok.py +21 -4
  153. sglang/srt/models/internlm2.py +36 -6
  154. sglang/srt/models/internlm2_reward.py +5 -1
  155. sglang/srt/models/llama.py +26 -9
  156. sglang/srt/models/llama_classification.py +5 -1
  157. sglang/srt/models/llama_eagle.py +17 -4
  158. sglang/srt/models/llama_embedding.py +5 -1
  159. sglang/srt/models/llama_reward.py +7 -2
  160. sglang/srt/models/llava.py +19 -3
  161. sglang/srt/models/llavavid.py +10 -1
  162. sglang/srt/models/minicpm.py +26 -2
  163. sglang/srt/models/minicpm3.py +39 -3
  164. sglang/srt/models/minicpmv.py +45 -14
  165. sglang/srt/models/mixtral.py +20 -9
  166. sglang/srt/models/mixtral_quant.py +50 -8
  167. sglang/srt/models/mllama.py +57 -11
  168. sglang/srt/models/olmo.py +34 -6
  169. sglang/srt/models/olmo2.py +34 -13
  170. sglang/srt/models/olmoe.py +26 -4
  171. sglang/srt/models/phi3_small.py +29 -10
  172. sglang/srt/models/qwen.py +26 -3
  173. sglang/srt/models/qwen2.py +26 -4
  174. sglang/srt/models/qwen2_5_vl.py +46 -8
  175. sglang/srt/models/qwen2_eagle.py +17 -5
  176. sglang/srt/models/qwen2_moe.py +44 -6
  177. sglang/srt/models/qwen2_rm.py +78 -0
  178. sglang/srt/models/qwen2_vl.py +39 -8
  179. sglang/srt/models/stablelm.py +32 -5
  180. sglang/srt/models/torch_native_llama.py +5 -2
  181. sglang/srt/models/xverse.py +21 -9
  182. sglang/srt/models/xverse_moe.py +45 -7
  183. sglang/srt/models/yivl.py +2 -1
  184. sglang/srt/openai_api/adapter.py +109 -24
  185. sglang/srt/openai_api/protocol.py +17 -1
  186. sglang/srt/reasoning_parser.py +154 -0
  187. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  188. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  189. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  190. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  191. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  192. sglang/srt/sampling/sampling_batch_info.py +79 -157
  193. sglang/srt/sampling/sampling_params.py +16 -13
  194. sglang/srt/server_args.py +136 -52
  195. sglang/srt/speculative/build_eagle_tree.py +2 -8
  196. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
  197. sglang/srt/speculative/eagle_utils.py +92 -58
  198. sglang/srt/speculative/eagle_worker.py +186 -94
  199. sglang/srt/speculative/spec_info.py +1 -13
  200. sglang/srt/utils.py +43 -17
  201. sglang/srt/warmup.py +47 -0
  202. sglang/test/few_shot_gsm8k.py +4 -1
  203. sglang/test/runners.py +389 -126
  204. sglang/test/send_one.py +88 -0
  205. sglang/test/test_block_fp8_ep.py +361 -0
  206. sglang/test/test_programs.py +1 -1
  207. sglang/test/test_utils.py +138 -84
  208. sglang/utils.py +50 -60
  209. sglang/version.py +1 -1
  210. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
  211. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
  212. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
  213. sglang/bench_latency.py +0 -1
  214. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  215. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  216. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  217. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  218. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
  219. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@
5
5
  "BLOCK_SIZE_K": 256,
6
6
  "GROUP_SIZE_M": 1,
7
7
  "num_warps": 2,
8
- "num_stages": 0,
8
+ "num_stages": 2,
9
9
  "waves_per_eu": 0,
10
10
  "matrix_instr_nonkdim": 16,
11
11
  "kpack": 2
@@ -16,7 +16,7 @@
16
16
  "BLOCK_SIZE_K": 128,
17
17
  "GROUP_SIZE_M": 1,
18
18
  "num_warps": 4,
19
- "num_stages": 0,
19
+ "num_stages": 2,
20
20
  "waves_per_eu": 0,
21
21
  "matrix_instr_nonkdim": 16,
22
22
  "kpack": 1
@@ -27,7 +27,7 @@
27
27
  "BLOCK_SIZE_K": 128,
28
28
  "GROUP_SIZE_M": 1,
29
29
  "num_warps": 4,
30
- "num_stages": 0,
30
+ "num_stages": 2,
31
31
  "waves_per_eu": 0,
32
32
  "matrix_instr_nonkdim": 16,
33
33
  "kpack": 2
@@ -38,7 +38,7 @@
38
38
  "BLOCK_SIZE_K": 256,
39
39
  "GROUP_SIZE_M": 1,
40
40
  "num_warps": 2,
41
- "num_stages": 0,
41
+ "num_stages": 2,
42
42
  "waves_per_eu": 0,
43
43
  "matrix_instr_nonkdim": 16,
44
44
  "kpack": 2
@@ -49,7 +49,7 @@
49
49
  "BLOCK_SIZE_K": 64,
50
50
  "GROUP_SIZE_M": 1,
51
51
  "num_warps": 8,
52
- "num_stages": 0,
52
+ "num_stages": 2,
53
53
  "waves_per_eu": 0,
54
54
  "matrix_instr_nonkdim": 16,
55
55
  "kpack": 1
@@ -60,7 +60,7 @@
60
60
  "BLOCK_SIZE_K": 64,
61
61
  "GROUP_SIZE_M": 1,
62
62
  "num_warps": 4,
63
- "num_stages": 0,
63
+ "num_stages": 2,
64
64
  "waves_per_eu": 0,
65
65
  "matrix_instr_nonkdim": 16,
66
66
  "kpack": 1
@@ -71,7 +71,7 @@
71
71
  "BLOCK_SIZE_K": 256,
72
72
  "GROUP_SIZE_M": 4,
73
73
  "num_warps": 2,
74
- "num_stages": 0,
74
+ "num_stages": 2,
75
75
  "waves_per_eu": 0,
76
76
  "matrix_instr_nonkdim": 16,
77
77
  "kpack": 2
@@ -82,7 +82,7 @@
82
82
  "BLOCK_SIZE_K": 64,
83
83
  "GROUP_SIZE_M": 1,
84
84
  "num_warps": 8,
85
- "num_stages": 0,
85
+ "num_stages": 2,
86
86
  "waves_per_eu": 0,
87
87
  "matrix_instr_nonkdim": 16,
88
88
  "kpack": 2
@@ -93,7 +93,7 @@
93
93
  "BLOCK_SIZE_K": 64,
94
94
  "GROUP_SIZE_M": 1,
95
95
  "num_warps": 2,
96
- "num_stages": 0,
96
+ "num_stages": 2,
97
97
  "waves_per_eu": 0,
98
98
  "matrix_instr_nonkdim": 16,
99
99
  "kpack": 1
@@ -104,7 +104,7 @@
104
104
  "BLOCK_SIZE_K": 256,
105
105
  "GROUP_SIZE_M": 4,
106
106
  "num_warps": 4,
107
- "num_stages": 0,
107
+ "num_stages": 2,
108
108
  "waves_per_eu": 0,
109
109
  "matrix_instr_nonkdim": 16,
110
110
  "kpack": 2
@@ -115,7 +115,7 @@
115
115
  "BLOCK_SIZE_K": 64,
116
116
  "GROUP_SIZE_M": 4,
117
117
  "num_warps": 8,
118
- "num_stages": 0,
118
+ "num_stages": 2,
119
119
  "waves_per_eu": 0,
120
120
  "matrix_instr_nonkdim": 16,
121
121
  "kpack": 1
@@ -126,7 +126,7 @@
126
126
  "BLOCK_SIZE_K": 64,
127
127
  "GROUP_SIZE_M": 4,
128
128
  "num_warps": 8,
129
- "num_stages": 0,
129
+ "num_stages": 2,
130
130
  "waves_per_eu": 0,
131
131
  "matrix_instr_nonkdim": 16,
132
132
  "kpack": 1
@@ -137,7 +137,7 @@
137
137
  "BLOCK_SIZE_K": 128,
138
138
  "GROUP_SIZE_M": 1,
139
139
  "num_warps": 8,
140
- "num_stages": 0,
140
+ "num_stages": 2,
141
141
  "waves_per_eu": 0,
142
142
  "matrix_instr_nonkdim": 16,
143
143
  "kpack": 2
@@ -148,7 +148,7 @@
148
148
  "BLOCK_SIZE_K": 64,
149
149
  "GROUP_SIZE_M": 1,
150
150
  "num_warps": 8,
151
- "num_stages": 0,
151
+ "num_stages": 2,
152
152
  "waves_per_eu": 0,
153
153
  "matrix_instr_nonkdim": 16,
154
154
  "kpack": 1
@@ -159,7 +159,7 @@
159
159
  "BLOCK_SIZE_K": 64,
160
160
  "GROUP_SIZE_M": 1,
161
161
  "num_warps": 8,
162
- "num_stages": 0,
162
+ "num_stages": 2,
163
163
  "waves_per_eu": 0,
164
164
  "matrix_instr_nonkdim": 16,
165
165
  "kpack": 1
@@ -170,7 +170,7 @@
170
170
  "BLOCK_SIZE_K": 64,
171
171
  "GROUP_SIZE_M": 1,
172
172
  "num_warps": 8,
173
- "num_stages": 0,
173
+ "num_stages": 2,
174
174
  "waves_per_eu": 0,
175
175
  "matrix_instr_nonkdim": 16,
176
176
  "kpack": 2
@@ -181,7 +181,7 @@
181
181
  "BLOCK_SIZE_K": 64,
182
182
  "GROUP_SIZE_M": 1,
183
183
  "num_warps": 8,
184
- "num_stages": 0,
184
+ "num_stages": 2,
185
185
  "waves_per_eu": 0,
186
186
  "matrix_instr_nonkdim": 16,
187
187
  "kpack": 1
@@ -192,7 +192,7 @@
192
192
  "BLOCK_SIZE_K": 64,
193
193
  "GROUP_SIZE_M": 1,
194
194
  "num_warps": 8,
195
- "num_stages": 0,
195
+ "num_stages": 2,
196
196
  "waves_per_eu": 0,
197
197
  "matrix_instr_nonkdim": 16,
198
198
  "kpack": 1
@@ -5,7 +5,7 @@
5
5
  "BLOCK_SIZE_K": 256,
6
6
  "GROUP_SIZE_M": 1,
7
7
  "num_warps": 2,
8
- "num_stages": 0,
8
+ "num_stages": 2,
9
9
  "waves_per_eu": 0,
10
10
  "matrix_instr_nonkdim": 16,
11
11
  "kpack": 2
@@ -16,7 +16,7 @@
16
16
  "BLOCK_SIZE_K": 128,
17
17
  "GROUP_SIZE_M": 1,
18
18
  "num_warps": 4,
19
- "num_stages": 0,
19
+ "num_stages": 2,
20
20
  "waves_per_eu": 0,
21
21
  "matrix_instr_nonkdim": 16,
22
22
  "kpack": 1
@@ -27,7 +27,7 @@
27
27
  "BLOCK_SIZE_K": 128,
28
28
  "GROUP_SIZE_M": 1,
29
29
  "num_warps": 4,
30
- "num_stages": 0,
30
+ "num_stages": 2,
31
31
  "waves_per_eu": 0,
32
32
  "matrix_instr_nonkdim": 16,
33
33
  "kpack": 2
@@ -38,7 +38,7 @@
38
38
  "BLOCK_SIZE_K": 256,
39
39
  "GROUP_SIZE_M": 1,
40
40
  "num_warps": 2,
41
- "num_stages": 0,
41
+ "num_stages": 2,
42
42
  "waves_per_eu": 0,
43
43
  "matrix_instr_nonkdim": 16,
44
44
  "kpack": 2
@@ -49,7 +49,7 @@
49
49
  "BLOCK_SIZE_K": 64,
50
50
  "GROUP_SIZE_M": 1,
51
51
  "num_warps": 8,
52
- "num_stages": 0,
52
+ "num_stages": 2,
53
53
  "waves_per_eu": 0,
54
54
  "matrix_instr_nonkdim": 16,
55
55
  "kpack": 1
@@ -60,7 +60,7 @@
60
60
  "BLOCK_SIZE_K": 64,
61
61
  "GROUP_SIZE_M": 1,
62
62
  "num_warps": 4,
63
- "num_stages": 0,
63
+ "num_stages": 2,
64
64
  "waves_per_eu": 0,
65
65
  "matrix_instr_nonkdim": 16,
66
66
  "kpack": 1
@@ -71,7 +71,7 @@
71
71
  "BLOCK_SIZE_K": 256,
72
72
  "GROUP_SIZE_M": 4,
73
73
  "num_warps": 2,
74
- "num_stages": 0,
74
+ "num_stages": 2,
75
75
  "waves_per_eu": 0,
76
76
  "matrix_instr_nonkdim": 16,
77
77
  "kpack": 2
@@ -82,7 +82,7 @@
82
82
  "BLOCK_SIZE_K": 64,
83
83
  "GROUP_SIZE_M": 1,
84
84
  "num_warps": 8,
85
- "num_stages": 0,
85
+ "num_stages": 2,
86
86
  "waves_per_eu": 0,
87
87
  "matrix_instr_nonkdim": 16,
88
88
  "kpack": 2
@@ -93,7 +93,7 @@
93
93
  "BLOCK_SIZE_K": 64,
94
94
  "GROUP_SIZE_M": 1,
95
95
  "num_warps": 2,
96
- "num_stages": 0,
96
+ "num_stages": 2,
97
97
  "waves_per_eu": 0,
98
98
  "matrix_instr_nonkdim": 16,
99
99
  "kpack": 1
@@ -104,7 +104,7 @@
104
104
  "BLOCK_SIZE_K": 256,
105
105
  "GROUP_SIZE_M": 4,
106
106
  "num_warps": 4,
107
- "num_stages": 0,
107
+ "num_stages": 2,
108
108
  "waves_per_eu": 0,
109
109
  "matrix_instr_nonkdim": 16,
110
110
  "kpack": 2
@@ -115,7 +115,7 @@
115
115
  "BLOCK_SIZE_K": 64,
116
116
  "GROUP_SIZE_M": 4,
117
117
  "num_warps": 8,
118
- "num_stages": 0,
118
+ "num_stages": 2,
119
119
  "waves_per_eu": 0,
120
120
  "matrix_instr_nonkdim": 16,
121
121
  "kpack": 1
@@ -126,7 +126,7 @@
126
126
  "BLOCK_SIZE_K": 64,
127
127
  "GROUP_SIZE_M": 4,
128
128
  "num_warps": 8,
129
- "num_stages": 0,
129
+ "num_stages": 2,
130
130
  "waves_per_eu": 0,
131
131
  "matrix_instr_nonkdim": 16,
132
132
  "kpack": 1
@@ -137,7 +137,7 @@
137
137
  "BLOCK_SIZE_K": 128,
138
138
  "GROUP_SIZE_M": 1,
139
139
  "num_warps": 8,
140
- "num_stages": 0,
140
+ "num_stages": 2,
141
141
  "waves_per_eu": 0,
142
142
  "matrix_instr_nonkdim": 16,
143
143
  "kpack": 2
@@ -148,7 +148,7 @@
148
148
  "BLOCK_SIZE_K": 64,
149
149
  "GROUP_SIZE_M": 1,
150
150
  "num_warps": 8,
151
- "num_stages": 0,
151
+ "num_stages": 2,
152
152
  "waves_per_eu": 0,
153
153
  "matrix_instr_nonkdim": 16,
154
154
  "kpack": 1
@@ -159,7 +159,7 @@
159
159
  "BLOCK_SIZE_K": 64,
160
160
  "GROUP_SIZE_M": 1,
161
161
  "num_warps": 8,
162
- "num_stages": 0,
162
+ "num_stages": 2,
163
163
  "waves_per_eu": 0,
164
164
  "matrix_instr_nonkdim": 16,
165
165
  "kpack": 1
@@ -170,7 +170,7 @@
170
170
  "BLOCK_SIZE_K": 64,
171
171
  "GROUP_SIZE_M": 1,
172
172
  "num_warps": 8,
173
- "num_stages": 0,
173
+ "num_stages": 2,
174
174
  "waves_per_eu": 0,
175
175
  "matrix_instr_nonkdim": 16,
176
176
  "kpack": 2
@@ -181,7 +181,7 @@
181
181
  "BLOCK_SIZE_K": 64,
182
182
  "GROUP_SIZE_M": 1,
183
183
  "num_warps": 8,
184
- "num_stages": 0,
184
+ "num_stages": 2,
185
185
  "waves_per_eu": 0,
186
186
  "matrix_instr_nonkdim": 16,
187
187
  "kpack": 1
@@ -192,7 +192,7 @@
192
192
  "BLOCK_SIZE_K": 64,
193
193
  "GROUP_SIZE_M": 1,
194
194
  "num_warps": 8,
195
- "num_stages": 0,
195
+ "num_stages": 2,
196
196
  "waves_per_eu": 0,
197
197
  "matrix_instr_nonkdim": 16,
198
198
  "kpack": 1
@@ -5,7 +5,7 @@
5
5
  "BLOCK_SIZE_K": 256,
6
6
  "GROUP_SIZE_M": 1,
7
7
  "num_warps": 2,
8
- "num_stages": 0,
8
+ "num_stages": 2,
9
9
  "waves_per_eu": 0,
10
10
  "matrix_instr_nonkdim": 16,
11
11
  "kpack": 2
@@ -16,7 +16,7 @@
16
16
  "BLOCK_SIZE_K": 128,
17
17
  "GROUP_SIZE_M": 1,
18
18
  "num_warps": 4,
19
- "num_stages": 0,
19
+ "num_stages": 2,
20
20
  "waves_per_eu": 0,
21
21
  "matrix_instr_nonkdim": 16,
22
22
  "kpack": 1
@@ -27,7 +27,7 @@
27
27
  "BLOCK_SIZE_K": 128,
28
28
  "GROUP_SIZE_M": 1,
29
29
  "num_warps": 4,
30
- "num_stages": 0,
30
+ "num_stages": 2,
31
31
  "waves_per_eu": 0,
32
32
  "matrix_instr_nonkdim": 16,
33
33
  "kpack": 2
@@ -38,7 +38,7 @@
38
38
  "BLOCK_SIZE_K": 256,
39
39
  "GROUP_SIZE_M": 1,
40
40
  "num_warps": 2,
41
- "num_stages": 0,
41
+ "num_stages": 2,
42
42
  "waves_per_eu": 0,
43
43
  "matrix_instr_nonkdim": 16,
44
44
  "kpack": 2
@@ -49,7 +49,7 @@
49
49
  "BLOCK_SIZE_K": 64,
50
50
  "GROUP_SIZE_M": 1,
51
51
  "num_warps": 8,
52
- "num_stages": 0,
52
+ "num_stages": 2,
53
53
  "waves_per_eu": 0,
54
54
  "matrix_instr_nonkdim": 16,
55
55
  "kpack": 1
@@ -60,7 +60,7 @@
60
60
  "BLOCK_SIZE_K": 64,
61
61
  "GROUP_SIZE_M": 1,
62
62
  "num_warps": 4,
63
- "num_stages": 0,
63
+ "num_stages": 2,
64
64
  "waves_per_eu": 0,
65
65
  "matrix_instr_nonkdim": 16,
66
66
  "kpack": 1
@@ -71,7 +71,7 @@
71
71
  "BLOCK_SIZE_K": 256,
72
72
  "GROUP_SIZE_M": 4,
73
73
  "num_warps": 2,
74
- "num_stages": 0,
74
+ "num_stages": 2,
75
75
  "waves_per_eu": 0,
76
76
  "matrix_instr_nonkdim": 16,
77
77
  "kpack": 2
@@ -82,7 +82,7 @@
82
82
  "BLOCK_SIZE_K": 64,
83
83
  "GROUP_SIZE_M": 1,
84
84
  "num_warps": 8,
85
- "num_stages": 0,
85
+ "num_stages": 2,
86
86
  "waves_per_eu": 0,
87
87
  "matrix_instr_nonkdim": 16,
88
88
  "kpack": 2
@@ -93,7 +93,7 @@
93
93
  "BLOCK_SIZE_K": 64,
94
94
  "GROUP_SIZE_M": 1,
95
95
  "num_warps": 2,
96
- "num_stages": 0,
96
+ "num_stages": 2,
97
97
  "waves_per_eu": 0,
98
98
  "matrix_instr_nonkdim": 16,
99
99
  "kpack": 1
@@ -104,7 +104,7 @@
104
104
  "BLOCK_SIZE_K": 256,
105
105
  "GROUP_SIZE_M": 4,
106
106
  "num_warps": 4,
107
- "num_stages": 0,
107
+ "num_stages": 2,
108
108
  "waves_per_eu": 0,
109
109
  "matrix_instr_nonkdim": 16,
110
110
  "kpack": 2
@@ -115,7 +115,7 @@
115
115
  "BLOCK_SIZE_K": 64,
116
116
  "GROUP_SIZE_M": 4,
117
117
  "num_warps": 8,
118
- "num_stages": 0,
118
+ "num_stages": 2,
119
119
  "waves_per_eu": 0,
120
120
  "matrix_instr_nonkdim": 16,
121
121
  "kpack": 1
@@ -126,7 +126,7 @@
126
126
  "BLOCK_SIZE_K": 64,
127
127
  "GROUP_SIZE_M": 4,
128
128
  "num_warps": 8,
129
- "num_stages": 0,
129
+ "num_stages": 2,
130
130
  "waves_per_eu": 0,
131
131
  "matrix_instr_nonkdim": 16,
132
132
  "kpack": 1
@@ -137,7 +137,7 @@
137
137
  "BLOCK_SIZE_K": 128,
138
138
  "GROUP_SIZE_M": 1,
139
139
  "num_warps": 8,
140
- "num_stages": 0,
140
+ "num_stages": 2,
141
141
  "waves_per_eu": 0,
142
142
  "matrix_instr_nonkdim": 16,
143
143
  "kpack": 2
@@ -148,7 +148,7 @@
148
148
  "BLOCK_SIZE_K": 64,
149
149
  "GROUP_SIZE_M": 1,
150
150
  "num_warps": 8,
151
- "num_stages": 0,
151
+ "num_stages": 2,
152
152
  "waves_per_eu": 0,
153
153
  "matrix_instr_nonkdim": 16,
154
154
  "kpack": 1
@@ -159,7 +159,7 @@
159
159
  "BLOCK_SIZE_K": 64,
160
160
  "GROUP_SIZE_M": 1,
161
161
  "num_warps": 8,
162
- "num_stages": 0,
162
+ "num_stages": 2,
163
163
  "waves_per_eu": 0,
164
164
  "matrix_instr_nonkdim": 16,
165
165
  "kpack": 1
@@ -170,7 +170,7 @@
170
170
  "BLOCK_SIZE_K": 64,
171
171
  "GROUP_SIZE_M": 1,
172
172
  "num_warps": 8,
173
- "num_stages": 0,
173
+ "num_stages": 2,
174
174
  "waves_per_eu": 0,
175
175
  "matrix_instr_nonkdim": 16,
176
176
  "kpack": 2
@@ -181,7 +181,7 @@
181
181
  "BLOCK_SIZE_K": 64,
182
182
  "GROUP_SIZE_M": 1,
183
183
  "num_warps": 8,
184
- "num_stages": 0,
184
+ "num_stages": 2,
185
185
  "waves_per_eu": 0,
186
186
  "matrix_instr_nonkdim": 16,
187
187
  "kpack": 1
@@ -192,7 +192,7 @@
192
192
  "BLOCK_SIZE_K": 64,
193
193
  "GROUP_SIZE_M": 1,
194
194
  "num_warps": 8,
195
- "num_stages": 0,
195
+ "num_stages": 2,
196
196
  "waves_per_eu": 0,
197
197
  "matrix_instr_nonkdim": 16,
198
198
  "kpack": 1
@@ -5,7 +5,7 @@
5
5
  "BLOCK_SIZE_K": 128,
6
6
  "GROUP_SIZE_M": 1,
7
7
  "num_warps": 2,
8
- "num_stages": 0,
8
+ "num_stages": 2,
9
9
  "waves_per_eu": 0,
10
10
  "matrix_instr_nonkdim": 16,
11
11
  "kpack": 1
@@ -16,7 +16,7 @@
16
16
  "BLOCK_SIZE_K": 64,
17
17
  "GROUP_SIZE_M": 1,
18
18
  "num_warps": 2,
19
- "num_stages": 0,
19
+ "num_stages": 2,
20
20
  "waves_per_eu": 0,
21
21
  "matrix_instr_nonkdim": 16,
22
22
  "kpack": 2
@@ -27,7 +27,7 @@
27
27
  "BLOCK_SIZE_K": 256,
28
28
  "GROUP_SIZE_M": 1,
29
29
  "num_warps": 2,
30
- "num_stages": 0,
30
+ "num_stages": 2,
31
31
  "waves_per_eu": 0,
32
32
  "matrix_instr_nonkdim": 16,
33
33
  "kpack": 2
@@ -38,7 +38,7 @@
38
38
  "BLOCK_SIZE_K": 256,
39
39
  "GROUP_SIZE_M": 1,
40
40
  "num_warps": 2,
41
- "num_stages": 0,
41
+ "num_stages": 2,
42
42
  "waves_per_eu": 0,
43
43
  "matrix_instr_nonkdim": 16,
44
44
  "kpack": 2
@@ -49,7 +49,7 @@
49
49
  "BLOCK_SIZE_K": 256,
50
50
  "GROUP_SIZE_M": 1,
51
51
  "num_warps": 2,
52
- "num_stages": 0,
52
+ "num_stages": 2,
53
53
  "waves_per_eu": 0,
54
54
  "matrix_instr_nonkdim": 16,
55
55
  "kpack": 2
@@ -60,7 +60,7 @@
60
60
  "BLOCK_SIZE_K": 64,
61
61
  "GROUP_SIZE_M": 1,
62
62
  "num_warps": 4,
63
- "num_stages": 0,
63
+ "num_stages": 2,
64
64
  "waves_per_eu": 0,
65
65
  "matrix_instr_nonkdim": 16,
66
66
  "kpack": 1
@@ -71,7 +71,7 @@
71
71
  "BLOCK_SIZE_K": 256,
72
72
  "GROUP_SIZE_M": 4,
73
73
  "num_warps": 2,
74
- "num_stages": 0,
74
+ "num_stages": 2,
75
75
  "waves_per_eu": 0,
76
76
  "matrix_instr_nonkdim": 16,
77
77
  "kpack": 2
@@ -82,7 +82,7 @@
82
82
  "BLOCK_SIZE_K": 256,
83
83
  "GROUP_SIZE_M": 1,
84
84
  "num_warps": 2,
85
- "num_stages": 0,
85
+ "num_stages": 2,
86
86
  "waves_per_eu": 0,
87
87
  "matrix_instr_nonkdim": 16,
88
88
  "kpack": 2
@@ -93,7 +93,7 @@
93
93
  "BLOCK_SIZE_K": 256,
94
94
  "GROUP_SIZE_M": 4,
95
95
  "num_warps": 4,
96
- "num_stages": 0,
96
+ "num_stages": 2,
97
97
  "waves_per_eu": 0,
98
98
  "matrix_instr_nonkdim": 16,
99
99
  "kpack": 2
@@ -104,7 +104,7 @@
104
104
  "BLOCK_SIZE_K": 128,
105
105
  "GROUP_SIZE_M": 4,
106
106
  "num_warps": 4,
107
- "num_stages": 0,
107
+ "num_stages": 2,
108
108
  "waves_per_eu": 0,
109
109
  "matrix_instr_nonkdim": 16,
110
110
  "kpack": 1
@@ -115,7 +115,7 @@
115
115
  "BLOCK_SIZE_K": 128,
116
116
  "GROUP_SIZE_M": 4,
117
117
  "num_warps": 8,
118
- "num_stages": 0,
118
+ "num_stages": 2,
119
119
  "waves_per_eu": 0,
120
120
  "matrix_instr_nonkdim": 16,
121
121
  "kpack": 1
@@ -126,7 +126,7 @@
126
126
  "BLOCK_SIZE_K": 64,
127
127
  "GROUP_SIZE_M": 4,
128
128
  "num_warps": 8,
129
- "num_stages": 0,
129
+ "num_stages": 2,
130
130
  "waves_per_eu": 0,
131
131
  "matrix_instr_nonkdim": 16,
132
132
  "kpack": 1
@@ -137,7 +137,7 @@
137
137
  "BLOCK_SIZE_K": 64,
138
138
  "GROUP_SIZE_M": 1,
139
139
  "num_warps": 8,
140
- "num_stages": 0,
140
+ "num_stages": 2,
141
141
  "waves_per_eu": 0,
142
142
  "matrix_instr_nonkdim": 32,
143
143
  "kpack": 2
@@ -148,7 +148,7 @@
148
148
  "BLOCK_SIZE_K": 64,
149
149
  "GROUP_SIZE_M": 1,
150
150
  "num_warps": 8,
151
- "num_stages": 0,
151
+ "num_stages": 2,
152
152
  "waves_per_eu": 0,
153
153
  "matrix_instr_nonkdim": 16,
154
154
  "kpack": 1
@@ -159,7 +159,7 @@
159
159
  "BLOCK_SIZE_K": 64,
160
160
  "GROUP_SIZE_M": 1,
161
161
  "num_warps": 8,
162
- "num_stages": 0,
162
+ "num_stages": 2,
163
163
  "waves_per_eu": 0,
164
164
  "matrix_instr_nonkdim": 16,
165
165
  "kpack": 2
@@ -170,7 +170,7 @@
170
170
  "BLOCK_SIZE_K": 64,
171
171
  "GROUP_SIZE_M": 1,
172
172
  "num_warps": 8,
173
- "num_stages": 0,
173
+ "num_stages": 2,
174
174
  "waves_per_eu": 0,
175
175
  "matrix_instr_nonkdim": 16,
176
176
  "kpack": 1
@@ -181,7 +181,7 @@
181
181
  "BLOCK_SIZE_K": 64,
182
182
  "GROUP_SIZE_M": 1,
183
183
  "num_warps": 8,
184
- "num_stages": 0,
184
+ "num_stages": 2,
185
185
  "waves_per_eu": 0,
186
186
  "matrix_instr_nonkdim": 16,
187
187
  "kpack": 2
@@ -192,7 +192,7 @@
192
192
  "BLOCK_SIZE_K": 64,
193
193
  "GROUP_SIZE_M": 1,
194
194
  "num_warps": 8,
195
- "num_stages": 0,
195
+ "num_stages": 2,
196
196
  "waves_per_eu": 0,
197
197
  "matrix_instr_nonkdim": 16,
198
198
  "kpack": 1