sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +302 -414
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +13 -8
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +144 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +773 -334
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +225 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +68 -37
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +102 -36
- sglang/srt/model_executor/cuda_graph_runner.py +56 -31
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +280 -81
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +135 -60
- sglang/srt/speculative/build_eagle_tree.py +8 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
- sglang/srt/speculative/eagle_utils.py +92 -57
- sglang/srt/speculative/eagle_worker.py +238 -111
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
"BLOCK_SIZE_K": 256,
|
6
6
|
"GROUP_SIZE_M": 1,
|
7
7
|
"num_warps": 2,
|
8
|
-
"num_stages":
|
8
|
+
"num_stages": 2,
|
9
9
|
"waves_per_eu": 0,
|
10
10
|
"matrix_instr_nonkdim": 16,
|
11
11
|
"kpack": 2
|
@@ -16,7 +16,7 @@
|
|
16
16
|
"BLOCK_SIZE_K": 128,
|
17
17
|
"GROUP_SIZE_M": 1,
|
18
18
|
"num_warps": 4,
|
19
|
-
"num_stages":
|
19
|
+
"num_stages": 2,
|
20
20
|
"waves_per_eu": 0,
|
21
21
|
"matrix_instr_nonkdim": 16,
|
22
22
|
"kpack": 1
|
@@ -27,7 +27,7 @@
|
|
27
27
|
"BLOCK_SIZE_K": 128,
|
28
28
|
"GROUP_SIZE_M": 1,
|
29
29
|
"num_warps": 4,
|
30
|
-
"num_stages":
|
30
|
+
"num_stages": 2,
|
31
31
|
"waves_per_eu": 0,
|
32
32
|
"matrix_instr_nonkdim": 16,
|
33
33
|
"kpack": 2
|
@@ -38,7 +38,7 @@
|
|
38
38
|
"BLOCK_SIZE_K": 256,
|
39
39
|
"GROUP_SIZE_M": 1,
|
40
40
|
"num_warps": 2,
|
41
|
-
"num_stages":
|
41
|
+
"num_stages": 2,
|
42
42
|
"waves_per_eu": 0,
|
43
43
|
"matrix_instr_nonkdim": 16,
|
44
44
|
"kpack": 2
|
@@ -49,7 +49,7 @@
|
|
49
49
|
"BLOCK_SIZE_K": 64,
|
50
50
|
"GROUP_SIZE_M": 1,
|
51
51
|
"num_warps": 8,
|
52
|
-
"num_stages":
|
52
|
+
"num_stages": 2,
|
53
53
|
"waves_per_eu": 0,
|
54
54
|
"matrix_instr_nonkdim": 16,
|
55
55
|
"kpack": 1
|
@@ -60,7 +60,7 @@
|
|
60
60
|
"BLOCK_SIZE_K": 64,
|
61
61
|
"GROUP_SIZE_M": 1,
|
62
62
|
"num_warps": 4,
|
63
|
-
"num_stages":
|
63
|
+
"num_stages": 2,
|
64
64
|
"waves_per_eu": 0,
|
65
65
|
"matrix_instr_nonkdim": 16,
|
66
66
|
"kpack": 1
|
@@ -71,7 +71,7 @@
|
|
71
71
|
"BLOCK_SIZE_K": 256,
|
72
72
|
"GROUP_SIZE_M": 4,
|
73
73
|
"num_warps": 2,
|
74
|
-
"num_stages":
|
74
|
+
"num_stages": 2,
|
75
75
|
"waves_per_eu": 0,
|
76
76
|
"matrix_instr_nonkdim": 16,
|
77
77
|
"kpack": 2
|
@@ -82,7 +82,7 @@
|
|
82
82
|
"BLOCK_SIZE_K": 64,
|
83
83
|
"GROUP_SIZE_M": 1,
|
84
84
|
"num_warps": 8,
|
85
|
-
"num_stages":
|
85
|
+
"num_stages": 2,
|
86
86
|
"waves_per_eu": 0,
|
87
87
|
"matrix_instr_nonkdim": 16,
|
88
88
|
"kpack": 2
|
@@ -93,7 +93,7 @@
|
|
93
93
|
"BLOCK_SIZE_K": 64,
|
94
94
|
"GROUP_SIZE_M": 1,
|
95
95
|
"num_warps": 2,
|
96
|
-
"num_stages":
|
96
|
+
"num_stages": 2,
|
97
97
|
"waves_per_eu": 0,
|
98
98
|
"matrix_instr_nonkdim": 16,
|
99
99
|
"kpack": 1
|
@@ -104,7 +104,7 @@
|
|
104
104
|
"BLOCK_SIZE_K": 256,
|
105
105
|
"GROUP_SIZE_M": 4,
|
106
106
|
"num_warps": 4,
|
107
|
-
"num_stages":
|
107
|
+
"num_stages": 2,
|
108
108
|
"waves_per_eu": 0,
|
109
109
|
"matrix_instr_nonkdim": 16,
|
110
110
|
"kpack": 2
|
@@ -115,7 +115,7 @@
|
|
115
115
|
"BLOCK_SIZE_K": 64,
|
116
116
|
"GROUP_SIZE_M": 4,
|
117
117
|
"num_warps": 8,
|
118
|
-
"num_stages":
|
118
|
+
"num_stages": 2,
|
119
119
|
"waves_per_eu": 0,
|
120
120
|
"matrix_instr_nonkdim": 16,
|
121
121
|
"kpack": 1
|
@@ -126,7 +126,7 @@
|
|
126
126
|
"BLOCK_SIZE_K": 64,
|
127
127
|
"GROUP_SIZE_M": 4,
|
128
128
|
"num_warps": 8,
|
129
|
-
"num_stages":
|
129
|
+
"num_stages": 2,
|
130
130
|
"waves_per_eu": 0,
|
131
131
|
"matrix_instr_nonkdim": 16,
|
132
132
|
"kpack": 1
|
@@ -137,7 +137,7 @@
|
|
137
137
|
"BLOCK_SIZE_K": 128,
|
138
138
|
"GROUP_SIZE_M": 1,
|
139
139
|
"num_warps": 8,
|
140
|
-
"num_stages":
|
140
|
+
"num_stages": 2,
|
141
141
|
"waves_per_eu": 0,
|
142
142
|
"matrix_instr_nonkdim": 16,
|
143
143
|
"kpack": 2
|
@@ -148,7 +148,7 @@
|
|
148
148
|
"BLOCK_SIZE_K": 64,
|
149
149
|
"GROUP_SIZE_M": 1,
|
150
150
|
"num_warps": 8,
|
151
|
-
"num_stages":
|
151
|
+
"num_stages": 2,
|
152
152
|
"waves_per_eu": 0,
|
153
153
|
"matrix_instr_nonkdim": 16,
|
154
154
|
"kpack": 1
|
@@ -159,7 +159,7 @@
|
|
159
159
|
"BLOCK_SIZE_K": 64,
|
160
160
|
"GROUP_SIZE_M": 1,
|
161
161
|
"num_warps": 8,
|
162
|
-
"num_stages":
|
162
|
+
"num_stages": 2,
|
163
163
|
"waves_per_eu": 0,
|
164
164
|
"matrix_instr_nonkdim": 16,
|
165
165
|
"kpack": 1
|
@@ -170,7 +170,7 @@
|
|
170
170
|
"BLOCK_SIZE_K": 64,
|
171
171
|
"GROUP_SIZE_M": 1,
|
172
172
|
"num_warps": 8,
|
173
|
-
"num_stages":
|
173
|
+
"num_stages": 2,
|
174
174
|
"waves_per_eu": 0,
|
175
175
|
"matrix_instr_nonkdim": 16,
|
176
176
|
"kpack": 2
|
@@ -181,7 +181,7 @@
|
|
181
181
|
"BLOCK_SIZE_K": 64,
|
182
182
|
"GROUP_SIZE_M": 1,
|
183
183
|
"num_warps": 8,
|
184
|
-
"num_stages":
|
184
|
+
"num_stages": 2,
|
185
185
|
"waves_per_eu": 0,
|
186
186
|
"matrix_instr_nonkdim": 16,
|
187
187
|
"kpack": 1
|
@@ -192,7 +192,7 @@
|
|
192
192
|
"BLOCK_SIZE_K": 64,
|
193
193
|
"GROUP_SIZE_M": 1,
|
194
194
|
"num_warps": 8,
|
195
|
-
"num_stages":
|
195
|
+
"num_stages": 2,
|
196
196
|
"waves_per_eu": 0,
|
197
197
|
"matrix_instr_nonkdim": 16,
|
198
198
|
"kpack": 1
|
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
"BLOCK_SIZE_K": 256,
|
6
6
|
"GROUP_SIZE_M": 1,
|
7
7
|
"num_warps": 2,
|
8
|
-
"num_stages":
|
8
|
+
"num_stages": 2,
|
9
9
|
"waves_per_eu": 0,
|
10
10
|
"matrix_instr_nonkdim": 16,
|
11
11
|
"kpack": 2
|
@@ -16,7 +16,7 @@
|
|
16
16
|
"BLOCK_SIZE_K": 128,
|
17
17
|
"GROUP_SIZE_M": 1,
|
18
18
|
"num_warps": 4,
|
19
|
-
"num_stages":
|
19
|
+
"num_stages": 2,
|
20
20
|
"waves_per_eu": 0,
|
21
21
|
"matrix_instr_nonkdim": 16,
|
22
22
|
"kpack": 1
|
@@ -27,7 +27,7 @@
|
|
27
27
|
"BLOCK_SIZE_K": 128,
|
28
28
|
"GROUP_SIZE_M": 1,
|
29
29
|
"num_warps": 4,
|
30
|
-
"num_stages":
|
30
|
+
"num_stages": 2,
|
31
31
|
"waves_per_eu": 0,
|
32
32
|
"matrix_instr_nonkdim": 16,
|
33
33
|
"kpack": 2
|
@@ -38,7 +38,7 @@
|
|
38
38
|
"BLOCK_SIZE_K": 256,
|
39
39
|
"GROUP_SIZE_M": 1,
|
40
40
|
"num_warps": 2,
|
41
|
-
"num_stages":
|
41
|
+
"num_stages": 2,
|
42
42
|
"waves_per_eu": 0,
|
43
43
|
"matrix_instr_nonkdim": 16,
|
44
44
|
"kpack": 2
|
@@ -49,7 +49,7 @@
|
|
49
49
|
"BLOCK_SIZE_K": 64,
|
50
50
|
"GROUP_SIZE_M": 1,
|
51
51
|
"num_warps": 8,
|
52
|
-
"num_stages":
|
52
|
+
"num_stages": 2,
|
53
53
|
"waves_per_eu": 0,
|
54
54
|
"matrix_instr_nonkdim": 16,
|
55
55
|
"kpack": 1
|
@@ -60,7 +60,7 @@
|
|
60
60
|
"BLOCK_SIZE_K": 64,
|
61
61
|
"GROUP_SIZE_M": 1,
|
62
62
|
"num_warps": 4,
|
63
|
-
"num_stages":
|
63
|
+
"num_stages": 2,
|
64
64
|
"waves_per_eu": 0,
|
65
65
|
"matrix_instr_nonkdim": 16,
|
66
66
|
"kpack": 1
|
@@ -71,7 +71,7 @@
|
|
71
71
|
"BLOCK_SIZE_K": 256,
|
72
72
|
"GROUP_SIZE_M": 4,
|
73
73
|
"num_warps": 2,
|
74
|
-
"num_stages":
|
74
|
+
"num_stages": 2,
|
75
75
|
"waves_per_eu": 0,
|
76
76
|
"matrix_instr_nonkdim": 16,
|
77
77
|
"kpack": 2
|
@@ -82,7 +82,7 @@
|
|
82
82
|
"BLOCK_SIZE_K": 64,
|
83
83
|
"GROUP_SIZE_M": 1,
|
84
84
|
"num_warps": 8,
|
85
|
-
"num_stages":
|
85
|
+
"num_stages": 2,
|
86
86
|
"waves_per_eu": 0,
|
87
87
|
"matrix_instr_nonkdim": 16,
|
88
88
|
"kpack": 2
|
@@ -93,7 +93,7 @@
|
|
93
93
|
"BLOCK_SIZE_K": 64,
|
94
94
|
"GROUP_SIZE_M": 1,
|
95
95
|
"num_warps": 2,
|
96
|
-
"num_stages":
|
96
|
+
"num_stages": 2,
|
97
97
|
"waves_per_eu": 0,
|
98
98
|
"matrix_instr_nonkdim": 16,
|
99
99
|
"kpack": 1
|
@@ -104,7 +104,7 @@
|
|
104
104
|
"BLOCK_SIZE_K": 256,
|
105
105
|
"GROUP_SIZE_M": 4,
|
106
106
|
"num_warps": 4,
|
107
|
-
"num_stages":
|
107
|
+
"num_stages": 2,
|
108
108
|
"waves_per_eu": 0,
|
109
109
|
"matrix_instr_nonkdim": 16,
|
110
110
|
"kpack": 2
|
@@ -115,7 +115,7 @@
|
|
115
115
|
"BLOCK_SIZE_K": 64,
|
116
116
|
"GROUP_SIZE_M": 4,
|
117
117
|
"num_warps": 8,
|
118
|
-
"num_stages":
|
118
|
+
"num_stages": 2,
|
119
119
|
"waves_per_eu": 0,
|
120
120
|
"matrix_instr_nonkdim": 16,
|
121
121
|
"kpack": 1
|
@@ -126,7 +126,7 @@
|
|
126
126
|
"BLOCK_SIZE_K": 64,
|
127
127
|
"GROUP_SIZE_M": 4,
|
128
128
|
"num_warps": 8,
|
129
|
-
"num_stages":
|
129
|
+
"num_stages": 2,
|
130
130
|
"waves_per_eu": 0,
|
131
131
|
"matrix_instr_nonkdim": 16,
|
132
132
|
"kpack": 1
|
@@ -137,7 +137,7 @@
|
|
137
137
|
"BLOCK_SIZE_K": 128,
|
138
138
|
"GROUP_SIZE_M": 1,
|
139
139
|
"num_warps": 8,
|
140
|
-
"num_stages":
|
140
|
+
"num_stages": 2,
|
141
141
|
"waves_per_eu": 0,
|
142
142
|
"matrix_instr_nonkdim": 16,
|
143
143
|
"kpack": 2
|
@@ -148,7 +148,7 @@
|
|
148
148
|
"BLOCK_SIZE_K": 64,
|
149
149
|
"GROUP_SIZE_M": 1,
|
150
150
|
"num_warps": 8,
|
151
|
-
"num_stages":
|
151
|
+
"num_stages": 2,
|
152
152
|
"waves_per_eu": 0,
|
153
153
|
"matrix_instr_nonkdim": 16,
|
154
154
|
"kpack": 1
|
@@ -159,7 +159,7 @@
|
|
159
159
|
"BLOCK_SIZE_K": 64,
|
160
160
|
"GROUP_SIZE_M": 1,
|
161
161
|
"num_warps": 8,
|
162
|
-
"num_stages":
|
162
|
+
"num_stages": 2,
|
163
163
|
"waves_per_eu": 0,
|
164
164
|
"matrix_instr_nonkdim": 16,
|
165
165
|
"kpack": 1
|
@@ -170,7 +170,7 @@
|
|
170
170
|
"BLOCK_SIZE_K": 64,
|
171
171
|
"GROUP_SIZE_M": 1,
|
172
172
|
"num_warps": 8,
|
173
|
-
"num_stages":
|
173
|
+
"num_stages": 2,
|
174
174
|
"waves_per_eu": 0,
|
175
175
|
"matrix_instr_nonkdim": 16,
|
176
176
|
"kpack": 2
|
@@ -181,7 +181,7 @@
|
|
181
181
|
"BLOCK_SIZE_K": 64,
|
182
182
|
"GROUP_SIZE_M": 1,
|
183
183
|
"num_warps": 8,
|
184
|
-
"num_stages":
|
184
|
+
"num_stages": 2,
|
185
185
|
"waves_per_eu": 0,
|
186
186
|
"matrix_instr_nonkdim": 16,
|
187
187
|
"kpack": 1
|
@@ -192,7 +192,7 @@
|
|
192
192
|
"BLOCK_SIZE_K": 64,
|
193
193
|
"GROUP_SIZE_M": 1,
|
194
194
|
"num_warps": 8,
|
195
|
-
"num_stages":
|
195
|
+
"num_stages": 2,
|
196
196
|
"waves_per_eu": 0,
|
197
197
|
"matrix_instr_nonkdim": 16,
|
198
198
|
"kpack": 1
|
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
"BLOCK_SIZE_K": 128,
|
6
6
|
"GROUP_SIZE_M": 1,
|
7
7
|
"num_warps": 2,
|
8
|
-
"num_stages":
|
8
|
+
"num_stages": 2,
|
9
9
|
"waves_per_eu": 0,
|
10
10
|
"matrix_instr_nonkdim": 16,
|
11
11
|
"kpack": 1
|
@@ -16,7 +16,7 @@
|
|
16
16
|
"BLOCK_SIZE_K": 64,
|
17
17
|
"GROUP_SIZE_M": 1,
|
18
18
|
"num_warps": 2,
|
19
|
-
"num_stages":
|
19
|
+
"num_stages": 2,
|
20
20
|
"waves_per_eu": 0,
|
21
21
|
"matrix_instr_nonkdim": 16,
|
22
22
|
"kpack": 2
|
@@ -27,7 +27,7 @@
|
|
27
27
|
"BLOCK_SIZE_K": 256,
|
28
28
|
"GROUP_SIZE_M": 1,
|
29
29
|
"num_warps": 2,
|
30
|
-
"num_stages":
|
30
|
+
"num_stages": 2,
|
31
31
|
"waves_per_eu": 0,
|
32
32
|
"matrix_instr_nonkdim": 16,
|
33
33
|
"kpack": 2
|
@@ -38,7 +38,7 @@
|
|
38
38
|
"BLOCK_SIZE_K": 256,
|
39
39
|
"GROUP_SIZE_M": 1,
|
40
40
|
"num_warps": 2,
|
41
|
-
"num_stages":
|
41
|
+
"num_stages": 2,
|
42
42
|
"waves_per_eu": 0,
|
43
43
|
"matrix_instr_nonkdim": 16,
|
44
44
|
"kpack": 2
|
@@ -49,7 +49,7 @@
|
|
49
49
|
"BLOCK_SIZE_K": 256,
|
50
50
|
"GROUP_SIZE_M": 1,
|
51
51
|
"num_warps": 2,
|
52
|
-
"num_stages":
|
52
|
+
"num_stages": 2,
|
53
53
|
"waves_per_eu": 0,
|
54
54
|
"matrix_instr_nonkdim": 16,
|
55
55
|
"kpack": 2
|
@@ -60,7 +60,7 @@
|
|
60
60
|
"BLOCK_SIZE_K": 64,
|
61
61
|
"GROUP_SIZE_M": 1,
|
62
62
|
"num_warps": 4,
|
63
|
-
"num_stages":
|
63
|
+
"num_stages": 2,
|
64
64
|
"waves_per_eu": 0,
|
65
65
|
"matrix_instr_nonkdim": 16,
|
66
66
|
"kpack": 1
|
@@ -71,7 +71,7 @@
|
|
71
71
|
"BLOCK_SIZE_K": 256,
|
72
72
|
"GROUP_SIZE_M": 4,
|
73
73
|
"num_warps": 2,
|
74
|
-
"num_stages":
|
74
|
+
"num_stages": 2,
|
75
75
|
"waves_per_eu": 0,
|
76
76
|
"matrix_instr_nonkdim": 16,
|
77
77
|
"kpack": 2
|
@@ -82,7 +82,7 @@
|
|
82
82
|
"BLOCK_SIZE_K": 256,
|
83
83
|
"GROUP_SIZE_M": 1,
|
84
84
|
"num_warps": 2,
|
85
|
-
"num_stages":
|
85
|
+
"num_stages": 2,
|
86
86
|
"waves_per_eu": 0,
|
87
87
|
"matrix_instr_nonkdim": 16,
|
88
88
|
"kpack": 2
|
@@ -93,7 +93,7 @@
|
|
93
93
|
"BLOCK_SIZE_K": 256,
|
94
94
|
"GROUP_SIZE_M": 4,
|
95
95
|
"num_warps": 4,
|
96
|
-
"num_stages":
|
96
|
+
"num_stages": 2,
|
97
97
|
"waves_per_eu": 0,
|
98
98
|
"matrix_instr_nonkdim": 16,
|
99
99
|
"kpack": 2
|
@@ -104,7 +104,7 @@
|
|
104
104
|
"BLOCK_SIZE_K": 128,
|
105
105
|
"GROUP_SIZE_M": 4,
|
106
106
|
"num_warps": 4,
|
107
|
-
"num_stages":
|
107
|
+
"num_stages": 2,
|
108
108
|
"waves_per_eu": 0,
|
109
109
|
"matrix_instr_nonkdim": 16,
|
110
110
|
"kpack": 1
|
@@ -115,7 +115,7 @@
|
|
115
115
|
"BLOCK_SIZE_K": 128,
|
116
116
|
"GROUP_SIZE_M": 4,
|
117
117
|
"num_warps": 8,
|
118
|
-
"num_stages":
|
118
|
+
"num_stages": 2,
|
119
119
|
"waves_per_eu": 0,
|
120
120
|
"matrix_instr_nonkdim": 16,
|
121
121
|
"kpack": 1
|
@@ -126,7 +126,7 @@
|
|
126
126
|
"BLOCK_SIZE_K": 64,
|
127
127
|
"GROUP_SIZE_M": 4,
|
128
128
|
"num_warps": 8,
|
129
|
-
"num_stages":
|
129
|
+
"num_stages": 2,
|
130
130
|
"waves_per_eu": 0,
|
131
131
|
"matrix_instr_nonkdim": 16,
|
132
132
|
"kpack": 1
|
@@ -137,7 +137,7 @@
|
|
137
137
|
"BLOCK_SIZE_K": 64,
|
138
138
|
"GROUP_SIZE_M": 1,
|
139
139
|
"num_warps": 8,
|
140
|
-
"num_stages":
|
140
|
+
"num_stages": 2,
|
141
141
|
"waves_per_eu": 0,
|
142
142
|
"matrix_instr_nonkdim": 32,
|
143
143
|
"kpack": 2
|
@@ -148,7 +148,7 @@
|
|
148
148
|
"BLOCK_SIZE_K": 64,
|
149
149
|
"GROUP_SIZE_M": 1,
|
150
150
|
"num_warps": 8,
|
151
|
-
"num_stages":
|
151
|
+
"num_stages": 2,
|
152
152
|
"waves_per_eu": 0,
|
153
153
|
"matrix_instr_nonkdim": 16,
|
154
154
|
"kpack": 1
|
@@ -159,7 +159,7 @@
|
|
159
159
|
"BLOCK_SIZE_K": 64,
|
160
160
|
"GROUP_SIZE_M": 1,
|
161
161
|
"num_warps": 8,
|
162
|
-
"num_stages":
|
162
|
+
"num_stages": 2,
|
163
163
|
"waves_per_eu": 0,
|
164
164
|
"matrix_instr_nonkdim": 16,
|
165
165
|
"kpack": 2
|
@@ -170,7 +170,7 @@
|
|
170
170
|
"BLOCK_SIZE_K": 64,
|
171
171
|
"GROUP_SIZE_M": 1,
|
172
172
|
"num_warps": 8,
|
173
|
-
"num_stages":
|
173
|
+
"num_stages": 2,
|
174
174
|
"waves_per_eu": 0,
|
175
175
|
"matrix_instr_nonkdim": 16,
|
176
176
|
"kpack": 1
|
@@ -181,7 +181,7 @@
|
|
181
181
|
"BLOCK_SIZE_K": 64,
|
182
182
|
"GROUP_SIZE_M": 1,
|
183
183
|
"num_warps": 8,
|
184
|
-
"num_stages":
|
184
|
+
"num_stages": 2,
|
185
185
|
"waves_per_eu": 0,
|
186
186
|
"matrix_instr_nonkdim": 16,
|
187
187
|
"kpack": 2
|
@@ -192,7 +192,7 @@
|
|
192
192
|
"BLOCK_SIZE_K": 64,
|
193
193
|
"GROUP_SIZE_M": 1,
|
194
194
|
"num_warps": 8,
|
195
|
-
"num_stages":
|
195
|
+
"num_stages": 2,
|
196
196
|
"waves_per_eu": 0,
|
197
197
|
"matrix_instr_nonkdim": 16,
|
198
198
|
"kpack": 1
|
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
"BLOCK_SIZE_K": 128,
|
6
6
|
"GROUP_SIZE_M": 1,
|
7
7
|
"num_warps": 2,
|
8
|
-
"num_stages":
|
8
|
+
"num_stages": 2,
|
9
9
|
"waves_per_eu": 0,
|
10
10
|
"matrix_instr_nonkdim": 16,
|
11
11
|
"kpack": 1
|
@@ -16,7 +16,7 @@
|
|
16
16
|
"BLOCK_SIZE_K": 64,
|
17
17
|
"GROUP_SIZE_M": 1,
|
18
18
|
"num_warps": 2,
|
19
|
-
"num_stages":
|
19
|
+
"num_stages": 2,
|
20
20
|
"waves_per_eu": 0,
|
21
21
|
"matrix_instr_nonkdim": 16,
|
22
22
|
"kpack": 2
|
@@ -27,7 +27,7 @@
|
|
27
27
|
"BLOCK_SIZE_K": 256,
|
28
28
|
"GROUP_SIZE_M": 1,
|
29
29
|
"num_warps": 2,
|
30
|
-
"num_stages":
|
30
|
+
"num_stages": 2,
|
31
31
|
"waves_per_eu": 0,
|
32
32
|
"matrix_instr_nonkdim": 16,
|
33
33
|
"kpack": 2
|
@@ -38,7 +38,7 @@
|
|
38
38
|
"BLOCK_SIZE_K": 256,
|
39
39
|
"GROUP_SIZE_M": 1,
|
40
40
|
"num_warps": 2,
|
41
|
-
"num_stages":
|
41
|
+
"num_stages": 2,
|
42
42
|
"waves_per_eu": 0,
|
43
43
|
"matrix_instr_nonkdim": 16,
|
44
44
|
"kpack": 2
|
@@ -49,7 +49,7 @@
|
|
49
49
|
"BLOCK_SIZE_K": 256,
|
50
50
|
"GROUP_SIZE_M": 1,
|
51
51
|
"num_warps": 2,
|
52
|
-
"num_stages":
|
52
|
+
"num_stages": 2,
|
53
53
|
"waves_per_eu": 0,
|
54
54
|
"matrix_instr_nonkdim": 16,
|
55
55
|
"kpack": 2
|
@@ -60,7 +60,7 @@
|
|
60
60
|
"BLOCK_SIZE_K": 64,
|
61
61
|
"GROUP_SIZE_M": 1,
|
62
62
|
"num_warps": 4,
|
63
|
-
"num_stages":
|
63
|
+
"num_stages": 2,
|
64
64
|
"waves_per_eu": 0,
|
65
65
|
"matrix_instr_nonkdim": 16,
|
66
66
|
"kpack": 1
|
@@ -71,7 +71,7 @@
|
|
71
71
|
"BLOCK_SIZE_K": 256,
|
72
72
|
"GROUP_SIZE_M": 4,
|
73
73
|
"num_warps": 2,
|
74
|
-
"num_stages":
|
74
|
+
"num_stages": 2,
|
75
75
|
"waves_per_eu": 0,
|
76
76
|
"matrix_instr_nonkdim": 16,
|
77
77
|
"kpack": 2
|
@@ -82,7 +82,7 @@
|
|
82
82
|
"BLOCK_SIZE_K": 256,
|
83
83
|
"GROUP_SIZE_M": 1,
|
84
84
|
"num_warps": 2,
|
85
|
-
"num_stages":
|
85
|
+
"num_stages": 2,
|
86
86
|
"waves_per_eu": 0,
|
87
87
|
"matrix_instr_nonkdim": 16,
|
88
88
|
"kpack": 2
|
@@ -93,7 +93,7 @@
|
|
93
93
|
"BLOCK_SIZE_K": 256,
|
94
94
|
"GROUP_SIZE_M": 4,
|
95
95
|
"num_warps": 4,
|
96
|
-
"num_stages":
|
96
|
+
"num_stages": 2,
|
97
97
|
"waves_per_eu": 0,
|
98
98
|
"matrix_instr_nonkdim": 16,
|
99
99
|
"kpack": 2
|
@@ -104,7 +104,7 @@
|
|
104
104
|
"BLOCK_SIZE_K": 128,
|
105
105
|
"GROUP_SIZE_M": 4,
|
106
106
|
"num_warps": 4,
|
107
|
-
"num_stages":
|
107
|
+
"num_stages": 2,
|
108
108
|
"waves_per_eu": 0,
|
109
109
|
"matrix_instr_nonkdim": 16,
|
110
110
|
"kpack": 1
|
@@ -115,7 +115,7 @@
|
|
115
115
|
"BLOCK_SIZE_K": 128,
|
116
116
|
"GROUP_SIZE_M": 4,
|
117
117
|
"num_warps": 8,
|
118
|
-
"num_stages":
|
118
|
+
"num_stages": 2,
|
119
119
|
"waves_per_eu": 0,
|
120
120
|
"matrix_instr_nonkdim": 16,
|
121
121
|
"kpack": 1
|
@@ -126,7 +126,7 @@
|
|
126
126
|
"BLOCK_SIZE_K": 64,
|
127
127
|
"GROUP_SIZE_M": 4,
|
128
128
|
"num_warps": 8,
|
129
|
-
"num_stages":
|
129
|
+
"num_stages": 2,
|
130
130
|
"waves_per_eu": 0,
|
131
131
|
"matrix_instr_nonkdim": 16,
|
132
132
|
"kpack": 1
|
@@ -137,7 +137,7 @@
|
|
137
137
|
"BLOCK_SIZE_K": 64,
|
138
138
|
"GROUP_SIZE_M": 1,
|
139
139
|
"num_warps": 8,
|
140
|
-
"num_stages":
|
140
|
+
"num_stages": 2,
|
141
141
|
"waves_per_eu": 0,
|
142
142
|
"matrix_instr_nonkdim": 32,
|
143
143
|
"kpack": 2
|
@@ -148,7 +148,7 @@
|
|
148
148
|
"BLOCK_SIZE_K": 64,
|
149
149
|
"GROUP_SIZE_M": 1,
|
150
150
|
"num_warps": 8,
|
151
|
-
"num_stages":
|
151
|
+
"num_stages": 2,
|
152
152
|
"waves_per_eu": 0,
|
153
153
|
"matrix_instr_nonkdim": 16,
|
154
154
|
"kpack": 1
|
@@ -159,7 +159,7 @@
|
|
159
159
|
"BLOCK_SIZE_K": 64,
|
160
160
|
"GROUP_SIZE_M": 1,
|
161
161
|
"num_warps": 8,
|
162
|
-
"num_stages":
|
162
|
+
"num_stages": 2,
|
163
163
|
"waves_per_eu": 0,
|
164
164
|
"matrix_instr_nonkdim": 16,
|
165
165
|
"kpack": 2
|
@@ -170,7 +170,7 @@
|
|
170
170
|
"BLOCK_SIZE_K": 64,
|
171
171
|
"GROUP_SIZE_M": 1,
|
172
172
|
"num_warps": 8,
|
173
|
-
"num_stages":
|
173
|
+
"num_stages": 2,
|
174
174
|
"waves_per_eu": 0,
|
175
175
|
"matrix_instr_nonkdim": 16,
|
176
176
|
"kpack": 1
|
@@ -181,7 +181,7 @@
|
|
181
181
|
"BLOCK_SIZE_K": 64,
|
182
182
|
"GROUP_SIZE_M": 1,
|
183
183
|
"num_warps": 8,
|
184
|
-
"num_stages":
|
184
|
+
"num_stages": 2,
|
185
185
|
"waves_per_eu": 0,
|
186
186
|
"matrix_instr_nonkdim": 16,
|
187
187
|
"kpack": 2
|
@@ -192,7 +192,7 @@
|
|
192
192
|
"BLOCK_SIZE_K": 64,
|
193
193
|
"GROUP_SIZE_M": 1,
|
194
194
|
"num_warps": 8,
|
195
|
-
"num_stages":
|
195
|
+
"num_stages": 2,
|
196
196
|
"waves_per_eu": 0,
|
197
197
|
"matrix_instr_nonkdim": 16,
|
198
198
|
"kpack": 1
|