wafer-cli 0.2.32__py3-none-any.whl → 0.2.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +1 -1
- wafer/agent_defaults.py +157 -2
- wafer/billing.py +6 -6
- wafer/cli.py +432 -348
- wafer/corpus.py +6 -72
- wafer/evaluate.py +143 -81
- wafer/global_config.py +0 -13
- wafer/kernel_scope.py +1 -1
- wafer/ncu_analyze.py +1 -1
- wafer/nsys_analyze.py +1 -1
- wafer/skills/wafer-guide/SKILL.md +6 -22
- wafer/ssh_keys.py +6 -6
- wafer/targets_ops.py +2 -29
- wafer/templates/aiter_optimize.py +59 -0
- wafer/templates/optimize_kernel.py +2 -4
- wafer/templates/optimize_kernelbench.py +62 -17
- wafer/templates/optimize_vllm.py +156 -0
- wafer/trace_compare.py +48 -139
- wafer/wevin_cli.py +1 -12
- wafer/workspaces.py +8 -8
- wafer_cli-0.2.34.dist-info/METADATA +260 -0
- {wafer_cli-0.2.32.dist-info → wafer_cli-0.2.34.dist-info}/RECORD +25 -23
- wafer_cli-0.2.32.dist-info/METADATA +0 -107
- {wafer_cli-0.2.32.dist-info → wafer_cli-0.2.34.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.32.dist-info → wafer_cli-0.2.34.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.32.dist-info → wafer_cli-0.2.34.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Template for optimizing AMD aiter operators.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
wafer agent -t aiter-optimize --args op=gemm_a8w8 --args target=mi300x "Optimize this operator"
|
|
5
|
+
wafer agent -t aiter-optimize --args op=mha --args target=runpod-mi300x-rocm7 "Improve MHA performance"
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from wafer.agent_defaults import (
|
|
10
|
+
AITER_BASH_ALLOWLIST,
|
|
11
|
+
AITER_ENABLED_TOOLS,
|
|
12
|
+
AITER_SYSTEM_PROMPT,
|
|
13
|
+
)
|
|
14
|
+
except ImportError:
|
|
15
|
+
# Fallback for when wafer-cli package isn't installed
|
|
16
|
+
AITER_ENABLED_TOOLS = ["read", "write", "edit", "glob", "grep", "bash"]
|
|
17
|
+
AITER_BASH_ALLOWLIST = [
|
|
18
|
+
"ls", "cat", "head", "tail", "wc", "find", "grep", "rg", "pwd", "tree",
|
|
19
|
+
"which", "diff", "sort", "mkdir", "cp", "mv", "git diff", "git status",
|
|
20
|
+
"git log", "hipcc", "g++", "gcc", "clang", "python", "python3", "pip",
|
|
21
|
+
"pytest", "./", "wafer evaluate aiter", "wafer amd rocprof-compute",
|
|
22
|
+
"wafer amd rocprof-sdk", "wafer amd rocprof-systems", "wafer amd isa",
|
|
23
|
+
"wafer agent -t ask-docs", "timeout",
|
|
24
|
+
]
|
|
25
|
+
AITER_SYSTEM_PROMPT = "You are a GPU kernel optimization expert for AMD MI300X and aiter."
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from wafer_core.rollouts.templates import TemplateConfig
|
|
29
|
+
except ImportError:
|
|
30
|
+
from rollouts.templates import TemplateConfig
|
|
31
|
+
|
|
32
|
+
# Format system prompt with template variables ($op, $target become {op}, {target})
|
|
33
|
+
# The template loader will substitute these at runtime
|
|
34
|
+
_SYSTEM_PROMPT = AITER_SYSTEM_PROMPT.replace("{op}", "$op").replace("{target_flag}", "--target $target")
|
|
35
|
+
|
|
36
|
+
template = TemplateConfig(
|
|
37
|
+
# Identity
|
|
38
|
+
name="aiter-optimize",
|
|
39
|
+
description="Optimize AMD aiter operators for better performance on MI300X",
|
|
40
|
+
# System prompt - uses shared prompt from agent_defaults
|
|
41
|
+
system_prompt=_SYSTEM_PROMPT,
|
|
42
|
+
# Tools - full coding environment
|
|
43
|
+
tools=AITER_ENABLED_TOOLS,
|
|
44
|
+
bash_allowlist=AITER_BASH_ALLOWLIST,
|
|
45
|
+
# Network access required for wafer evaluate (connects to remote GPU)
|
|
46
|
+
allow_network=True,
|
|
47
|
+
# Model config - use thinking for optimization analysis
|
|
48
|
+
model="anthropic/claude-sonnet-4-5-20250929",
|
|
49
|
+
max_tokens=16384,
|
|
50
|
+
thinking=True,
|
|
51
|
+
thinking_budget=10000,
|
|
52
|
+
# Multi-turn for iterative optimization
|
|
53
|
+
single_turn=False,
|
|
54
|
+
# Template variables
|
|
55
|
+
defaults={
|
|
56
|
+
"op": "gemm_a8w8",
|
|
57
|
+
"target": "mi300x", # Required - user must specify their target
|
|
58
|
+
},
|
|
59
|
+
)
|
|
@@ -35,8 +35,7 @@ Strategy:
|
|
|
35
35
|
Commands:
|
|
36
36
|
- `wafer evaluate --impl <file> --reference <ref> --test-cases <tests>` - Run evaluation
|
|
37
37
|
- `wafer evaluate --impl <file> --reference <ref> --test-cases <tests> --profile` - With NCU profiling
|
|
38
|
-
- `wafer
|
|
39
|
-
- `wafer targets exec <target> -- <command>` - Run commands on a configured target via SSH
|
|
38
|
+
- `wafer remote-run "<command>"` - Run arbitrary commands on remote GPU
|
|
40
39
|
|
|
41
40
|
Output:
|
|
42
41
|
- Summary of optimizations applied
|
|
@@ -49,8 +48,7 @@ IMPORTANT: Always verify correctness with wafer evaluate before claiming success
|
|
|
49
48
|
tools=["read", "write", "edit", "glob", "grep", "bash"],
|
|
50
49
|
bash_allowlist=[
|
|
51
50
|
"wafer evaluate",
|
|
52
|
-
"wafer
|
|
53
|
-
"wafer targets exec",
|
|
51
|
+
"wafer remote-run",
|
|
54
52
|
"wafer nvidia ncu",
|
|
55
53
|
"wafer nvidia nsys",
|
|
56
54
|
"wafer nvidia perfetto",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Template for KernelBench optimization.
|
|
1
|
+
"""Template for KernelBench optimization - matches eval system prompt.
|
|
2
2
|
|
|
3
3
|
Usage:
|
|
4
4
|
# Run on a specific problem
|
|
@@ -26,18 +26,12 @@ try:
|
|
|
26
26
|
except ImportError:
|
|
27
27
|
from rollouts.templates import TemplateConfig
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# Task-specific instructions only — must stay in sync with the eval's SYSTEM_PROMPT
|
|
32
|
-
# in research/evals/optimize_kernelbench_eval/.../base_config.py.
|
|
33
|
-
# Run test_eval_cli_parity.py to verify.
|
|
34
|
-
# Wafer CLI command docs are auto-generated from --help text and composed
|
|
35
|
-
# at runtime by wevin_cli.py (see wafer.cli_instructions.build_cli_instructions).
|
|
36
|
-
# TODO: Consider having both eval and template import SYSTEM_PROMPT from a shared
|
|
37
|
-
# module so there's only one copy to maintain.
|
|
29
|
+
# System prompt matches optimize_kernelbench_eval/base_config.py SYSTEM_PROMPT
|
|
38
30
|
SYSTEM_PROMPT = """\
|
|
39
31
|
You are a GPU kernel optimization expert. Your task is to write optimized GPU kernels that are correct and faster than the PyTorch baseline.
|
|
40
32
|
|
|
33
|
+
IMPORTANT: You do NOT have a local GPU. You MUST use `wafer evaluate kernelbench` to test kernels on remote GPU hardware.
|
|
34
|
+
|
|
41
35
|
## Kernel Format (KernelBench)
|
|
42
36
|
|
|
43
37
|
The reference file contains a PyTorch `Model` class. You must write a `ModelNew` class that:
|
|
@@ -49,14 +43,49 @@ The reference file also provides:
|
|
|
49
43
|
- `get_inputs()` - generates test inputs for forward()
|
|
50
44
|
- `get_init_inputs()` - generates constructor arguments
|
|
51
45
|
|
|
46
|
+
## Available Tools
|
|
47
|
+
|
|
48
|
+
- read(file_path): Read source files
|
|
49
|
+
- write(file_path, content): Write your optimized kernel
|
|
50
|
+
- glob(pattern): Find files by pattern
|
|
51
|
+
- grep(pattern): Search code
|
|
52
|
+
- bash(command): Run shell commands including wafer CLI
|
|
53
|
+
|
|
52
54
|
## Workflow
|
|
53
55
|
|
|
54
56
|
1. Read the reference problem file to understand what `Model` does
|
|
55
57
|
2. Analyze the computation and identify optimization opportunities
|
|
56
58
|
3. Write an optimized `ModelNew` class with custom $backend_upper kernels using `__global__` kernel definitions and `torch.utils.cpp_extension.load_inline`
|
|
57
|
-
4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl
|
|
59
|
+
4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl <your_file.py> --reference <problem.py> --benchmark`
|
|
58
60
|
5. Iterate based on feedback until correct and fast
|
|
59
61
|
|
|
62
|
+
## Example Command
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
wafer evaluate kernelbench \\
|
|
66
|
+
$target_flag \\
|
|
67
|
+
--backend $backend \\
|
|
68
|
+
--impl optimized_kernel.py \\
|
|
69
|
+
--reference $reference \\
|
|
70
|
+
--benchmark
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Profiling Tools (USE THESE!)
|
|
74
|
+
|
|
75
|
+
When your kernel is slower than expected, use profiling to understand WHY:
|
|
76
|
+
|
|
77
|
+
- `wafer rocprof profile --impl <file> --reference <ref>` - AMD GPU profiling
|
|
78
|
+
- `wafer nvidia ncu --impl <file> --reference <ref>` - NVIDIA NCU profiling
|
|
79
|
+
|
|
80
|
+
## CRITICAL: Reactive Debugging
|
|
81
|
+
|
|
82
|
+
After EVERY `wafer evaluate` call:
|
|
83
|
+
1. Check the speedup result
|
|
84
|
+
2. If speedup < 1.0x (slowdown), STOP and analyze:
|
|
85
|
+
- Run profiling to identify the bottleneck
|
|
86
|
+
- Ask: "Why is this slow?" before trying another approach
|
|
87
|
+
3. Don't just try random optimizations - understand the root cause
|
|
88
|
+
|
|
60
89
|
Your kernel MUST:
|
|
61
90
|
- Pass correctness tests (outputs match reference within tolerance)
|
|
62
91
|
- Achieve speedup > 1.0x over PyTorch baseline
|
|
@@ -67,16 +96,32 @@ You MUST run `wafer evaluate kernelbench` to verify your kernel. Your score depe
|
|
|
67
96
|
template = TemplateConfig(
|
|
68
97
|
# Identity
|
|
69
98
|
name="optimize-kernelbench",
|
|
70
|
-
description="Optimize KernelBench problems",
|
|
71
|
-
# System prompt
|
|
99
|
+
description="Optimize KernelBench problems (matches eval system prompt)",
|
|
100
|
+
# System prompt
|
|
72
101
|
system_prompt=SYSTEM_PROMPT,
|
|
73
102
|
# Tools
|
|
74
|
-
tools=
|
|
75
|
-
bash_allowlist=
|
|
76
|
-
|
|
103
|
+
tools=["read", "write", "edit", "glob", "grep", "bash"],
|
|
104
|
+
bash_allowlist=[
|
|
105
|
+
"wafer evaluate",
|
|
106
|
+
"wafer nvidia ncu",
|
|
107
|
+
"wafer nvidia nsys",
|
|
108
|
+
"wafer rocprof",
|
|
109
|
+
"wafer compiler-analyze",
|
|
110
|
+
"python",
|
|
111
|
+
"python3",
|
|
112
|
+
"timeout",
|
|
113
|
+
"ls",
|
|
114
|
+
"cat",
|
|
115
|
+
"head",
|
|
116
|
+
"tail",
|
|
117
|
+
"wc",
|
|
118
|
+
"pwd",
|
|
119
|
+
"which",
|
|
120
|
+
],
|
|
121
|
+
# Model config - match eval settings
|
|
77
122
|
model="anthropic/claude-opus-4-5-20251101",
|
|
78
123
|
max_tokens=8192,
|
|
79
|
-
# No thinking by default, can override with --thinking
|
|
124
|
+
# No thinking by default (match eval), can override with --thinking
|
|
80
125
|
thinking=False,
|
|
81
126
|
# Multi-turn for iterative optimization
|
|
82
127
|
single_turn=False,
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Template for vLLM kernel optimization.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
# Optimize fused_moe kernel
|
|
5
|
+
wafer agent -t optimize-vllm \
|
|
6
|
+
--args vllm_dir=/path/to/vllm \
|
|
7
|
+
--args op=fused_moe \
|
|
8
|
+
--args target=my-gpu-server \
|
|
9
|
+
"Optimize the fused MoE kernel for better throughput"
|
|
10
|
+
|
|
11
|
+
# With custom test and benchmark commands
|
|
12
|
+
wafer agent -t optimize-vllm \
|
|
13
|
+
--args vllm_dir=./vllm \
|
|
14
|
+
--args op=paged_attention \
|
|
15
|
+
--args test_cmd="pytest tests/kernels/attention/test_attention.py -v" \
|
|
16
|
+
--args bench_cmd="python benchmarks/kernels/benchmark_paged_attention.py" \
|
|
17
|
+
--json
|
|
18
|
+
|
|
19
|
+
Variables:
|
|
20
|
+
- vllm_dir: Path to vLLM repository (required)
|
|
21
|
+
- op: Target op to optimize (required, e.g., fused_moe, paged_attention)
|
|
22
|
+
- target: Target name (default: uses default target)
|
|
23
|
+
- pool: Target pool name (alternative to target)
|
|
24
|
+
- test_cmd: Pytest command for correctness (auto-generated from op if not provided)
|
|
25
|
+
- bench_cmd: Kernel microbenchmark command (auto-generated from op if not provided)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from wafer_core.rollouts.templates import TemplateConfig
|
|
30
|
+
except ImportError:
|
|
31
|
+
from rollouts.templates import TemplateConfig
|
|
32
|
+
|
|
33
|
+
from wafer.agent_defaults import VLLM_BASH_ALLOWLIST, VLLM_ENABLED_TOOLS
|
|
34
|
+
|
|
35
|
+
# Default test commands per op (from vLLM's test structure)
|
|
36
|
+
DEFAULT_TEST_CMDS = {
|
|
37
|
+
"fused_moe": "pytest tests/kernels/moe/test_moe.py -v",
|
|
38
|
+
"paged_attention": "pytest tests/kernels/attention/test_attention.py -v",
|
|
39
|
+
"flash_attn": "pytest tests/kernels/attention/test_flash_attn.py -v",
|
|
40
|
+
"flashinfer": "pytest tests/kernels/attention/test_flashinfer.py -v",
|
|
41
|
+
"rms_norm": "pytest tests/kernels/core/test_layernorm.py -v -k rms",
|
|
42
|
+
"layernorm": "pytest tests/kernels/core/test_layernorm.py -v",
|
|
43
|
+
"rotary_embedding": "pytest tests/kernels/core/test_rotary_embedding.py -v",
|
|
44
|
+
"activation": "pytest tests/kernels/core/test_activation.py -v",
|
|
45
|
+
"fused_topk": "pytest tests/kernels/moe/test_fused_topk.py -v",
|
|
46
|
+
"fp8_quant": "pytest tests/kernels/quantization/test_fp8_quant.py -v",
|
|
47
|
+
"int8_quant": "pytest tests/kernels/quantization/test_int8_quant.py -v",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Default benchmark commands per op.
|
|
51
|
+
# Uses pytest with --durations to measure kernel execution time.
|
|
52
|
+
# vLLM v0.15+ kernel benchmarks require config context, so pytest
|
|
53
|
+
# (which sets up fixtures) is the reliable path.
|
|
54
|
+
DEFAULT_BENCH_CMDS = {
|
|
55
|
+
"fused_moe": "pytest tests/kernels/moe/test_moe.py --timeout=300 --durations=0 -q",
|
|
56
|
+
"paged_attention": "pytest tests/kernels/attention/test_attention.py --timeout=300 --durations=0 -q",
|
|
57
|
+
"rms_norm": "pytest tests/kernels/core/test_layernorm.py -k rms --timeout=120 --durations=0 -q",
|
|
58
|
+
"layernorm": "pytest tests/kernels/core/test_layernorm.py --timeout=120 --durations=0 -q",
|
|
59
|
+
"rotary_embedding": "pytest tests/kernels/core/test_rotary_embedding.py --timeout=120 --durations=0 -q",
|
|
60
|
+
"activation": "pytest tests/kernels/core/test_activation.py --timeout=120 --durations=0 -q",
|
|
61
|
+
"fused_topk": "pytest tests/kernels/moe/test_fused_topk.py --timeout=120 --durations=0 -q",
|
|
62
|
+
"fp8_quant": "pytest tests/kernels/quantization/test_fp8_quant.py --timeout=120 --durations=0 -q",
|
|
63
|
+
"int8_quant": "pytest tests/kernels/quantization/test_int8_quant.py --timeout=120 --durations=0 -q",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
SYSTEM_PROMPT = """\
|
|
67
|
+
You are a GPU kernel optimization expert. Your task is to improve the performance
|
|
68
|
+
of a specific vLLM kernel while maintaining correctness.
|
|
69
|
+
|
|
70
|
+
## Target
|
|
71
|
+
|
|
72
|
+
You are optimizing the `$op` kernel in vLLM.
|
|
73
|
+
- vLLM directory: `$vllm_dir`
|
|
74
|
+
- Correctness test: `$test_cmd`
|
|
75
|
+
- Benchmark: `$bench_cmd`
|
|
76
|
+
|
|
77
|
+
## Workflow
|
|
78
|
+
|
|
79
|
+
1. **Understand the kernel**: Read the kernel implementation in `$vllm_dir`
|
|
80
|
+
- For MoE: `vllm/model_executor/layers/fused_moe/`
|
|
81
|
+
- For attention: `vllm/attention/backends/`
|
|
82
|
+
- For normalization: `vllm/_custom_ops.py` or specific layer files
|
|
83
|
+
- For quantization: `vllm/_custom_ops.py`
|
|
84
|
+
|
|
85
|
+
2. **Run baseline benchmark**: Establish baseline performance
|
|
86
|
+
```bash
|
|
87
|
+
cd $vllm_dir && $bench_cmd
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
3. **Analyze and optimize**: Identify optimization opportunities
|
|
91
|
+
- Memory access patterns (coalescing, shared memory usage)
|
|
92
|
+
- Occupancy and register pressure
|
|
93
|
+
- Algorithm improvements
|
|
94
|
+
- Hardware-specific optimizations (tensor cores, etc.)
|
|
95
|
+
|
|
96
|
+
4. **Modify the kernel**: Make your changes to improve performance
|
|
97
|
+
|
|
98
|
+
5. **Validate correctness**: Run the test suite
|
|
99
|
+
```bash
|
|
100
|
+
cd $vllm_dir && $test_cmd
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
6. **Measure improvement**: Run benchmark again and compare
|
|
104
|
+
|
|
105
|
+
7. **Iterate**: If correctness fails or performance regresses, adjust and retry
|
|
106
|
+
|
|
107
|
+
## Evaluation
|
|
108
|
+
|
|
109
|
+
Use the wafer evaluate command to run both correctness and benchmark:
|
|
110
|
+
```bash
|
|
111
|
+
wafer evaluate vllm --vllm-dir $vllm_dir --op $op \\
|
|
112
|
+
--test-cmd "$test_cmd" \\
|
|
113
|
+
--bench-cmd "$bench_cmd" \\
|
|
114
|
+
$target_flag --json
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Constraints
|
|
118
|
+
|
|
119
|
+
- The correctness test MUST pass after your changes
|
|
120
|
+
- Focus on the specific kernel identified (`$op`)
|
|
121
|
+
- Document your changes and reasoning
|
|
122
|
+
- Your score depends on actual measured throughput improvement
|
|
123
|
+
|
|
124
|
+
## Key Metrics
|
|
125
|
+
|
|
126
|
+
- **time_us**: kernel execution time in microseconds (lower is better)
|
|
127
|
+
- **tflops**: teraflops achieved (higher is better)
|
|
128
|
+
- **bandwidth_gbps**: memory bandwidth in GB/s (higher is better)"""
|
|
129
|
+
|
|
130
|
+
template = TemplateConfig(
|
|
131
|
+
# Identity
|
|
132
|
+
name="optimize-vllm",
|
|
133
|
+
description="Optimize vLLM kernels for better inference performance",
|
|
134
|
+
# System prompt (task-specific; CLI docs appended at runtime)
|
|
135
|
+
system_prompt=SYSTEM_PROMPT,
|
|
136
|
+
# Tools
|
|
137
|
+
tools=VLLM_ENABLED_TOOLS,
|
|
138
|
+
bash_allowlist=VLLM_BASH_ALLOWLIST,
|
|
139
|
+
# Model config
|
|
140
|
+
model="anthropic/claude-opus-4-5-20251101",
|
|
141
|
+
max_tokens=8192,
|
|
142
|
+
# No thinking by default, can override with --thinking
|
|
143
|
+
thinking=False,
|
|
144
|
+
# Multi-turn for iterative optimization
|
|
145
|
+
single_turn=False,
|
|
146
|
+
# Template variables
|
|
147
|
+
defaults={
|
|
148
|
+
"vllm_dir": "./vllm",
|
|
149
|
+
"op": "fused_moe",
|
|
150
|
+
"target": "",
|
|
151
|
+
"pool": "",
|
|
152
|
+
"test_cmd": "", # Auto-filled from DEFAULT_TEST_CMDS[op] if empty
|
|
153
|
+
"bench_cmd": "", # Auto-filled from DEFAULT_BENCH_CMDS[op] if empty
|
|
154
|
+
"target_flag": "", # Auto-computed: --target X or --pool Y
|
|
155
|
+
},
|
|
156
|
+
)
|
wafer/trace_compare.py
CHANGED
|
@@ -6,22 +6,19 @@ All core logic is in wafer_core.lib.trace_compare.
|
|
|
6
6
|
|
|
7
7
|
import sys
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
10
9
|
|
|
11
10
|
import typer
|
|
12
11
|
|
|
13
|
-
import json
|
|
14
|
-
import sys
|
|
15
|
-
|
|
16
12
|
from wafer_core.lib.trace_compare import (
|
|
17
|
-
|
|
13
|
+
analyze_fusion_differences,
|
|
14
|
+
analyze_traces,
|
|
18
15
|
format_csv,
|
|
16
|
+
format_fusion_csv,
|
|
17
|
+
format_fusion_json,
|
|
18
|
+
format_fusion_text,
|
|
19
19
|
format_json,
|
|
20
20
|
format_text,
|
|
21
|
-
ArchitectureType,
|
|
22
|
-
detect_architecture,
|
|
23
21
|
)
|
|
24
|
-
from wafer_core.lib.trace_compare.loader import StreamingMetadata
|
|
25
22
|
|
|
26
23
|
|
|
27
24
|
def compare_traces(
|
|
@@ -33,7 +30,6 @@ def compare_traces(
|
|
|
33
30
|
show_layers: bool = False,
|
|
34
31
|
show_all: bool = False,
|
|
35
32
|
show_stack_traces: bool = False,
|
|
36
|
-
recommendations: bool = False,
|
|
37
33
|
) -> None:
|
|
38
34
|
"""Compare two GPU traces and generate performance report.
|
|
39
35
|
|
|
@@ -56,60 +52,21 @@ def compare_traces(
|
|
|
56
52
|
typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
|
|
57
53
|
raise typer.Exit(1)
|
|
58
54
|
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
if output_format == 'json':
|
|
62
|
-
progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
|
|
63
|
-
print(progress_msg, file=sys.stdout, flush=True)
|
|
64
|
-
elif output_format != 'json':
|
|
65
|
-
percent = int(fraction * 100)
|
|
66
|
-
typer.echo(f"📊 {stage}: {percent}%", err=True)
|
|
67
|
-
|
|
68
|
-
# Metadata callback for JSON format (emits NDJSON with early GPU info)
|
|
69
|
-
def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
|
|
70
|
-
if output_format == 'json':
|
|
71
|
-
metadata_msg = json.dumps({
|
|
72
|
-
"type": "metadata",
|
|
73
|
-
"trace1": {
|
|
74
|
-
"platform": meta1.platform,
|
|
75
|
-
"gpu": meta1.gpu_name,
|
|
76
|
-
"file_size_mb": round(meta1.file_size_mb, 1),
|
|
77
|
-
},
|
|
78
|
-
"trace2": {
|
|
79
|
-
"platform": meta2.platform,
|
|
80
|
-
"gpu": meta2.gpu_name,
|
|
81
|
-
"file_size_mb": round(meta2.file_size_mb, 1),
|
|
82
|
-
},
|
|
83
|
-
})
|
|
84
|
-
print(metadata_msg, file=sys.stdout, flush=True)
|
|
85
|
-
else:
|
|
86
|
-
typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
|
|
87
|
-
typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
|
|
88
|
-
|
|
89
|
-
# Analyze traces using unified API
|
|
55
|
+
# Analyze traces
|
|
56
|
+
# Only show progress messages for non-JSON formats (JSON needs clean stdout)
|
|
90
57
|
if output_format != 'json':
|
|
91
58
|
typer.echo("📊 Loading traces...")
|
|
92
59
|
|
|
60
|
+
# Determine how many stack traces to collect
|
|
61
|
+
max_stacks = 0 if (show_stack_traces and show_all) else (3 if show_stack_traces else 3)
|
|
62
|
+
|
|
93
63
|
try:
|
|
94
|
-
|
|
64
|
+
results = analyze_traces(
|
|
95
65
|
trace1,
|
|
96
66
|
trace2,
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
on_progress=progress_callback,
|
|
100
|
-
on_metadata=metadata_callback,
|
|
67
|
+
phase_filter=phase,
|
|
68
|
+
max_stacks=max_stacks,
|
|
101
69
|
)
|
|
102
|
-
|
|
103
|
-
results = {
|
|
104
|
-
"metadata": result_obj.metadata,
|
|
105
|
-
"operations": result_obj.operations,
|
|
106
|
-
"layers": result_obj.layers,
|
|
107
|
-
"warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
|
|
108
|
-
"architecture": result_obj.architecture.value,
|
|
109
|
-
"layer_alignments": result_obj.layer_alignments,
|
|
110
|
-
"fusion_analysis": result_obj.fusion_analysis,
|
|
111
|
-
"same_kernel_analysis": result_obj.same_kernel_analysis,
|
|
112
|
-
}
|
|
113
70
|
except ValueError as e:
|
|
114
71
|
typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
|
|
115
72
|
raise typer.Exit(1)
|
|
@@ -117,26 +74,17 @@ def compare_traces(
|
|
|
117
74
|
typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
|
|
118
75
|
raise typer.Exit(1)
|
|
119
76
|
|
|
77
|
+
# Show loading confirmation
|
|
120
78
|
if output_format != 'json':
|
|
121
79
|
meta = results["metadata"]
|
|
80
|
+
# Determine which trace is AMD and which is NVIDIA
|
|
122
81
|
if meta['trace1_platform'] == 'AMD':
|
|
123
82
|
amd_gpu, nvidia_gpu = meta['trace1_gpu'], meta['trace2_gpu']
|
|
124
83
|
else:
|
|
125
84
|
amd_gpu, nvidia_gpu = meta['trace2_gpu'], meta['trace1_gpu']
|
|
126
85
|
typer.echo(f"✅ Loaded: AMD ({amd_gpu}) vs NVIDIA ({nvidia_gpu})")
|
|
127
|
-
|
|
128
|
-
# Display warnings
|
|
129
|
-
warnings = results.get("warnings", [])
|
|
130
|
-
if warnings:
|
|
131
|
-
typer.echo()
|
|
132
|
-
for warning in warnings:
|
|
133
|
-
icon = "❌" if warning["severity"] == "error" else "⚠️" if warning["severity"] == "warning" else "ℹ️"
|
|
134
|
-
typer.secho(f"{icon} {warning['message']}", fg=typer.colors.YELLOW if warning["severity"] == "warning" else typer.colors.BLUE)
|
|
135
|
-
if warning.get("suggestion"):
|
|
136
|
-
typer.secho(f" Suggestion: {warning['suggestion']}", fg=typer.colors.BLUE)
|
|
137
86
|
typer.echo()
|
|
138
87
|
|
|
139
|
-
|
|
140
88
|
# Generate output based on format
|
|
141
89
|
if output_format == "text":
|
|
142
90
|
output_str = format_text(results, show_layers=show_layers, show_all=show_all, show_stack_traces=show_stack_traces)
|
|
@@ -160,23 +108,21 @@ def compare_traces(
|
|
|
160
108
|
typer.echo(output_str)
|
|
161
109
|
|
|
162
110
|
|
|
163
|
-
def
|
|
111
|
+
def compare_fusion(
|
|
164
112
|
trace1: Path,
|
|
165
113
|
trace2: Path,
|
|
166
114
|
output: Path | None = None,
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
layer: int | None = None,
|
|
115
|
+
format_type: str = "text",
|
|
116
|
+
min_group_size: int = 50,
|
|
170
117
|
) -> None:
|
|
171
|
-
"""
|
|
118
|
+
"""Analyze kernel fusion differences between AMD and NVIDIA traces.
|
|
172
119
|
|
|
173
120
|
Args:
|
|
174
121
|
trace1: Path to first trace file (AMD or NVIDIA)
|
|
175
122
|
trace2: Path to second trace file (AMD or NVIDIA)
|
|
176
123
|
output: Optional output file path (default: stdout)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
layer: Focus on specific layer number (optional)
|
|
124
|
+
format_type: Output format ('text', 'csv', or 'json')
|
|
125
|
+
min_group_size: Minimum correlation group size to analyze
|
|
180
126
|
"""
|
|
181
127
|
# Validate files exist
|
|
182
128
|
if not trace1.exists():
|
|
@@ -187,86 +133,49 @@ def compare_align(
|
|
|
187
133
|
typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
|
|
188
134
|
raise typer.Exit(1)
|
|
189
135
|
|
|
190
|
-
#
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
|
|
194
|
-
print(progress_msg, file=sys.stdout, flush=True)
|
|
195
|
-
else:
|
|
196
|
-
percent = int(fraction * 100)
|
|
197
|
-
typer.echo(f"📊 {stage}: {percent}%", err=True)
|
|
198
|
-
|
|
199
|
-
# Metadata callback for JSON format
|
|
200
|
-
def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
|
|
201
|
-
if output_format == 'json':
|
|
202
|
-
metadata_msg = json.dumps({
|
|
203
|
-
"type": "metadata",
|
|
204
|
-
"trace1": {
|
|
205
|
-
"platform": meta1.platform,
|
|
206
|
-
"gpu": meta1.gpu_name,
|
|
207
|
-
"file_size_mb": round(meta1.file_size_mb, 1),
|
|
208
|
-
},
|
|
209
|
-
"trace2": {
|
|
210
|
-
"platform": meta2.platform,
|
|
211
|
-
"gpu": meta2.gpu_name,
|
|
212
|
-
"file_size_mb": round(meta2.file_size_mb, 1),
|
|
213
|
-
},
|
|
214
|
-
})
|
|
215
|
-
print(metadata_msg, file=sys.stdout, flush=True)
|
|
216
|
-
else:
|
|
217
|
-
typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
|
|
218
|
-
typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
|
|
219
|
-
|
|
220
|
-
# Analyze traces using unified API
|
|
221
|
-
if output_format != 'json':
|
|
136
|
+
# Analyze fusion
|
|
137
|
+
# Only show progress messages for non-JSON formats (JSON needs clean stdout)
|
|
138
|
+
if format_type != 'json':
|
|
222
139
|
typer.echo("📊 Loading traces...")
|
|
223
|
-
|
|
224
140
|
try:
|
|
225
|
-
|
|
141
|
+
results = analyze_fusion_differences(
|
|
226
142
|
trace1,
|
|
227
143
|
trace2,
|
|
228
|
-
|
|
229
|
-
include_stacks=True,
|
|
230
|
-
on_progress=progress_callback,
|
|
231
|
-
on_metadata=metadata_callback,
|
|
144
|
+
min_group_size=min_group_size,
|
|
232
145
|
)
|
|
233
|
-
|
|
234
|
-
results = {
|
|
235
|
-
"metadata": result_obj.metadata,
|
|
236
|
-
"layer_alignments": result_obj.layer_alignments or [],
|
|
237
|
-
"fusion_analysis": result_obj.fusion_analysis or {},
|
|
238
|
-
"same_kernel_analysis": result_obj.same_kernel_analysis or {},
|
|
239
|
-
"operations": result_obj.operations,
|
|
240
|
-
"layers": result_obj.layers,
|
|
241
|
-
"warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
|
|
242
|
-
"architecture": result_obj.architecture.value,
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
if layer is not None:
|
|
246
|
-
results["layer_alignments"] = [
|
|
247
|
-
la for la in results["layer_alignments"] if la.get("layer") == layer
|
|
248
|
-
]
|
|
249
|
-
except ValueError as e:
|
|
250
|
-
typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
|
|
251
|
-
raise typer.Exit(1)
|
|
252
146
|
except Exception as e:
|
|
253
|
-
typer.secho(
|
|
147
|
+
typer.secho(
|
|
148
|
+
f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True
|
|
149
|
+
)
|
|
254
150
|
import traceback
|
|
151
|
+
|
|
255
152
|
traceback.print_exc()
|
|
256
153
|
raise typer.Exit(1)
|
|
257
154
|
|
|
258
|
-
|
|
155
|
+
# Show loading confirmation
|
|
156
|
+
if format_type != 'json':
|
|
259
157
|
meta = results["metadata"]
|
|
260
|
-
|
|
261
|
-
typer.echo(f"✅
|
|
158
|
+
# Note: fusion analyzer always uses trace1=AMD, trace2=NVIDIA
|
|
159
|
+
typer.echo(f"✅ Loaded: {meta['trace1_gpu']} vs {meta['trace2_gpu']}")
|
|
160
|
+
typer.echo(
|
|
161
|
+
f"Found {meta['trace1_correlation_groups']} trace1 groups and "
|
|
162
|
+
f"{meta['trace2_correlation_groups']} trace2 groups with ≥{min_group_size} kernels"
|
|
163
|
+
)
|
|
164
|
+
typer.echo(f"✅ Matched {meta['matched_groups']} correlation groups")
|
|
262
165
|
typer.echo()
|
|
263
166
|
|
|
264
|
-
|
|
265
|
-
|
|
167
|
+
# Generate output
|
|
168
|
+
if format_type == "text":
|
|
169
|
+
output_str = format_fusion_text(results)
|
|
170
|
+
elif format_type == "csv":
|
|
171
|
+
output_str = format_fusion_csv(results)
|
|
172
|
+
elif format_type == "json":
|
|
173
|
+
output_str = format_fusion_json(results)
|
|
266
174
|
else:
|
|
267
|
-
typer.secho(f"❌
|
|
175
|
+
typer.secho(f"❌ Unknown format: {format_type}", fg=typer.colors.RED, err=True)
|
|
268
176
|
raise typer.Exit(1)
|
|
269
177
|
|
|
178
|
+
# Write output
|
|
270
179
|
if output:
|
|
271
180
|
output.write_text(output_str)
|
|
272
181
|
typer.secho(f"✅ Report saved to {output}", fg=typer.colors.GREEN)
|
wafer/wevin_cli.py
CHANGED
|
@@ -550,7 +550,7 @@ def main( # noqa: PLR0913, PLR0915
|
|
|
550
550
|
api_base, api_key, api_key_refresh = _get_wafer_auth(no_proxy=no_proxy)
|
|
551
551
|
if not api_base or not api_key:
|
|
552
552
|
print("Error: No API credentials found", file=sys.stderr)
|
|
553
|
-
print(" Run 'wafer
|
|
553
|
+
print(" Run 'wafer login' or set ANTHROPIC_API_KEY", file=sys.stderr)
|
|
554
554
|
sys.exit(1)
|
|
555
555
|
|
|
556
556
|
assert api_base is not None
|
|
@@ -573,17 +573,6 @@ def main( # noqa: PLR0913, PLR0915
|
|
|
573
573
|
tpl = _get_default_template()
|
|
574
574
|
base_system_prompt = tpl.system_prompt
|
|
575
575
|
|
|
576
|
-
# Compose CLI instructions from --help text for allowed wafer commands
|
|
577
|
-
# TODO: The eval path doesn't have the skills layer below. If include_skills
|
|
578
|
-
# is ever enabled for optimize-kernelbench, the eval would need it too for parity.
|
|
579
|
-
# See test_eval_cli_parity.py for coverage notes.
|
|
580
|
-
if tpl.bash_allowlist:
|
|
581
|
-
from wafer.cli_instructions import build_cli_instructions
|
|
582
|
-
|
|
583
|
-
cli_instructions = build_cli_instructions(tpl.bash_allowlist)
|
|
584
|
-
if cli_instructions:
|
|
585
|
-
base_system_prompt = base_system_prompt + "\n\n" + cli_instructions
|
|
586
|
-
|
|
587
576
|
# Append skill metadata if skills are enabled
|
|
588
577
|
if tpl.include_skills:
|
|
589
578
|
from wafer_core.rollouts.skills import discover_skills, format_skill_metadata_for_prompt
|