wafer-cli 0.2.32__py3-none-any.whl → 0.2.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ """Template for optimizing AMD aiter operators.
2
+
3
+ Usage:
4
+ wafer agent -t aiter-optimize --args op=gemm_a8w8 --args target=mi300x "Optimize this operator"
5
+ wafer agent -t aiter-optimize --args op=mha --args target=runpod-mi300x-rocm7 "Improve MHA performance"
6
+ """
7
+
8
+ try:
9
+ from wafer.agent_defaults import (
10
+ AITER_BASH_ALLOWLIST,
11
+ AITER_ENABLED_TOOLS,
12
+ AITER_SYSTEM_PROMPT,
13
+ )
14
+ except ImportError:
15
+ # Fallback for when wafer-cli package isn't installed
16
+ AITER_ENABLED_TOOLS = ["read", "write", "edit", "glob", "grep", "bash"]
17
+ AITER_BASH_ALLOWLIST = [
18
+ "ls", "cat", "head", "tail", "wc", "find", "grep", "rg", "pwd", "tree",
19
+ "which", "diff", "sort", "mkdir", "cp", "mv", "git diff", "git status",
20
+ "git log", "hipcc", "g++", "gcc", "clang", "python", "python3", "pip",
21
+ "pytest", "./", "wafer evaluate aiter", "wafer amd rocprof-compute",
22
+ "wafer amd rocprof-sdk", "wafer amd rocprof-systems", "wafer amd isa",
23
+ "wafer agent -t ask-docs", "timeout",
24
+ ]
25
+ AITER_SYSTEM_PROMPT = "You are a GPU kernel optimization expert for AMD MI300X and aiter."
26
+
27
+ try:
28
+ from wafer_core.rollouts.templates import TemplateConfig
29
+ except ImportError:
30
+ from rollouts.templates import TemplateConfig
31
+
32
+ # Format system prompt with template variables ($op, $target become {op}, {target})
33
+ # The template loader will substitute these at runtime
34
+ _SYSTEM_PROMPT = AITER_SYSTEM_PROMPT.replace("{op}", "$op").replace("{target_flag}", "--target $target")
35
+
36
+ template = TemplateConfig(
37
+ # Identity
38
+ name="aiter-optimize",
39
+ description="Optimize AMD aiter operators for better performance on MI300X",
40
+ # System prompt - uses shared prompt from agent_defaults
41
+ system_prompt=_SYSTEM_PROMPT,
42
+ # Tools - full coding environment
43
+ tools=AITER_ENABLED_TOOLS,
44
+ bash_allowlist=AITER_BASH_ALLOWLIST,
45
+ # Network access required for wafer evaluate (connects to remote GPU)
46
+ allow_network=True,
47
+ # Model config - use thinking for optimization analysis
48
+ model="anthropic/claude-sonnet-4-5-20250929",
49
+ max_tokens=16384,
50
+ thinking=True,
51
+ thinking_budget=10000,
52
+ # Multi-turn for iterative optimization
53
+ single_turn=False,
54
+ # Template variables
55
+ defaults={
56
+ "op": "gemm_a8w8",
57
+ "target": "mi300x", # Required - user must specify their target
58
+ },
59
+ )
@@ -35,8 +35,7 @@ Strategy:
35
35
  Commands:
36
36
  - `wafer evaluate --impl <file> --reference <ref> --test-cases <tests>` - Run evaluation
37
37
  - `wafer evaluate --impl <file> --reference <ref> --test-cases <tests> --profile` - With NCU profiling
38
- - `wafer workspaces exec -- <command>` - Run arbitrary commands on remote GPU
39
- - `wafer targets exec <target> -- <command>` - Run commands on a configured target via SSH
38
+ - `wafer remote-run "<command>"` - Run arbitrary commands on remote GPU
40
39
 
41
40
  Output:
42
41
  - Summary of optimizations applied
@@ -49,8 +48,7 @@ IMPORTANT: Always verify correctness with wafer evaluate before claiming success
49
48
  tools=["read", "write", "edit", "glob", "grep", "bash"],
50
49
  bash_allowlist=[
51
50
  "wafer evaluate",
52
- "wafer workspaces exec",
53
- "wafer targets exec",
51
+ "wafer remote-run",
54
52
  "wafer nvidia ncu",
55
53
  "wafer nvidia nsys",
56
54
  "wafer nvidia perfetto",
@@ -1,4 +1,4 @@
1
- """Template for KernelBench optimization.
1
+ """Template for KernelBench optimization - matches eval system prompt.
2
2
 
3
3
  Usage:
4
4
  # Run on a specific problem
@@ -26,18 +26,12 @@ try:
26
26
  except ImportError:
27
27
  from rollouts.templates import TemplateConfig
28
28
 
29
- from wafer.agent_defaults import ENABLED_TOOLS, KERNELBENCH_BASH_ALLOWLIST
30
-
31
- # Task-specific instructions only — must stay in sync with the eval's SYSTEM_PROMPT
32
- # in research/evals/optimize_kernelbench_eval/.../base_config.py.
33
- # Run test_eval_cli_parity.py to verify.
34
- # Wafer CLI command docs are auto-generated from --help text and composed
35
- # at runtime by wevin_cli.py (see wafer.cli_instructions.build_cli_instructions).
36
- # TODO: Consider having both eval and template import SYSTEM_PROMPT from a shared
37
- # module so there's only one copy to maintain.
29
+ # System prompt matches optimize_kernelbench_eval/base_config.py SYSTEM_PROMPT
38
30
  SYSTEM_PROMPT = """\
39
31
  You are a GPU kernel optimization expert. Your task is to write optimized GPU kernels that are correct and faster than the PyTorch baseline.
40
32
 
33
+ IMPORTANT: You do NOT have a local GPU. You MUST use `wafer evaluate kernelbench` to test kernels on remote GPU hardware.
34
+
41
35
  ## Kernel Format (KernelBench)
42
36
 
43
37
  The reference file contains a PyTorch `Model` class. You must write a `ModelNew` class that:
@@ -49,14 +43,49 @@ The reference file also provides:
49
43
  - `get_inputs()` - generates test inputs for forward()
50
44
  - `get_init_inputs()` - generates constructor arguments
51
45
 
46
+ ## Available Tools
47
+
48
+ - read(file_path): Read source files
49
+ - write(file_path, content): Write your optimized kernel
50
+ - glob(pattern): Find files by pattern
51
+ - grep(pattern): Search code
52
+ - bash(command): Run shell commands including wafer CLI
53
+
52
54
  ## Workflow
53
55
 
54
56
  1. Read the reference problem file to understand what `Model` does
55
57
  2. Analyze the computation and identify optimization opportunities
56
58
  3. Write an optimized `ModelNew` class with custom $backend_upper kernels using `__global__` kernel definitions and `torch.utils.cpp_extension.load_inline`
57
- 4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl optimized.py --reference <problem.py> --benchmark`
59
+ 4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl <your_file.py> --reference <problem.py> --benchmark`
58
60
  5. Iterate based on feedback until correct and fast
59
61
 
62
+ ## Example Command
63
+
64
+ ```bash
65
+ wafer evaluate kernelbench \\
66
+ $target_flag \\
67
+ --backend $backend \\
68
+ --impl optimized_kernel.py \\
69
+ --reference $reference \\
70
+ --benchmark
71
+ ```
72
+
73
+ ## Profiling Tools (USE THESE!)
74
+
75
+ When your kernel is slower than expected, use profiling to understand WHY:
76
+
77
+ - `wafer rocprof profile --impl <file> --reference <ref>` - AMD GPU profiling
78
+ - `wafer nvidia ncu --impl <file> --reference <ref>` - NVIDIA NCU profiling
79
+
80
+ ## CRITICAL: Reactive Debugging
81
+
82
+ After EVERY `wafer evaluate` call:
83
+ 1. Check the speedup result
84
+ 2. If speedup < 1.0x (slowdown), STOP and analyze:
85
+ - Run profiling to identify the bottleneck
86
+ - Ask: "Why is this slow?" before trying another approach
87
+ 3. Don't just try random optimizations - understand the root cause
88
+
60
89
  Your kernel MUST:
61
90
  - Pass correctness tests (outputs match reference within tolerance)
62
91
  - Achieve speedup > 1.0x over PyTorch baseline
@@ -67,16 +96,32 @@ You MUST run `wafer evaluate kernelbench` to verify your kernel. Your score depe
67
96
  template = TemplateConfig(
68
97
  # Identity
69
98
  name="optimize-kernelbench",
70
- description="Optimize KernelBench problems",
71
- # System prompt (task-specific; CLI docs appended at runtime)
99
+ description="Optimize KernelBench problems (matches eval system prompt)",
100
+ # System prompt
72
101
  system_prompt=SYSTEM_PROMPT,
73
102
  # Tools
74
- tools=ENABLED_TOOLS,
75
- bash_allowlist=KERNELBENCH_BASH_ALLOWLIST,
76
- # Model config
103
+ tools=["read", "write", "edit", "glob", "grep", "bash"],
104
+ bash_allowlist=[
105
+ "wafer evaluate",
106
+ "wafer nvidia ncu",
107
+ "wafer nvidia nsys",
108
+ "wafer rocprof",
109
+ "wafer compiler-analyze",
110
+ "python",
111
+ "python3",
112
+ "timeout",
113
+ "ls",
114
+ "cat",
115
+ "head",
116
+ "tail",
117
+ "wc",
118
+ "pwd",
119
+ "which",
120
+ ],
121
+ # Model config - match eval settings
77
122
  model="anthropic/claude-opus-4-5-20251101",
78
123
  max_tokens=8192,
79
- # No thinking by default, can override with --thinking
124
+ # No thinking by default (match eval), can override with --thinking
80
125
  thinking=False,
81
126
  # Multi-turn for iterative optimization
82
127
  single_turn=False,
@@ -0,0 +1,156 @@
1
+ """Template for vLLM kernel optimization.
2
+
3
+ Usage:
4
+ # Optimize fused_moe kernel
5
+ wafer agent -t optimize-vllm \
6
+ --args vllm_dir=/path/to/vllm \
7
+ --args op=fused_moe \
8
+ --args target=my-gpu-server \
9
+ "Optimize the fused MoE kernel for better throughput"
10
+
11
+ # With custom test and benchmark commands
12
+ wafer agent -t optimize-vllm \
13
+ --args vllm_dir=./vllm \
14
+ --args op=paged_attention \
15
+ --args test_cmd="pytest tests/kernels/attention/test_attention.py -v" \
16
+ --args bench_cmd="python benchmarks/kernels/benchmark_paged_attention.py" \
17
+ --json
18
+
19
+ Variables:
20
+ - vllm_dir: Path to vLLM repository (required)
21
+ - op: Target op to optimize (required, e.g., fused_moe, paged_attention)
22
+ - target: Target name (default: uses default target)
23
+ - pool: Target pool name (alternative to target)
24
+ - test_cmd: Pytest command for correctness (auto-generated from op if not provided)
25
+ - bench_cmd: Kernel microbenchmark command (auto-generated from op if not provided)
26
+ """
27
+
28
+ try:
29
+ from wafer_core.rollouts.templates import TemplateConfig
30
+ except ImportError:
31
+ from rollouts.templates import TemplateConfig
32
+
33
+ from wafer.agent_defaults import VLLM_BASH_ALLOWLIST, VLLM_ENABLED_TOOLS
34
+
35
+ # Default test commands per op (from vLLM's test structure)
36
+ DEFAULT_TEST_CMDS = {
37
+ "fused_moe": "pytest tests/kernels/moe/test_moe.py -v",
38
+ "paged_attention": "pytest tests/kernels/attention/test_attention.py -v",
39
+ "flash_attn": "pytest tests/kernels/attention/test_flash_attn.py -v",
40
+ "flashinfer": "pytest tests/kernels/attention/test_flashinfer.py -v",
41
+ "rms_norm": "pytest tests/kernels/core/test_layernorm.py -v -k rms",
42
+ "layernorm": "pytest tests/kernels/core/test_layernorm.py -v",
43
+ "rotary_embedding": "pytest tests/kernels/core/test_rotary_embedding.py -v",
44
+ "activation": "pytest tests/kernels/core/test_activation.py -v",
45
+ "fused_topk": "pytest tests/kernels/moe/test_fused_topk.py -v",
46
+ "fp8_quant": "pytest tests/kernels/quantization/test_fp8_quant.py -v",
47
+ "int8_quant": "pytest tests/kernels/quantization/test_int8_quant.py -v",
48
+ }
49
+
50
+ # Default benchmark commands per op.
51
+ # Uses pytest with --durations to measure kernel execution time.
52
+ # vLLM v0.15+ kernel benchmarks require config context, so pytest
53
+ # (which sets up fixtures) is the reliable path.
54
+ DEFAULT_BENCH_CMDS = {
55
+ "fused_moe": "pytest tests/kernels/moe/test_moe.py --timeout=300 --durations=0 -q",
56
+ "paged_attention": "pytest tests/kernels/attention/test_attention.py --timeout=300 --durations=0 -q",
57
+ "rms_norm": "pytest tests/kernels/core/test_layernorm.py -k rms --timeout=120 --durations=0 -q",
58
+ "layernorm": "pytest tests/kernels/core/test_layernorm.py --timeout=120 --durations=0 -q",
59
+ "rotary_embedding": "pytest tests/kernels/core/test_rotary_embedding.py --timeout=120 --durations=0 -q",
60
+ "activation": "pytest tests/kernels/core/test_activation.py --timeout=120 --durations=0 -q",
61
+ "fused_topk": "pytest tests/kernels/moe/test_fused_topk.py --timeout=120 --durations=0 -q",
62
+ "fp8_quant": "pytest tests/kernels/quantization/test_fp8_quant.py --timeout=120 --durations=0 -q",
63
+ "int8_quant": "pytest tests/kernels/quantization/test_int8_quant.py --timeout=120 --durations=0 -q",
64
+ }
65
+
66
+ SYSTEM_PROMPT = """\
67
+ You are a GPU kernel optimization expert. Your task is to improve the performance
68
+ of a specific vLLM kernel while maintaining correctness.
69
+
70
+ ## Target
71
+
72
+ You are optimizing the `$op` kernel in vLLM.
73
+ - vLLM directory: `$vllm_dir`
74
+ - Correctness test: `$test_cmd`
75
+ - Benchmark: `$bench_cmd`
76
+
77
+ ## Workflow
78
+
79
+ 1. **Understand the kernel**: Read the kernel implementation in `$vllm_dir`
80
+ - For MoE: `vllm/model_executor/layers/fused_moe/`
81
+ - For attention: `vllm/attention/backends/`
82
+ - For normalization: `vllm/_custom_ops.py` or specific layer files
83
+ - For quantization: `vllm/_custom_ops.py`
84
+
85
+ 2. **Run baseline benchmark**: Establish baseline performance
86
+ ```bash
87
+ cd $vllm_dir && $bench_cmd
88
+ ```
89
+
90
+ 3. **Analyze and optimize**: Identify optimization opportunities
91
+ - Memory access patterns (coalescing, shared memory usage)
92
+ - Occupancy and register pressure
93
+ - Algorithm improvements
94
+ - Hardware-specific optimizations (tensor cores, etc.)
95
+
96
+ 4. **Modify the kernel**: Make your changes to improve performance
97
+
98
+ 5. **Validate correctness**: Run the test suite
99
+ ```bash
100
+ cd $vllm_dir && $test_cmd
101
+ ```
102
+
103
+ 6. **Measure improvement**: Run benchmark again and compare
104
+
105
+ 7. **Iterate**: If correctness fails or performance regresses, adjust and retry
106
+
107
+ ## Evaluation
108
+
109
+ Use the wafer evaluate command to run both correctness and benchmark:
110
+ ```bash
111
+ wafer evaluate vllm --vllm-dir $vllm_dir --op $op \\
112
+ --test-cmd "$test_cmd" \\
113
+ --bench-cmd "$bench_cmd" \\
114
+ $target_flag --json
115
+ ```
116
+
117
+ ## Constraints
118
+
119
+ - The correctness test MUST pass after your changes
120
+ - Focus on the specific kernel identified (`$op`)
121
+ - Document your changes and reasoning
122
+ - Your score depends on actual measured throughput improvement
123
+
124
+ ## Key Metrics
125
+
126
+ - **time_us**: kernel execution time in microseconds (lower is better)
127
+ - **tflops**: teraflops achieved (higher is better)
128
+ - **bandwidth_gbps**: memory bandwidth in GB/s (higher is better)"""
129
+
130
+ template = TemplateConfig(
131
+ # Identity
132
+ name="optimize-vllm",
133
+ description="Optimize vLLM kernels for better inference performance",
134
+ # System prompt (task-specific; CLI docs appended at runtime)
135
+ system_prompt=SYSTEM_PROMPT,
136
+ # Tools
137
+ tools=VLLM_ENABLED_TOOLS,
138
+ bash_allowlist=VLLM_BASH_ALLOWLIST,
139
+ # Model config
140
+ model="anthropic/claude-opus-4-5-20251101",
141
+ max_tokens=8192,
142
+ # No thinking by default, can override with --thinking
143
+ thinking=False,
144
+ # Multi-turn for iterative optimization
145
+ single_turn=False,
146
+ # Template variables
147
+ defaults={
148
+ "vllm_dir": "./vllm",
149
+ "op": "fused_moe",
150
+ "target": "",
151
+ "pool": "",
152
+ "test_cmd": "", # Auto-filled from DEFAULT_TEST_CMDS[op] if empty
153
+ "bench_cmd": "", # Auto-filled from DEFAULT_BENCH_CMDS[op] if empty
154
+ "target_flag": "", # Auto-computed: --target X or --pool Y
155
+ },
156
+ )
wafer/trace_compare.py CHANGED
@@ -6,22 +6,19 @@ All core logic is in wafer_core.lib.trace_compare.
6
6
 
7
7
  import sys
8
8
  from pathlib import Path
9
- from typing import Any
10
9
 
11
10
  import typer
12
11
 
13
- import json
14
- import sys
15
-
16
12
  from wafer_core.lib.trace_compare import (
17
- analyze_trace_pair,
13
+ analyze_fusion_differences,
14
+ analyze_traces,
18
15
  format_csv,
16
+ format_fusion_csv,
17
+ format_fusion_json,
18
+ format_fusion_text,
19
19
  format_json,
20
20
  format_text,
21
- ArchitectureType,
22
- detect_architecture,
23
21
  )
24
- from wafer_core.lib.trace_compare.loader import StreamingMetadata
25
22
 
26
23
 
27
24
  def compare_traces(
@@ -33,7 +30,6 @@ def compare_traces(
33
30
  show_layers: bool = False,
34
31
  show_all: bool = False,
35
32
  show_stack_traces: bool = False,
36
- recommendations: bool = False,
37
33
  ) -> None:
38
34
  """Compare two GPU traces and generate performance report.
39
35
 
@@ -56,60 +52,21 @@ def compare_traces(
56
52
  typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
57
53
  raise typer.Exit(1)
58
54
 
59
- # Progress callback for JSON format (emits NDJSON to stdout)
60
- def progress_callback(stage: str, fraction: float) -> None:
61
- if output_format == 'json':
62
- progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
63
- print(progress_msg, file=sys.stdout, flush=True)
64
- elif output_format != 'json':
65
- percent = int(fraction * 100)
66
- typer.echo(f"📊 {stage}: {percent}%", err=True)
67
-
68
- # Metadata callback for JSON format (emits NDJSON with early GPU info)
69
- def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
70
- if output_format == 'json':
71
- metadata_msg = json.dumps({
72
- "type": "metadata",
73
- "trace1": {
74
- "platform": meta1.platform,
75
- "gpu": meta1.gpu_name,
76
- "file_size_mb": round(meta1.file_size_mb, 1),
77
- },
78
- "trace2": {
79
- "platform": meta2.platform,
80
- "gpu": meta2.gpu_name,
81
- "file_size_mb": round(meta2.file_size_mb, 1),
82
- },
83
- })
84
- print(metadata_msg, file=sys.stdout, flush=True)
85
- else:
86
- typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
87
- typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
88
-
89
- # Analyze traces using unified API
55
+ # Analyze traces
56
+ # Only show progress messages for non-JSON formats (JSON needs clean stdout)
90
57
  if output_format != 'json':
91
58
  typer.echo("📊 Loading traces...")
92
59
 
60
+ # Determine how many stack traces to collect
61
+ max_stacks = 0 if (show_stack_traces and show_all) else (3 if show_stack_traces else 3)
62
+
93
63
  try:
94
- result_obj = analyze_trace_pair(
64
+ results = analyze_traces(
95
65
  trace1,
96
66
  trace2,
97
- phase=phase,
98
- include_stacks=True,
99
- on_progress=progress_callback,
100
- on_metadata=metadata_callback,
67
+ phase_filter=phase,
68
+ max_stacks=max_stacks,
101
69
  )
102
-
103
- results = {
104
- "metadata": result_obj.metadata,
105
- "operations": result_obj.operations,
106
- "layers": result_obj.layers,
107
- "warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
108
- "architecture": result_obj.architecture.value,
109
- "layer_alignments": result_obj.layer_alignments,
110
- "fusion_analysis": result_obj.fusion_analysis,
111
- "same_kernel_analysis": result_obj.same_kernel_analysis,
112
- }
113
70
  except ValueError as e:
114
71
  typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
115
72
  raise typer.Exit(1)
@@ -117,26 +74,17 @@ def compare_traces(
117
74
  typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
118
75
  raise typer.Exit(1)
119
76
 
77
+ # Show loading confirmation
120
78
  if output_format != 'json':
121
79
  meta = results["metadata"]
80
+ # Determine which trace is AMD and which is NVIDIA
122
81
  if meta['trace1_platform'] == 'AMD':
123
82
  amd_gpu, nvidia_gpu = meta['trace1_gpu'], meta['trace2_gpu']
124
83
  else:
125
84
  amd_gpu, nvidia_gpu = meta['trace2_gpu'], meta['trace1_gpu']
126
85
  typer.echo(f"✅ Loaded: AMD ({amd_gpu}) vs NVIDIA ({nvidia_gpu})")
127
-
128
- # Display warnings
129
- warnings = results.get("warnings", [])
130
- if warnings:
131
- typer.echo()
132
- for warning in warnings:
133
- icon = "❌" if warning["severity"] == "error" else "⚠️" if warning["severity"] == "warning" else "ℹ️"
134
- typer.secho(f"{icon} {warning['message']}", fg=typer.colors.YELLOW if warning["severity"] == "warning" else typer.colors.BLUE)
135
- if warning.get("suggestion"):
136
- typer.secho(f" Suggestion: {warning['suggestion']}", fg=typer.colors.BLUE)
137
86
  typer.echo()
138
87
 
139
-
140
88
  # Generate output based on format
141
89
  if output_format == "text":
142
90
  output_str = format_text(results, show_layers=show_layers, show_all=show_all, show_stack_traces=show_stack_traces)
@@ -160,23 +108,21 @@ def compare_traces(
160
108
  typer.echo(output_str)
161
109
 
162
110
 
163
- def compare_align(
111
+ def compare_fusion(
164
112
  trace1: Path,
165
113
  trace2: Path,
166
114
  output: Path | None = None,
167
- output_format: str = "json",
168
- phase: str = "all",
169
- layer: int | None = None,
115
+ format_type: str = "text",
116
+ min_group_size: int = 50,
170
117
  ) -> None:
171
- """Align kernels at layer level for exact kernel-to-kernel comparison.
118
+ """Analyze kernel fusion differences between AMD and NVIDIA traces.
172
119
 
173
120
  Args:
174
121
  trace1: Path to first trace file (AMD or NVIDIA)
175
122
  trace2: Path to second trace file (AMD or NVIDIA)
176
123
  output: Optional output file path (default: stdout)
177
- output_format: Output format ('json' only for now)
178
- phase: Filter by phase ('all', 'prefill', or 'decode')
179
- layer: Focus on specific layer number (optional)
124
+ format_type: Output format ('text', 'csv', or 'json')
125
+ min_group_size: Minimum correlation group size to analyze
180
126
  """
181
127
  # Validate files exist
182
128
  if not trace1.exists():
@@ -187,86 +133,49 @@ def compare_align(
187
133
  typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
188
134
  raise typer.Exit(1)
189
135
 
190
- # Progress callback for JSON format (emits NDJSON to stdout)
191
- def progress_callback(stage: str, fraction: float) -> None:
192
- if output_format == 'json':
193
- progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
194
- print(progress_msg, file=sys.stdout, flush=True)
195
- else:
196
- percent = int(fraction * 100)
197
- typer.echo(f"📊 {stage}: {percent}%", err=True)
198
-
199
- # Metadata callback for JSON format
200
- def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
201
- if output_format == 'json':
202
- metadata_msg = json.dumps({
203
- "type": "metadata",
204
- "trace1": {
205
- "platform": meta1.platform,
206
- "gpu": meta1.gpu_name,
207
- "file_size_mb": round(meta1.file_size_mb, 1),
208
- },
209
- "trace2": {
210
- "platform": meta2.platform,
211
- "gpu": meta2.gpu_name,
212
- "file_size_mb": round(meta2.file_size_mb, 1),
213
- },
214
- })
215
- print(metadata_msg, file=sys.stdout, flush=True)
216
- else:
217
- typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
218
- typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
219
-
220
- # Analyze traces using unified API
221
- if output_format != 'json':
136
+ # Analyze fusion
137
+ # Only show progress messages for non-JSON formats (JSON needs clean stdout)
138
+ if format_type != 'json':
222
139
  typer.echo("📊 Loading traces...")
223
-
224
140
  try:
225
- result_obj = analyze_trace_pair(
141
+ results = analyze_fusion_differences(
226
142
  trace1,
227
143
  trace2,
228
- phase=phase,
229
- include_stacks=True,
230
- on_progress=progress_callback,
231
- on_metadata=metadata_callback,
144
+ min_group_size=min_group_size,
232
145
  )
233
-
234
- results = {
235
- "metadata": result_obj.metadata,
236
- "layer_alignments": result_obj.layer_alignments or [],
237
- "fusion_analysis": result_obj.fusion_analysis or {},
238
- "same_kernel_analysis": result_obj.same_kernel_analysis or {},
239
- "operations": result_obj.operations,
240
- "layers": result_obj.layers,
241
- "warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
242
- "architecture": result_obj.architecture.value,
243
- }
244
-
245
- if layer is not None:
246
- results["layer_alignments"] = [
247
- la for la in results["layer_alignments"] if la.get("layer") == layer
248
- ]
249
- except ValueError as e:
250
- typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
251
- raise typer.Exit(1)
252
146
  except Exception as e:
253
- typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
147
+ typer.secho(
148
+ f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True
149
+ )
254
150
  import traceback
151
+
255
152
  traceback.print_exc()
256
153
  raise typer.Exit(1)
257
154
 
258
- if output_format != 'json':
155
+ # Show loading confirmation
156
+ if format_type != 'json':
259
157
  meta = results["metadata"]
260
- typer.echo(f"✅ Loaded: {meta.get('amd_gpu', 'Unknown')} vs {meta.get('nvidia_gpu', 'Unknown')}")
261
- typer.echo(f"✅ Found {len(results['layer_alignments'])} layers")
158
+ # Note: fusion analyzer always uses trace1=AMD, trace2=NVIDIA
159
+ typer.echo(f"✅ Loaded: {meta['trace1_gpu']} vs {meta['trace2_gpu']}")
160
+ typer.echo(
161
+ f"Found {meta['trace1_correlation_groups']} trace1 groups and "
162
+ f"{meta['trace2_correlation_groups']} trace2 groups with ≥{min_group_size} kernels"
163
+ )
164
+ typer.echo(f"✅ Matched {meta['matched_groups']} correlation groups")
262
165
  typer.echo()
263
166
 
264
- if output_format == "json":
265
- output_str = format_json(results)
167
+ # Generate output
168
+ if format_type == "text":
169
+ output_str = format_fusion_text(results)
170
+ elif format_type == "csv":
171
+ output_str = format_fusion_csv(results)
172
+ elif format_type == "json":
173
+ output_str = format_fusion_json(results)
266
174
  else:
267
- typer.secho(f"❌ Format {output_format} not yet supported for align command. Use 'json'.", fg=typer.colors.RED, err=True)
175
+ typer.secho(f"❌ Unknown format: {format_type}", fg=typer.colors.RED, err=True)
268
176
  raise typer.Exit(1)
269
177
 
178
+ # Write output
270
179
  if output:
271
180
  output.write_text(output_str)
272
181
  typer.secho(f"✅ Report saved to {output}", fg=typer.colors.GREEN)
wafer/wevin_cli.py CHANGED
@@ -550,7 +550,7 @@ def main( # noqa: PLR0913, PLR0915
550
550
  api_base, api_key, api_key_refresh = _get_wafer_auth(no_proxy=no_proxy)
551
551
  if not api_base or not api_key:
552
552
  print("Error: No API credentials found", file=sys.stderr)
553
- print(" Run 'wafer auth login' or set ANTHROPIC_API_KEY", file=sys.stderr)
553
+ print(" Run 'wafer login' or set ANTHROPIC_API_KEY", file=sys.stderr)
554
554
  sys.exit(1)
555
555
 
556
556
  assert api_base is not None
@@ -573,17 +573,6 @@ def main( # noqa: PLR0913, PLR0915
573
573
  tpl = _get_default_template()
574
574
  base_system_prompt = tpl.system_prompt
575
575
 
576
- # Compose CLI instructions from --help text for allowed wafer commands
577
- # TODO: The eval path doesn't have the skills layer below. If include_skills
578
- # is ever enabled for optimize-kernelbench, the eval would need it too for parity.
579
- # See test_eval_cli_parity.py for coverage notes.
580
- if tpl.bash_allowlist:
581
- from wafer.cli_instructions import build_cli_instructions
582
-
583
- cli_instructions = build_cli_instructions(tpl.bash_allowlist)
584
- if cli_instructions:
585
- base_system_prompt = base_system_prompt + "\n\n" + cli_instructions
586
-
587
576
  # Append skill metadata if skills are enabled
588
577
  if tpl.include_skills:
589
578
  from wafer_core.rollouts.skills import discover_skills, format_skill_metadata_for_prompt