wafer-cli 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ """Template for querying GPU documentation.
2
+
3
+ Usage:
4
+ wafer wevin -t ask-docs "How do bank conflicts occur?"
5
+ wafer wevin -t ask-docs --args corpus=./cuda-docs/ "Explain warp divergence"
6
+ """
7
+
8
+ try:
9
+ from wafer_core.rollouts.templates import TemplateConfig
10
+ except ImportError:
11
+ from rollouts.templates import TemplateConfig
12
+
13
+ # NOTE: Agent tends to prefer bash (find, ls) over glob/grep tools despite system prompt
14
+ # guidance. Expanded allowlist so this works. TODO: improve error display when blocked
15
+ # commands are attempted (currently shows ❌ but error message not visible in TUI).
16
+ template = TemplateConfig(
17
+ # Identity
18
+ name="ask-docs",
19
+ description="Query GPU documentation to answer technical questions",
20
+ # System prompt
21
+ system_prompt="""You are a GPU programming expert helping answer questions about CUDA, GPU architecture, and kernel optimization.
22
+
23
+ Your task: Answer the user's question using the available documentation and tools.
24
+
25
+ You have these tools available:
26
+ - **glob**: Find files by pattern (e.g., glob pattern="**/*.md")
27
+ - **grep**: Search file contents (e.g., grep pattern="shared memory" path=".")
28
+ - **read**: Read file contents (e.g., read file_path="./guide.md")
29
+ - **bash**: Run shell commands (ls, find, cat, head, tail, wc, jq, python -c)
30
+
31
+ Strategy:
32
+ 1. Use the glob tool to find relevant documentation files (e.g., glob pattern="**/*.md")
33
+ 2. Use the grep tool to search for relevant content (e.g., grep pattern="your topic")
34
+ 3. Use the read tool to examine promising files
35
+ 4. Synthesize a clear, accurate answer
36
+
37
+ Prefer glob/grep/read tools over bash equivalents when possible, but bash is available for common commands.
38
+
39
+ Output your answer directly. Be concise but thorough. Include code examples when relevant.
40
+ """,
41
+ # Tools
42
+ tools=["read", "glob", "grep", "bash"],
43
+ bash_allowlist=[
44
+ "ls",
45
+ "find",
46
+ "cat",
47
+ "head",
48
+ "tail",
49
+ "wc",
50
+ "jq",
51
+ "python -c",
52
+ ],
53
+ # Model config
54
+ model="anthropic/claude-sonnet-4-5-20250929",
55
+ max_tokens=8192,
56
+ # Thinking config - disabled for simple doc queries
57
+ thinking=False,
58
+ thinking_budget=10000,
59
+ # Execution mode - multi-turn for follow-up questions
60
+ single_turn=False,
61
+ )
@@ -0,0 +1,71 @@
1
+ """Template for optimizing GPU kernels.
2
+
3
+ Usage:
4
+ wafer wevin -t optimize-kernel --args kernel=./matmul.cu "Optimize for H100"
5
+ wafer wevin -t optimize-kernel --args kernel=./attention.cu --args target=A100 "Reduce memory bandwidth"
6
+ """
7
+
8
+ try:
9
+ from wafer_core.rollouts.templates import TemplateConfig
10
+ except ImportError:
11
+ from rollouts.templates import TemplateConfig
12
+
13
+ template = TemplateConfig(
14
+ # Identity
15
+ name="optimize-kernel",
16
+ description="Optimize GPU kernel implementations for performance",
17
+ # System prompt
18
+ system_prompt="""You are a GPU kernel optimization expert. Your task is to optimize kernel code for maximum performance.
19
+
20
+ Kernel file(s): $kernel
21
+ Target GPU: $target
22
+
23
+ Strategy:
24
+ 1. Read and understand the current implementation
25
+ 2. Run `wafer evaluate` to get baseline performance metrics
26
+ 3. Identify optimization opportunities:
27
+ - Memory access patterns (coalescing, bank conflicts)
28
+ - Occupancy and register usage
29
+ - Warp divergence
30
+ - Instruction-level parallelism
31
+ 4. Implement optimizations using edit tool
32
+ 5. Re-run `wafer evaluate` to verify improvements
33
+ 6. Iterate until target performance is achieved
34
+
35
+ Commands:
36
+ - `wafer evaluate --impl <file> --reference <ref> --test-cases <tests>` - Run evaluation
37
+ - `wafer evaluate --impl <file> --reference <ref> --test-cases <tests> --profile` - With NCU profiling
38
+ - `wafer remote-run "<command>"` - Run arbitrary commands on remote GPU
39
+
40
+ Output:
41
+ - Summary of optimizations applied
42
+ - Before/after performance comparison
43
+ - Explanation of key changes
44
+
45
+ IMPORTANT: Always verify correctness with wafer evaluate before claiming success.
46
+ """,
47
+ # Tools
48
+ tools=["read", "write", "edit", "glob", "grep", "bash"],
49
+ bash_allowlist=[
50
+ "wafer evaluate",
51
+ "wafer remote-run",
52
+ "wafer nvidia ncu",
53
+ "wafer nvidia nsys",
54
+ "wafer nvidia perfetto",
55
+ "jq",
56
+ "python -c",
57
+ ],
58
+ # Model config - use thinking for complex optimization reasoning
59
+ model="anthropic/claude-sonnet-4-5-20250929",
60
+ max_tokens=16384,
61
+ # Thinking config - enabled for complex kernel optimization
62
+ thinking=True,
63
+ thinking_budget=10000,
64
+ # Execution mode - multi-turn for iterative optimization
65
+ single_turn=False,
66
+ # Template variables
67
+ defaults={
68
+ "kernel": "./kernel.cu",
69
+ "target": "H100",
70
+ },
71
+ )
@@ -0,0 +1,137 @@
1
+ """Template for KernelBench optimization - matches eval system prompt.
2
+
3
+ Usage:
4
+ # Run on a specific problem
5
+ wafer agent -t optimize-kernelbench \
6
+ --args reference=/path/to/problem.py \
7
+ --args pool=kernelbench-pool \
8
+ --args backend=hip \
9
+ --json \
10
+ "Optimize the Softmax kernel"
11
+
12
+ # Watch in real-time with JSON streaming
13
+ wafer agent -t optimize-kernelbench \
14
+ --args reference=./23_Softmax.py \
15
+ --json
16
+
17
+ Variables:
18
+ - reference: Path to the KernelBench problem file (required)
19
+ - pool: Target pool name (default: kernelbench-pool)
20
+ - target: Single target name (alternative to pool)
21
+ - backend: Backend type - hip or cuda (default: hip)
22
+ """
23
+
24
+ try:
25
+ from wafer_core.rollouts.templates import TemplateConfig
26
+ except ImportError:
27
+ from rollouts.templates import TemplateConfig
28
+
29
+ # System prompt matches optimize_kernelbench_eval/base_config.py SYSTEM_PROMPT
30
+ SYSTEM_PROMPT = """\
31
+ You are a GPU kernel optimization expert. Your task is to write optimized GPU kernels that are correct and faster than the PyTorch baseline.
32
+
33
+ IMPORTANT: You do NOT have a local GPU. You MUST use `wafer evaluate kernelbench` to test kernels on remote GPU hardware.
34
+
35
+ ## Kernel Format (KernelBench)
36
+
37
+ The reference file contains a PyTorch `Model` class. You must write a `ModelNew` class that:
38
+ 1. Has the same `__init__` signature as `Model`
39
+ 2. Has a `forward()` method with the same input/output signature
40
+ 3. Uses custom $backend_upper kernels for the computation (NOT PyTorch ops like F.scaled_dot_product_attention or torch.matmul)
41
+
42
+ The reference file also provides:
43
+ - `get_inputs()` - generates test inputs for forward()
44
+ - `get_init_inputs()` - generates constructor arguments
45
+
46
+ ## Available Tools
47
+
48
+ - read(file_path): Read source files
49
+ - write(file_path, content): Write your optimized kernel
50
+ - glob(pattern): Find files by pattern
51
+ - grep(pattern): Search code
52
+ - bash(command): Run shell commands including wafer CLI
53
+
54
+ ## Workflow
55
+
56
+ 1. Read the reference problem file to understand what `Model` does
57
+ 2. Analyze the computation and identify optimization opportunities
58
+ 3. Write an optimized `ModelNew` class with custom $backend_upper kernels using `__global__` kernel definitions and `torch.utils.cpp_extension.load_inline`
59
+ 4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl <your_file.py> --reference <problem.py> --benchmark`
60
+ 5. Iterate based on feedback until correct and fast
61
+
62
+ ## Example Command
63
+
64
+ ```bash
65
+ wafer evaluate kernelbench \\
66
+ $target_flag \\
67
+ --backend $backend \\
68
+ --impl optimized_kernel.py \\
69
+ --reference $reference \\
70
+ --benchmark
71
+ ```
72
+
73
+ ## Profiling Tools (USE THESE!)
74
+
75
+ When your kernel is slower than expected, use profiling to understand WHY:
76
+
77
+ - `wafer rocprof profile --impl <file> --reference <ref>` - AMD GPU profiling
78
+ - `wafer nvidia ncu --impl <file> --reference <ref>` - NVIDIA NCU profiling
79
+
80
+ ## CRITICAL: Reactive Debugging
81
+
82
+ After EVERY `wafer evaluate` call:
83
+ 1. Check the speedup result
84
+ 2. If speedup < 1.0x (slowdown), STOP and analyze:
85
+ - Run profiling to identify the bottleneck
86
+ - Ask: "Why is this slow?" before trying another approach
87
+ 3. Don't just try random optimizations - understand the root cause
88
+
89
+ Your kernel MUST:
90
+ - Pass correctness tests (outputs match reference within tolerance)
91
+ - Achieve speedup > 1.0x over PyTorch baseline
92
+ - Use actual $backend_upper kernels (with `__global__` definitions), NOT PyTorch ops
93
+
94
+ You MUST run `wafer evaluate kernelbench` to verify your kernel. Your score depends on actual measured results."""
95
+
96
+ template = TemplateConfig(
97
+ # Identity
98
+ name="optimize-kernelbench",
99
+ description="Optimize KernelBench problems (matches eval system prompt)",
100
+ # System prompt
101
+ system_prompt=SYSTEM_PROMPT,
102
+ # Tools
103
+ tools=["read", "write", "edit", "glob", "grep", "bash"],
104
+ bash_allowlist=[
105
+ "wafer evaluate",
106
+ "wafer nvidia ncu",
107
+ "wafer nvidia nsys",
108
+ "wafer rocprof",
109
+ "wafer compiler-analyze",
110
+ "python",
111
+ "python3",
112
+ "timeout",
113
+ "ls",
114
+ "cat",
115
+ "head",
116
+ "tail",
117
+ "wc",
118
+ "pwd",
119
+ "which",
120
+ ],
121
+ # Model config - match eval settings
122
+ model="anthropic/claude-opus-4-5-20251101",
123
+ max_tokens=8192,
124
+ # No thinking by default (match eval), can override with --thinking
125
+ thinking=False,
126
+ # Multi-turn for iterative optimization
127
+ single_turn=False,
128
+ # Template variables
129
+ defaults={
130
+ "reference": "./problem.py",
131
+ "pool": "kernelbench-pool",
132
+ "target": "", # If set, overrides pool
133
+ "backend": "hip",
134
+ "backend_upper": "HIP", # Auto-computed from backend
135
+ "target_flag": "--pool kernelbench-pool", # Auto-computed
136
+ },
137
+ )
@@ -0,0 +1,74 @@
1
+ """Template for analyzing GPU performance traces.
2
+
3
+ Usage:
4
+ wafer wevin -t trace-analyze --args trace=./profile.ncu-rep "What's the bottleneck?"
5
+ wafer wevin -t trace-analyze --args trace=./trace.nsys-rep "Why is kernel X slow?"
6
+ wafer wevin -t trace-analyze --args trace=./trace.json "Analyze this PyTorch trace"
7
+ """
8
+
9
+ try:
10
+ from wafer_core.rollouts.templates import TemplateConfig
11
+ except ImportError:
12
+ from rollouts.templates import TemplateConfig
13
+
14
+ template = TemplateConfig(
15
+ # Identity
16
+ name="trace-analyze",
17
+ description="Analyze GPU performance traces (NCU, NSYS, Perfetto, PyTorch)",
18
+ # System prompt
19
+ system_prompt="""You are a GPU performance analysis expert. Your task is to analyze performance traces and identify optimization opportunities.
20
+
21
+ Trace file: $trace
22
+
23
+ Strategy:
24
+ 1. Identify the trace type by extension:
25
+ - `.ncu-rep` → NVIDIA Nsight Compute profile
26
+ - `.nsys-rep` → NVIDIA Nsight Systems trace
27
+ - `.json` or `.pt.trace.json` → PyTorch profiler trace (Chrome trace format)
28
+ - `.perfetto` or `.pftrace` → Perfetto trace
29
+
30
+ 2. Use the appropriate wafer analyze command:
31
+ - `wafer nvidia ncu analyze <file>` for NCU profiles
32
+ - `wafer nvidia nsys analyze <file>` for NSYS traces
33
+ - `wafer nvidia perfetto query <file> "<SQL>"` for Perfetto OR PyTorch JSON traces
34
+ - `wafer nvidia perfetto tables <file>` to list available tables
35
+
36
+ 3. For PyTorch/Perfetto traces, useful SQL queries:
37
+ - `SELECT DISTINCT cat FROM slice` - list event categories
38
+ - `SELECT name, dur/1000000.0 as dur_ms FROM slice WHERE cat = 'kernel' ORDER BY dur DESC LIMIT 20` - slowest GPU kernels
39
+ - `SELECT name, SUM(dur)/1000000.0 as total_ms, COUNT(*) as count FROM slice WHERE cat = 'kernel' GROUP BY name ORDER BY total_ms DESC` - kernel time breakdown
40
+ - `SELECT name, dur/1000000.0 as dur_ms FROM slice WHERE cat = 'cpu_op' ORDER BY dur DESC LIMIT 20` - slowest CPU ops
41
+
42
+ 4. Identify bottlenecks and provide actionable recommendations
43
+
44
+ Output format:
45
+ - Summary of key findings
46
+ - Performance bottlenecks identified (ranked by impact)
47
+ - Specific optimization recommendations with expected improvements
48
+ - Code changes if applicable
49
+
50
+ Use `--json` flags when available for structured output that's easier to parse.
51
+ """,
52
+ # Tools
53
+ tools=["read", "glob", "grep", "bash"],
54
+ bash_allowlist=[
55
+ "wafer nvidia ncu",
56
+ "wafer nvidia nsys",
57
+ "wafer nvidia perfetto",
58
+ "wafer nvidia tracelens",
59
+ "jq",
60
+ "python -c",
61
+ ],
62
+ # Model config
63
+ model="anthropic/claude-sonnet-4-5-20250929",
64
+ max_tokens=8192,
65
+ # Thinking config - disabled for trace analysis (mostly parsing)
66
+ thinking=False,
67
+ thinking_budget=10000,
68
+ # Execution mode - single turn for one-shot analysis
69
+ single_turn=True,
70
+ # Template variables
71
+ defaults={
72
+ "trace": "./profile.ncu-rep",
73
+ },
74
+ )
wafer/tracelens.py ADDED
@@ -0,0 +1,218 @@
1
+ """TraceLens CLI wrapper.
2
+
3
+ Provides human-readable CLI interface for TraceLens operations.
4
+ This follows the same pattern as rocprof_sdk.py and other CLI wrappers.
5
+ """
6
+
7
+ import json
8
+ import sys
9
+ from dataclasses import asdict
10
+
11
+
12
+ def print_usage() -> None:
13
+ """Print CLI usage information."""
14
+ print("Usage: wafer tracelens <subcommand> [options]", file=sys.stderr)
15
+ print("", file=sys.stderr)
16
+ print("Subcommands:", file=sys.stderr)
17
+ print(" check Check TraceLens installation status", file=sys.stderr)
18
+ print(" report TRACE Generate performance report from trace file", file=sys.stderr)
19
+ print(" compare A B Compare two performance reports", file=sys.stderr)
20
+ print(" collective DIR Generate multi-rank collective report", file=sys.stderr)
21
+ print("", file=sys.stderr)
22
+ print("Report Options:", file=sys.stderr)
23
+ print(" --output PATH Output file path", file=sys.stderr)
24
+ print(" --format FORMAT Trace format: auto, pytorch, rocprof, jax", file=sys.stderr)
25
+ print(" --short-kernel Include short kernel analysis", file=sys.stderr)
26
+ print(" --kernel-details Include detailed kernel breakdown", file=sys.stderr)
27
+ print(" --json Output result as JSON", file=sys.stderr)
28
+ print("", file=sys.stderr)
29
+ print("Examples:", file=sys.stderr)
30
+ print(" wafer tracelens check", file=sys.stderr)
31
+ print(" wafer tracelens report trace.json", file=sys.stderr)
32
+ print(" wafer tracelens report trace.json --format pytorch --kernel-details", file=sys.stderr)
33
+ print(" wafer tracelens compare baseline.xlsx candidate.xlsx", file=sys.stderr)
34
+ print(" wafer tracelens collective ./traces --world-size 8", file=sys.stderr)
35
+
36
+
37
+ def check_command(json_output: bool = False) -> str:
38
+ """CLI wrapper for checking TraceLens installation.
39
+
40
+ Args:
41
+ json_output: If True, return JSON; otherwise print human-readable
42
+
43
+ Returns:
44
+ Status message or JSON string
45
+ """
46
+ from wafer_core.lib.tracelens import check_installation
47
+
48
+ result = check_installation()
49
+
50
+ if json_output:
51
+ return json.dumps(asdict(result), indent=2)
52
+ else:
53
+ if result.installed:
54
+ print("✓ TraceLens is installed", file=sys.stderr)
55
+ if result.version:
56
+ print(f" Version: {result.version}", file=sys.stderr)
57
+ if result.commands_available:
58
+ print(" Available commands:", file=sys.stderr)
59
+ for cmd in result.commands_available:
60
+ print(f" - {cmd}", file=sys.stderr)
61
+ return "TraceLens is installed"
62
+ else:
63
+ print("✗ TraceLens is not installed", file=sys.stderr)
64
+ if result.install_command:
65
+ print(f" Install: {result.install_command}", file=sys.stderr)
66
+ return "TraceLens is not installed"
67
+
68
+
69
+ def report_command(
70
+ trace_path: str,
71
+ output_path: str | None = None,
72
+ trace_format: str = "auto",
73
+ short_kernel: bool = False,
74
+ kernel_details: bool = False,
75
+ json_output: bool = False,
76
+ ) -> str:
77
+ """CLI wrapper for generating performance report.
78
+
79
+ Args:
80
+ trace_path: Path to trace file
81
+ output_path: Optional output path for Excel report
82
+ trace_format: Trace format (auto, pytorch, rocprof, jax)
83
+ short_kernel: Include short kernel analysis
84
+ kernel_details: Include detailed kernel breakdown
85
+ json_output: If True, return JSON; otherwise print human-readable
86
+
87
+ Returns:
88
+ Success message or JSON string
89
+
90
+ Raises:
91
+ RuntimeError: If report generation fails
92
+ """
93
+ from wafer_core.lib.tracelens import generate_perf_report
94
+ from wafer_core.lib.tracelens.types import TraceFormat
95
+
96
+ format_map = {
97
+ "auto": TraceFormat.AUTO,
98
+ "pytorch": TraceFormat.PYTORCH,
99
+ "rocprof": TraceFormat.ROCPROF,
100
+ "jax": TraceFormat.JAX,
101
+ }
102
+
103
+ result = generate_perf_report(
104
+ trace_path=trace_path,
105
+ output_path=output_path,
106
+ trace_format=format_map.get(trace_format, TraceFormat.AUTO),
107
+ short_kernel_study=short_kernel,
108
+ kernel_details=kernel_details,
109
+ )
110
+
111
+ if json_output:
112
+ return json.dumps(asdict(result), indent=2)
113
+ else:
114
+ if result.success:
115
+ print("✓ Report generated successfully", file=sys.stderr)
116
+ print(f" Output: {result.output_path}", file=sys.stderr)
117
+ print(f" Format: {result.trace_format}", file=sys.stderr)
118
+ return "Report generated"
119
+ else:
120
+ print("✗ Report generation failed", file=sys.stderr)
121
+ if result.error:
122
+ print(f" Error: {result.error}", file=sys.stderr)
123
+ if result.stderr:
124
+ print(" stderr:", file=sys.stderr)
125
+ print(result.stderr, file=sys.stderr)
126
+ raise RuntimeError(result.error or "Report generation failed")
127
+
128
+
129
+ def compare_command(
130
+ baseline_path: str,
131
+ candidate_path: str,
132
+ output_path: str | None = None,
133
+ baseline_name: str = "baseline",
134
+ candidate_name: str = "candidate",
135
+ json_output: bool = False,
136
+ ) -> str:
137
+ """CLI wrapper for comparing two performance reports.
138
+
139
+ Args:
140
+ baseline_path: Path to baseline Excel report
141
+ candidate_path: Path to candidate Excel report
142
+ output_path: Optional output path for comparison file
143
+ baseline_name: Display name for baseline
144
+ candidate_name: Display name for candidate
145
+ json_output: If True, return JSON; otherwise print human-readable
146
+
147
+ Returns:
148
+ Success message or JSON string
149
+
150
+ Raises:
151
+ RuntimeError: If comparison fails
152
+ """
153
+ from wafer_core.lib.tracelens import compare_reports
154
+
155
+ result = compare_reports(
156
+ baseline_path=baseline_path,
157
+ candidate_path=candidate_path,
158
+ output_path=output_path,
159
+ baseline_name=baseline_name,
160
+ candidate_name=candidate_name,
161
+ )
162
+
163
+ if json_output:
164
+ return json.dumps(asdict(result), indent=2)
165
+ else:
166
+ if result.success:
167
+ print("✓ Comparison complete", file=sys.stderr)
168
+ print(f" Output: {result.output_path}", file=sys.stderr)
169
+ return "Comparison complete"
170
+ else:
171
+ print("✗ Comparison failed", file=sys.stderr)
172
+ if result.error:
173
+ print(f" Error: {result.error}", file=sys.stderr)
174
+ raise RuntimeError(result.error or "Comparison failed")
175
+
176
+
177
+ def collective_command(
178
+ trace_dir: str,
179
+ world_size: int,
180
+ output_path: str | None = None,
181
+ json_output: bool = False,
182
+ ) -> str:
183
+ """CLI wrapper for generating multi-rank collective report.
184
+
185
+ Args:
186
+ trace_dir: Directory containing trace files for all ranks
187
+ world_size: Number of ranks (GPUs)
188
+ output_path: Optional output path for report
189
+ json_output: If True, return JSON; otherwise print human-readable
190
+
191
+ Returns:
192
+ Success message or JSON string
193
+
194
+ Raises:
195
+ RuntimeError: If report generation fails
196
+ """
197
+ from wafer_core.lib.tracelens import generate_collective_report
198
+
199
+ result = generate_collective_report(
200
+ trace_dir=trace_dir,
201
+ world_size=world_size,
202
+ output_path=output_path,
203
+ )
204
+
205
+ if json_output:
206
+ return json.dumps(asdict(result), indent=2)
207
+ else:
208
+ if result.success:
209
+ print("✓ Collective report generated", file=sys.stderr)
210
+ print(f" World size: {result.world_size}", file=sys.stderr)
211
+ if result.output_path:
212
+ print(f" Output: {result.output_path}", file=sys.stderr)
213
+ return "Collective report generated"
214
+ else:
215
+ print("✗ Collective report failed", file=sys.stderr)
216
+ if result.error:
217
+ print(f" Error: {result.error}", file=sys.stderr)
218
+ raise RuntimeError(result.error or "Collective report failed")