wafer-cli 0.2.19__tar.gz → 0.2.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. wafer_cli-0.2.36/PKG-INFO +260 -0
  2. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/pyproject.toml +3 -2
  3. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_cli_coverage.py +14 -0
  4. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_wevin_cli.py +86 -7
  5. wafer_cli-0.2.36/wafer/agent_defaults.py +197 -0
  6. wafer_cli-0.2.36/wafer/baseline.py +661 -0
  7. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/cli.py +607 -27
  8. wafer_cli-0.2.36/wafer/cli_instructions.py +143 -0
  9. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/corpus.py +241 -9
  10. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/evaluate.py +424 -8
  11. wafer_cli-0.2.36/wafer/specs_cli.py +157 -0
  12. wafer_cli-0.2.36/wafer/targets_cli.py +472 -0
  13. wafer_cli-0.2.36/wafer/templates/aiter_optimize.py +59 -0
  14. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/templates/ask_docs.py +1 -1
  15. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/templates/optimize_kernel.py +3 -1
  16. wafer_cli-0.2.36/wafer/templates/optimize_vllm.py +156 -0
  17. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/templates/trace_analyze.py +1 -1
  18. wafer_cli-0.2.36/wafer/tests/test_eval_cli_parity.py +199 -0
  19. wafer_cli-0.2.36/wafer/trace_compare.py +183 -0
  20. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/wevin_cli.py +113 -25
  21. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/workspaces.py +96 -0
  22. wafer_cli-0.2.36/wafer_cli.egg-info/PKG-INFO +260 -0
  23. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer_cli.egg-info/SOURCES.txt +9 -0
  24. wafer_cli-0.2.19/PKG-INFO +0 -16
  25. wafer_cli-0.2.19/wafer_cli.egg-info/PKG-INFO +0 -16
  26. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/README.md +0 -0
  27. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/setup.cfg +0 -0
  28. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_analytics.py +0 -0
  29. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_auth.py +0 -0
  30. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_billing.py +0 -0
  31. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_cli_parity_integration.py +0 -0
  32. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_config_integration.py +0 -0
  33. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_file_operations_integration.py +0 -0
  34. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_kernel_scope_cli.py +0 -0
  35. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_nsys_analyze.py +0 -0
  36. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_nsys_profile.py +0 -0
  37. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_output.py +0 -0
  38. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_rocprof_compute_integration.py +0 -0
  39. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_skill_commands.py +0 -0
  40. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_ssh_integration.py +0 -0
  41. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_targets_ops.py +0 -0
  42. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/tests/test_workflow_integration.py +0 -0
  43. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/GUIDE.md +0 -0
  44. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/__init__.py +0 -0
  45. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/analytics.py +0 -0
  46. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/api_client.py +0 -0
  47. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/auth.py +0 -0
  48. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/autotuner.py +0 -0
  49. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/billing.py +0 -0
  50. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/config.py +0 -0
  51. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/global_config.py +0 -0
  52. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/gpu_run.py +0 -0
  53. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/inference.py +0 -0
  54. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/kernel_scope.py +0 -0
  55. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/ncu_analyze.py +0 -0
  56. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/nsys_analyze.py +0 -0
  57. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/nsys_profile.py +0 -0
  58. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/output.py +0 -0
  59. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/problems.py +0 -0
  60. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/rocprof_compute.py +0 -0
  61. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/rocprof_sdk.py +0 -0
  62. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/rocprof_systems.py +0 -0
  63. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/skills/wafer-guide/SKILL.md +0 -0
  64. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/ssh_keys.py +0 -0
  65. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/target_lock.py +0 -0
  66. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/targets.py +0 -0
  67. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/targets_ops.py +0 -0
  68. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/templates/__init__.py +0 -0
  69. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/templates/optimize_kernelbench.py +0 -0
  70. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer/tracelens.py +0 -0
  71. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer_cli.egg-info/dependency_links.txt +0 -0
  72. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer_cli.egg-info/entry_points.txt +0 -0
  73. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer_cli.egg-info/requires.txt +0 -0
  74. {wafer_cli-0.2.19 → wafer_cli-0.2.36}/wafer_cli.egg-info/top_level.txt +0 -0
@@ -0,0 +1,260 @@
1
+ Metadata-Version: 2.4
2
+ Name: wafer-cli
3
+ Version: 0.2.36
4
+ Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
5
+ Requires-Python: >=3.11
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: typer>=0.12.0
8
+ Requires-Dist: trio>=0.24.0
9
+ Requires-Dist: trio-asyncio>=0.15.0
10
+ Requires-Dist: wafer-core>=0.1.0
11
+ Requires-Dist: perfetto>=0.16.0
12
+ Requires-Dist: posthog>=3.0.0
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
15
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
16
+ Requires-Dist: diff-cover>=8.0.0; extra == "dev"
17
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
18
+
19
+ # Wafer CLI
20
+
21
+ Run GPU workloads, optimize kernels, and query GPU documentation.
22
+
23
+ ## Getting Started
24
+
25
+ ```bash
26
+ # Install
27
+ cd apps/wafer-cli && uv sync
28
+
29
+ # Use staging (workspaces and other features require staging)
30
+ wafer config set api.environment staging
31
+
32
+ # Login
33
+ wafer login
34
+
35
+ # Run a command on a remote GPU
36
+ wafer remote-run -- nvidia-smi
37
+ ```
38
+
39
+ ## Commands
40
+
41
+ ### `wafer login` / `wafer logout` / `wafer whoami`
42
+
43
+ Authenticate with GitHub OAuth.
44
+
45
+ ```bash
46
+ wafer login # Opens browser for GitHub OAuth
47
+ wafer whoami # Show current user
48
+ wafer logout # Remove credentials
49
+ ```
50
+
51
+ ### `wafer remote-run`
52
+
53
+ Run any command on a remote GPU.
54
+
55
+ ```bash
56
+ wafer remote-run -- nvidia-smi
57
+ wafer remote-run --upload-dir ./my_code -- python3 train.py
58
+ ```
59
+
60
+ ### `wafer workspaces`
61
+
62
+ Create and manage persistent GPU environments.
63
+
64
+ **Available GPUs:**
65
+
66
+ - `MI300X` - AMD Instinct MI300X (192GB HBM3, ROCm)
67
+ - `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
68
+
69
+ ```bash
70
+ wafer workspaces list
71
+ wafer workspaces create my-workspace --gpu B200 --wait # NVIDIA B200
72
+ wafer workspaces create amd-dev --gpu MI300X # AMD MI300X
73
+ wafer workspaces ssh <workspace-id>
74
+ wafer workspaces delete <workspace-id>
75
+ ```
76
+
77
+ ### `wafer agent`
78
+
79
+ AI assistant for GPU kernel development. Helps with CUDA/Triton optimization, documentation queries, and performance analysis.
80
+
81
+ ```bash
82
+ wafer agent "What is TMEM in CuTeDSL?"
83
+ wafer agent -s "optimize this kernel" < kernel.py
84
+ ```
85
+
86
+ ### `wafer evaluate`
87
+
88
+ Evaluate kernel correctness and performance against a reference implementation.
89
+
90
+ **Functional format** (default):
91
+ ```bash
92
+ # Generate template files
93
+ wafer evaluate make-template ./my-kernel
94
+
95
+ # Run evaluation
96
+ wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
97
+ ```
98
+
99
+ The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
100
+
101
+ **KernelBench format** (ModelNew class):
102
+ ```bash
103
+ # Extract a KernelBench problem as template
104
+ wafer evaluate kernelbench make-template level1/1
105
+
106
+ # Run evaluation
107
+ wafer evaluate kernelbench --impl my_kernel.py --reference problem.py --benchmark
108
+ ```
109
+
110
+ The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
111
+
112
+ ### `wafer wevin -t ask-docs`
113
+
114
+ Query GPU documentation using the docs template.
115
+
116
+ ```bash
117
+ wafer wevin -t ask-docs --json -s "What causes bank conflicts in shared memory?"
118
+ ```
119
+
120
+ ### `wafer corpus`
121
+
122
+ Download documentation to local filesystem for agents to search.
123
+
124
+ ```bash
125
+ wafer corpus list
126
+ wafer corpus download cuda-programming-guide
127
+ ```
128
+
129
+ ---
130
+
131
+ ## Customization
132
+
133
+ ### `wafer remote-run` options
134
+
135
+ ```bash
136
+ wafer remote-run --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel -- python3 script.py
137
+ wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
138
+ ```
139
+
140
+ ### `wafer evaluate` options
141
+
142
+ ```bash
143
+ wafer evaluate --impl k.py --reference r.py --test-cases t.json \
144
+ --target vultr-b200 \ # Specific GPU target
145
+ --benchmark \ # Measure performance
146
+ --profile # Enable torch.profiler + NCU
147
+ ```
148
+
149
+ ### `wafer push` for multi-command workflows
150
+
151
+ ```bash
152
+ WORKSPACE=$(wafer push ./project)
153
+ wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
154
+ wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
155
+ ```
156
+
157
+ ### Profile analysis
158
+
159
+ ```bash
160
+ wafer nvidia ncu analyze profile.ncu-rep
161
+ wafer nvidia nsys analyze profile.nsys-rep
162
+ ```
163
+
164
+ ---
165
+
166
+ ## Advanced
167
+
168
+ ### Local targets
169
+
170
+ Bypass the API and SSH directly to your own GPUs:
171
+
172
+ ```bash
173
+ wafer targets list
174
+ wafer targets add ./my-gpu.toml
175
+ wafer targets default my-gpu
176
+ ```
177
+
178
+ ### Defensive evaluation
179
+
180
+ Detect evaluation hacking (stream injection, lazy evaluation, etc.):
181
+
182
+ ```bash
183
+ wafer evaluate --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
184
+ ```
185
+
186
+ ### Other tools
187
+
188
+ ```bash
189
+ wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
190
+ wafer capture ./script.py # Capture execution snapshot
191
+ wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
192
+ ```
193
+
194
+ ### ROCm profiling (AMD GPUs)
195
+
196
+ ```bash
197
+ wafer rocprof-sdk ...
198
+ wafer rocprof-systems ...
199
+ wafer rocprof-compute ...
200
+ ```
201
+
202
+ ---
203
+
204
+ ## Shell Completion
205
+
206
+ Enable tab completion for commands, options, and target names:
207
+
208
+ ```bash
209
+ # Install completion (zsh/bash/fish)
210
+ wafer --install-completion
211
+
212
+ # Then restart your terminal, or source your shell config:
213
+ source ~/.zshrc # or ~/.bashrc
214
+ ```
215
+
216
+ Now you can tab-complete:
217
+ - Commands: `wafer eva<TAB>` → `wafer evaluate`
218
+ - Options: `wafer evaluate --<TAB>`
219
+ - Target names: `wafer evaluate --target v<TAB>` → `wafer evaluate --target vultr-b200`
220
+ - File paths: `wafer evaluate --impl ./<TAB>`
221
+
222
+ ---
223
+
224
+ ## AI Assistant Skills
225
+
226
+ Install the Wafer CLI skill to make wafer commands discoverable by your AI coding assistant:
227
+
228
+ ```bash
229
+ # Install for all supported tools (Claude Code, Codex CLI, Cursor)
230
+ wafer skill install
231
+
232
+ # Install for a specific tool
233
+ wafer skill install -t cursor # Cursor
234
+ wafer skill install -t claude # Claude Code
235
+ wafer skill install -t codex # Codex CLI
236
+
237
+ # Check installation status
238
+ wafer skill status
239
+
240
+ # Uninstall
241
+ wafer skill uninstall
242
+ ```
243
+
244
+ ### Installing from GitHub (Cursor)
245
+
246
+ You can also install the skill directly from GitHub in Cursor:
247
+
248
+ 1. Open Cursor Settings (Cmd+Shift+J / Ctrl+Shift+J)
249
+ 2. Navigate to **Rules** → **Add Rule** → **Remote Rule (Github)**
250
+ 3. Enter: `https://github.com/wafer-ai/skills`
251
+ 4. Cursor will automatically discover skills in `.cursor/skills/`
252
+
253
+ The skill provides comprehensive guidance for GPU kernel development, including documentation lookup, trace analysis, kernel evaluation, and optimization workflows.
254
+
255
+ ---
256
+
257
+ ## Requirements
258
+
259
+ - Python 3.10+
260
+ - GitHub account (for authentication)
@@ -1,7 +1,8 @@
1
1
  [project]
2
2
  name = "wafer-cli"
3
- version = "0.2.19"
4
- description = "CLI tool for running commands on remote GPUs and GPU kernel optimization agent"
3
+ version = "0.2.36"
4
+ description = "CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels"
5
+ readme = "README.md"
5
6
  requires-python = ">=3.11"
6
7
  dependencies = [
7
8
  "typer>=0.12.0",
@@ -719,3 +719,17 @@ class TestWorkspacesExecFlagPassthrough:
719
719
  "workspaces", "exec", "test-ws", "--", "cmd", "--output=/tmp/out"
720
720
  ])
721
721
  assert "no such option" not in result.output.lower()
722
+
723
+
724
+ class TestAgentNoSandboxOption:
725
+ """Test --no-sandbox option in wafer agent command."""
726
+
727
+ def test_agent_no_sandbox_option_exists(self) -> None:
728
+ """Test that --no-sandbox option is accepted by wafer agent command."""
729
+ result = runner.invoke(app, ["agent", "--help"])
730
+ assert result.exit_code == 0
731
+ # Strip ANSI escape codes before checking (help output may contain color codes)
732
+ ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
733
+ clean_output = ansi_escape.sub('', result.stdout)
734
+ assert "--no-sandbox" in clean_output
735
+ assert "liability" in clean_output.lower() # Warning text should be in help
@@ -634,35 +634,114 @@ def test_streaming_frontend_session_start_state_without_session_id():
634
634
 
635
635
  def test_streaming_frontend_session_start_resumed_then_new():
636
636
  """Test session_start emission when resuming but states have different session_id.
637
-
637
+
638
638
  Edge case: --resume used but states return different session_id (should use states one).
639
639
  """
640
640
  import trio
641
641
 
642
642
  from wafer.wevin_cli import StreamingChunkFrontend
643
-
643
+
644
644
  async def run_test() -> None:
645
645
  # Start with resumed session_id
646
646
  frontend = StreamingChunkFrontend(
647
647
  session_id="resumed-session-123",
648
648
  model="claude-sonnet-4.5"
649
649
  )
650
-
650
+
651
651
  emitted_events = []
652
652
 
653
653
  def mock_emit(obj) -> None:
654
654
  emitted_events.append(obj)
655
-
655
+
656
656
  frontend._emit = mock_emit
657
-
657
+
658
658
  # start() emits session_start for resumed session
659
659
  await frontend.start()
660
660
  assert len(emitted_events) == 1
661
661
  assert emitted_events[0]["session_id"] == "resumed-session-123"
662
-
662
+
663
663
  # If states have different session_id (shouldn't happen, but handle gracefully)
664
664
  # The logic in main() checks `if first_session_id and not session_id`
665
665
  # So if session_id was set, it won't emit again
666
666
  # This is correct behavior - use the one from --resume
667
-
667
+
668
668
  trio.run(run_test)
669
+
670
+
671
+ # =============================================================================
672
+ # --no-sandbox flag tests
673
+ # =============================================================================
674
+
675
+
676
+ def test_no_sandbox_parameter_accepted():
677
+ """Test that no_sandbox parameter exists in wevin_main signature."""
678
+ import inspect
679
+
680
+ from wafer.wevin_cli import main as wevin_main
681
+
682
+ sig = inspect.signature(wevin_main)
683
+ params = sig.parameters
684
+
685
+ # Verify parameter exists
686
+ assert 'no_sandbox' in params
687
+
688
+ # Verify type and default
689
+ assert str(params['no_sandbox'].annotation) in ('bool', "<class 'bool'>")
690
+ assert params['no_sandbox'].default is False
691
+
692
+
693
+ def test_build_environment_accepts_no_sandbox():
694
+ """Test that _build_environment accepts no_sandbox parameter."""
695
+ import inspect
696
+
697
+ from wafer.wevin_cli import _build_environment
698
+
699
+ sig = inspect.signature(_build_environment)
700
+ params = sig.parameters
701
+
702
+ assert 'no_sandbox' in params
703
+ assert params['no_sandbox'].default is False
704
+
705
+
706
+ def test_build_environment_with_no_sandbox_false():
707
+ """Test _build_environment creates env with sandbox ENABLED when no_sandbox=False."""
708
+ from wafer_core.rollouts.templates import TemplateConfig
709
+ from wafer_core.sandbox import SandboxMode
710
+
711
+ from wafer.wevin_cli import _build_environment
712
+
713
+ tpl = TemplateConfig(
714
+ name="test",
715
+ description="Test template",
716
+ system_prompt="Test",
717
+ tools=["read"],
718
+ )
719
+
720
+ # This will raise RuntimeError if sandbox is unavailable on this system
721
+ # That's expected - we're testing that sandbox is ENABLED by default
722
+ try:
723
+ env = _build_environment(tpl, None, None, no_sandbox=False)
724
+ # If we get here, sandbox is available - verify it's enabled
725
+ assert env.sandbox_mode == SandboxMode.ENABLED
726
+ except RuntimeError as e:
727
+ # Sandbox unavailable - that's OK, the error proves ENABLED is set
728
+ assert "sandboxing is not available" in str(e)
729
+
730
+
731
+ def test_build_environment_with_no_sandbox_true():
732
+ """Test _build_environment creates env with sandbox DISABLED when no_sandbox=True."""
733
+ from wafer_core.rollouts.templates import TemplateConfig
734
+ from wafer_core.sandbox import SandboxMode
735
+
736
+ from wafer.wevin_cli import _build_environment
737
+
738
+ tpl = TemplateConfig(
739
+ name="test",
740
+ description="Test template",
741
+ system_prompt="Test",
742
+ tools=["read"],
743
+ )
744
+
745
+ # This should NOT raise - sandbox is disabled
746
+ env = _build_environment(tpl, None, None, no_sandbox=True)
747
+ assert env.sandbox_mode == SandboxMode.DISABLED
@@ -0,0 +1,197 @@
1
+ """Shared agent defaults for kernel optimization tasks.
2
+
3
+ Single source of truth for bash allowlists and enabled tools used by both:
4
+ - CLI templates (apps/wafer-cli/wafer/templates/*.py)
5
+ - Eval configs (research/evals/*_eval/*.py)
6
+
7
+ Import from here instead of defining your own copy.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ # Tools available to the agent (coding environment tools)
13
+ ENABLED_TOOLS: list[str] = ["read", "write", "edit", "glob", "grep", "bash"]
14
+
15
+ # vLLM-specific tools (same as ENABLED_TOOLS for now)
16
+ VLLM_ENABLED_TOOLS: list[str] = ["read", "write", "edit", "glob", "grep", "bash"]
17
+
18
+ # Bash commands allowed for kernel optimization agents.
19
+ # Uses prefix matching — "wafer evaluate" also allows "wafer evaluate kernelbench".
20
+ KERNELBENCH_BASH_ALLOWLIST: list[str] = [
21
+ # Kernel evaluation
22
+ "wafer evaluate",
23
+ # Profiling — AMD
24
+ "wafer amd rocprof-compute",
25
+ "wafer amd rocprof-sdk",
26
+ "wafer amd rocprof-systems",
27
+ # Profiling — NVIDIA
28
+ "wafer nvidia ncu",
29
+ "wafer nvidia nsys",
30
+ # Analysis
31
+ "wafer compiler-analyze",
32
+ # Sub-agents
33
+ "wafer agent -t ask-docs",
34
+ # General utilities
35
+ "python",
36
+ "python3",
37
+ "timeout",
38
+ "ls",
39
+ "cat",
40
+ "head",
41
+ "tail",
42
+ "wc",
43
+ "pwd",
44
+ "which",
45
+ ]
46
+
47
+ # Tools available to aiter optimization agents (full coding environment)
48
+ AITER_ENABLED_TOOLS: list[str] = ["read", "write", "edit", "glob", "grep", "bash"]
49
+
50
+ # System prompt for aiter optimization (shared between eval and template)
51
+ # Uses {op_name}, {test_file}, {target_flag} placeholders
52
+ AITER_SYSTEM_PROMPT = """\
53
+ You are a GPU kernel optimization expert specializing in AMD MI300X and the aiter library.
54
+
55
+ ## Context
56
+
57
+ aiter (ROCm/aiter) is AMD's centralized repository for high-performance AI operators.
58
+ Operators are implemented using Triton kernels, Composable Kernel (CK), or HIP/ROCm.
59
+
60
+ Each operator has a test in `op_tests/test_{{op}}.py` that validates correctness and
61
+ measures performance against a reference implementation.
62
+
63
+ ## Your Task
64
+
65
+ 1. **Understand the operator**: Read the test file and trace imports to find implementation
66
+ 2. **Establish baseline**: Run the evaluation to measure current performance
67
+ ```bash
68
+ # Quick check with one shape (fast iteration)
69
+ wafer evaluate aiter --aiter-dir . --cmd "python op_tests/test_{{op}}.py --mnk 128,32,8192" {target_flag}
70
+
71
+ # Full test suite (final validation)
72
+ wafer evaluate aiter --aiter-dir . --cmd "python op_tests/test_{{op}}.py" {target_flag}
73
+ ```
74
+ 3. **Identify optimizations**: Look for memory access patterns, occupancy, instruction selection
75
+ 4. **Implement changes**: Modify the operator to improve performance
76
+ 5. **Validate**: Re-run evaluation to verify correctness and measure speedup
77
+ 6. **Iterate**: Use quick checks during development, full suite for final validation
78
+
79
+ ## Finding Source Files
80
+
81
+ The aiter codebase structure varies by operator. To find implementation files:
82
+
83
+ 1. **Start with the test file**: `op_tests/test_{{op}}.py`
84
+ - Read imports to see what modules are used
85
+ - Look for the main function being tested
86
+
87
+ 2. **Check common locations** (not all ops have all of these):
88
+ - `aiter/ops/{{op}}.py` — High-level Python API (some ops)
89
+ - `aiter/triton_kernels/` — Triton kernel implementations
90
+ - `csrc/kernels/` — CUDA/HIP kernel implementations
91
+ - `csrc/py_itfs_cu/` — Python interface CUDA files
92
+ - `csrc/cktile_*/` — Composable Kernel tile implementations
93
+
94
+ 3. **Search for the op name**:
95
+ ```bash
96
+ find . -name "*{{op}}*" -type f | grep -v __pycache__
97
+ grep -r "def {{function_name}}" aiter/ csrc/ --include="*.py" --include="*.cu"
98
+ ```
99
+
100
+ ## Key Directories
101
+
102
+ - `aiter/` — Main package with operator implementations
103
+ - `aiter/ops/` — High-level operator APIs (some ops)
104
+ - `aiter/triton_kernels/` — Triton kernel implementations
105
+ - `csrc/` — C++/CUDA/HIP implementations
106
+ - `op_tests/` — Tests for each operator
107
+ - `aiter/configs/` — Tuned configurations (CSV files)
108
+
109
+ ## Output
110
+
111
+ Your goal is to produce:
112
+ 1. Modified operator code with optimizations
113
+ 2. Benchmark results showing correctness and speedup
114
+ 3. A summary of what you changed and why
115
+
116
+ The optimization should be correct (pass the op_test) and faster than baseline."""
117
+
118
+ # Bash commands allowed for aiter optimization agents.
119
+ AITER_BASH_ALLOWLIST: list[str] = [
120
+ # Read-only
121
+ "ls",
122
+ "cat",
123
+ "head",
124
+ "tail",
125
+ "wc",
126
+ "find",
127
+ "grep",
128
+ "rg",
129
+ "pwd",
130
+ "tree",
131
+ "which",
132
+ "diff",
133
+ "sort",
134
+ # Filesystem
135
+ "mkdir",
136
+ "cp",
137
+ "mv",
138
+ # Git
139
+ "git diff",
140
+ "git status",
141
+ "git log",
142
+ # Compilation
143
+ "hipcc",
144
+ "g++",
145
+ "gcc",
146
+ "clang",
147
+ "python",
148
+ "python3",
149
+ "pip",
150
+ "pytest",
151
+ # Execution — allows running compiled binaries and python scripts
152
+ "./",
153
+ # Kernel evaluation
154
+ "wafer evaluate aiter",
155
+ # Profiling — AMD
156
+ "wafer amd rocprof-compute",
157
+ "wafer amd rocprof-sdk",
158
+ "wafer amd rocprof-systems",
159
+ "wafer amd isa",
160
+ # Sub-agents
161
+ "wafer agent -t ask-docs",
162
+ # Misc
163
+ "timeout",
164
+ ]
165
+
166
+ # Bash commands allowed for vLLM kernel optimization agents.
167
+ VLLM_BASH_ALLOWLIST: list[str] = [
168
+ # vLLM evaluation
169
+ "wafer evaluate vllm",
170
+ # vLLM's own test and benchmark commands (run inside vllm dir)
171
+ "pytest",
172
+ # Profiling — AMD
173
+ "wafer amd rocprof-compute",
174
+ "wafer amd rocprof-sdk",
175
+ "wafer amd rocprof-systems",
176
+ # Profiling — NVIDIA
177
+ "wafer nvidia ncu",
178
+ "wafer nvidia nsys",
179
+ # Analysis
180
+ "wafer compiler-analyze",
181
+ # Sub-agents
182
+ "wafer agent -t ask-docs",
183
+ # General utilities
184
+ "python",
185
+ "python3",
186
+ "pip",
187
+ "timeout",
188
+ "ls",
189
+ "cat",
190
+ "head",
191
+ "tail",
192
+ "wc",
193
+ "pwd",
194
+ "which",
195
+ "cd",
196
+ "git",
197
+ ]