wafer-cli 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +18 -7
- wafer/api_client.py +4 -0
- wafer/auth.py +85 -0
- wafer/cli.py +2339 -404
- wafer/corpus.py +158 -32
- wafer/evaluate.py +1232 -201
- wafer/gpu_run.py +5 -1
- wafer/kernel_scope.py +554 -0
- wafer/nsys_analyze.py +903 -73
- wafer/nsys_profile.py +511 -0
- wafer/output.py +241 -0
- wafer/problems.py +357 -0
- wafer/skills/wafer-guide/SKILL.md +13 -0
- wafer/ssh_keys.py +261 -0
- wafer/target_lock.py +270 -0
- wafer/targets.py +490 -0
- wafer/targets_ops.py +718 -0
- wafer/wevin_cli.py +129 -18
- wafer/workspaces.py +282 -182
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/METADATA +1 -1
- wafer_cli-0.2.10.dist-info/RECORD +40 -0
- wafer_cli-0.2.8.dist-info/RECORD +0 -33
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/top_level.txt +0 -0
wafer/gpu_run.py
CHANGED
|
@@ -19,7 +19,10 @@ CONTAINER_WORKSPACE = "/workspace"
|
|
|
19
19
|
class PushResult:
|
|
20
20
|
"""Result of pushing a directory to remote target."""
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
workspace_name: str # Just the workspace name (e.g., "project")
|
|
23
|
+
workspace_path: (
|
|
24
|
+
str # Full absolute path on remote (e.g., "/home/user/.wafer/workspaces/project")
|
|
25
|
+
)
|
|
23
26
|
files_uploaded: list[str] # Relative paths of uploaded files
|
|
24
27
|
|
|
25
28
|
|
|
@@ -71,6 +74,7 @@ def push_directory(
|
|
|
71
74
|
files_uploaded.append(str(file.relative_to(local_path)))
|
|
72
75
|
|
|
73
76
|
return PushResult(
|
|
77
|
+
workspace_name=workspace_name,
|
|
74
78
|
workspace_path=expanded_workspace,
|
|
75
79
|
files_uploaded=files_uploaded,
|
|
76
80
|
)
|
wafer/kernel_scope.py
ADDED
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
"""Unified ISA Analyzer - CLI for static ISA analysis of AMD GPU kernels.
|
|
2
|
+
|
|
3
|
+
This module provides the CLI wrapper for the `wafer amd isa` command.
|
|
4
|
+
It supports analysis of:
|
|
5
|
+
- AMD GPU code objects (.co) - Via API server with ROCm tools
|
|
6
|
+
- AMDGCN ISA files (.s, .gcn, .asm) - Local parsing
|
|
7
|
+
- LLVM-IR files (.ll) - Local parsing
|
|
8
|
+
- TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
|
|
9
|
+
|
|
10
|
+
Design: Wafer-436 - AMD Kernel Scope / ISA Analyzer
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def print_usage() -> None:
|
|
20
|
+
"""Print CLI usage information."""
|
|
21
|
+
print("Usage: wafer amd isa <subcommand> [options]", file=sys.stderr)
|
|
22
|
+
print("", file=sys.stderr)
|
|
23
|
+
print("Subcommands:", file=sys.stderr)
|
|
24
|
+
print(" analyze <file|directory> Analyze ISA files (.co, .s, .ll, .ttgir)", file=sys.stderr)
|
|
25
|
+
print(" metrics List available metrics", file=sys.stderr)
|
|
26
|
+
print(" targets List supported GPU targets", file=sys.stderr)
|
|
27
|
+
print("", file=sys.stderr)
|
|
28
|
+
print("Supported File Types:", file=sys.stderr)
|
|
29
|
+
print(" .co AMD GPU code objects (requires API authentication)", file=sys.stderr)
|
|
30
|
+
print(" .s, .gcn, .asm AMDGCN ISA assembly (local parsing)", file=sys.stderr)
|
|
31
|
+
print(" .ll, .bc LLVM-IR (local parsing)", file=sys.stderr)
|
|
32
|
+
print(" .ttgir, .ttir, .mlir TTGIR / Triton IR (local parsing)", file=sys.stderr)
|
|
33
|
+
print("", file=sys.stderr)
|
|
34
|
+
print("Analyze Options:", file=sys.stderr)
|
|
35
|
+
print(" --json Output as JSON", file=sys.stderr)
|
|
36
|
+
print(" --csv Output as CSV", file=sys.stderr)
|
|
37
|
+
print(" --recursive / -r Scan directories recursively", file=sys.stderr)
|
|
38
|
+
print(" --filter EXPR Filter results (e.g., 'spills > 0')", file=sys.stderr)
|
|
39
|
+
print(" --output / -o FILE Write output to file", file=sys.stderr)
|
|
40
|
+
print(" --kernel INDEX Kernel index if multiple in file", file=sys.stderr)
|
|
41
|
+
print("", file=sys.stderr)
|
|
42
|
+
print("Examples:", file=sys.stderr)
|
|
43
|
+
print(" wafer amd isa analyze kernel.co # Analyze code object (requires login)", file=sys.stderr)
|
|
44
|
+
print(" wafer amd isa analyze kernel.s # Analyze ISA assembly", file=sys.stderr)
|
|
45
|
+
print(" wafer amd isa analyze kernel.s --json # Output as JSON", file=sys.stderr)
|
|
46
|
+
print(" wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
|
|
47
|
+
print(" wafer amd isa analyze . -r --csv -o metrics.csv", file=sys.stderr)
|
|
48
|
+
print(" wafer amd isa metrics # List available metrics", file=sys.stderr)
|
|
49
|
+
print(" wafer amd isa targets # List supported GPU targets", file=sys.stderr)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def analyze_command(
|
|
53
|
+
path: str,
|
|
54
|
+
json_output: bool = False,
|
|
55
|
+
csv_output: bool = False,
|
|
56
|
+
recursive: bool = True,
|
|
57
|
+
filter_expr: Optional[str] = None,
|
|
58
|
+
output_file: Optional[str] = None,
|
|
59
|
+
kernel_index: int = 0,
|
|
60
|
+
api_url: Optional[str] = None,
|
|
61
|
+
auth_headers: Optional[dict[str, str]] = None,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Analyze ISA/LLVM-IR/TTGIR/.co file or directory.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
path: Path to file or directory
|
|
67
|
+
json_output: Output as JSON
|
|
68
|
+
csv_output: Output as CSV
|
|
69
|
+
recursive: Scan directories recursively
|
|
70
|
+
filter_expr: Filter expression (e.g., "spills > 0")
|
|
71
|
+
output_file: Write output to file
|
|
72
|
+
kernel_index: Kernel index for multi-kernel files
|
|
73
|
+
api_url: API URL for .co file analysis (required for .co files)
|
|
74
|
+
auth_headers: Auth headers for .co file analysis
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Analysis output string
|
|
78
|
+
"""
|
|
79
|
+
from wafer_core.lib.kernel_scope import (
|
|
80
|
+
analyze_isa_file,
|
|
81
|
+
analyze_code_object,
|
|
82
|
+
analyze_directory,
|
|
83
|
+
analyze_file,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
target_path = Path(path).expanduser()
|
|
87
|
+
|
|
88
|
+
if not target_path.exists():
|
|
89
|
+
raise FileNotFoundError(f"Path not found: {path}")
|
|
90
|
+
|
|
91
|
+
# Single file analysis
|
|
92
|
+
if target_path.is_file():
|
|
93
|
+
suffix = target_path.suffix.lower()
|
|
94
|
+
|
|
95
|
+
# Code object files (.co) - need API
|
|
96
|
+
if suffix == ".co":
|
|
97
|
+
if not api_url or not auth_headers:
|
|
98
|
+
raise RuntimeError(
|
|
99
|
+
"API authentication required for .co file analysis. "
|
|
100
|
+
"Run 'wafer login' first."
|
|
101
|
+
)
|
|
102
|
+
result = analyze_code_object(target_path, api_url, auth_headers)
|
|
103
|
+
# ISA files - use kernel_index parameter
|
|
104
|
+
elif suffix in (".s", ".gcn", ".asm"):
|
|
105
|
+
result = analyze_isa_file(target_path, kernel_index=kernel_index)
|
|
106
|
+
else:
|
|
107
|
+
result = analyze_file(target_path, api_url=api_url, auth_headers=auth_headers)
|
|
108
|
+
|
|
109
|
+
if not result.success:
|
|
110
|
+
raise RuntimeError(f"Analysis failed: {result.error}")
|
|
111
|
+
|
|
112
|
+
output = _format_single_result(result, json_output, csv_output)
|
|
113
|
+
|
|
114
|
+
# Directory analysis
|
|
115
|
+
else:
|
|
116
|
+
batch_result = analyze_directory(
|
|
117
|
+
target_path,
|
|
118
|
+
recursive=recursive,
|
|
119
|
+
api_url=api_url,
|
|
120
|
+
auth_headers=auth_headers,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Apply filter if specified
|
|
124
|
+
if filter_expr:
|
|
125
|
+
batch_result = _apply_filter(batch_result, filter_expr)
|
|
126
|
+
|
|
127
|
+
output = _format_batch_result(batch_result, json_output, csv_output)
|
|
128
|
+
|
|
129
|
+
# Write to file if specified
|
|
130
|
+
if output_file:
|
|
131
|
+
Path(output_file).write_text(output)
|
|
132
|
+
print(f"Output written to {output_file}", file=sys.stderr)
|
|
133
|
+
return f"Results saved to {output_file}"
|
|
134
|
+
|
|
135
|
+
return output
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def metrics_command() -> str:
|
|
139
|
+
"""List available metrics.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Metrics list output
|
|
143
|
+
"""
|
|
144
|
+
metrics = [
|
|
145
|
+
("vgpr_count", "Vector GPR allocation", "From .amdhsa_next_free_vgpr directive"),
|
|
146
|
+
("sgpr_count", "Scalar GPR allocation", "From .amdhsa_next_free_sgpr directive"),
|
|
147
|
+
("agpr_count", "Accumulator GPR count", "For MFMA operations (MI100+)"),
|
|
148
|
+
("lds_size", "LDS allocation (bytes)", "From .amdhsa_group_segment_fixed_size"),
|
|
149
|
+
("scratch_size", "Scratch memory (bytes)", "From .amdhsa_private_segment_fixed_size"),
|
|
150
|
+
("spill_count", "Register spill operations", "Count of scratch_store/load instructions"),
|
|
151
|
+
("mfma_count", "MFMA instructions", "Count of v_mfma_* instructions"),
|
|
152
|
+
("mfma_density_pct", "MFMA density (%)", "MFMA / total VALU * 100"),
|
|
153
|
+
("packed_ops_count", "Packed instructions", "Count of v_pk_* instructions"),
|
|
154
|
+
("fma_count", "FMA instructions", "Count of v_fma_* instructions"),
|
|
155
|
+
("barrier_count", "Barriers", "Count of s_barrier instructions"),
|
|
156
|
+
("full_stall_count", "Full stalls", "Count of waitcnt 0 instructions"),
|
|
157
|
+
("global_load_count", "Global loads", "Count of global_load_* instructions"),
|
|
158
|
+
("global_store_count", "Global stores", "Count of global_store_* instructions"),
|
|
159
|
+
("lds_ops_count", "LDS operations", "Count of ds_read/write instructions"),
|
|
160
|
+
("theoretical_occupancy", "Max waves/CU", "Limited by VGPR/SGPR/LDS"),
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
lines = [
|
|
164
|
+
"Available Metrics for Kernel Scope Analysis",
|
|
165
|
+
"=" * 60,
|
|
166
|
+
"",
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
for name, description, derivation in metrics:
|
|
170
|
+
lines.append(f" {name:<25} {description}")
|
|
171
|
+
lines.append(f" {'':<25} Derivation: {derivation}")
|
|
172
|
+
lines.append("")
|
|
173
|
+
|
|
174
|
+
lines.extend([
|
|
175
|
+
"Instruction Categories:",
|
|
176
|
+
" VALU - Vector ALU (v_add_*, v_mul_*, v_fma_*)",
|
|
177
|
+
" SALU - Scalar ALU (s_add_*, s_mul_*)",
|
|
178
|
+
" VMEM - Vector memory (global_load_*, global_store_*)",
|
|
179
|
+
" SMEM - Scalar memory (s_load_*, s_buffer_load_*)",
|
|
180
|
+
" LDS - Local Data Share (ds_read_*, ds_write_*)",
|
|
181
|
+
" MFMA - Matrix FMA (v_mfma_f32_*, v_mfma_f16_*)",
|
|
182
|
+
" SYNC - Synchronization (s_barrier, s_waitcnt)",
|
|
183
|
+
" SPILL - Spill operations (scratch_store_*, scratch_load_*)",
|
|
184
|
+
])
|
|
185
|
+
|
|
186
|
+
return "\n".join(lines)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def targets_command() -> str:
|
|
190
|
+
"""List supported GPU targets.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Targets list output
|
|
194
|
+
"""
|
|
195
|
+
from wafer_core.lib.kernel_scope.targets import SUPPORTED_TARGETS, get_target_specs
|
|
196
|
+
|
|
197
|
+
lines = [
|
|
198
|
+
"Supported GPU Targets",
|
|
199
|
+
"=" * 60,
|
|
200
|
+
"",
|
|
201
|
+
f"{'Architecture':<12} {'Series':<10} {'VGPRs/CU':<10} {'SGPRs/CU':<10} {'LDS/CU':<10} {'Max Waves':<10}",
|
|
202
|
+
"-" * 60,
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
for target in SUPPORTED_TARGETS:
|
|
206
|
+
specs = get_target_specs(target)
|
|
207
|
+
lines.append(
|
|
208
|
+
f"{specs.name:<12} {specs.series:<10} {specs.vgprs_per_cu:<10} "
|
|
209
|
+
f"{specs.sgprs_per_cu:<10} {specs.lds_per_cu:<10} {specs.max_waves_per_cu:<10}"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
lines.extend([
|
|
213
|
+
"",
|
|
214
|
+
"Note: Default values are used for unknown architectures.",
|
|
215
|
+
])
|
|
216
|
+
|
|
217
|
+
return "\n".join(lines)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _format_single_result(result, json_output: bool, csv_output: bool) -> str:
|
|
221
|
+
"""Format a single analysis result."""
|
|
222
|
+
if json_output:
|
|
223
|
+
return result.to_json()
|
|
224
|
+
|
|
225
|
+
if csv_output:
|
|
226
|
+
return _result_to_csv(result)
|
|
227
|
+
|
|
228
|
+
return _result_to_text(result)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _format_batch_result(batch_result, json_output: bool, csv_output: bool) -> str:
|
|
232
|
+
"""Format batch analysis results."""
|
|
233
|
+
if json_output:
|
|
234
|
+
return batch_result.to_json()
|
|
235
|
+
|
|
236
|
+
if csv_output:
|
|
237
|
+
return _batch_to_csv(batch_result)
|
|
238
|
+
|
|
239
|
+
return _batch_to_text(batch_result)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _result_to_text(result) -> str:
|
|
243
|
+
"""Format single result as human-readable text."""
|
|
244
|
+
lines = []
|
|
245
|
+
|
|
246
|
+
if result.code_object_analysis:
|
|
247
|
+
# .co file analysis (via API)
|
|
248
|
+
a = result.code_object_analysis
|
|
249
|
+
lines.extend([
|
|
250
|
+
f"Kernel: {a.kernel_name}",
|
|
251
|
+
f"Architecture: {a.architecture}",
|
|
252
|
+
f"Source: Code Object (.co)",
|
|
253
|
+
"",
|
|
254
|
+
"=== Registers ===",
|
|
255
|
+
f" VGPRs: {a.vgpr_count}",
|
|
256
|
+
f" SGPRs: {a.sgpr_count}",
|
|
257
|
+
f" AGPRs: {a.agpr_count}",
|
|
258
|
+
])
|
|
259
|
+
|
|
260
|
+
if a.vgpr_spill_count > 0 or a.sgpr_spill_count > 0:
|
|
261
|
+
lines.extend([
|
|
262
|
+
"",
|
|
263
|
+
"!!! SPILLS DETECTED !!!",
|
|
264
|
+
f" VGPR spills: {a.vgpr_spill_count}",
|
|
265
|
+
f" SGPR spills: {a.sgpr_spill_count}",
|
|
266
|
+
])
|
|
267
|
+
else:
|
|
268
|
+
lines.append(" Spills: None (good)")
|
|
269
|
+
|
|
270
|
+
lines.extend([
|
|
271
|
+
"",
|
|
272
|
+
"=== Memory ===",
|
|
273
|
+
f" LDS: {a.lds_bytes} bytes",
|
|
274
|
+
f" Global loads: {a.global_loads}",
|
|
275
|
+
f" Global stores: {a.global_stores}",
|
|
276
|
+
f" LDS ops: {a.lds_ops}",
|
|
277
|
+
"",
|
|
278
|
+
"=== Instructions ===",
|
|
279
|
+
f" MFMA: {a.mfma_count}",
|
|
280
|
+
f" FMA: {a.fma_count}",
|
|
281
|
+
f" Packed (v_pk_*): {a.packed_ops_count}",
|
|
282
|
+
f" Full stalls (waitcnt 0): {a.waitcnt_full_stalls}",
|
|
283
|
+
f" Barriers: {a.barriers}",
|
|
284
|
+
])
|
|
285
|
+
|
|
286
|
+
elif result.isa_analysis:
|
|
287
|
+
# .s/.gcn/.asm file analysis (local parsing)
|
|
288
|
+
a = result.isa_analysis
|
|
289
|
+
lines.extend([
|
|
290
|
+
f"Kernel: {a.kernel_name}",
|
|
291
|
+
f"Architecture: {a.architecture}",
|
|
292
|
+
f"Source: ISA Assembly (.s)",
|
|
293
|
+
"",
|
|
294
|
+
"=== Registers ===",
|
|
295
|
+
f" VGPRs: {a.vgpr_count}",
|
|
296
|
+
f" SGPRs: {a.sgpr_count}",
|
|
297
|
+
f" AGPRs: {a.agpr_count}",
|
|
298
|
+
])
|
|
299
|
+
|
|
300
|
+
if a.spill_count > 0:
|
|
301
|
+
lines.extend([
|
|
302
|
+
"",
|
|
303
|
+
"!!! SPILLS DETECTED !!!",
|
|
304
|
+
f" Total spills: {a.spill_count}",
|
|
305
|
+
f" VGPR spills: {a.vgpr_spill_count}",
|
|
306
|
+
f" SGPR spills: {a.sgpr_spill_count}",
|
|
307
|
+
])
|
|
308
|
+
else:
|
|
309
|
+
lines.append(" Spills: None (good)")
|
|
310
|
+
|
|
311
|
+
lines.extend([
|
|
312
|
+
"",
|
|
313
|
+
"=== Memory ===",
|
|
314
|
+
f" LDS: {a.lds_size} bytes",
|
|
315
|
+
f" Scratch: {a.scratch_size} bytes",
|
|
316
|
+
f" Global loads: {a.global_load_count}",
|
|
317
|
+
f" Global stores: {a.global_store_count}",
|
|
318
|
+
f" LDS ops: {a.lds_ops_count}",
|
|
319
|
+
"",
|
|
320
|
+
"=== Instructions ===",
|
|
321
|
+
f" MFMA: {a.mfma_count} ({a.mfma_density_pct:.1f}% density)",
|
|
322
|
+
f" FMA: {a.fma_count}",
|
|
323
|
+
f" Packed (v_pk_*): {a.packed_ops_count}",
|
|
324
|
+
f" Barriers: {a.barrier_count}",
|
|
325
|
+
f" Full stalls: {a.full_stall_count}",
|
|
326
|
+
"",
|
|
327
|
+
"=== Instruction Mix ===",
|
|
328
|
+
f" VALU: {a.instruction_mix.valu_count}",
|
|
329
|
+
f" SALU: {a.instruction_mix.salu_count}",
|
|
330
|
+
f" VMEM: {a.instruction_mix.vmem_count}",
|
|
331
|
+
f" SMEM: {a.instruction_mix.smem_count}",
|
|
332
|
+
f" LDS: {a.instruction_mix.lds_count}",
|
|
333
|
+
f" MFMA: {a.instruction_mix.mfma_count}",
|
|
334
|
+
f" Sync: {a.instruction_mix.sync_count}",
|
|
335
|
+
f" Total: {a.instruction_mix.total_count}",
|
|
336
|
+
"",
|
|
337
|
+
"=== Occupancy ===",
|
|
338
|
+
f" Max waves (VGPR): {a.max_waves_vgpr}",
|
|
339
|
+
f" Max waves (SGPR): {a.max_waves_sgpr}",
|
|
340
|
+
f" Max waves (LDS): {a.max_waves_lds}",
|
|
341
|
+
f" Theoretical: {a.theoretical_occupancy} waves/CU",
|
|
342
|
+
])
|
|
343
|
+
|
|
344
|
+
if a.warnings:
|
|
345
|
+
lines.extend([
|
|
346
|
+
"",
|
|
347
|
+
"=== Warnings ===",
|
|
348
|
+
])
|
|
349
|
+
for warning in a.warnings:
|
|
350
|
+
lines.append(f" {warning}")
|
|
351
|
+
|
|
352
|
+
elif result.ttgir_analysis:
|
|
353
|
+
a = result.ttgir_analysis
|
|
354
|
+
lines.extend([
|
|
355
|
+
"TTGIR Analysis",
|
|
356
|
+
"",
|
|
357
|
+
"=== Operations ===",
|
|
358
|
+
f" tt.dot: {a.dot_count}",
|
|
359
|
+
f" tt.load: {a.load_count}",
|
|
360
|
+
f" tt.store: {a.store_count}",
|
|
361
|
+
f" tt.reduce: {a.reduce_count}",
|
|
362
|
+
f" Barriers: {a.barrier_count}",
|
|
363
|
+
])
|
|
364
|
+
|
|
365
|
+
if a.tile_info:
|
|
366
|
+
lines.extend([
|
|
367
|
+
"",
|
|
368
|
+
"=== Tiling ===",
|
|
369
|
+
f" BLOCK_M: {a.tile_info.block_m}",
|
|
370
|
+
f" BLOCK_N: {a.tile_info.block_n}",
|
|
371
|
+
f" BLOCK_K: {a.tile_info.block_k}",
|
|
372
|
+
f" num_warps: {a.tile_info.num_warps}",
|
|
373
|
+
f" num_stages: {a.tile_info.num_stages}",
|
|
374
|
+
])
|
|
375
|
+
|
|
376
|
+
if a.has_software_pipelining:
|
|
377
|
+
lines.append(" Software pipelining: enabled")
|
|
378
|
+
|
|
379
|
+
if a.estimated_compute_intensity:
|
|
380
|
+
lines.append(f" Compute intensity: {a.estimated_compute_intensity:.1f} FLOPs/byte")
|
|
381
|
+
|
|
382
|
+
elif result.llvm_ir_analysis:
|
|
383
|
+
a = result.llvm_ir_analysis
|
|
384
|
+
lines.extend([
|
|
385
|
+
"LLVM-IR Analysis",
|
|
386
|
+
"",
|
|
387
|
+
f" Functions: {a.function_count}",
|
|
388
|
+
f" Total instructions: {a.total_instructions}",
|
|
389
|
+
f" Functions with loops: {a.functions_with_loops}",
|
|
390
|
+
f" Has vector ops: {a.has_vector_ops}",
|
|
391
|
+
])
|
|
392
|
+
|
|
393
|
+
if a.kernel_functions:
|
|
394
|
+
lines.append(f" Kernel functions: {', '.join(a.kernel_functions)}")
|
|
395
|
+
|
|
396
|
+
return "\n".join(lines)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _result_to_csv(result) -> str:
|
|
400
|
+
"""Format single result as CSV."""
|
|
401
|
+
header = "kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes,global_loads,global_stores"
|
|
402
|
+
|
|
403
|
+
if result.code_object_analysis:
|
|
404
|
+
a = result.code_object_analysis
|
|
405
|
+
row = f"{a.kernel_name},{a.architecture},code_object,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_bytes},{a.global_loads},{a.global_stores}"
|
|
406
|
+
return f"{header}\n{row}"
|
|
407
|
+
|
|
408
|
+
if result.isa_analysis:
|
|
409
|
+
a = result.isa_analysis
|
|
410
|
+
row = f"{a.kernel_name},{a.architecture},isa_assembly,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_size},{a.global_load_count},{a.global_store_count}"
|
|
411
|
+
return f"{header}\n{row}"
|
|
412
|
+
|
|
413
|
+
return "# Unsupported format for CSV"
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _batch_to_text(batch_result) -> str:
|
|
417
|
+
"""Format batch results as text."""
|
|
418
|
+
lines = [
|
|
419
|
+
f"Analyzed {batch_result.total_files} files",
|
|
420
|
+
f" Successful: {batch_result.successful}",
|
|
421
|
+
f" Failed: {batch_result.failed}",
|
|
422
|
+
"",
|
|
423
|
+
]
|
|
424
|
+
|
|
425
|
+
if batch_result.summary:
|
|
426
|
+
lines.extend([
|
|
427
|
+
"=== Summary ===",
|
|
428
|
+
f" Avg VGPRs: {batch_result.summary.get('total_vgpr_avg', 0):.1f}",
|
|
429
|
+
f" Avg SGPRs: {batch_result.summary.get('total_sgpr_avg', 0):.1f}",
|
|
430
|
+
f" Total spills: {batch_result.summary.get('total_spills', 0)}",
|
|
431
|
+
f" Files with spills: {batch_result.summary.get('files_with_spills', 0)}",
|
|
432
|
+
f" Total MFMA: {batch_result.summary.get('total_mfma', 0)}",
|
|
433
|
+
f" Avg MFMA density: {batch_result.summary.get('avg_mfma_density', 0):.1f}%",
|
|
434
|
+
"",
|
|
435
|
+
])
|
|
436
|
+
|
|
437
|
+
# Show individual results
|
|
438
|
+
for result in batch_result.results:
|
|
439
|
+
if result.success and result.code_object_analysis:
|
|
440
|
+
a = result.code_object_analysis
|
|
441
|
+
spills = a.vgpr_spill_count + a.sgpr_spill_count
|
|
442
|
+
status = "⚠️" if spills > 0 else "✓"
|
|
443
|
+
lines.append(
|
|
444
|
+
f" {status} {result.file_path}: "
|
|
445
|
+
f"VGPRs={a.vgpr_count}, spills={spills}, MFMA={a.mfma_count}"
|
|
446
|
+
)
|
|
447
|
+
elif result.success and result.isa_analysis:
|
|
448
|
+
a = result.isa_analysis
|
|
449
|
+
status = "⚠️" if a.spill_count > 0 else "✓"
|
|
450
|
+
lines.append(
|
|
451
|
+
f" {status} {result.file_path}: "
|
|
452
|
+
f"VGPRs={a.vgpr_count}, spills={a.spill_count}, MFMA={a.mfma_count}"
|
|
453
|
+
)
|
|
454
|
+
elif not result.success:
|
|
455
|
+
lines.append(f" ✗ {result.file_path}: {result.error}")
|
|
456
|
+
|
|
457
|
+
return "\n".join(lines)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _batch_to_csv(batch_result) -> str:
|
|
461
|
+
"""Format batch results as CSV."""
|
|
462
|
+
lines = ["file_path,kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes"]
|
|
463
|
+
|
|
464
|
+
for result in batch_result.results:
|
|
465
|
+
if result.success and result.code_object_analysis:
|
|
466
|
+
a = result.code_object_analysis
|
|
467
|
+
lines.append(
|
|
468
|
+
f"{result.file_path},{a.kernel_name},{a.architecture},code_object,"
|
|
469
|
+
f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
|
|
470
|
+
f"{a.mfma_count},{a.lds_bytes}"
|
|
471
|
+
)
|
|
472
|
+
elif result.success and result.isa_analysis:
|
|
473
|
+
a = result.isa_analysis
|
|
474
|
+
lines.append(
|
|
475
|
+
f"{result.file_path},{a.kernel_name},{a.architecture},isa_assembly,"
|
|
476
|
+
f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
|
|
477
|
+
f"{a.mfma_count},{a.lds_size}"
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
return "\n".join(lines)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def _apply_filter(batch_result, filter_expr: str):
|
|
484
|
+
"""Apply filter expression to batch results."""
|
|
485
|
+
# Simple filter parsing: "metric op value"
|
|
486
|
+
# Supported: spills > 0, vgpr_count > 128, mfma_count == 0
|
|
487
|
+
import re
|
|
488
|
+
|
|
489
|
+
match = re.match(r"(\w+)\s*(>|<|>=|<=|==|!=)\s*(\d+)", filter_expr)
|
|
490
|
+
if not match:
|
|
491
|
+
print(f"Warning: Invalid filter expression: {filter_expr}", file=sys.stderr)
|
|
492
|
+
return batch_result
|
|
493
|
+
|
|
494
|
+
metric = match.group(1)
|
|
495
|
+
op = match.group(2)
|
|
496
|
+
value = int(match.group(3))
|
|
497
|
+
|
|
498
|
+
# Map common aliases
|
|
499
|
+
metric_map = {
|
|
500
|
+
"spills": "spill_count",
|
|
501
|
+
"vgpr": "vgpr_count",
|
|
502
|
+
"sgpr": "sgpr_count",
|
|
503
|
+
"mfma": "mfma_count",
|
|
504
|
+
"occupancy": "theoretical_occupancy",
|
|
505
|
+
}
|
|
506
|
+
metric = metric_map.get(metric, metric)
|
|
507
|
+
|
|
508
|
+
# Filter function - supports both isa_analysis and code_object_analysis
|
|
509
|
+
def passes_filter(result):
|
|
510
|
+
if not result.success:
|
|
511
|
+
return False
|
|
512
|
+
|
|
513
|
+
# Try to get metric from either analysis type
|
|
514
|
+
actual = None
|
|
515
|
+
if result.isa_analysis:
|
|
516
|
+
actual = getattr(result.isa_analysis, metric, None)
|
|
517
|
+
elif result.code_object_analysis:
|
|
518
|
+
# Map isa_analysis metric names to code_object_analysis equivalents
|
|
519
|
+
co_metric_map = {
|
|
520
|
+
"spill_count": "vgpr_spill_count", # Use vgpr_spill_count as proxy
|
|
521
|
+
"lds_size": "lds_bytes",
|
|
522
|
+
}
|
|
523
|
+
co_metric = co_metric_map.get(metric, metric)
|
|
524
|
+
actual = getattr(result.code_object_analysis, co_metric, None)
|
|
525
|
+
|
|
526
|
+
if actual is None:
|
|
527
|
+
return False
|
|
528
|
+
|
|
529
|
+
if op == ">":
|
|
530
|
+
return actual > value
|
|
531
|
+
elif op == "<":
|
|
532
|
+
return actual < value
|
|
533
|
+
elif op == ">=":
|
|
534
|
+
return actual >= value
|
|
535
|
+
elif op == "<=":
|
|
536
|
+
return actual <= value
|
|
537
|
+
elif op == "==":
|
|
538
|
+
return actual == value
|
|
539
|
+
elif op == "!=":
|
|
540
|
+
return actual != value
|
|
541
|
+
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
filtered_results = [r for r in batch_result.results if passes_filter(r)]
|
|
545
|
+
|
|
546
|
+
from wafer_core.lib.kernel_scope.api import BatchAnalysisResult
|
|
547
|
+
|
|
548
|
+
return BatchAnalysisResult(
|
|
549
|
+
total_files=len(filtered_results),
|
|
550
|
+
successful=sum(1 for r in filtered_results if r.success),
|
|
551
|
+
failed=sum(1 for r in filtered_results if not r.success),
|
|
552
|
+
results=tuple(filtered_results),
|
|
553
|
+
summary=batch_result.summary,
|
|
554
|
+
)
|