wafer-cli 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +118 -0
- wafer/__init__.py +3 -0
- wafer/analytics.py +306 -0
- wafer/api_client.py +195 -0
- wafer/auth.py +432 -0
- wafer/autotuner.py +1080 -0
- wafer/billing.py +233 -0
- wafer/cli.py +7289 -0
- wafer/config.py +105 -0
- wafer/corpus.py +366 -0
- wafer/evaluate.py +4593 -0
- wafer/global_config.py +350 -0
- wafer/gpu_run.py +307 -0
- wafer/inference.py +148 -0
- wafer/kernel_scope.py +552 -0
- wafer/ncu_analyze.py +651 -0
- wafer/nsys_analyze.py +1042 -0
- wafer/nsys_profile.py +510 -0
- wafer/output.py +248 -0
- wafer/problems.py +357 -0
- wafer/rocprof_compute.py +490 -0
- wafer/rocprof_sdk.py +274 -0
- wafer/rocprof_systems.py +520 -0
- wafer/skills/wafer-guide/SKILL.md +129 -0
- wafer/ssh_keys.py +261 -0
- wafer/target_lock.py +270 -0
- wafer/targets.py +842 -0
- wafer/targets_ops.py +717 -0
- wafer/templates/__init__.py +0 -0
- wafer/templates/ask_docs.py +61 -0
- wafer/templates/optimize_kernel.py +71 -0
- wafer/templates/optimize_kernelbench.py +137 -0
- wafer/templates/trace_analyze.py +74 -0
- wafer/tracelens.py +218 -0
- wafer/wevin_cli.py +577 -0
- wafer/workspaces.py +852 -0
- wafer_cli-0.2.14.dist-info/METADATA +16 -0
- wafer_cli-0.2.14.dist-info/RECORD +41 -0
- wafer_cli-0.2.14.dist-info/WHEEL +5 -0
- wafer_cli-0.2.14.dist-info/entry_points.txt +2 -0
- wafer_cli-0.2.14.dist-info/top_level.txt +1 -0
wafer/inference.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Pure functions for inferring what files to upload and which environment to use.
|
|
2
|
+
|
|
3
|
+
All functions are pure: same input = same output, no side effects.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import shlex
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .config import WaferConfig, WaferEnvironment
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def infer_upload_files(command: str, cwd: Path) -> list[Path]:
|
|
13
|
+
"""Infer which files to upload based on command.
|
|
14
|
+
|
|
15
|
+
Pure function: command + directory -> list of paths.
|
|
16
|
+
|
|
17
|
+
Strategy:
|
|
18
|
+
1. Extract file references from command tokens
|
|
19
|
+
2. Add common build files (Makefile, pyproject.toml, etc.)
|
|
20
|
+
3. Add source files matching common patterns
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
command: Command to execute
|
|
24
|
+
cwd: Current working directory
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Sorted list of file paths to upload
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> infer_upload_files("nvcc kernel.cu -o kernel", Path("/home/user/cuda"))
|
|
31
|
+
[Path("/home/user/cuda/kernel.cu"), Path("/home/user/cuda/Makefile"), ...]
|
|
32
|
+
"""
|
|
33
|
+
assert cwd.exists(), f"cwd does not exist: {cwd}"
|
|
34
|
+
assert isinstance(command, str), "command must be a string"
|
|
35
|
+
assert isinstance(cwd, Path), "cwd must be a Path"
|
|
36
|
+
|
|
37
|
+
files = set()
|
|
38
|
+
|
|
39
|
+
# Extract file references from command
|
|
40
|
+
try:
|
|
41
|
+
tokens = shlex.split(command)
|
|
42
|
+
except ValueError:
|
|
43
|
+
# If command has unmatched quotes, just split on spaces
|
|
44
|
+
tokens = command.split()
|
|
45
|
+
|
|
46
|
+
# File extensions we care about
|
|
47
|
+
file_extensions = {
|
|
48
|
+
".cu",
|
|
49
|
+
".cuh",
|
|
50
|
+
".py",
|
|
51
|
+
".cpp",
|
|
52
|
+
".c",
|
|
53
|
+
".h",
|
|
54
|
+
".hpp",
|
|
55
|
+
".rs",
|
|
56
|
+
".go",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
for token in tokens:
|
|
60
|
+
token_path = Path(token)
|
|
61
|
+
if token_path.suffix in file_extensions:
|
|
62
|
+
full_path = cwd / token_path
|
|
63
|
+
if full_path.exists() and full_path.is_file():
|
|
64
|
+
files.add(full_path)
|
|
65
|
+
|
|
66
|
+
# Add common build files if they exist
|
|
67
|
+
common_files = [
|
|
68
|
+
"Makefile",
|
|
69
|
+
"CMakeLists.txt",
|
|
70
|
+
"pyproject.toml",
|
|
71
|
+
"setup.py",
|
|
72
|
+
"Cargo.toml",
|
|
73
|
+
"go.mod",
|
|
74
|
+
"requirements.txt",
|
|
75
|
+
]
|
|
76
|
+
for filename in common_files:
|
|
77
|
+
path = cwd / filename
|
|
78
|
+
if path.exists() and path.is_file():
|
|
79
|
+
files.add(path)
|
|
80
|
+
|
|
81
|
+
# Add all source files in current directory (not recursive)
|
|
82
|
+
source_extensions = [".cu", ".cuh", ".h", ".hpp", ".c", ".cpp"]
|
|
83
|
+
for ext in source_extensions:
|
|
84
|
+
for path in cwd.glob(f"*{ext}"):
|
|
85
|
+
if path.is_file():
|
|
86
|
+
files.add(path)
|
|
87
|
+
|
|
88
|
+
result = sorted(files)
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def resolve_environment(
|
|
93
|
+
config: WaferConfig,
|
|
94
|
+
env_name: str | None,
|
|
95
|
+
) -> WaferEnvironment:
|
|
96
|
+
"""Resolve which environment to use.
|
|
97
|
+
|
|
98
|
+
Pure function: config + name -> environment.
|
|
99
|
+
|
|
100
|
+
Priority:
|
|
101
|
+
1. Explicit env_name argument
|
|
102
|
+
2. Config default_environment
|
|
103
|
+
3. Only environment if there's exactly one
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
config: Wafer configuration
|
|
107
|
+
env_name: Optional environment name from CLI
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
WaferEnvironment to use
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
ValueError: If environment cannot be determined
|
|
114
|
+
|
|
115
|
+
Example:
|
|
116
|
+
>>> config = WaferConfig(...)
|
|
117
|
+
>>> env = resolve_environment(config, "pytorch")
|
|
118
|
+
>>> env.docker
|
|
119
|
+
'pytorch/pytorch:2.5'
|
|
120
|
+
"""
|
|
121
|
+
assert isinstance(config, WaferConfig), "config must be WaferConfig"
|
|
122
|
+
assert env_name is None or isinstance(env_name, str), "env_name must be None or str"
|
|
123
|
+
|
|
124
|
+
# Priority 1: Explicit env_name
|
|
125
|
+
if env_name:
|
|
126
|
+
if env_name not in config.environments:
|
|
127
|
+
available = ", ".join(config.environments.keys())
|
|
128
|
+
raise ValueError(f"Unknown environment: {env_name}. Available: {available}")
|
|
129
|
+
return config.environments[env_name]
|
|
130
|
+
|
|
131
|
+
# Priority 2: Config default
|
|
132
|
+
if config.default_environment:
|
|
133
|
+
assert (
|
|
134
|
+
config.default_environment in config.environments
|
|
135
|
+
), "default_environment validated in WaferConfig"
|
|
136
|
+
return config.environments[config.default_environment]
|
|
137
|
+
|
|
138
|
+
# Priority 3: Only one environment
|
|
139
|
+
if len(config.environments) == 1:
|
|
140
|
+
return next(iter(config.environments.values()))
|
|
141
|
+
|
|
142
|
+
# Cannot determine
|
|
143
|
+
available = ", ".join(config.environments.keys())
|
|
144
|
+
raise ValueError(
|
|
145
|
+
f"No environment specified and no default configured. "
|
|
146
|
+
f"Available: {available}. "
|
|
147
|
+
f"Use --env to specify or set default.environment in config."
|
|
148
|
+
)
|
wafer/kernel_scope.py
ADDED
|
@@ -0,0 +1,552 @@
|
|
|
1
|
+
"""Unified ISA Analyzer - CLI for static ISA analysis of AMD GPU kernels.
|
|
2
|
+
|
|
3
|
+
This module provides the CLI wrapper for the `wafer amd isa` command.
|
|
4
|
+
It supports analysis of:
|
|
5
|
+
- AMD GPU code objects (.co) - Via API server with ROCm tools
|
|
6
|
+
- AMDGCN ISA files (.s, .gcn, .asm) - Local parsing
|
|
7
|
+
- LLVM-IR files (.ll) - Local parsing
|
|
8
|
+
- TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
|
|
9
|
+
|
|
10
|
+
Design: Wafer-436 - AMD Kernel Scope / ISA Analyzer
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def print_usage() -> None:
|
|
18
|
+
"""Print CLI usage information."""
|
|
19
|
+
print("Usage: wafer amd isa <subcommand> [options]", file=sys.stderr)
|
|
20
|
+
print("", file=sys.stderr)
|
|
21
|
+
print("Subcommands:", file=sys.stderr)
|
|
22
|
+
print(" analyze <file|directory> Analyze ISA files (.co, .s, .ll, .ttgir)", file=sys.stderr)
|
|
23
|
+
print(" metrics List available metrics", file=sys.stderr)
|
|
24
|
+
print(" targets List supported GPU targets", file=sys.stderr)
|
|
25
|
+
print("", file=sys.stderr)
|
|
26
|
+
print("Supported File Types:", file=sys.stderr)
|
|
27
|
+
print(" .co AMD GPU code objects (requires API authentication)", file=sys.stderr)
|
|
28
|
+
print(" .s, .gcn, .asm AMDGCN ISA assembly (local parsing)", file=sys.stderr)
|
|
29
|
+
print(" .ll, .bc LLVM-IR (local parsing)", file=sys.stderr)
|
|
30
|
+
print(" .ttgir, .ttir, .mlir TTGIR / Triton IR (local parsing)", file=sys.stderr)
|
|
31
|
+
print("", file=sys.stderr)
|
|
32
|
+
print("Analyze Options:", file=sys.stderr)
|
|
33
|
+
print(" --json Output as JSON", file=sys.stderr)
|
|
34
|
+
print(" --csv Output as CSV", file=sys.stderr)
|
|
35
|
+
print(" --recursive / -r Scan directories recursively", file=sys.stderr)
|
|
36
|
+
print(" --filter EXPR Filter results (e.g., 'spills > 0')", file=sys.stderr)
|
|
37
|
+
print(" --output / -o FILE Write output to file", file=sys.stderr)
|
|
38
|
+
print(" --kernel INDEX Kernel index if multiple in file", file=sys.stderr)
|
|
39
|
+
print("", file=sys.stderr)
|
|
40
|
+
print("Examples:", file=sys.stderr)
|
|
41
|
+
print(" wafer amd isa analyze kernel.co # Analyze code object (requires login)", file=sys.stderr)
|
|
42
|
+
print(" wafer amd isa analyze kernel.s # Analyze ISA assembly", file=sys.stderr)
|
|
43
|
+
print(" wafer amd isa analyze kernel.s --json # Output as JSON", file=sys.stderr)
|
|
44
|
+
print(" wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
|
|
45
|
+
print(" wafer amd isa analyze . -r --csv -o metrics.csv", file=sys.stderr)
|
|
46
|
+
print(" wafer amd isa metrics # List available metrics", file=sys.stderr)
|
|
47
|
+
print(" wafer amd isa targets # List supported GPU targets", file=sys.stderr)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def analyze_command(
|
|
51
|
+
path: str,
|
|
52
|
+
json_output: bool = False,
|
|
53
|
+
csv_output: bool = False,
|
|
54
|
+
recursive: bool = True,
|
|
55
|
+
filter_expr: str | None = None,
|
|
56
|
+
output_file: str | None = None,
|
|
57
|
+
kernel_index: int = 0,
|
|
58
|
+
api_url: str | None = None,
|
|
59
|
+
auth_headers: dict[str, str] | None = None,
|
|
60
|
+
) -> str:
|
|
61
|
+
"""Analyze ISA/LLVM-IR/TTGIR/.co file or directory.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
path: Path to file or directory
|
|
65
|
+
json_output: Output as JSON
|
|
66
|
+
csv_output: Output as CSV
|
|
67
|
+
recursive: Scan directories recursively
|
|
68
|
+
filter_expr: Filter expression (e.g., "spills > 0")
|
|
69
|
+
output_file: Write output to file
|
|
70
|
+
kernel_index: Kernel index for multi-kernel files
|
|
71
|
+
api_url: API URL for .co file analysis (required for .co files)
|
|
72
|
+
auth_headers: Auth headers for .co file analysis
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Analysis output string
|
|
76
|
+
"""
|
|
77
|
+
from wafer_core.lib.kernel_scope import (
|
|
78
|
+
analyze_code_object,
|
|
79
|
+
analyze_directory,
|
|
80
|
+
analyze_file,
|
|
81
|
+
analyze_isa_file,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
target_path = Path(path).expanduser()
|
|
85
|
+
|
|
86
|
+
if not target_path.exists():
|
|
87
|
+
raise FileNotFoundError(f"Path not found: {path}")
|
|
88
|
+
|
|
89
|
+
# Single file analysis
|
|
90
|
+
if target_path.is_file():
|
|
91
|
+
suffix = target_path.suffix.lower()
|
|
92
|
+
|
|
93
|
+
# Code object files (.co) - need API
|
|
94
|
+
if suffix == ".co":
|
|
95
|
+
if not api_url or not auth_headers:
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
"API authentication required for .co file analysis. "
|
|
98
|
+
"Run 'wafer login' first."
|
|
99
|
+
)
|
|
100
|
+
result = analyze_code_object(target_path, api_url, auth_headers)
|
|
101
|
+
# ISA files - use kernel_index parameter
|
|
102
|
+
elif suffix in (".s", ".gcn", ".asm"):
|
|
103
|
+
result = analyze_isa_file(target_path, kernel_index=kernel_index)
|
|
104
|
+
else:
|
|
105
|
+
result = analyze_file(target_path, api_url=api_url, auth_headers=auth_headers)
|
|
106
|
+
|
|
107
|
+
if not result.success:
|
|
108
|
+
raise RuntimeError(f"Analysis failed: {result.error}")
|
|
109
|
+
|
|
110
|
+
output = _format_single_result(result, json_output, csv_output)
|
|
111
|
+
|
|
112
|
+
# Directory analysis
|
|
113
|
+
else:
|
|
114
|
+
batch_result = analyze_directory(
|
|
115
|
+
target_path,
|
|
116
|
+
recursive=recursive,
|
|
117
|
+
api_url=api_url,
|
|
118
|
+
auth_headers=auth_headers,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Apply filter if specified
|
|
122
|
+
if filter_expr:
|
|
123
|
+
batch_result = _apply_filter(batch_result, filter_expr)
|
|
124
|
+
|
|
125
|
+
output = _format_batch_result(batch_result, json_output, csv_output)
|
|
126
|
+
|
|
127
|
+
# Write to file if specified
|
|
128
|
+
if output_file:
|
|
129
|
+
Path(output_file).write_text(output)
|
|
130
|
+
print(f"Output written to {output_file}", file=sys.stderr)
|
|
131
|
+
return f"Results saved to {output_file}"
|
|
132
|
+
|
|
133
|
+
return output
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def metrics_command() -> str:
|
|
137
|
+
"""List available metrics.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Metrics list output
|
|
141
|
+
"""
|
|
142
|
+
metrics = [
|
|
143
|
+
("vgpr_count", "Vector GPR allocation", "From .amdhsa_next_free_vgpr directive"),
|
|
144
|
+
("sgpr_count", "Scalar GPR allocation", "From .amdhsa_next_free_sgpr directive"),
|
|
145
|
+
("agpr_count", "Accumulator GPR count", "For MFMA operations (MI100+)"),
|
|
146
|
+
("lds_size", "LDS allocation (bytes)", "From .amdhsa_group_segment_fixed_size"),
|
|
147
|
+
("scratch_size", "Scratch memory (bytes)", "From .amdhsa_private_segment_fixed_size"),
|
|
148
|
+
("spill_count", "Register spill operations", "Count of scratch_store/load instructions"),
|
|
149
|
+
("mfma_count", "MFMA instructions", "Count of v_mfma_* instructions"),
|
|
150
|
+
("mfma_density_pct", "MFMA density (%)", "MFMA / total VALU * 100"),
|
|
151
|
+
("packed_ops_count", "Packed instructions", "Count of v_pk_* instructions"),
|
|
152
|
+
("fma_count", "FMA instructions", "Count of v_fma_* instructions"),
|
|
153
|
+
("barrier_count", "Barriers", "Count of s_barrier instructions"),
|
|
154
|
+
("full_stall_count", "Full stalls", "Count of waitcnt 0 instructions"),
|
|
155
|
+
("global_load_count", "Global loads", "Count of global_load_* instructions"),
|
|
156
|
+
("global_store_count", "Global stores", "Count of global_store_* instructions"),
|
|
157
|
+
("lds_ops_count", "LDS operations", "Count of ds_read/write instructions"),
|
|
158
|
+
("theoretical_occupancy", "Max waves/CU", "Limited by VGPR/SGPR/LDS"),
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
lines = [
|
|
162
|
+
"Available Metrics for Kernel Scope Analysis",
|
|
163
|
+
"=" * 60,
|
|
164
|
+
"",
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
for name, description, derivation in metrics:
|
|
168
|
+
lines.append(f" {name:<25} {description}")
|
|
169
|
+
lines.append(f" {'':<25} Derivation: {derivation}")
|
|
170
|
+
lines.append("")
|
|
171
|
+
|
|
172
|
+
lines.extend([
|
|
173
|
+
"Instruction Categories:",
|
|
174
|
+
" VALU - Vector ALU (v_add_*, v_mul_*, v_fma_*)",
|
|
175
|
+
" SALU - Scalar ALU (s_add_*, s_mul_*)",
|
|
176
|
+
" VMEM - Vector memory (global_load_*, global_store_*)",
|
|
177
|
+
" SMEM - Scalar memory (s_load_*, s_buffer_load_*)",
|
|
178
|
+
" LDS - Local Data Share (ds_read_*, ds_write_*)",
|
|
179
|
+
" MFMA - Matrix FMA (v_mfma_f32_*, v_mfma_f16_*)",
|
|
180
|
+
" SYNC - Synchronization (s_barrier, s_waitcnt)",
|
|
181
|
+
" SPILL - Spill operations (scratch_store_*, scratch_load_*)",
|
|
182
|
+
])
|
|
183
|
+
|
|
184
|
+
return "\n".join(lines)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def targets_command() -> str:
|
|
188
|
+
"""List supported GPU targets.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Targets list output
|
|
192
|
+
"""
|
|
193
|
+
from wafer_core.lib.kernel_scope.targets import SUPPORTED_TARGETS, get_target_specs
|
|
194
|
+
|
|
195
|
+
lines = [
|
|
196
|
+
"Supported GPU Targets",
|
|
197
|
+
"=" * 60,
|
|
198
|
+
"",
|
|
199
|
+
f"{'Architecture':<12} {'Series':<10} {'VGPRs/CU':<10} {'SGPRs/CU':<10} {'LDS/CU':<10} {'Max Waves':<10}",
|
|
200
|
+
"-" * 60,
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
for target in SUPPORTED_TARGETS:
|
|
204
|
+
specs = get_target_specs(target)
|
|
205
|
+
lines.append(
|
|
206
|
+
f"{specs.name:<12} {specs.series:<10} {specs.vgprs_per_cu:<10} "
|
|
207
|
+
f"{specs.sgprs_per_cu:<10} {specs.lds_per_cu:<10} {specs.max_waves_per_cu:<10}"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
lines.extend([
|
|
211
|
+
"",
|
|
212
|
+
"Note: Default values are used for unknown architectures.",
|
|
213
|
+
])
|
|
214
|
+
|
|
215
|
+
return "\n".join(lines)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _format_single_result(result, json_output: bool, csv_output: bool) -> str:
|
|
219
|
+
"""Format a single analysis result."""
|
|
220
|
+
if json_output:
|
|
221
|
+
return result.to_json()
|
|
222
|
+
|
|
223
|
+
if csv_output:
|
|
224
|
+
return _result_to_csv(result)
|
|
225
|
+
|
|
226
|
+
return _result_to_text(result)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _format_batch_result(batch_result, json_output: bool, csv_output: bool) -> str:
|
|
230
|
+
"""Format batch analysis results."""
|
|
231
|
+
if json_output:
|
|
232
|
+
return batch_result.to_json()
|
|
233
|
+
|
|
234
|
+
if csv_output:
|
|
235
|
+
return _batch_to_csv(batch_result)
|
|
236
|
+
|
|
237
|
+
return _batch_to_text(batch_result)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _result_to_text(result) -> str:
|
|
241
|
+
"""Format single result as human-readable text."""
|
|
242
|
+
lines = []
|
|
243
|
+
|
|
244
|
+
if result.code_object_analysis:
|
|
245
|
+
# .co file analysis (via API)
|
|
246
|
+
a = result.code_object_analysis
|
|
247
|
+
lines.extend([
|
|
248
|
+
f"Kernel: {a.kernel_name}",
|
|
249
|
+
f"Architecture: {a.architecture}",
|
|
250
|
+
"Source: Code Object (.co)",
|
|
251
|
+
"",
|
|
252
|
+
"=== Registers ===",
|
|
253
|
+
f" VGPRs: {a.vgpr_count}",
|
|
254
|
+
f" SGPRs: {a.sgpr_count}",
|
|
255
|
+
f" AGPRs: {a.agpr_count}",
|
|
256
|
+
])
|
|
257
|
+
|
|
258
|
+
if a.vgpr_spill_count > 0 or a.sgpr_spill_count > 0:
|
|
259
|
+
lines.extend([
|
|
260
|
+
"",
|
|
261
|
+
"!!! SPILLS DETECTED !!!",
|
|
262
|
+
f" VGPR spills: {a.vgpr_spill_count}",
|
|
263
|
+
f" SGPR spills: {a.sgpr_spill_count}",
|
|
264
|
+
])
|
|
265
|
+
else:
|
|
266
|
+
lines.append(" Spills: None (good)")
|
|
267
|
+
|
|
268
|
+
lines.extend([
|
|
269
|
+
"",
|
|
270
|
+
"=== Memory ===",
|
|
271
|
+
f" LDS: {a.lds_bytes} bytes",
|
|
272
|
+
f" Global loads: {a.global_loads}",
|
|
273
|
+
f" Global stores: {a.global_stores}",
|
|
274
|
+
f" LDS ops: {a.lds_ops}",
|
|
275
|
+
"",
|
|
276
|
+
"=== Instructions ===",
|
|
277
|
+
f" MFMA: {a.mfma_count}",
|
|
278
|
+
f" FMA: {a.fma_count}",
|
|
279
|
+
f" Packed (v_pk_*): {a.packed_ops_count}",
|
|
280
|
+
f" Full stalls (waitcnt 0): {a.waitcnt_full_stalls}",
|
|
281
|
+
f" Barriers: {a.barriers}",
|
|
282
|
+
])
|
|
283
|
+
|
|
284
|
+
elif result.isa_analysis:
|
|
285
|
+
# .s/.gcn/.asm file analysis (local parsing)
|
|
286
|
+
a = result.isa_analysis
|
|
287
|
+
lines.extend([
|
|
288
|
+
f"Kernel: {a.kernel_name}",
|
|
289
|
+
f"Architecture: {a.architecture}",
|
|
290
|
+
"Source: ISA Assembly (.s)",
|
|
291
|
+
"",
|
|
292
|
+
"=== Registers ===",
|
|
293
|
+
f" VGPRs: {a.vgpr_count}",
|
|
294
|
+
f" SGPRs: {a.sgpr_count}",
|
|
295
|
+
f" AGPRs: {a.agpr_count}",
|
|
296
|
+
])
|
|
297
|
+
|
|
298
|
+
if a.spill_count > 0:
|
|
299
|
+
lines.extend([
|
|
300
|
+
"",
|
|
301
|
+
"!!! SPILLS DETECTED !!!",
|
|
302
|
+
f" Total spills: {a.spill_count}",
|
|
303
|
+
f" VGPR spills: {a.vgpr_spill_count}",
|
|
304
|
+
f" SGPR spills: {a.sgpr_spill_count}",
|
|
305
|
+
])
|
|
306
|
+
else:
|
|
307
|
+
lines.append(" Spills: None (good)")
|
|
308
|
+
|
|
309
|
+
lines.extend([
|
|
310
|
+
"",
|
|
311
|
+
"=== Memory ===",
|
|
312
|
+
f" LDS: {a.lds_size} bytes",
|
|
313
|
+
f" Scratch: {a.scratch_size} bytes",
|
|
314
|
+
f" Global loads: {a.global_load_count}",
|
|
315
|
+
f" Global stores: {a.global_store_count}",
|
|
316
|
+
f" LDS ops: {a.lds_ops_count}",
|
|
317
|
+
"",
|
|
318
|
+
"=== Instructions ===",
|
|
319
|
+
f" MFMA: {a.mfma_count} ({a.mfma_density_pct:.1f}% density)",
|
|
320
|
+
f" FMA: {a.fma_count}",
|
|
321
|
+
f" Packed (v_pk_*): {a.packed_ops_count}",
|
|
322
|
+
f" Barriers: {a.barrier_count}",
|
|
323
|
+
f" Full stalls: {a.full_stall_count}",
|
|
324
|
+
"",
|
|
325
|
+
"=== Instruction Mix ===",
|
|
326
|
+
f" VALU: {a.instruction_mix.valu_count}",
|
|
327
|
+
f" SALU: {a.instruction_mix.salu_count}",
|
|
328
|
+
f" VMEM: {a.instruction_mix.vmem_count}",
|
|
329
|
+
f" SMEM: {a.instruction_mix.smem_count}",
|
|
330
|
+
f" LDS: {a.instruction_mix.lds_count}",
|
|
331
|
+
f" MFMA: {a.instruction_mix.mfma_count}",
|
|
332
|
+
f" Sync: {a.instruction_mix.sync_count}",
|
|
333
|
+
f" Total: {a.instruction_mix.total_count}",
|
|
334
|
+
"",
|
|
335
|
+
"=== Occupancy ===",
|
|
336
|
+
f" Max waves (VGPR): {a.max_waves_vgpr}",
|
|
337
|
+
f" Max waves (SGPR): {a.max_waves_sgpr}",
|
|
338
|
+
f" Max waves (LDS): {a.max_waves_lds}",
|
|
339
|
+
f" Theoretical: {a.theoretical_occupancy} waves/CU",
|
|
340
|
+
])
|
|
341
|
+
|
|
342
|
+
if a.warnings:
|
|
343
|
+
lines.extend([
|
|
344
|
+
"",
|
|
345
|
+
"=== Warnings ===",
|
|
346
|
+
])
|
|
347
|
+
for warning in a.warnings:
|
|
348
|
+
lines.append(f" {warning}")
|
|
349
|
+
|
|
350
|
+
elif result.ttgir_analysis:
|
|
351
|
+
a = result.ttgir_analysis
|
|
352
|
+
lines.extend([
|
|
353
|
+
"TTGIR Analysis",
|
|
354
|
+
"",
|
|
355
|
+
"=== Operations ===",
|
|
356
|
+
f" tt.dot: {a.dot_count}",
|
|
357
|
+
f" tt.load: {a.load_count}",
|
|
358
|
+
f" tt.store: {a.store_count}",
|
|
359
|
+
f" tt.reduce: {a.reduce_count}",
|
|
360
|
+
f" Barriers: {a.barrier_count}",
|
|
361
|
+
])
|
|
362
|
+
|
|
363
|
+
if a.tile_info:
|
|
364
|
+
lines.extend([
|
|
365
|
+
"",
|
|
366
|
+
"=== Tiling ===",
|
|
367
|
+
f" BLOCK_M: {a.tile_info.block_m}",
|
|
368
|
+
f" BLOCK_N: {a.tile_info.block_n}",
|
|
369
|
+
f" BLOCK_K: {a.tile_info.block_k}",
|
|
370
|
+
f" num_warps: {a.tile_info.num_warps}",
|
|
371
|
+
f" num_stages: {a.tile_info.num_stages}",
|
|
372
|
+
])
|
|
373
|
+
|
|
374
|
+
if a.has_software_pipelining:
|
|
375
|
+
lines.append(" Software pipelining: enabled")
|
|
376
|
+
|
|
377
|
+
if a.estimated_compute_intensity:
|
|
378
|
+
lines.append(f" Compute intensity: {a.estimated_compute_intensity:.1f} FLOPs/byte")
|
|
379
|
+
|
|
380
|
+
elif result.llvm_ir_analysis:
|
|
381
|
+
a = result.llvm_ir_analysis
|
|
382
|
+
lines.extend([
|
|
383
|
+
"LLVM-IR Analysis",
|
|
384
|
+
"",
|
|
385
|
+
f" Functions: {a.function_count}",
|
|
386
|
+
f" Total instructions: {a.total_instructions}",
|
|
387
|
+
f" Functions with loops: {a.functions_with_loops}",
|
|
388
|
+
f" Has vector ops: {a.has_vector_ops}",
|
|
389
|
+
])
|
|
390
|
+
|
|
391
|
+
if a.kernel_functions:
|
|
392
|
+
lines.append(f" Kernel functions: {', '.join(a.kernel_functions)}")
|
|
393
|
+
|
|
394
|
+
return "\n".join(lines)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _result_to_csv(result) -> str:
|
|
398
|
+
"""Format single result as CSV."""
|
|
399
|
+
header = "kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes,global_loads,global_stores"
|
|
400
|
+
|
|
401
|
+
if result.code_object_analysis:
|
|
402
|
+
a = result.code_object_analysis
|
|
403
|
+
row = f"{a.kernel_name},{a.architecture},code_object,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_bytes},{a.global_loads},{a.global_stores}"
|
|
404
|
+
return f"{header}\n{row}"
|
|
405
|
+
|
|
406
|
+
if result.isa_analysis:
|
|
407
|
+
a = result.isa_analysis
|
|
408
|
+
row = f"{a.kernel_name},{a.architecture},isa_assembly,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_size},{a.global_load_count},{a.global_store_count}"
|
|
409
|
+
return f"{header}\n{row}"
|
|
410
|
+
|
|
411
|
+
return "# Unsupported format for CSV"
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _batch_to_text(batch_result) -> str:
|
|
415
|
+
"""Format batch results as text."""
|
|
416
|
+
lines = [
|
|
417
|
+
f"Analyzed {batch_result.total_files} files",
|
|
418
|
+
f" Successful: {batch_result.successful}",
|
|
419
|
+
f" Failed: {batch_result.failed}",
|
|
420
|
+
"",
|
|
421
|
+
]
|
|
422
|
+
|
|
423
|
+
if batch_result.summary:
|
|
424
|
+
lines.extend([
|
|
425
|
+
"=== Summary ===",
|
|
426
|
+
f" Avg VGPRs: {batch_result.summary.get('total_vgpr_avg', 0):.1f}",
|
|
427
|
+
f" Avg SGPRs: {batch_result.summary.get('total_sgpr_avg', 0):.1f}",
|
|
428
|
+
f" Total spills: {batch_result.summary.get('total_spills', 0)}",
|
|
429
|
+
f" Files with spills: {batch_result.summary.get('files_with_spills', 0)}",
|
|
430
|
+
f" Total MFMA: {batch_result.summary.get('total_mfma', 0)}",
|
|
431
|
+
f" Avg MFMA density: {batch_result.summary.get('avg_mfma_density', 0):.1f}%",
|
|
432
|
+
"",
|
|
433
|
+
])
|
|
434
|
+
|
|
435
|
+
# Show individual results
|
|
436
|
+
for result in batch_result.results:
|
|
437
|
+
if result.success and result.code_object_analysis:
|
|
438
|
+
a = result.code_object_analysis
|
|
439
|
+
spills = a.vgpr_spill_count + a.sgpr_spill_count
|
|
440
|
+
status = "⚠️" if spills > 0 else "✓"
|
|
441
|
+
lines.append(
|
|
442
|
+
f" {status} {result.file_path}: "
|
|
443
|
+
f"VGPRs={a.vgpr_count}, spills={spills}, MFMA={a.mfma_count}"
|
|
444
|
+
)
|
|
445
|
+
elif result.success and result.isa_analysis:
|
|
446
|
+
a = result.isa_analysis
|
|
447
|
+
status = "⚠️" if a.spill_count > 0 else "✓"
|
|
448
|
+
lines.append(
|
|
449
|
+
f" {status} {result.file_path}: "
|
|
450
|
+
f"VGPRs={a.vgpr_count}, spills={a.spill_count}, MFMA={a.mfma_count}"
|
|
451
|
+
)
|
|
452
|
+
elif not result.success:
|
|
453
|
+
lines.append(f" ✗ {result.file_path}: {result.error}")
|
|
454
|
+
|
|
455
|
+
return "\n".join(lines)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _batch_to_csv(batch_result) -> str:
|
|
459
|
+
"""Format batch results as CSV."""
|
|
460
|
+
lines = ["file_path,kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes"]
|
|
461
|
+
|
|
462
|
+
for result in batch_result.results:
|
|
463
|
+
if result.success and result.code_object_analysis:
|
|
464
|
+
a = result.code_object_analysis
|
|
465
|
+
lines.append(
|
|
466
|
+
f"{result.file_path},{a.kernel_name},{a.architecture},code_object,"
|
|
467
|
+
f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
|
|
468
|
+
f"{a.mfma_count},{a.lds_bytes}"
|
|
469
|
+
)
|
|
470
|
+
elif result.success and result.isa_analysis:
|
|
471
|
+
a = result.isa_analysis
|
|
472
|
+
lines.append(
|
|
473
|
+
f"{result.file_path},{a.kernel_name},{a.architecture},isa_assembly,"
|
|
474
|
+
f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
|
|
475
|
+
f"{a.mfma_count},{a.lds_size}"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
return "\n".join(lines)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def _apply_filter(batch_result, filter_expr: str):
|
|
482
|
+
"""Apply filter expression to batch results."""
|
|
483
|
+
# Simple filter parsing: "metric op value"
|
|
484
|
+
# Supported: spills > 0, vgpr_count > 128, mfma_count == 0
|
|
485
|
+
import re
|
|
486
|
+
|
|
487
|
+
match = re.match(r"(\w+)\s*(>|<|>=|<=|==|!=)\s*(\d+)", filter_expr)
|
|
488
|
+
if not match:
|
|
489
|
+
print(f"Warning: Invalid filter expression: {filter_expr}", file=sys.stderr)
|
|
490
|
+
return batch_result
|
|
491
|
+
|
|
492
|
+
metric = match.group(1)
|
|
493
|
+
op = match.group(2)
|
|
494
|
+
value = int(match.group(3))
|
|
495
|
+
|
|
496
|
+
# Map common aliases
|
|
497
|
+
metric_map = {
|
|
498
|
+
"spills": "spill_count",
|
|
499
|
+
"vgpr": "vgpr_count",
|
|
500
|
+
"sgpr": "sgpr_count",
|
|
501
|
+
"mfma": "mfma_count",
|
|
502
|
+
"occupancy": "theoretical_occupancy",
|
|
503
|
+
}
|
|
504
|
+
metric = metric_map.get(metric, metric)
|
|
505
|
+
|
|
506
|
+
# Filter function - supports both isa_analysis and code_object_analysis
|
|
507
|
+
def passes_filter(result):
|
|
508
|
+
if not result.success:
|
|
509
|
+
return False
|
|
510
|
+
|
|
511
|
+
# Try to get metric from either analysis type
|
|
512
|
+
actual = None
|
|
513
|
+
if result.isa_analysis:
|
|
514
|
+
actual = getattr(result.isa_analysis, metric, None)
|
|
515
|
+
elif result.code_object_analysis:
|
|
516
|
+
# Map isa_analysis metric names to code_object_analysis equivalents
|
|
517
|
+
co_metric_map = {
|
|
518
|
+
"spill_count": "vgpr_spill_count", # Use vgpr_spill_count as proxy
|
|
519
|
+
"lds_size": "lds_bytes",
|
|
520
|
+
}
|
|
521
|
+
co_metric = co_metric_map.get(metric, metric)
|
|
522
|
+
actual = getattr(result.code_object_analysis, co_metric, None)
|
|
523
|
+
|
|
524
|
+
if actual is None:
|
|
525
|
+
return False
|
|
526
|
+
|
|
527
|
+
if op == ">":
|
|
528
|
+
return actual > value
|
|
529
|
+
elif op == "<":
|
|
530
|
+
return actual < value
|
|
531
|
+
elif op == ">=":
|
|
532
|
+
return actual >= value
|
|
533
|
+
elif op == "<=":
|
|
534
|
+
return actual <= value
|
|
535
|
+
elif op == "==":
|
|
536
|
+
return actual == value
|
|
537
|
+
elif op == "!=":
|
|
538
|
+
return actual != value
|
|
539
|
+
|
|
540
|
+
return False
|
|
541
|
+
|
|
542
|
+
filtered_results = [r for r in batch_result.results if passes_filter(r)]
|
|
543
|
+
|
|
544
|
+
from wafer_core.lib.kernel_scope.api import BatchAnalysisResult
|
|
545
|
+
|
|
546
|
+
return BatchAnalysisResult(
|
|
547
|
+
total_files=len(filtered_results),
|
|
548
|
+
successful=sum(1 for r in filtered_results if r.success),
|
|
549
|
+
failed=sum(1 for r in filtered_results if not r.success),
|
|
550
|
+
results=tuple(filtered_results),
|
|
551
|
+
summary=batch_result.summary,
|
|
552
|
+
)
|