wafer-cli 0.2.36__py3-none-any.whl → 0.2.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/cli.py +2 -0
- wafer/ncu_analyze.py +500 -178
- wafer/targets.py +13 -7
- {wafer_cli-0.2.36.dist-info → wafer_cli-0.2.38.dist-info}/METADATA +1 -1
- {wafer_cli-0.2.36.dist-info → wafer_cli-0.2.38.dist-info}/RECORD +8 -8
- {wafer_cli-0.2.36.dist-info → wafer_cli-0.2.38.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.36.dist-info → wafer_cli-0.2.38.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.36.dist-info → wafer_cli-0.2.38.dist-info}/top_level.txt +0 -0
wafer/cli.py
CHANGED
|
@@ -1497,6 +1497,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
|
|
|
1497
1497
|
template_args: list[str] | None = typer.Option(None, "--args"),
|
|
1498
1498
|
corpus: str | None = typer.Option(None, "--corpus"),
|
|
1499
1499
|
no_sandbox: bool = typer.Option(False, "--no-sandbox"),
|
|
1500
|
+
no_proxy: bool = typer.Option(False, "--no-proxy"),
|
|
1500
1501
|
) -> None:
|
|
1501
1502
|
agent(
|
|
1502
1503
|
prompt=prompt,
|
|
@@ -1517,6 +1518,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
|
|
|
1517
1518
|
template_args=template_args,
|
|
1518
1519
|
corpus=corpus,
|
|
1519
1520
|
no_sandbox=no_sandbox,
|
|
1521
|
+
no_proxy=no_proxy,
|
|
1520
1522
|
)
|
|
1521
1523
|
|
|
1522
1524
|
alias_cmd.__doc__ = doc
|
wafer/ncu_analyze.py
CHANGED
|
@@ -41,6 +41,32 @@ NCU_PATHS = {
|
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
# GPU SM counts for common NVIDIA GPUs (used for underfill detection)
|
|
45
|
+
GPU_SM_COUNTS = {
|
|
46
|
+
"B200": 148,
|
|
47
|
+
"H100": 132,
|
|
48
|
+
"H200": 132,
|
|
49
|
+
"A100": 108,
|
|
50
|
+
"A10": 72,
|
|
51
|
+
"L4": 58,
|
|
52
|
+
"L40": 142,
|
|
53
|
+
"V100": 80,
|
|
54
|
+
"RTX 4090": 128,
|
|
55
|
+
"RTX 3090": 82,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_sm_count_for_gpu(gpu_name: str) -> int:
|
|
60
|
+
"""Get SM count for a GPU name. Returns 148 (B200) as default."""
|
|
61
|
+
if not gpu_name:
|
|
62
|
+
return 148
|
|
63
|
+
gpu_upper = gpu_name.upper()
|
|
64
|
+
for gpu_key, sm_count in GPU_SM_COUNTS.items():
|
|
65
|
+
if gpu_key.upper() in gpu_upper:
|
|
66
|
+
return sm_count
|
|
67
|
+
return 148 # Default to B200
|
|
68
|
+
|
|
69
|
+
|
|
44
70
|
def _get_platform() -> str:
|
|
45
71
|
"""Get normalized platform name."""
|
|
46
72
|
system = platform.system().lower()
|
|
@@ -85,197 +111,471 @@ def _get_install_command() -> str:
|
|
|
85
111
|
return "Download from https://developer.nvidia.com/nsight-compute"
|
|
86
112
|
|
|
87
113
|
|
|
114
|
+
def _parse_gpu_from_session(session_output: str) -> str:
|
|
115
|
+
"""Parse GPU name from NCU session output."""
|
|
116
|
+
assert isinstance(session_output, str)
|
|
117
|
+
|
|
118
|
+
for line in session_output.split("\n"):
|
|
119
|
+
if "display_name" in line:
|
|
120
|
+
parts = line.split()
|
|
121
|
+
if len(parts) >= 2:
|
|
122
|
+
return " ".join(parts[1:])
|
|
123
|
+
return "Unknown"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _create_kernel_entry(kernel_name: str) -> dict:
|
|
127
|
+
"""Create a new kernel metrics dict with default values."""
|
|
128
|
+
assert kernel_name, "kernel_name must not be empty"
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"name": kernel_name,
|
|
132
|
+
"duration_us": 0,
|
|
133
|
+
"duration_ms": 0,
|
|
134
|
+
"memory_throughput_pct": 0,
|
|
135
|
+
"compute_throughput_pct": 0,
|
|
136
|
+
"achieved_occupancy_pct": 0,
|
|
137
|
+
"theoretical_occupancy_pct": 0,
|
|
138
|
+
"registers_per_thread": 0,
|
|
139
|
+
"block_size": 0,
|
|
140
|
+
"grid_size": 0,
|
|
141
|
+
"waves_per_sm": 0,
|
|
142
|
+
"estimated_speedup_pct": 0,
|
|
143
|
+
"recommendations": [],
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _parse_metric_line(kernel: dict, metric_line: str, parts: list[str], current_section: str | None) -> None:
|
|
148
|
+
"""Parse a metric line and update the kernel dict in place."""
|
|
149
|
+
assert kernel is not None
|
|
150
|
+
assert parts, "parts must not be empty"
|
|
151
|
+
|
|
152
|
+
# Duration (in us)
|
|
153
|
+
if metric_line.startswith("Duration") and "us" in metric_line:
|
|
154
|
+
try:
|
|
155
|
+
value = float(parts[-1].replace(",", ""))
|
|
156
|
+
kernel["duration_us"] = value
|
|
157
|
+
kernel["duration_ms"] = value / 1000
|
|
158
|
+
except (ValueError, IndexError):
|
|
159
|
+
pass
|
|
160
|
+
# Memory Throughput (%)
|
|
161
|
+
elif "Memory Throughput" in metric_line and "%" in metric_line:
|
|
162
|
+
try:
|
|
163
|
+
kernel["memory_throughput_pct"] = float(parts[-1].replace(",", ""))
|
|
164
|
+
except (ValueError, IndexError):
|
|
165
|
+
pass
|
|
166
|
+
# Compute (SM) Throughput (%)
|
|
167
|
+
elif "Compute (SM) Throughput" in metric_line or "Compute Throughput" in metric_line:
|
|
168
|
+
try:
|
|
169
|
+
kernel["compute_throughput_pct"] = float(parts[-1].replace(",", ""))
|
|
170
|
+
except (ValueError, IndexError):
|
|
171
|
+
pass
|
|
172
|
+
# Achieved Occupancy (%)
|
|
173
|
+
elif "Achieved Occupancy" in metric_line and "%" in metric_line:
|
|
174
|
+
try:
|
|
175
|
+
kernel["achieved_occupancy_pct"] = float(parts[-1].replace(",", ""))
|
|
176
|
+
except (ValueError, IndexError):
|
|
177
|
+
pass
|
|
178
|
+
# Registers Per Thread
|
|
179
|
+
elif "Registers Per Thread" in metric_line:
|
|
180
|
+
try:
|
|
181
|
+
kernel["registers_per_thread"] = int(float(parts[-1].replace(",", "")))
|
|
182
|
+
except (ValueError, IndexError):
|
|
183
|
+
pass
|
|
184
|
+
# Block Size (only from Launch Statistics section)
|
|
185
|
+
elif metric_line.startswith("Block Size") and current_section == "Launch Statistics":
|
|
186
|
+
try:
|
|
187
|
+
kernel["block_size"] = int(float(parts[-1].replace(",", "")))
|
|
188
|
+
except (ValueError, IndexError):
|
|
189
|
+
pass
|
|
190
|
+
# Grid Size (only from Launch Statistics section)
|
|
191
|
+
elif metric_line.startswith("Grid Size") and current_section == "Launch Statistics":
|
|
192
|
+
try:
|
|
193
|
+
kernel["grid_size"] = int(float(parts[-1].replace(",", "")))
|
|
194
|
+
except (ValueError, IndexError):
|
|
195
|
+
pass
|
|
196
|
+
# Waves Per SM (key metric for underfill detection)
|
|
197
|
+
elif "Waves Per SM" in metric_line:
|
|
198
|
+
try:
|
|
199
|
+
kernel["waves_per_sm"] = float(parts[-1].replace(",", ""))
|
|
200
|
+
except (ValueError, IndexError):
|
|
201
|
+
pass
|
|
202
|
+
# Theoretical Occupancy (%)
|
|
203
|
+
elif "Theoretical Occupancy" in metric_line and "%" in metric_line:
|
|
204
|
+
try:
|
|
205
|
+
kernel["theoretical_occupancy_pct"] = float(parts[-1].replace(",", ""))
|
|
206
|
+
except (ValueError, IndexError):
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _extract_speedup(kernel: dict, stripped: str) -> None:
|
|
211
|
+
"""Extract estimated speedup from recommendation line."""
|
|
212
|
+
import re
|
|
213
|
+
assert kernel is not None
|
|
214
|
+
|
|
215
|
+
for pattern in [r"Est\. Speedup:\s*([\d.]+)%", r"Est\. Local Speedup:\s*([\d.]+)%"]:
|
|
216
|
+
match = re.search(pattern, stripped)
|
|
217
|
+
if match:
|
|
218
|
+
try:
|
|
219
|
+
speedup = float(match.group(1))
|
|
220
|
+
if speedup > kernel["estimated_speedup_pct"]:
|
|
221
|
+
kernel["estimated_speedup_pct"] = speedup
|
|
222
|
+
except ValueError:
|
|
223
|
+
pass
|
|
224
|
+
|
|
225
|
+
|
|
88
226
|
def _parse_ncu_output(session_output: str, details_output: str) -> dict:
|
|
89
227
|
"""Parse NCU session and details output into structured data."""
|
|
90
228
|
import re
|
|
229
|
+
|
|
230
|
+
assert isinstance(session_output, str)
|
|
231
|
+
assert isinstance(details_output, str)
|
|
91
232
|
|
|
92
233
|
summary: dict = {
|
|
93
|
-
"gpu": "Unknown",
|
|
234
|
+
"gpu": _parse_gpu_from_session(session_output) if session_output else "Unknown",
|
|
94
235
|
"kernels": [],
|
|
95
236
|
"recommendations": [],
|
|
96
237
|
}
|
|
97
238
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
"
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
metric_line = stripped
|
|
154
|
-
|
|
155
|
-
# Duration (in us)
|
|
156
|
-
if metric_line.startswith("Duration") and "us" in metric_line:
|
|
157
|
-
try:
|
|
158
|
-
value = float(parts[-1].replace(",", ""))
|
|
159
|
-
current_kernel["duration_us"] = value
|
|
160
|
-
current_kernel["duration_ms"] = value / 1000
|
|
161
|
-
except (ValueError, IndexError):
|
|
162
|
-
pass
|
|
163
|
-
|
|
164
|
-
# Memory Throughput (%)
|
|
165
|
-
elif "Memory Throughput" in metric_line and "%" in metric_line:
|
|
166
|
-
try:
|
|
167
|
-
value = float(parts[-1].replace(",", ""))
|
|
168
|
-
current_kernel["memory_throughput_pct"] = value
|
|
169
|
-
except (ValueError, IndexError):
|
|
170
|
-
pass
|
|
171
|
-
|
|
172
|
-
# Compute (SM) Throughput (%)
|
|
173
|
-
elif (
|
|
174
|
-
"Compute (SM) Throughput" in metric_line
|
|
175
|
-
or "Compute Throughput" in metric_line
|
|
176
|
-
):
|
|
177
|
-
try:
|
|
178
|
-
value = float(parts[-1].replace(",", ""))
|
|
179
|
-
current_kernel["compute_throughput_pct"] = value
|
|
180
|
-
except (ValueError, IndexError):
|
|
181
|
-
pass
|
|
182
|
-
|
|
183
|
-
# Achieved Occupancy (%)
|
|
184
|
-
elif "Achieved Occupancy" in metric_line and "%" in metric_line:
|
|
185
|
-
try:
|
|
186
|
-
value = float(parts[-1].replace(",", ""))
|
|
187
|
-
current_kernel["achieved_occupancy_pct"] = value
|
|
188
|
-
except (ValueError, IndexError):
|
|
189
|
-
pass
|
|
190
|
-
|
|
191
|
-
# Registers Per Thread
|
|
192
|
-
elif "Registers Per Thread" in metric_line:
|
|
193
|
-
try:
|
|
194
|
-
value = int(float(parts[-1].replace(",", "")))
|
|
195
|
-
current_kernel["registers_per_thread"] = value
|
|
196
|
-
except (ValueError, IndexError):
|
|
197
|
-
pass
|
|
198
|
-
|
|
199
|
-
# Block Size
|
|
200
|
-
elif (
|
|
201
|
-
metric_line.startswith("Block Size")
|
|
202
|
-
and current_section == "Launch Statistics"
|
|
203
|
-
):
|
|
204
|
-
try:
|
|
205
|
-
value = int(float(parts[-1].replace(",", "")))
|
|
206
|
-
current_kernel["block_size"] = value
|
|
207
|
-
except (ValueError, IndexError):
|
|
208
|
-
pass
|
|
209
|
-
|
|
210
|
-
# Grid Size
|
|
211
|
-
elif (
|
|
212
|
-
metric_line.startswith("Grid Size")
|
|
213
|
-
and current_section == "Launch Statistics"
|
|
214
|
-
):
|
|
215
|
-
try:
|
|
216
|
-
value = int(float(parts[-1].replace(",", "")))
|
|
217
|
-
current_kernel["grid_size"] = value
|
|
218
|
-
except (ValueError, IndexError):
|
|
219
|
-
pass
|
|
220
|
-
|
|
221
|
-
# Parse recommendations (OPT and INF markers)
|
|
222
|
-
if stripped.startswith("OPT") or stripped.startswith("INF"):
|
|
223
|
-
in_recommendation = True
|
|
224
|
-
recommendation_lines = [stripped]
|
|
225
|
-
|
|
226
|
-
# Extract estimated speedup
|
|
227
|
-
if current_kernel and "Est. Speedup:" in stripped:
|
|
228
|
-
speedup_match = re.search(r"Est\. Speedup:\s*([\d.]+)%", stripped)
|
|
229
|
-
if speedup_match:
|
|
230
|
-
try:
|
|
231
|
-
speedup = float(speedup_match.group(1))
|
|
232
|
-
if speedup > current_kernel["estimated_speedup_pct"]:
|
|
233
|
-
current_kernel["estimated_speedup_pct"] = speedup
|
|
234
|
-
except ValueError:
|
|
235
|
-
pass
|
|
236
|
-
|
|
237
|
-
if current_kernel and "Est. Local Speedup:" in stripped:
|
|
238
|
-
speedup_match = re.search(r"Est\. Local Speedup:\s*([\d.]+)%", stripped)
|
|
239
|
-
if speedup_match:
|
|
240
|
-
try:
|
|
241
|
-
speedup = float(speedup_match.group(1))
|
|
242
|
-
if speedup > current_kernel["estimated_speedup_pct"]:
|
|
243
|
-
current_kernel["estimated_speedup_pct"] = speedup
|
|
244
|
-
except ValueError:
|
|
245
|
-
pass
|
|
246
|
-
elif in_recommendation:
|
|
247
|
-
if line.startswith(" ") and stripped:
|
|
248
|
-
recommendation_lines.append(stripped)
|
|
249
|
-
elif (
|
|
250
|
-
stripped.startswith("Section:")
|
|
251
|
-
or stripped.startswith("---")
|
|
252
|
-
or (stripped and not line.startswith(" "))
|
|
253
|
-
):
|
|
254
|
-
if recommendation_lines:
|
|
255
|
-
full_rec = " ".join(recommendation_lines)
|
|
256
|
-
if full_rec not in summary["recommendations"]:
|
|
257
|
-
summary["recommendations"].append(full_rec)
|
|
258
|
-
if current_kernel and full_rec not in current_kernel["recommendations"]:
|
|
259
|
-
current_kernel["recommendations"].append(full_rec)
|
|
260
|
-
in_recommendation = False
|
|
261
|
-
recommendation_lines = []
|
|
262
|
-
|
|
263
|
-
i += 1
|
|
264
|
-
|
|
265
|
-
# Capture last recommendation if any
|
|
266
|
-
if recommendation_lines:
|
|
267
|
-
full_rec = " ".join(recommendation_lines)
|
|
268
|
-
if full_rec not in summary["recommendations"]:
|
|
269
|
-
summary["recommendations"].append(full_rec)
|
|
270
|
-
if current_kernel and full_rec not in current_kernel["recommendations"]:
|
|
271
|
-
current_kernel["recommendations"].append(full_rec)
|
|
239
|
+
if not details_output:
|
|
240
|
+
return summary
|
|
241
|
+
|
|
242
|
+
lines = details_output.split("\n")
|
|
243
|
+
current_kernel: dict | None = None
|
|
244
|
+
current_section: str | None = None
|
|
245
|
+
in_recommendation = False
|
|
246
|
+
recommendation_lines: list[str] = []
|
|
247
|
+
|
|
248
|
+
for line in lines:
|
|
249
|
+
stripped = line.strip()
|
|
250
|
+
|
|
251
|
+
# Detect kernel header
|
|
252
|
+
if line.startswith(" ") and not line.startswith(" ") and "Context" in line and "Device" in line:
|
|
253
|
+
match = re.match(r"^ (.+?)\s+\(\d+,\s*\d+,\s*\d+\)x\(\d+,\s*\d+,\s*\d+\)", line)
|
|
254
|
+
if match:
|
|
255
|
+
current_kernel = _create_kernel_entry(match.group(1).strip())
|
|
256
|
+
summary["kernels"].append(current_kernel)
|
|
257
|
+
|
|
258
|
+
# Detect section headers
|
|
259
|
+
if stripped.startswith("Section:"):
|
|
260
|
+
current_section = stripped.replace("Section:", "").strip()
|
|
261
|
+
|
|
262
|
+
# Parse metrics from table rows
|
|
263
|
+
if current_kernel and " " in line:
|
|
264
|
+
parts = line.split()
|
|
265
|
+
if len(parts) >= 2:
|
|
266
|
+
_parse_metric_line(current_kernel, stripped, parts, current_section)
|
|
267
|
+
|
|
268
|
+
# Parse recommendations (OPT and INF markers)
|
|
269
|
+
if stripped.startswith("OPT") or stripped.startswith("INF"):
|
|
270
|
+
in_recommendation = True
|
|
271
|
+
recommendation_lines = [stripped]
|
|
272
|
+
if current_kernel:
|
|
273
|
+
_extract_speedup(current_kernel, stripped)
|
|
274
|
+
elif in_recommendation:
|
|
275
|
+
if line.startswith(" ") and stripped:
|
|
276
|
+
recommendation_lines.append(stripped)
|
|
277
|
+
elif stripped.startswith("Section:") or stripped.startswith("---") or (stripped and not line.startswith(" ")):
|
|
278
|
+
if recommendation_lines:
|
|
279
|
+
full_rec = " ".join(recommendation_lines)
|
|
280
|
+
if full_rec not in summary["recommendations"]:
|
|
281
|
+
summary["recommendations"].append(full_rec)
|
|
282
|
+
if current_kernel and full_rec not in current_kernel["recommendations"]:
|
|
283
|
+
current_kernel["recommendations"].append(full_rec)
|
|
284
|
+
in_recommendation = False
|
|
285
|
+
recommendation_lines = []
|
|
286
|
+
|
|
287
|
+
# Capture last recommendation if any
|
|
288
|
+
if recommendation_lines:
|
|
289
|
+
full_rec = " ".join(recommendation_lines)
|
|
290
|
+
if full_rec not in summary["recommendations"]:
|
|
291
|
+
summary["recommendations"].append(full_rec)
|
|
292
|
+
if current_kernel and full_rec not in current_kernel["recommendations"]:
|
|
293
|
+
current_kernel["recommendations"].append(full_rec)
|
|
272
294
|
|
|
273
295
|
return summary
|
|
274
296
|
|
|
275
297
|
|
|
298
|
+
def _classify_underfill(
|
|
299
|
+
waves_per_sm: float, grid_size: int, num_sms: int
|
|
300
|
+
) -> tuple[str | None, str | None]:
|
|
301
|
+
"""Classify underfill type and severity based on metrics.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
(underfill_type, severity) where:
|
|
305
|
+
- underfill_type: "launch" | "resource" | None
|
|
306
|
+
- severity: "severe" | "moderate" | None
|
|
307
|
+
"""
|
|
308
|
+
assert waves_per_sm >= 0, f"waves_per_sm must be non-negative, got {waves_per_sm}"
|
|
309
|
+
assert grid_size >= 0, f"grid_size must be non-negative, got {grid_size}"
|
|
310
|
+
assert num_sms > 0, f"num_sms must be positive, got {num_sms}"
|
|
311
|
+
|
|
312
|
+
is_grid_small = grid_size > 0 and grid_size < num_sms
|
|
313
|
+
|
|
314
|
+
if waves_per_sm > 0 and waves_per_sm < 1.0:
|
|
315
|
+
return ("launch" if is_grid_small else "resource", "severe")
|
|
316
|
+
if waves_per_sm > 0 and waves_per_sm < 2.0:
|
|
317
|
+
return ("launch" if is_grid_small else "resource", "moderate")
|
|
318
|
+
if is_grid_small:
|
|
319
|
+
return ("launch", "severe")
|
|
320
|
+
return (None, None)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _classify_occupancy(
|
|
324
|
+
achieved_occ: float, theoretical_occ: float
|
|
325
|
+
) -> tuple[bool, str | None]:
|
|
326
|
+
"""Classify occupancy issue.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
(is_low_occupancy, analysis_type) where:
|
|
330
|
+
- is_low_occupancy: True if achieved < 50%
|
|
331
|
+
- analysis_type: "runtime_issue" | "resource_limited" | None
|
|
332
|
+
"""
|
|
333
|
+
assert achieved_occ >= 0, f"achieved_occ must be non-negative, got {achieved_occ}"
|
|
334
|
+
assert theoretical_occ >= 0, f"theoretical_occ must be non-negative, got {theoretical_occ}"
|
|
335
|
+
|
|
336
|
+
if achieved_occ <= 0 or achieved_occ >= 50:
|
|
337
|
+
return (False, None)
|
|
338
|
+
|
|
339
|
+
if theoretical_occ <= 0:
|
|
340
|
+
return (True, None)
|
|
341
|
+
|
|
342
|
+
occ_gap = theoretical_occ - achieved_occ
|
|
343
|
+
if theoretical_occ >= 50 and occ_gap > 20:
|
|
344
|
+
return (True, "runtime_issue")
|
|
345
|
+
if theoretical_occ < 50:
|
|
346
|
+
return (True, "resource_limited")
|
|
347
|
+
return (True, None)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _classify_throughput(
|
|
351
|
+
memory_tp: float, compute_tp: float, achieved_occ: float
|
|
352
|
+
) -> tuple[bool, bool, bool]:
|
|
353
|
+
"""Classify throughput observations.
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
(has_high_memory, has_high_compute, has_both_low)
|
|
357
|
+
"""
|
|
358
|
+
assert memory_tp >= 0, f"memory_tp must be non-negative, got {memory_tp}"
|
|
359
|
+
assert compute_tp >= 0, f"compute_tp must be non-negative, got {compute_tp}"
|
|
360
|
+
assert achieved_occ >= 0, f"achieved_occ must be non-negative, got {achieved_occ}"
|
|
361
|
+
|
|
362
|
+
has_high_memory = memory_tp > 60
|
|
363
|
+
has_high_compute = compute_tp > 60
|
|
364
|
+
has_both_low = memory_tp < 30 and compute_tp < 30 and achieved_occ >= 50
|
|
365
|
+
return (has_high_memory, has_high_compute, has_both_low)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _format_underfill_diagnosis(
|
|
369
|
+
underfill_type: str,
|
|
370
|
+
underfill_severity: str,
|
|
371
|
+
waves_per_sm: float,
|
|
372
|
+
grid_size: int,
|
|
373
|
+
num_sms: int,
|
|
374
|
+
achieved_occ: float,
|
|
375
|
+
theoretical_occ: float,
|
|
376
|
+
compute_tp: float,
|
|
377
|
+
memory_tp: float,
|
|
378
|
+
estimated_speedup: float,
|
|
379
|
+
) -> list[str]:
|
|
380
|
+
"""Format diagnosis lines for underfill issues. Returns early from _generate_diagnosis."""
|
|
381
|
+
assert underfill_type in ("launch", "resource")
|
|
382
|
+
assert underfill_severity in ("severe", "moderate")
|
|
383
|
+
|
|
384
|
+
severity_label = "UNDERFILL" if underfill_severity == "severe" else "LIMITED CONCURRENCY"
|
|
385
|
+
blocks_per_sm = grid_size / num_sms if grid_size > 0 else 0
|
|
386
|
+
|
|
387
|
+
lines = [f"**Primary Issue: {severity_label}**"]
|
|
388
|
+
|
|
389
|
+
if waves_per_sm > 0:
|
|
390
|
+
lines.append(f"- Waves per SM: {waves_per_sm:.2f} (often benefits from >2 to hide latency)")
|
|
391
|
+
if grid_size > 0:
|
|
392
|
+
lines.append(f"- Grid: {grid_size} blocks for {num_sms} SMs ({blocks_per_sm:.2f} blocks/SM)")
|
|
393
|
+
lines.append("- ⚠️ Compute/memory throughput % not reliable for global bottleneck; underfill dominates")
|
|
394
|
+
lines.append("")
|
|
395
|
+
|
|
396
|
+
if underfill_type == "launch":
|
|
397
|
+
lines.extend([
|
|
398
|
+
"**Type: LAUNCH-LIMITED** (grid smaller than SM count)",
|
|
399
|
+
"",
|
|
400
|
+
"**What WON'T help:**",
|
|
401
|
+
"- Reducing registers/shared memory (can't create more blocks than launched)",
|
|
402
|
+
"",
|
|
403
|
+
"**What MAY help:**",
|
|
404
|
+
"- Increase batch size or problem dimensions",
|
|
405
|
+
"- Split work into more blocks (e.g., tile over batch/head/rows; sequence tiling only if algorithm permits)",
|
|
406
|
+
"- Use persistent CTAs / work queue: launch ~k×SM blocks that pull tasks",
|
|
407
|
+
"- If inherently sequential, focus on per-block latency optimization",
|
|
408
|
+
])
|
|
409
|
+
else:
|
|
410
|
+
lines.extend([
|
|
411
|
+
"**Type: RESOURCE-LIMITED** (grid is adequate, but few blocks fit per SM)",
|
|
412
|
+
"",
|
|
413
|
+
"**What MAY help:**",
|
|
414
|
+
"- Reduce registers per thread (__launch_bounds__, fewer local vars)",
|
|
415
|
+
"- Reduce shared memory per block (smaller tiles, multi-stage)",
|
|
416
|
+
"- Reduce block size to fit more blocks per SM",
|
|
417
|
+
"- Check 'Block Limit' in NCU Occupancy section for the limiter",
|
|
418
|
+
"",
|
|
419
|
+
"**Note:** If kernel is very short, waves/SM may be less indicative.",
|
|
420
|
+
"Confirm with Occupancy 'Block Limit' and duration metrics.",
|
|
421
|
+
])
|
|
422
|
+
|
|
423
|
+
lines.extend(["", "**Raw metrics (interpret with caution due to underfill):**"])
|
|
424
|
+
lines.append(f"- Achieved Occupancy: {achieved_occ:.1f}%")
|
|
425
|
+
if theoretical_occ > 0:
|
|
426
|
+
lines.append(f"- Theoretical Occupancy: {theoretical_occ:.1f}%")
|
|
427
|
+
lines.append(f"- Compute Throughput: {compute_tp:.1f}%")
|
|
428
|
+
lines.append(f"- Memory Throughput: {memory_tp:.1f}%")
|
|
429
|
+
if estimated_speedup > 0:
|
|
430
|
+
lines.append(f"- NCU Est. Speedup potential: {estimated_speedup:.1f}%")
|
|
431
|
+
lines.append("")
|
|
432
|
+
return lines
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _format_occupancy_diagnosis(
|
|
436
|
+
achieved_occ: float,
|
|
437
|
+
theoretical_occ: float,
|
|
438
|
+
occupancy_analysis: str | None,
|
|
439
|
+
) -> list[str]:
|
|
440
|
+
"""Format diagnosis lines for low occupancy issues."""
|
|
441
|
+
assert achieved_occ >= 0
|
|
442
|
+
|
|
443
|
+
lines = ["**Observation: Low Achieved Occupancy**", f"- Achieved: {achieved_occ:.1f}%"]
|
|
444
|
+
|
|
445
|
+
if theoretical_occ > 0:
|
|
446
|
+
lines.append(f"- Theoretical: {theoretical_occ:.1f}%")
|
|
447
|
+
|
|
448
|
+
if occupancy_analysis == "runtime_issue":
|
|
449
|
+
lines.extend([
|
|
450
|
+
"",
|
|
451
|
+
"**Analysis: Large gap between theoretical and achieved**",
|
|
452
|
+
"- Theoretical is high, so this is NOT a resource limit (regs/shmem)",
|
|
453
|
+
"- Likely causes: load imbalance, barriers, short kernel duration, tail effects",
|
|
454
|
+
"- Check if work is evenly distributed across blocks",
|
|
455
|
+
])
|
|
456
|
+
elif occupancy_analysis == "resource_limited":
|
|
457
|
+
lines.extend([
|
|
458
|
+
"",
|
|
459
|
+
"**Analysis: Theoretical occupancy is also low**",
|
|
460
|
+
"- This IS a resource limit (registers, shared memory, or block size)",
|
|
461
|
+
"- Check 'Block Limit' in NCU Occupancy section for the specific limiter",
|
|
462
|
+
])
|
|
463
|
+
|
|
464
|
+
lines.extend([
|
|
465
|
+
"",
|
|
466
|
+
"**General suggestions:**",
|
|
467
|
+
"- If register-limited: try __launch_bounds__, reduce local arrays",
|
|
468
|
+
"- If shared-mem-limited: reduce tile sizes or use multi-stage",
|
|
469
|
+
"- If runtime-limited: check barriers, load balance, kernel duration",
|
|
470
|
+
"",
|
|
471
|
+
])
|
|
472
|
+
return lines
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _format_throughput_diagnosis(
|
|
476
|
+
has_high_memory: bool,
|
|
477
|
+
has_high_compute: bool,
|
|
478
|
+
has_both_low: bool,
|
|
479
|
+
memory_tp: float,
|
|
480
|
+
compute_tp: float,
|
|
481
|
+
) -> list[str]:
|
|
482
|
+
"""Format diagnosis lines for throughput observations."""
|
|
483
|
+
lines: list[str] = []
|
|
484
|
+
|
|
485
|
+
if has_high_memory or has_high_compute:
|
|
486
|
+
lines.append("**Throughput observations:**")
|
|
487
|
+
if has_high_memory:
|
|
488
|
+
lines.append(f"- Memory throughput relatively high ({memory_tp:.1f}%)")
|
|
489
|
+
lines.append(" - May benefit from: better caching, shared memory tiling, coalesced access")
|
|
490
|
+
if has_high_compute:
|
|
491
|
+
lines.append(f"- Compute throughput relatively high ({compute_tp:.1f}%)")
|
|
492
|
+
lines.append(" - May benefit from: reduced instruction count, better ILP")
|
|
493
|
+
lines.append(" - Check which pipeline is saturated (FP32/FP16/INT/SFU/TensorCore) if available")
|
|
494
|
+
lines.append("")
|
|
495
|
+
elif has_both_low:
|
|
496
|
+
lines.extend([
|
|
497
|
+
"**Observation: Both % of peak are low**",
|
|
498
|
+
"- Likely: latency-bound, sync-bound, dependency stalls, or non-peak pipelines",
|
|
499
|
+
"- This can happen with: integer-heavy, SFU-heavy, or control-flow-heavy kernels",
|
|
500
|
+
"- Check instruction mix / pipeline utilization metrics if available",
|
|
501
|
+
"- Check NCU stall reasons (smsp__warp_issue_stalled_*) for more detail",
|
|
502
|
+
"",
|
|
503
|
+
])
|
|
504
|
+
|
|
505
|
+
return lines
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _generate_diagnosis(kernel: dict, num_sms: int = 148) -> list[str]:
|
|
509
|
+
"""Generate actionable diagnosis based on kernel metrics.
|
|
510
|
+
|
|
511
|
+
Uses a prioritized decision order:
|
|
512
|
+
1. Underfill check (waves_per_sm < 2 OR grid_size < num_sms) - overrides other diagnoses
|
|
513
|
+
2. Occupancy limiters (theoretical vs achieved gap analysis)
|
|
514
|
+
3. General observations (avoid strong "bound" labels without stall data)
|
|
515
|
+
"""
|
|
516
|
+
assert isinstance(kernel, dict), "kernel must be a dict"
|
|
517
|
+
assert num_sms > 0, f"num_sms must be positive, got {num_sms}"
|
|
518
|
+
|
|
519
|
+
# Extract metrics (single assignments)
|
|
520
|
+
grid_size = kernel.get('grid_size', 0)
|
|
521
|
+
achieved_occ = kernel.get('achieved_occupancy_pct', 0)
|
|
522
|
+
theoretical_occ = kernel.get('theoretical_occupancy_pct', 0)
|
|
523
|
+
compute_tp = kernel.get('compute_throughput_pct', 0)
|
|
524
|
+
memory_tp = kernel.get('memory_throughput_pct', 0)
|
|
525
|
+
estimated_speedup = kernel.get('estimated_speedup_pct', 0)
|
|
526
|
+
waves_per_sm = kernel.get('waves_per_sm', 0)
|
|
527
|
+
|
|
528
|
+
# Skip if we don't have enough data
|
|
529
|
+
if grid_size == 0 and achieved_occ == 0 and waves_per_sm == 0:
|
|
530
|
+
return []
|
|
531
|
+
|
|
532
|
+
# Compute all classifications upfront (single assignments)
|
|
533
|
+
underfill_type, underfill_severity = _classify_underfill(waves_per_sm, grid_size, num_sms)
|
|
534
|
+
is_low_occupancy, occupancy_analysis = _classify_occupancy(achieved_occ, theoretical_occ)
|
|
535
|
+
has_high_memory, has_high_compute, has_both_low = _classify_throughput(memory_tp, compute_tp, achieved_occ)
|
|
536
|
+
|
|
537
|
+
# Derived flags (single assignments)
|
|
538
|
+
has_underfill = underfill_type is not None
|
|
539
|
+
has_throughput_obs = has_high_memory or has_high_compute or has_both_low
|
|
540
|
+
|
|
541
|
+
# Build output
|
|
542
|
+
lines = ["#### 🔍 Diagnosis", ""]
|
|
543
|
+
|
|
544
|
+
# PRIORITY 1: Underfill (overrides other diagnoses)
|
|
545
|
+
if has_underfill:
|
|
546
|
+
lines.extend(_format_underfill_diagnosis(
|
|
547
|
+
underfill_type, underfill_severity, waves_per_sm, grid_size, num_sms,
|
|
548
|
+
achieved_occ, theoretical_occ, compute_tp, memory_tp, estimated_speedup,
|
|
549
|
+
))
|
|
550
|
+
return lines
|
|
551
|
+
|
|
552
|
+
# PRIORITY 2: Low occupancy (when NOT caused by underfill)
|
|
553
|
+
if is_low_occupancy:
|
|
554
|
+
lines.extend(_format_occupancy_diagnosis(achieved_occ, theoretical_occ, occupancy_analysis))
|
|
555
|
+
|
|
556
|
+
# PRIORITY 3: Throughput observations
|
|
557
|
+
lines.extend(_format_throughput_diagnosis(has_high_memory, has_high_compute, has_both_low, memory_tp, compute_tp))
|
|
558
|
+
|
|
559
|
+
# Show NCU's own recommendations if present
|
|
560
|
+
if estimated_speedup > 0:
|
|
561
|
+
lines.extend([f"**NCU estimated speedup potential: {estimated_speedup:.1f}%**",
|
|
562
|
+
"- See NCU recommendations below for specific suggestions", ""])
|
|
563
|
+
|
|
564
|
+
# No major issues detected
|
|
565
|
+
if not (has_underfill or is_low_occupancy or has_throughput_obs):
|
|
566
|
+
lines.extend(["**Status: No obvious bottleneck detected**",
|
|
567
|
+
f"- Occupancy: {achieved_occ:.1f}%, Compute: {compute_tp:.1f}%, Memory: {memory_tp:.1f}%",
|
|
568
|
+
"- Consider profiling with --set full for stall breakdown",
|
|
569
|
+
"- Or the kernel may already be well-optimized for its workload", ""])
|
|
570
|
+
|
|
571
|
+
return lines
|
|
572
|
+
|
|
573
|
+
|
|
276
574
|
def _generate_text_output(filename: str, summary: dict) -> str:
|
|
277
575
|
"""Generate human-readable markdown text from summary."""
|
|
278
576
|
timestamp = datetime.now().isoformat()
|
|
577
|
+
gpu_name = summary.get('gpu', 'Unknown')
|
|
578
|
+
num_sms = _get_sm_count_for_gpu(gpu_name)
|
|
279
579
|
|
|
280
580
|
lines = [
|
|
281
581
|
"# NCU Profiling Analysis",
|
|
@@ -283,7 +583,7 @@ def _generate_text_output(filename: str, summary: dict) -> str:
|
|
|
283
583
|
f"Generated: {timestamp}",
|
|
284
584
|
"",
|
|
285
585
|
"## GPU Information",
|
|
286
|
-
f"- Device: {
|
|
586
|
+
f"- Device: {gpu_name}",
|
|
287
587
|
"",
|
|
288
588
|
"## Kernel Summary",
|
|
289
589
|
"",
|
|
@@ -301,10 +601,15 @@ def _generate_text_output(filename: str, summary: dict) -> str:
|
|
|
301
601
|
f"- Grid Size: {kernel.get('grid_size', 0)}",
|
|
302
602
|
"",
|
|
303
603
|
])
|
|
604
|
+
|
|
605
|
+
# Add actionable diagnosis
|
|
606
|
+
diagnosis = _generate_diagnosis(kernel, num_sms=num_sms)
|
|
607
|
+
if diagnosis:
|
|
608
|
+
lines.extend(diagnosis)
|
|
304
609
|
|
|
305
610
|
if summary.get("recommendations"):
|
|
306
611
|
lines.extend([
|
|
307
|
-
"## Recommendations",
|
|
612
|
+
"## NCU Recommendations",
|
|
308
613
|
"",
|
|
309
614
|
])
|
|
310
615
|
for i, rec in enumerate(summary["recommendations"], 1):
|
|
@@ -534,6 +839,8 @@ def _analyze_remote_api(
|
|
|
534
839
|
def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
|
|
535
840
|
"""Generate human-readable text from NCU API result."""
|
|
536
841
|
timestamp = datetime.now().isoformat()
|
|
842
|
+
gpu_name = result.get('gpu', 'Unknown')
|
|
843
|
+
num_sms = _get_sm_count_for_gpu(gpu_name)
|
|
537
844
|
|
|
538
845
|
lines = [
|
|
539
846
|
"# NCU Profiling Analysis",
|
|
@@ -542,7 +849,7 @@ def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
|
|
|
542
849
|
f"Report ID: {result.get('report_id', 'N/A')}",
|
|
543
850
|
"",
|
|
544
851
|
"## GPU Information",
|
|
545
|
-
f"- Device: {
|
|
852
|
+
f"- Device: {gpu_name}",
|
|
546
853
|
"",
|
|
547
854
|
"## Kernel Summary",
|
|
548
855
|
"",
|
|
@@ -556,8 +863,23 @@ def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
|
|
|
556
863
|
f"- Achieved Occupancy: {kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)):.1f}%",
|
|
557
864
|
f"- Compute Throughput: {kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)):.1f}%",
|
|
558
865
|
f"- Memory Throughput: {kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)):.1f}%",
|
|
866
|
+
f"- Grid Size: {kernel.get('grid_size', 0)}",
|
|
867
|
+
f"- Block Size: {kernel.get('block_size', 0)}",
|
|
559
868
|
"",
|
|
560
869
|
])
|
|
870
|
+
|
|
871
|
+
# Add actionable diagnosis (normalize field names from API)
|
|
872
|
+
normalized_kernel = {
|
|
873
|
+
'grid_size': kernel.get('grid_size', 0),
|
|
874
|
+
'block_size': kernel.get('block_size', 0),
|
|
875
|
+
'achieved_occupancy_pct': kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)),
|
|
876
|
+
'compute_throughput_pct': kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)),
|
|
877
|
+
'memory_throughput_pct': kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)),
|
|
878
|
+
'registers_per_thread': kernel.get('registers_per_thread', 0),
|
|
879
|
+
}
|
|
880
|
+
diagnosis = _generate_diagnosis(normalized_kernel, num_sms=num_sms)
|
|
881
|
+
if diagnosis:
|
|
882
|
+
lines.extend(diagnosis)
|
|
561
883
|
|
|
562
884
|
# Add source correlation summary if present
|
|
563
885
|
source_data = result.get("source_correlation", [])
|
wafer/targets.py
CHANGED
|
@@ -4,7 +4,7 @@ CRUD operations for GPU targets stored in ~/.wafer/targets/.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import tomllib
|
|
7
|
-
from dataclasses import asdict
|
|
7
|
+
from dataclasses import asdict, fields
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
@@ -18,6 +18,12 @@ from wafer_core.utils.kernel_utils.targets.config import (
|
|
|
18
18
|
WorkspaceTarget,
|
|
19
19
|
)
|
|
20
20
|
|
|
21
|
+
|
|
22
|
+
def _filter_dataclass_fields(data: dict[str, Any], dataclass_type: type) -> dict[str, Any]:
|
|
23
|
+
"""Filter dict to only include fields that exist in the dataclass."""
|
|
24
|
+
valid_fields = {f.name for f in fields(dataclass_type)}
|
|
25
|
+
return {k: v for k, v in data.items() if k in valid_fields}
|
|
26
|
+
|
|
21
27
|
# Default paths
|
|
22
28
|
WAFER_DIR = Path.home() / ".wafer"
|
|
23
29
|
TARGETS_DIR = WAFER_DIR / "targets"
|
|
@@ -64,17 +70,17 @@ def _parse_target(data: dict[str, Any]) -> TargetConfig:
|
|
|
64
70
|
data_copy["gpu_ids"] = tuple(data_copy["gpu_ids"])
|
|
65
71
|
|
|
66
72
|
if target_type == "baremetal":
|
|
67
|
-
return BaremetalTarget(**data_copy)
|
|
73
|
+
return BaremetalTarget(**_filter_dataclass_fields(data_copy, BaremetalTarget))
|
|
68
74
|
elif target_type == "vm":
|
|
69
|
-
return VMTarget(**data_copy)
|
|
75
|
+
return VMTarget(**_filter_dataclass_fields(data_copy, VMTarget))
|
|
70
76
|
elif target_type == "modal":
|
|
71
|
-
return ModalTarget(**data_copy)
|
|
77
|
+
return ModalTarget(**_filter_dataclass_fields(data_copy, ModalTarget))
|
|
72
78
|
elif target_type == "workspace":
|
|
73
|
-
return WorkspaceTarget(**data_copy)
|
|
79
|
+
return WorkspaceTarget(**_filter_dataclass_fields(data_copy, WorkspaceTarget))
|
|
74
80
|
elif target_type == "runpod":
|
|
75
|
-
return RunPodTarget(**data_copy)
|
|
81
|
+
return RunPodTarget(**_filter_dataclass_fields(data_copy, RunPodTarget))
|
|
76
82
|
elif target_type == "digitalocean":
|
|
77
|
-
return DigitalOceanTarget(**data_copy)
|
|
83
|
+
return DigitalOceanTarget(**_filter_dataclass_fields(data_copy, DigitalOceanTarget))
|
|
78
84
|
else:
|
|
79
85
|
raise ValueError(
|
|
80
86
|
f"Unknown target type: {target_type}. Must be baremetal, vm, modal, workspace, runpod, or digitalocean"
|
|
@@ -7,7 +7,7 @@ wafer/auth.py,sha256=dwss_se5P-FFc9IN38q4kh_dBrA6k-CguDBkivgcdj0,14003
|
|
|
7
7
|
wafer/autotuner.py,sha256=41WYP41pTDvMijv2h42vm89bcHtDMJXObDlWmn6xpFU,44416
|
|
8
8
|
wafer/baseline.py,sha256=OrGCAut_xtkH9Ogx4mMU5-94Q0oClIXqac94YRwqERY,21534
|
|
9
9
|
wafer/billing.py,sha256=jbLB2lI4_9f2KD8uEFDi_ixLlowe5hasC0TIZJyIXRg,7163
|
|
10
|
-
wafer/cli.py,sha256=
|
|
10
|
+
wafer/cli.py,sha256=2MR2cwWN5Dn1rS8T_d-pm7f0jpLBW0K2ZxIVnJHc-VI,275779
|
|
11
11
|
wafer/cli_instructions.py,sha256=bziUKDNDAXABVMvKPLEMXm-hFSD2TcFSh-FKRYa949k,4693
|
|
12
12
|
wafer/config.py,sha256=h5Eo9_yfWqWGoPNdVQikI9GoZVUeysunSYiixf1mKcw,3411
|
|
13
13
|
wafer/corpus.py,sha256=oQegXA43MuyRvYxOsWhmqeP5vMb5IKFHOvM-1RcahPA,22301
|
|
@@ -16,7 +16,7 @@ wafer/global_config.py,sha256=fhaR_RU3ufMksDmOohH1OLeQ0JT0SDW1hEip_zaP75k,11345
|
|
|
16
16
|
wafer/gpu_run.py,sha256=TwqXy72T7f2I7e6n5WWod3xgxCPnDhU0BgLsB4CUoQY,9716
|
|
17
17
|
wafer/inference.py,sha256=tZCO5i05FKY27ewis3CSBHFBeFbXY3xwj0DSjdoMY9s,4314
|
|
18
18
|
wafer/kernel_scope.py,sha256=YtnxknAChkJoeU_vIdxiqWsAITGBeabp9OGIK-X32i0,20796
|
|
19
|
-
wafer/ncu_analyze.py,sha256=
|
|
19
|
+
wafer/ncu_analyze.py,sha256=8id2eJRuBabxINnUF0M6SQtS1YbAWBM3pzIN8xkxMCE,37139
|
|
20
20
|
wafer/nsys_analyze.py,sha256=AhNcjPaapB0QCbqiHRXvyy-ccjevvVwEyxes84D28JU,36124
|
|
21
21
|
wafer/nsys_profile.py,sha256=QFBl8pkr8r4uRNdNUO9gY-obj9slqpOgVYFZ_sXu6Nw,15478
|
|
22
22
|
wafer/output.py,sha256=8jw5ifvIMK8ldyBMGW4NhrKvJPl66TV2Y2fJ5Tlhh1I,8293
|
|
@@ -27,7 +27,7 @@ wafer/rocprof_systems.py,sha256=4IWbMcbYk1x_8iS7P3FC_u5sgH6EXADCtR2lV9id80M,1862
|
|
|
27
27
|
wafer/specs_cli.py,sha256=frMEKwMflxVNpFlAuxprmr33ZZ1Oeh2lB0KWZ4oZWzw,4360
|
|
28
28
|
wafer/ssh_keys.py,sha256=9kSdhV_dg9T6pQu2JmNQptarkkwGtN9rLyRkI1bW4i4,8094
|
|
29
29
|
wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
|
|
30
|
-
wafer/targets.py,sha256=
|
|
30
|
+
wafer/targets.py,sha256=XeEZeOykNBnjJLnCqpoXAnzeqbp6MWZRIW9A26BKqdU,27469
|
|
31
31
|
wafer/targets_cli.py,sha256=Oe3e02rSXeNrMbe_Qv9DNfQ8dEOKodtU7BbQQWxlNwA,16348
|
|
32
32
|
wafer/targets_ops.py,sha256=jN1oIBx0mutxRNE9xpIc7SaBxPkVmOyus2eqn0kEKNI,21475
|
|
33
33
|
wafer/trace_compare.py,sha256=IBVSGI8u5A10haDzL4eQ0R24fM1G_dd1F3-4iEkG1EQ,6349
|
|
@@ -43,8 +43,8 @@ wafer/templates/optimize_kernelbench.py,sha256=aoOA13zWEl89r6QW03xF9NKxQ7j4mWe9r
|
|
|
43
43
|
wafer/templates/optimize_vllm.py,sha256=_D1rDP9wHA8CCvmoUrdLEW94MiaK4nAYJ-jbnpAvq7A,6154
|
|
44
44
|
wafer/templates/trace_analyze.py,sha256=B7CiRlsokERzBjLL-k49kGjpU2zlJZqzTE05xbRS1WI,2878
|
|
45
45
|
wafer/tests/test_eval_cli_parity.py,sha256=SGmaj2NGBZ7GdDF53bXsECvQbV21iHZw8YeL_MJOLk0,7206
|
|
46
|
-
wafer_cli-0.2.
|
|
47
|
-
wafer_cli-0.2.
|
|
48
|
-
wafer_cli-0.2.
|
|
49
|
-
wafer_cli-0.2.
|
|
50
|
-
wafer_cli-0.2.
|
|
46
|
+
wafer_cli-0.2.38.dist-info/METADATA,sha256=JwKh4o8KfbCYVzy5hvYIjSyn5c1MiOfV-H4_q9fdtHI,6461
|
|
47
|
+
wafer_cli-0.2.38.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
48
|
+
wafer_cli-0.2.38.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
|
|
49
|
+
wafer_cli-0.2.38.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
|
|
50
|
+
wafer_cli-0.2.38.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|