wafer-cli 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +118 -0
- wafer/__init__.py +3 -0
- wafer/analytics.py +306 -0
- wafer/api_client.py +195 -0
- wafer/auth.py +432 -0
- wafer/autotuner.py +1080 -0
- wafer/billing.py +233 -0
- wafer/cli.py +7289 -0
- wafer/config.py +105 -0
- wafer/corpus.py +366 -0
- wafer/evaluate.py +4593 -0
- wafer/global_config.py +350 -0
- wafer/gpu_run.py +307 -0
- wafer/inference.py +148 -0
- wafer/kernel_scope.py +552 -0
- wafer/ncu_analyze.py +651 -0
- wafer/nsys_analyze.py +1042 -0
- wafer/nsys_profile.py +510 -0
- wafer/output.py +248 -0
- wafer/problems.py +357 -0
- wafer/rocprof_compute.py +490 -0
- wafer/rocprof_sdk.py +274 -0
- wafer/rocprof_systems.py +520 -0
- wafer/skills/wafer-guide/SKILL.md +129 -0
- wafer/ssh_keys.py +261 -0
- wafer/target_lock.py +270 -0
- wafer/targets.py +842 -0
- wafer/targets_ops.py +717 -0
- wafer/templates/__init__.py +0 -0
- wafer/templates/ask_docs.py +61 -0
- wafer/templates/optimize_kernel.py +71 -0
- wafer/templates/optimize_kernelbench.py +137 -0
- wafer/templates/trace_analyze.py +74 -0
- wafer/tracelens.py +218 -0
- wafer/wevin_cli.py +577 -0
- wafer/workspaces.py +852 -0
- wafer_cli-0.2.14.dist-info/METADATA +16 -0
- wafer_cli-0.2.14.dist-info/RECORD +41 -0
- wafer_cli-0.2.14.dist-info/WHEEL +5 -0
- wafer_cli-0.2.14.dist-info/entry_points.txt +2 -0
- wafer_cli-0.2.14.dist-info/top_level.txt +1 -0
wafer/ncu_analyze.py
ADDED
|
@@ -0,0 +1,651 @@
|
|
|
1
|
+
"""NCU Analyze - Parse and analyze .ncu-rep profile files.
|
|
2
|
+
|
|
3
|
+
This module provides the implementation for the `wafer nvidia ncu analyze` command.
|
|
4
|
+
It reuses the parsing logic from services/ncu-tool/ncu_tool.py.
|
|
5
|
+
|
|
6
|
+
TODO(Wafer-326): Migrate this to wafer-core architecture.
|
|
7
|
+
The NCU parsing logic should be consolidated into wafer_core/tools/ncu_parser.py,
|
|
8
|
+
similar to how compiler_explorer_tool.py was migrated to wafer_core/tools/compiler.py.
|
|
9
|
+
This will:
|
|
10
|
+
1. Eliminate duplicate code between this file and extension's ncu_tool.py
|
|
11
|
+
2. Enable automatic telemetry via @with_telemetry decorator
|
|
12
|
+
3. Allow both CLI and extension to use the same implementation
|
|
13
|
+
See wafer_core/tools/compiler.py for the migration pattern.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import platform
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
# Known NCU installation paths by platform
|
|
25
|
+
NCU_PATHS = {
|
|
26
|
+
"linux": [
|
|
27
|
+
"/usr/local/cuda/bin/ncu",
|
|
28
|
+
"/opt/nvidia/nsight-compute/ncu",
|
|
29
|
+
"/usr/bin/ncu",
|
|
30
|
+
"/usr/local/bin/ncu",
|
|
31
|
+
],
|
|
32
|
+
"darwin": [
|
|
33
|
+
"/Applications/NVIDIA Nsight Compute.app/Contents/MacOS/ncu",
|
|
34
|
+
"/usr/local/cuda/bin/ncu",
|
|
35
|
+
],
|
|
36
|
+
"windows": [
|
|
37
|
+
r"C:\Program Files\NVIDIA Corporation\Nsight Compute\ncu.exe",
|
|
38
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\bin\ncu.exe",
|
|
39
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\ncu.exe",
|
|
40
|
+
],
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_platform() -> str:
|
|
45
|
+
"""Get normalized platform name."""
|
|
46
|
+
system = platform.system().lower()
|
|
47
|
+
if system == "darwin":
|
|
48
|
+
return "darwin"
|
|
49
|
+
elif system == "windows":
|
|
50
|
+
return "windows"
|
|
51
|
+
return "linux"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _find_ncu() -> str | None:
|
|
55
|
+
"""Find NCU executable on the system."""
|
|
56
|
+
ncu = shutil.which("ncu")
|
|
57
|
+
if ncu:
|
|
58
|
+
return ncu
|
|
59
|
+
|
|
60
|
+
plat = _get_platform()
|
|
61
|
+
for path in NCU_PATHS.get(plat, []):
|
|
62
|
+
if os.path.isfile(path) and os.access(path, os.X_OK):
|
|
63
|
+
return path
|
|
64
|
+
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _get_install_command() -> str:
|
|
69
|
+
"""Get platform-appropriate install command."""
|
|
70
|
+
plat = _get_platform()
|
|
71
|
+
|
|
72
|
+
if plat == "linux":
|
|
73
|
+
if shutil.which("apt-get") or shutil.which("apt"):
|
|
74
|
+
return "sudo apt install nvidia-cuda-toolkit"
|
|
75
|
+
elif shutil.which("dnf"):
|
|
76
|
+
return "sudo dnf install cuda-nsight-compute"
|
|
77
|
+
elif shutil.which("yum"):
|
|
78
|
+
return "sudo yum install cuda-nsight-compute"
|
|
79
|
+
elif shutil.which("pacman"):
|
|
80
|
+
return "sudo pacman -S cuda-tools"
|
|
81
|
+
|
|
82
|
+
if shutil.which("conda"):
|
|
83
|
+
return "conda install -c nvidia nsight-compute"
|
|
84
|
+
|
|
85
|
+
return "Download from https://developer.nvidia.com/nsight-compute"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _parse_ncu_output(session_output: str, details_output: str) -> dict:
|
|
89
|
+
"""Parse NCU session and details output into structured data."""
|
|
90
|
+
import re
|
|
91
|
+
|
|
92
|
+
summary: dict = {
|
|
93
|
+
"gpu": "Unknown",
|
|
94
|
+
"kernels": [],
|
|
95
|
+
"recommendations": [],
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Parse session output for GPU name
|
|
99
|
+
if session_output:
|
|
100
|
+
for line in session_output.split("\n"):
|
|
101
|
+
if "display_name" in line:
|
|
102
|
+
parts = line.split()
|
|
103
|
+
if len(parts) >= 2:
|
|
104
|
+
summary["gpu"] = " ".join(parts[1:])
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
# Parse details output for kernel metrics and recommendations
|
|
108
|
+
if details_output:
|
|
109
|
+
lines = details_output.split("\n")
|
|
110
|
+
current_kernel: dict | None = None
|
|
111
|
+
current_section: str | None = None
|
|
112
|
+
in_recommendation = False
|
|
113
|
+
recommendation_lines: list[str] = []
|
|
114
|
+
|
|
115
|
+
i = 0
|
|
116
|
+
while i < len(lines):
|
|
117
|
+
line = lines[i]
|
|
118
|
+
stripped = line.strip()
|
|
119
|
+
|
|
120
|
+
# Detect kernel header
|
|
121
|
+
if (
|
|
122
|
+
line.startswith(" ")
|
|
123
|
+
and not line.startswith(" ")
|
|
124
|
+
and "Context" in line
|
|
125
|
+
and "Device" in line
|
|
126
|
+
):
|
|
127
|
+
match = re.match(r"^ (.+?)\s+\(\d+,\s*\d+,\s*\d+\)x\(\d+,\s*\d+,\s*\d+\)", line)
|
|
128
|
+
if match:
|
|
129
|
+
kernel_name = match.group(1).strip()
|
|
130
|
+
current_kernel = {
|
|
131
|
+
"name": kernel_name,
|
|
132
|
+
"duration_us": 0,
|
|
133
|
+
"duration_ms": 0,
|
|
134
|
+
"memory_throughput_pct": 0,
|
|
135
|
+
"compute_throughput_pct": 0,
|
|
136
|
+
"achieved_occupancy_pct": 0,
|
|
137
|
+
"registers_per_thread": 0,
|
|
138
|
+
"block_size": 0,
|
|
139
|
+
"grid_size": 0,
|
|
140
|
+
"estimated_speedup_pct": 0,
|
|
141
|
+
"recommendations": [],
|
|
142
|
+
}
|
|
143
|
+
summary["kernels"].append(current_kernel)
|
|
144
|
+
|
|
145
|
+
# Detect section headers
|
|
146
|
+
if stripped.startswith("Section:"):
|
|
147
|
+
current_section = stripped.replace("Section:", "").strip()
|
|
148
|
+
|
|
149
|
+
# Parse metrics from table rows
|
|
150
|
+
if current_kernel and " " in line:
|
|
151
|
+
parts = line.split()
|
|
152
|
+
if len(parts) >= 2:
|
|
153
|
+
metric_line = stripped
|
|
154
|
+
|
|
155
|
+
# Duration (in us)
|
|
156
|
+
if metric_line.startswith("Duration") and "us" in metric_line:
|
|
157
|
+
try:
|
|
158
|
+
value = float(parts[-1].replace(",", ""))
|
|
159
|
+
current_kernel["duration_us"] = value
|
|
160
|
+
current_kernel["duration_ms"] = value / 1000
|
|
161
|
+
except (ValueError, IndexError):
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
# Memory Throughput (%)
|
|
165
|
+
elif "Memory Throughput" in metric_line and "%" in metric_line:
|
|
166
|
+
try:
|
|
167
|
+
value = float(parts[-1].replace(",", ""))
|
|
168
|
+
current_kernel["memory_throughput_pct"] = value
|
|
169
|
+
except (ValueError, IndexError):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
# Compute (SM) Throughput (%)
|
|
173
|
+
elif (
|
|
174
|
+
"Compute (SM) Throughput" in metric_line
|
|
175
|
+
or "Compute Throughput" in metric_line
|
|
176
|
+
):
|
|
177
|
+
try:
|
|
178
|
+
value = float(parts[-1].replace(",", ""))
|
|
179
|
+
current_kernel["compute_throughput_pct"] = value
|
|
180
|
+
except (ValueError, IndexError):
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
# Achieved Occupancy (%)
|
|
184
|
+
elif "Achieved Occupancy" in metric_line and "%" in metric_line:
|
|
185
|
+
try:
|
|
186
|
+
value = float(parts[-1].replace(",", ""))
|
|
187
|
+
current_kernel["achieved_occupancy_pct"] = value
|
|
188
|
+
except (ValueError, IndexError):
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
# Registers Per Thread
|
|
192
|
+
elif "Registers Per Thread" in metric_line:
|
|
193
|
+
try:
|
|
194
|
+
value = int(float(parts[-1].replace(",", "")))
|
|
195
|
+
current_kernel["registers_per_thread"] = value
|
|
196
|
+
except (ValueError, IndexError):
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
# Block Size
|
|
200
|
+
elif (
|
|
201
|
+
metric_line.startswith("Block Size")
|
|
202
|
+
and current_section == "Launch Statistics"
|
|
203
|
+
):
|
|
204
|
+
try:
|
|
205
|
+
value = int(float(parts[-1].replace(",", "")))
|
|
206
|
+
current_kernel["block_size"] = value
|
|
207
|
+
except (ValueError, IndexError):
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# Grid Size
|
|
211
|
+
elif (
|
|
212
|
+
metric_line.startswith("Grid Size")
|
|
213
|
+
and current_section == "Launch Statistics"
|
|
214
|
+
):
|
|
215
|
+
try:
|
|
216
|
+
value = int(float(parts[-1].replace(",", "")))
|
|
217
|
+
current_kernel["grid_size"] = value
|
|
218
|
+
except (ValueError, IndexError):
|
|
219
|
+
pass
|
|
220
|
+
|
|
221
|
+
# Parse recommendations (OPT and INF markers)
|
|
222
|
+
if stripped.startswith("OPT") or stripped.startswith("INF"):
|
|
223
|
+
in_recommendation = True
|
|
224
|
+
recommendation_lines = [stripped]
|
|
225
|
+
|
|
226
|
+
# Extract estimated speedup
|
|
227
|
+
if current_kernel and "Est. Speedup:" in stripped:
|
|
228
|
+
speedup_match = re.search(r"Est\. Speedup:\s*([\d.]+)%", stripped)
|
|
229
|
+
if speedup_match:
|
|
230
|
+
try:
|
|
231
|
+
speedup = float(speedup_match.group(1))
|
|
232
|
+
if speedup > current_kernel["estimated_speedup_pct"]:
|
|
233
|
+
current_kernel["estimated_speedup_pct"] = speedup
|
|
234
|
+
except ValueError:
|
|
235
|
+
pass
|
|
236
|
+
|
|
237
|
+
if current_kernel and "Est. Local Speedup:" in stripped:
|
|
238
|
+
speedup_match = re.search(r"Est\. Local Speedup:\s*([\d.]+)%", stripped)
|
|
239
|
+
if speedup_match:
|
|
240
|
+
try:
|
|
241
|
+
speedup = float(speedup_match.group(1))
|
|
242
|
+
if speedup > current_kernel["estimated_speedup_pct"]:
|
|
243
|
+
current_kernel["estimated_speedup_pct"] = speedup
|
|
244
|
+
except ValueError:
|
|
245
|
+
pass
|
|
246
|
+
elif in_recommendation:
|
|
247
|
+
if line.startswith(" ") and stripped:
|
|
248
|
+
recommendation_lines.append(stripped)
|
|
249
|
+
elif (
|
|
250
|
+
stripped.startswith("Section:")
|
|
251
|
+
or stripped.startswith("---")
|
|
252
|
+
or (stripped and not line.startswith(" "))
|
|
253
|
+
):
|
|
254
|
+
if recommendation_lines:
|
|
255
|
+
full_rec = " ".join(recommendation_lines)
|
|
256
|
+
if full_rec not in summary["recommendations"]:
|
|
257
|
+
summary["recommendations"].append(full_rec)
|
|
258
|
+
if current_kernel and full_rec not in current_kernel["recommendations"]:
|
|
259
|
+
current_kernel["recommendations"].append(full_rec)
|
|
260
|
+
in_recommendation = False
|
|
261
|
+
recommendation_lines = []
|
|
262
|
+
|
|
263
|
+
i += 1
|
|
264
|
+
|
|
265
|
+
# Capture last recommendation if any
|
|
266
|
+
if recommendation_lines:
|
|
267
|
+
full_rec = " ".join(recommendation_lines)
|
|
268
|
+
if full_rec not in summary["recommendations"]:
|
|
269
|
+
summary["recommendations"].append(full_rec)
|
|
270
|
+
if current_kernel and full_rec not in current_kernel["recommendations"]:
|
|
271
|
+
current_kernel["recommendations"].append(full_rec)
|
|
272
|
+
|
|
273
|
+
return summary
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _generate_text_output(filename: str, summary: dict) -> str:
|
|
277
|
+
"""Generate human-readable markdown text from summary."""
|
|
278
|
+
timestamp = datetime.now().isoformat()
|
|
279
|
+
|
|
280
|
+
lines = [
|
|
281
|
+
"# NCU Profiling Analysis",
|
|
282
|
+
f"Source: {filename}",
|
|
283
|
+
f"Generated: {timestamp}",
|
|
284
|
+
"",
|
|
285
|
+
"## GPU Information",
|
|
286
|
+
f"- Device: {summary.get('gpu', 'Unknown')}",
|
|
287
|
+
"",
|
|
288
|
+
"## Kernel Summary",
|
|
289
|
+
"",
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
for kernel in summary.get("kernels", []):
|
|
293
|
+
lines.extend([
|
|
294
|
+
f"### {kernel['name']}",
|
|
295
|
+
f"- Duration: {kernel.get('duration_us', 0):.2f} us ({kernel.get('duration_ms', 0):.3f} ms)",
|
|
296
|
+
f"- Achieved Occupancy: {kernel.get('achieved_occupancy_pct', 0):.1f}%",
|
|
297
|
+
f"- Compute (SM) Throughput: {kernel.get('compute_throughput_pct', 0):.1f}%",
|
|
298
|
+
f"- Memory Throughput: {kernel.get('memory_throughput_pct', 0):.1f}%",
|
|
299
|
+
f"- Registers/Thread: {kernel.get('registers_per_thread', 0)}",
|
|
300
|
+
f"- Block Size: {kernel.get('block_size', 0)}",
|
|
301
|
+
f"- Grid Size: {kernel.get('grid_size', 0)}",
|
|
302
|
+
"",
|
|
303
|
+
])
|
|
304
|
+
|
|
305
|
+
if summary.get("recommendations"):
|
|
306
|
+
lines.extend([
|
|
307
|
+
"## Recommendations",
|
|
308
|
+
"",
|
|
309
|
+
])
|
|
310
|
+
for i, rec in enumerate(summary["recommendations"], 1):
|
|
311
|
+
lines.append(f"{i}. {rec}")
|
|
312
|
+
lines.append("")
|
|
313
|
+
|
|
314
|
+
return "\n".join(lines)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _analyze_local(
|
|
318
|
+
filepath: Path,
|
|
319
|
+
ncu_path: str,
|
|
320
|
+
output_dir: Path | None = None,
|
|
321
|
+
json_output: bool = False,
|
|
322
|
+
) -> str:
|
|
323
|
+
"""Analyze NCU profile locally using installed NCU."""
|
|
324
|
+
# Run NCU to get session and details
|
|
325
|
+
try:
|
|
326
|
+
session_result = subprocess.run(
|
|
327
|
+
[ncu_path, "--import", str(filepath), "--page", "session"],
|
|
328
|
+
capture_output=True,
|
|
329
|
+
text=True,
|
|
330
|
+
timeout=120,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
details_result = subprocess.run(
|
|
334
|
+
[ncu_path, "--import", str(filepath), "--page", "details"],
|
|
335
|
+
capture_output=True,
|
|
336
|
+
text=True,
|
|
337
|
+
timeout=120,
|
|
338
|
+
)
|
|
339
|
+
except subprocess.TimeoutExpired as e:
|
|
340
|
+
raise RuntimeError("NCU command timed out (120s limit)") from e
|
|
341
|
+
|
|
342
|
+
# Parse the outputs
|
|
343
|
+
summary = _parse_ncu_output(session_result.stdout, details_result.stdout)
|
|
344
|
+
|
|
345
|
+
# Save to output directory if specified
|
|
346
|
+
if output_dir:
|
|
347
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
348
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
|
349
|
+
txt_filename = f"ncu_analysis_{filepath.stem}_{timestamp}.txt"
|
|
350
|
+
txt_path = output_dir / txt_filename
|
|
351
|
+
txt_path.write_text(_generate_text_output(filepath.name, summary))
|
|
352
|
+
|
|
353
|
+
if json_output:
|
|
354
|
+
return json.dumps(summary, indent=2)
|
|
355
|
+
else:
|
|
356
|
+
return _generate_text_output(filepath.name, summary)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _analyze_remote_direct(
|
|
360
|
+
filepath: Path,
|
|
361
|
+
target_name: str,
|
|
362
|
+
json_output: bool = False,
|
|
363
|
+
) -> str:
|
|
364
|
+
"""Analyze NCU profile remotely via direct SSH to target.
|
|
365
|
+
|
|
366
|
+
Uploads the .ncu-rep file and runs NCU analysis on the target machine.
|
|
367
|
+
"""
|
|
368
|
+
import sys
|
|
369
|
+
import tempfile
|
|
370
|
+
|
|
371
|
+
from .gpu_run import push_directory, run_command_capture
|
|
372
|
+
from .targets import load_target
|
|
373
|
+
|
|
374
|
+
# Load target
|
|
375
|
+
try:
|
|
376
|
+
target = load_target(target_name)
|
|
377
|
+
except FileNotFoundError as e:
|
|
378
|
+
raise RuntimeError(f"Target not found: {target_name}") from e
|
|
379
|
+
|
|
380
|
+
# Create temp directory with just the .ncu-rep file
|
|
381
|
+
# Use a unique name based on the file
|
|
382
|
+
workspace_name = f"ncu_analyze_{filepath.stem}"
|
|
383
|
+
|
|
384
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
385
|
+
import shutil
|
|
386
|
+
|
|
387
|
+
# Create a directory with the workspace name
|
|
388
|
+
tmp_path = Path(tmpdir) / workspace_name
|
|
389
|
+
tmp_path.mkdir()
|
|
390
|
+
shutil.copy(filepath, tmp_path / filepath.name)
|
|
391
|
+
|
|
392
|
+
# Push the file
|
|
393
|
+
print(f"Uploading {filepath.name} to {target_name}...", file=sys.stderr)
|
|
394
|
+
push_directory(tmp_path, target)
|
|
395
|
+
|
|
396
|
+
# Run NCU commands - workspace_name is used (not full path)
|
|
397
|
+
ncu_cmd = f"/usr/local/cuda/bin/ncu --import {filepath.name} --page session && echo '---NCU_SEPARATOR---' && /usr/local/cuda/bin/ncu --import {filepath.name} --page details"
|
|
398
|
+
|
|
399
|
+
print("Running NCU analysis...", file=sys.stderr)
|
|
400
|
+
exit_code, output = run_command_capture(ncu_cmd, workspace_name, target)
|
|
401
|
+
|
|
402
|
+
if exit_code != 0:
|
|
403
|
+
raise RuntimeError(f"NCU command failed with exit code {exit_code}")
|
|
404
|
+
|
|
405
|
+
# Split session and details output
|
|
406
|
+
if "---NCU_SEPARATOR---" in output:
|
|
407
|
+
parts = output.split("---NCU_SEPARATOR---")
|
|
408
|
+
session_output = parts[0].strip()
|
|
409
|
+
details_output = parts[1].strip() if len(parts) > 1 else ""
|
|
410
|
+
else:
|
|
411
|
+
session_output = ""
|
|
412
|
+
details_output = output
|
|
413
|
+
|
|
414
|
+
# Parse the outputs
|
|
415
|
+
summary = _parse_ncu_output(session_output, details_output)
|
|
416
|
+
|
|
417
|
+
if json_output:
|
|
418
|
+
return json.dumps(summary, indent=2)
|
|
419
|
+
else:
|
|
420
|
+
return _generate_text_output(filepath.name, summary)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _analyze_remote_api(
|
|
424
|
+
filepath: Path,
|
|
425
|
+
json_output: bool = False,
|
|
426
|
+
include_source: bool = False,
|
|
427
|
+
) -> str:
|
|
428
|
+
"""Analyze NCU profile remotely via wafer-api.
|
|
429
|
+
|
|
430
|
+
Uploads the .ncu-rep file and runs NCU analysis on a remote GPU machine.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
filepath: Path to .ncu-rep file
|
|
434
|
+
json_output: Return JSON instead of formatted text
|
|
435
|
+
include_source: If True, fetch source correlation (SASS) for each kernel
|
|
436
|
+
"""
|
|
437
|
+
import sys
|
|
438
|
+
|
|
439
|
+
import httpx
|
|
440
|
+
|
|
441
|
+
from .api_client import get_api_url
|
|
442
|
+
from .auth import get_auth_headers
|
|
443
|
+
|
|
444
|
+
api_url = get_api_url()
|
|
445
|
+
headers = get_auth_headers()
|
|
446
|
+
|
|
447
|
+
# Use the dedicated NCU analyze endpoint (binary upload)
|
|
448
|
+
print(f"Uploading {filepath.name} for analysis...", file=sys.stderr)
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
with httpx.Client(timeout=300.0, headers=headers) as client:
|
|
452
|
+
# Upload via binary endpoint for efficiency
|
|
453
|
+
file_content = filepath.read_bytes()
|
|
454
|
+
response = client.post(
|
|
455
|
+
f"{api_url}/v1/ncu/reports/binary",
|
|
456
|
+
content=file_content,
|
|
457
|
+
headers={
|
|
458
|
+
**headers,
|
|
459
|
+
"Content-Type": "application/octet-stream",
|
|
460
|
+
"X-Filename": filepath.name,
|
|
461
|
+
},
|
|
462
|
+
)
|
|
463
|
+
response.raise_for_status()
|
|
464
|
+
upload_result = response.json()
|
|
465
|
+
# API returns camelCase "reportId", normalize to snake_case
|
|
466
|
+
report_id = upload_result.get("report_id") or upload_result.get("reportId")
|
|
467
|
+
|
|
468
|
+
if not report_id:
|
|
469
|
+
raise RuntimeError("No report_id returned from upload")
|
|
470
|
+
|
|
471
|
+
print(f"Report ID: {report_id}", file=sys.stderr)
|
|
472
|
+
|
|
473
|
+
# Get kernel list
|
|
474
|
+
print("Fetching kernel data...", file=sys.stderr)
|
|
475
|
+
kernels_response = client.get(f"{api_url}/v1/ncu/reports/{report_id}/kernels")
|
|
476
|
+
kernels_response.raise_for_status()
|
|
477
|
+
kernels_data = kernels_response.json()
|
|
478
|
+
# API may return {"kernels": [...]} or just [...]
|
|
479
|
+
kernels = (
|
|
480
|
+
kernels_data.get("kernels", kernels_data)
|
|
481
|
+
if isinstance(kernels_data, dict)
|
|
482
|
+
else kernels_data
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
result: dict = {
|
|
486
|
+
"report_id": report_id,
|
|
487
|
+
"gpu": upload_result.get("gpu", "Unknown"),
|
|
488
|
+
"kernels": kernels,
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
# Fetch source correlation if requested
|
|
492
|
+
if include_source:
|
|
493
|
+
print("Fetching source correlation (SASS)...", file=sys.stderr)
|
|
494
|
+
source_data = []
|
|
495
|
+
for kernel in kernels:
|
|
496
|
+
kernel_id = kernel.get("id") or kernel.get("kernel_id")
|
|
497
|
+
if not kernel_id:
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
source_response = client.get(
|
|
502
|
+
f"{api_url}/v1/ncu/reports/{report_id}/kernels/{kernel_id}/source",
|
|
503
|
+
params={"view": "sass"},
|
|
504
|
+
timeout=180.0, # Source extraction can be slow
|
|
505
|
+
)
|
|
506
|
+
source_response.raise_for_status()
|
|
507
|
+
source_info = source_response.json()
|
|
508
|
+
source_data.append({
|
|
509
|
+
"kernel_id": kernel_id,
|
|
510
|
+
"kernel_name": kernel.get("name", "Unknown"),
|
|
511
|
+
"source": source_info,
|
|
512
|
+
})
|
|
513
|
+
except httpx.HTTPStatusError as e:
|
|
514
|
+
print(
|
|
515
|
+
f"Warning: Failed to get source for kernel {kernel_id}: {e}",
|
|
516
|
+
file=sys.stderr,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
result["source_correlation"] = source_data
|
|
520
|
+
|
|
521
|
+
except httpx.HTTPStatusError as e:
|
|
522
|
+
if e.response.status_code == 401:
|
|
523
|
+
raise RuntimeError("Not authenticated. Run: wafer login") from e
|
|
524
|
+
raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
|
|
525
|
+
except httpx.RequestError as e:
|
|
526
|
+
raise RuntimeError(f"Could not reach API: {e}") from e
|
|
527
|
+
|
|
528
|
+
if json_output:
|
|
529
|
+
return json.dumps(result, indent=2)
|
|
530
|
+
else:
|
|
531
|
+
return _generate_ncu_api_text_output(filepath.name, result)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
|
|
535
|
+
"""Generate human-readable text from NCU API result."""
|
|
536
|
+
timestamp = datetime.now().isoformat()
|
|
537
|
+
|
|
538
|
+
lines = [
|
|
539
|
+
"# NCU Profiling Analysis",
|
|
540
|
+
f"Source: {filename}",
|
|
541
|
+
f"Generated: {timestamp}",
|
|
542
|
+
f"Report ID: {result.get('report_id', 'N/A')}",
|
|
543
|
+
"",
|
|
544
|
+
"## GPU Information",
|
|
545
|
+
f"- Device: {result.get('gpu', 'Unknown')}",
|
|
546
|
+
"",
|
|
547
|
+
"## Kernel Summary",
|
|
548
|
+
"",
|
|
549
|
+
]
|
|
550
|
+
|
|
551
|
+
for kernel in result.get("kernels", []):
|
|
552
|
+
name = kernel.get("name", kernel.get("function_name", "Unknown"))
|
|
553
|
+
lines.extend([
|
|
554
|
+
f"### {name}",
|
|
555
|
+
f"- Duration: {kernel.get('duration_us', 0):.2f} us",
|
|
556
|
+
f"- Achieved Occupancy: {kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)):.1f}%",
|
|
557
|
+
f"- Compute Throughput: {kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)):.1f}%",
|
|
558
|
+
f"- Memory Throughput: {kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)):.1f}%",
|
|
559
|
+
"",
|
|
560
|
+
])
|
|
561
|
+
|
|
562
|
+
# Add source correlation summary if present
|
|
563
|
+
source_data = result.get("source_correlation", [])
|
|
564
|
+
if source_data:
|
|
565
|
+
lines.extend([
|
|
566
|
+
"## Source Correlation",
|
|
567
|
+
"",
|
|
568
|
+
])
|
|
569
|
+
for sc in source_data:
|
|
570
|
+
kernel_name = sc.get("kernel_name", "Unknown")
|
|
571
|
+
source = sc.get("source", {})
|
|
572
|
+
instruction_count = len(source.get("instructions", []))
|
|
573
|
+
region_count = len(source.get("regions", []))
|
|
574
|
+
lines.extend([
|
|
575
|
+
f"### {kernel_name}",
|
|
576
|
+
f"- View: {source.get('view', 'N/A')}",
|
|
577
|
+
f"- Instructions: {instruction_count}",
|
|
578
|
+
f"- Regions: {region_count}",
|
|
579
|
+
"",
|
|
580
|
+
])
|
|
581
|
+
|
|
582
|
+
return "\n".join(lines)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def analyze_ncu_profile(
|
|
586
|
+
filepath: Path,
|
|
587
|
+
output_dir: Path | None = None,
|
|
588
|
+
json_output: bool = False,
|
|
589
|
+
remote: bool | None = None,
|
|
590
|
+
target: str | None = None,
|
|
591
|
+
include_source: bool = False,
|
|
592
|
+
) -> str:
|
|
593
|
+
"""Analyze an NCU profile file and return results.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
filepath: Path to .ncu-rep file
|
|
597
|
+
output_dir: Optional directory to save analysis files
|
|
598
|
+
json_output: If True, return raw JSON; otherwise return formatted text
|
|
599
|
+
remote: If True, force remote analysis via API. If False, force local.
|
|
600
|
+
If None (default), auto-detect: use local if NCU available, else remote.
|
|
601
|
+
target: Target name for direct SSH mode (e.g., "vultr-b200"). If provided,
|
|
602
|
+
uses direct SSH instead of API for remote analysis.
|
|
603
|
+
include_source: If True, fetch source correlation (SASS) for each kernel.
|
|
604
|
+
Only supported with --remote (requires GPU for extraction).
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
Analysis results as string (JSON or markdown)
|
|
608
|
+
|
|
609
|
+
Raises:
|
|
610
|
+
FileNotFoundError: If file doesn't exist
|
|
611
|
+
RuntimeError: If NCU parsing fails
|
|
612
|
+
"""
|
|
613
|
+
import sys
|
|
614
|
+
|
|
615
|
+
ncu_path = _find_ncu()
|
|
616
|
+
|
|
617
|
+
# include_source requires remote API (needs GPU for SASS extraction)
|
|
618
|
+
if include_source and not remote and target is None:
|
|
619
|
+
print("Note: --include-source requires remote analysis. Using --remote.", file=sys.stderr)
|
|
620
|
+
remote = True
|
|
621
|
+
|
|
622
|
+
# If target is provided, use direct SSH mode
|
|
623
|
+
if target is not None:
|
|
624
|
+
if include_source:
|
|
625
|
+
print(
|
|
626
|
+
"Warning: --include-source not supported with --target. Ignoring.", file=sys.stderr
|
|
627
|
+
)
|
|
628
|
+
if output_dir:
|
|
629
|
+
print("Warning: --output-dir not supported for remote analysis", file=sys.stderr)
|
|
630
|
+
return _analyze_remote_direct(filepath, target, json_output)
|
|
631
|
+
|
|
632
|
+
# Determine whether to use local or remote
|
|
633
|
+
use_remote = remote
|
|
634
|
+
if use_remote is None:
|
|
635
|
+
# Auto-detect: use remote if NCU not available locally
|
|
636
|
+
use_remote = ncu_path is None
|
|
637
|
+
|
|
638
|
+
if use_remote:
|
|
639
|
+
# Note: output_dir not supported for remote (would need to download results)
|
|
640
|
+
if output_dir:
|
|
641
|
+
print("Warning: --output-dir not supported for remote analysis", file=sys.stderr)
|
|
642
|
+
return _analyze_remote_api(filepath, json_output, include_source=include_source)
|
|
643
|
+
else:
|
|
644
|
+
if include_source:
|
|
645
|
+
print(
|
|
646
|
+
"Warning: --include-source only supported with --remote. Ignoring.", file=sys.stderr
|
|
647
|
+
)
|
|
648
|
+
if ncu_path is None:
|
|
649
|
+
install_cmd = _get_install_command()
|
|
650
|
+
raise FileNotFoundError(f"NCU not installed. Install with: {install_cmd}")
|
|
651
|
+
return _analyze_local(filepath, ncu_path, output_dir, json_output)
|