wafer-cli 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +18 -7
- wafer/api_client.py +4 -0
- wafer/cli.py +1177 -278
- wafer/corpus.py +158 -32
- wafer/evaluate.py +75 -6
- wafer/kernel_scope.py +132 -31
- wafer/nsys_analyze.py +903 -73
- wafer/nsys_profile.py +511 -0
- wafer/output.py +241 -0
- wafer/skills/wafer-guide/SKILL.md +13 -0
- wafer/ssh_keys.py +261 -0
- wafer/targets_ops.py +718 -0
- wafer/wevin_cli.py +127 -18
- wafer/workspaces.py +232 -184
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/METADATA +1 -1
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/RECORD +19 -15
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/top_level.txt +0 -0
wafer/nsys_analyze.py
CHANGED
|
@@ -1,47 +1,171 @@
|
|
|
1
1
|
"""NSYS Analyze - Parse and analyze .nsys-rep profile files.
|
|
2
2
|
|
|
3
3
|
This module provides the implementation for the `wafer nvidia nsys analyze` command.
|
|
4
|
-
Supports
|
|
4
|
+
Supports local analysis (when nsys is installed), remote analysis via API,
|
|
5
|
+
direct SSH analysis via targets, and workspace execution.
|
|
6
|
+
|
|
7
|
+
Local analysis uses `nsys stats` and `nsys export` commands which work on any machine
|
|
8
|
+
with nsys installed (no GPU required for analysis, only for profiling).
|
|
5
9
|
"""
|
|
6
10
|
|
|
7
11
|
import json
|
|
12
|
+
import os
|
|
8
13
|
import platform
|
|
9
14
|
import shutil
|
|
15
|
+
import subprocess
|
|
16
|
+
import sys
|
|
17
|
+
from dataclasses import dataclass
|
|
10
18
|
from datetime import datetime
|
|
11
19
|
from pathlib import Path
|
|
12
20
|
|
|
21
|
+
# Known NSYS installation paths by platform
|
|
22
|
+
# NOTE: On macOS, NVIDIA only provides the GUI viewer (nsys-ui), NOT the CLI tool.
|
|
23
|
+
# The nsys CLI is only available on Linux. macOS users must use remote analysis.
|
|
24
|
+
NSYS_PATHS = {
|
|
25
|
+
"linux": [
|
|
26
|
+
"/usr/bin/nsys",
|
|
27
|
+
"/usr/local/bin/nsys",
|
|
28
|
+
"/usr/local/cuda/bin/nsys",
|
|
29
|
+
"/opt/nvidia/nsight-systems/bin/nsys",
|
|
30
|
+
"/opt/nvidia/nsight-systems-cli/bin/nsys",
|
|
31
|
+
],
|
|
32
|
+
# macOS: nsys CLI not available - only GUI viewer exists
|
|
33
|
+
# Set to empty list to always fall back to remote analysis
|
|
34
|
+
"darwin": [],
|
|
35
|
+
"windows": [
|
|
36
|
+
r"C:\Program Files\NVIDIA Corporation\Nsight Systems\bin\nsys.exe",
|
|
37
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\bin\nsys.exe",
|
|
38
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\nsys.exe",
|
|
39
|
+
],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class NSYSCheckResult:
|
|
45
|
+
"""Result of checking NSYS installation."""
|
|
46
|
+
|
|
47
|
+
installed: bool
|
|
48
|
+
path: str | None = None
|
|
49
|
+
version: str | None = None
|
|
50
|
+
install_command: str | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class KernelInfo:
|
|
55
|
+
"""Information about a CUDA kernel from NSYS profile."""
|
|
56
|
+
|
|
57
|
+
name: str
|
|
58
|
+
duration_ns: int
|
|
59
|
+
duration_ms: float
|
|
60
|
+
instances: int
|
|
61
|
+
avg_duration_ns: float
|
|
62
|
+
min_duration_ns: int
|
|
63
|
+
max_duration_ns: int
|
|
64
|
+
grid_size: str | None = None
|
|
65
|
+
block_size: str | None = None
|
|
66
|
+
registers_per_thread: int | None = None
|
|
67
|
+
shared_memory_bytes: int | None = None
|
|
68
|
+
memory_throughput_gb_s: float | None = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(frozen=True)
|
|
72
|
+
class MemoryTransfer:
|
|
73
|
+
"""Information about a memory transfer from NSYS profile."""
|
|
74
|
+
|
|
75
|
+
operation: str # HtoD, DtoH, DtoD, etc.
|
|
76
|
+
duration_ns: int
|
|
77
|
+
size_bytes: int
|
|
78
|
+
throughput_gb_s: float
|
|
79
|
+
instances: int
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class NSYSAnalysisResult:
|
|
84
|
+
"""Complete NSYS analysis result."""
|
|
85
|
+
|
|
86
|
+
success: bool
|
|
87
|
+
report_id: str | None = None
|
|
88
|
+
gpu: str = "Unknown"
|
|
89
|
+
duration_ms: float = 0.0
|
|
90
|
+
kernel_count: int = 0
|
|
91
|
+
memory_transfer_count: int = 0
|
|
92
|
+
kernels: list[dict] | None = None
|
|
93
|
+
memory_transfers: list[dict] | None = None
|
|
94
|
+
timeline: list[dict] | None = None
|
|
95
|
+
diagnostics: list[dict] | None = None
|
|
96
|
+
error: str | None = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _get_platform() -> str:
|
|
100
|
+
"""Get normalized platform name."""
|
|
101
|
+
system = platform.system().lower()
|
|
102
|
+
if system == "darwin":
|
|
103
|
+
return "darwin"
|
|
104
|
+
elif system == "windows":
|
|
105
|
+
return "windows"
|
|
106
|
+
return "linux"
|
|
107
|
+
|
|
13
108
|
|
|
14
109
|
def _find_nsys() -> str | None:
|
|
15
|
-
"""Find nsys executable on the system.
|
|
110
|
+
"""Find nsys executable on the system.
|
|
111
|
+
|
|
112
|
+
Searches in order:
|
|
113
|
+
1. PATH environment variable
|
|
114
|
+
2. Common installation paths for the current platform
|
|
115
|
+
"""
|
|
116
|
+
# First check PATH
|
|
16
117
|
nsys = shutil.which("nsys")
|
|
17
118
|
if nsys:
|
|
18
119
|
return nsys
|
|
19
120
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"/opt/nvidia/nsight-systems/bin/nsys",
|
|
25
|
-
]
|
|
26
|
-
|
|
27
|
-
for path in common_paths:
|
|
28
|
-
if Path(path).is_file():
|
|
121
|
+
# Then check known installation paths
|
|
122
|
+
plat = _get_platform()
|
|
123
|
+
for path in NSYS_PATHS.get(plat, []):
|
|
124
|
+
if os.path.isfile(path) and os.access(path, os.X_OK):
|
|
29
125
|
return path
|
|
30
126
|
|
|
31
127
|
return None
|
|
32
128
|
|
|
33
129
|
|
|
130
|
+
def _get_nsys_version(nsys_path: str) -> str | None:
|
|
131
|
+
"""Get NSYS version string."""
|
|
132
|
+
try:
|
|
133
|
+
result = subprocess.run(
|
|
134
|
+
[nsys_path, "--version"],
|
|
135
|
+
capture_output=True,
|
|
136
|
+
text=True,
|
|
137
|
+
timeout=10,
|
|
138
|
+
)
|
|
139
|
+
if result.returncode == 0:
|
|
140
|
+
# Parse version from output like "NVIDIA Nsight Systems version 2024.6.1.90-246160830v0"
|
|
141
|
+
for line in result.stdout.split("\n"):
|
|
142
|
+
if "version" in line.lower():
|
|
143
|
+
parts = line.split("version")
|
|
144
|
+
if len(parts) >= 2:
|
|
145
|
+
return parts[1].strip().split()[0]
|
|
146
|
+
return result.stdout.strip().split("\n")[0]
|
|
147
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
148
|
+
pass
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
|
|
34
152
|
def _get_install_command() -> str:
|
|
35
|
-
"""Get platform-appropriate install command."""
|
|
36
|
-
|
|
153
|
+
"""Get platform-appropriate install command for NSYS."""
|
|
154
|
+
plat = _get_platform()
|
|
155
|
+
|
|
156
|
+
if plat == "darwin":
|
|
157
|
+
# macOS only has GUI viewer, no CLI - user must use remote analysis
|
|
158
|
+
return "NSYS CLI not available on macOS. Use --remote flag or --target for remote analysis."
|
|
37
159
|
|
|
38
|
-
if
|
|
160
|
+
if plat == "linux":
|
|
39
161
|
if shutil.which("apt-get") or shutil.which("apt"):
|
|
40
|
-
return "sudo apt install
|
|
162
|
+
return "sudo apt install nsight-systems"
|
|
41
163
|
elif shutil.which("dnf"):
|
|
42
164
|
return "sudo dnf install nsight-systems"
|
|
43
165
|
elif shutil.which("yum"):
|
|
44
166
|
return "sudo yum install nsight-systems"
|
|
167
|
+
elif shutil.which("pacman"):
|
|
168
|
+
return "sudo pacman -S nsight-systems"
|
|
45
169
|
|
|
46
170
|
if shutil.which("conda"):
|
|
47
171
|
return "conda install -c nvidia nsight-systems"
|
|
@@ -49,57 +173,632 @@ def _get_install_command() -> str:
|
|
|
49
173
|
return "Download from https://developer.nvidia.com/nsight-systems"
|
|
50
174
|
|
|
51
175
|
|
|
52
|
-
def
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
assert isinstance(result, dict), "result must be a dictionary"
|
|
176
|
+
def is_macos() -> bool:
|
|
177
|
+
"""Check if running on macOS."""
|
|
178
|
+
return _get_platform() == "darwin"
|
|
56
179
|
|
|
57
|
-
timestamp = datetime.now().isoformat()
|
|
58
|
-
summary = result.get("summary", {})
|
|
59
|
-
kernels = result.get("kernels", [])
|
|
60
180
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
181
|
+
def check_nsys_installation() -> NSYSCheckResult:
|
|
182
|
+
"""Check if NSYS is installed and return details.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
NSYSCheckResult with installation status and details
|
|
186
|
+
"""
|
|
187
|
+
nsys_path = _find_nsys()
|
|
188
|
+
|
|
189
|
+
if nsys_path is None:
|
|
190
|
+
return NSYSCheckResult(
|
|
191
|
+
installed=False,
|
|
192
|
+
install_command=_get_install_command(),
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
version = _get_nsys_version(nsys_path)
|
|
196
|
+
|
|
197
|
+
return NSYSCheckResult(
|
|
198
|
+
installed=True,
|
|
199
|
+
path=nsys_path,
|
|
200
|
+
version=version,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _run_nsys_stats(
|
|
205
|
+
nsys_path: str,
|
|
206
|
+
filepath: Path,
|
|
207
|
+
report_name: str,
|
|
208
|
+
timeout: int = 120,
|
|
209
|
+
) -> tuple[bool, str]:
|
|
210
|
+
"""Run nsys stats command to extract report data.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
nsys_path: Path to nsys executable
|
|
214
|
+
filepath: Path to .nsys-rep file
|
|
215
|
+
report_name: Report type (e.g., gpukernsum, gpumemtimesum, cudaapisum)
|
|
216
|
+
timeout: Command timeout in seconds
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Tuple of (success, output_or_error)
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
result = subprocess.run(
|
|
223
|
+
[
|
|
224
|
+
nsys_path,
|
|
225
|
+
"stats",
|
|
226
|
+
"--report", report_name,
|
|
227
|
+
"--format", "csv",
|
|
228
|
+
"--force-export", "true",
|
|
229
|
+
str(filepath),
|
|
230
|
+
],
|
|
231
|
+
capture_output=True,
|
|
232
|
+
text=True,
|
|
233
|
+
timeout=timeout,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if result.returncode != 0:
|
|
237
|
+
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
238
|
+
return False, error_msg
|
|
239
|
+
|
|
240
|
+
return True, result.stdout
|
|
241
|
+
|
|
242
|
+
except subprocess.TimeoutExpired:
|
|
243
|
+
return False, f"Command timed out after {timeout}s"
|
|
244
|
+
except OSError as e:
|
|
245
|
+
return False, f"Failed to execute nsys: {e}"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _run_nsys_export(
|
|
249
|
+
nsys_path: str,
|
|
250
|
+
filepath: Path,
|
|
251
|
+
output_format: str = "sqlite",
|
|
252
|
+
timeout: int = 180,
|
|
253
|
+
) -> tuple[bool, str]:
|
|
254
|
+
"""Run nsys export command to export trace data.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
nsys_path: Path to nsys executable
|
|
258
|
+
filepath: Path to .nsys-rep file
|
|
259
|
+
output_format: Export format (sqlite, json)
|
|
260
|
+
timeout: Command timeout in seconds
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Tuple of (success, output_path_or_error)
|
|
264
|
+
"""
|
|
265
|
+
# Determine output path
|
|
266
|
+
output_path = filepath.with_suffix(f".{output_format}")
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
result = subprocess.run(
|
|
270
|
+
[
|
|
271
|
+
nsys_path,
|
|
272
|
+
"export",
|
|
273
|
+
"--type", output_format,
|
|
274
|
+
"--force-overwrite", "true",
|
|
275
|
+
"--output", str(output_path),
|
|
276
|
+
str(filepath),
|
|
277
|
+
],
|
|
278
|
+
capture_output=True,
|
|
279
|
+
text=True,
|
|
280
|
+
timeout=timeout,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if result.returncode != 0:
|
|
284
|
+
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
285
|
+
return False, error_msg
|
|
286
|
+
|
|
287
|
+
return True, str(output_path)
|
|
288
|
+
|
|
289
|
+
except subprocess.TimeoutExpired:
|
|
290
|
+
return False, f"Export timed out after {timeout}s"
|
|
291
|
+
except OSError as e:
|
|
292
|
+
return False, f"Failed to execute nsys: {e}"
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _parse_csv_kernels(csv_output: str) -> list[dict]:
|
|
296
|
+
"""Parse GPU kernel summary from nsys stats CSV output."""
|
|
297
|
+
kernels = []
|
|
298
|
+
|
|
299
|
+
lines = csv_output.strip().split("\n")
|
|
300
|
+
if len(lines) < 2:
|
|
301
|
+
return kernels
|
|
302
|
+
|
|
303
|
+
# Find header line - look for a line with known CSV header columns
|
|
304
|
+
# The nsys output includes informational lines before the actual CSV
|
|
305
|
+
# Header line should contain "Time" and "Name" columns
|
|
306
|
+
header_idx = -1
|
|
307
|
+
for i, line in enumerate(lines):
|
|
308
|
+
line_lower = line.lower()
|
|
309
|
+
# Skip comment lines and non-CSV lines
|
|
310
|
+
if line.startswith("#"):
|
|
311
|
+
continue
|
|
312
|
+
# Check if this looks like a CSV header with expected columns
|
|
313
|
+
if ("time" in line_lower and "name" in line_lower) or \
|
|
314
|
+
("time (%)" in line_lower) or \
|
|
315
|
+
("total time" in line_lower and "instances" in line_lower):
|
|
316
|
+
header_idx = i
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
if header_idx < 0 or header_idx >= len(lines) - 1:
|
|
320
|
+
return kernels
|
|
321
|
+
|
|
322
|
+
headers = [h.strip().strip('"') for h in lines[header_idx].split(",")]
|
|
323
|
+
|
|
324
|
+
# Map header names to indices
|
|
325
|
+
def find_col(names: list[str]) -> int | None:
|
|
326
|
+
for name in names:
|
|
327
|
+
name_lower = name.lower()
|
|
328
|
+
for i, h in enumerate(headers):
|
|
329
|
+
if name_lower in h.lower():
|
|
330
|
+
return i
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
name_col = find_col(["Name", "Kernel Name", "KernelName"])
|
|
334
|
+
time_col = find_col(["Time (%)", "Time Percent", "Time%"])
|
|
335
|
+
total_time_col = find_col(["Total Time", "TotalTime", "Duration"])
|
|
336
|
+
instances_col = find_col(["Instances", "Count", "Calls"])
|
|
337
|
+
avg_col = find_col(["Avg", "Average", "AvgTime"])
|
|
338
|
+
min_col = find_col(["Min", "Minimum", "MinTime"])
|
|
339
|
+
max_col = find_col(["Max", "Maximum", "MaxTime"])
|
|
340
|
+
|
|
341
|
+
# Parse data rows
|
|
342
|
+
for line in lines[header_idx + 1:]:
|
|
343
|
+
if not line.strip() or line.startswith("#"):
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
# Handle CSV with quoted fields
|
|
347
|
+
parts = []
|
|
348
|
+
in_quote = False
|
|
349
|
+
current = ""
|
|
350
|
+
for char in line:
|
|
351
|
+
if char == '"':
|
|
352
|
+
in_quote = not in_quote
|
|
353
|
+
elif char == "," and not in_quote:
|
|
354
|
+
parts.append(current.strip().strip('"'))
|
|
355
|
+
current = ""
|
|
356
|
+
else:
|
|
357
|
+
current += char
|
|
358
|
+
parts.append(current.strip().strip('"'))
|
|
359
|
+
|
|
360
|
+
if len(parts) <= (name_col or 0):
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
kernel = {
|
|
364
|
+
"name": parts[name_col] if name_col is not None else "Unknown",
|
|
365
|
+
"time_percent": 0.0,
|
|
366
|
+
"total_time_ns": 0,
|
|
367
|
+
"duration_ms": 0.0,
|
|
368
|
+
"instances": 0,
|
|
369
|
+
"avg_time_ns": 0,
|
|
370
|
+
"min_time_ns": 0,
|
|
371
|
+
"max_time_ns": 0,
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
if time_col is not None and time_col < len(parts):
|
|
376
|
+
kernel["time_percent"] = float(parts[time_col].replace("%", "").strip() or 0)
|
|
377
|
+
|
|
378
|
+
if total_time_col is not None and total_time_col < len(parts):
|
|
379
|
+
# Time may be in ns, us, or ms - parse accordingly
|
|
380
|
+
time_str = parts[total_time_col].strip()
|
|
381
|
+
kernel["total_time_ns"] = _parse_time_to_ns(time_str)
|
|
382
|
+
kernel["duration_ms"] = kernel["total_time_ns"] / 1_000_000
|
|
383
|
+
|
|
384
|
+
if instances_col is not None and instances_col < len(parts):
|
|
385
|
+
kernel["instances"] = int(float(parts[instances_col].strip() or 0))
|
|
386
|
+
|
|
387
|
+
if avg_col is not None and avg_col < len(parts):
|
|
388
|
+
kernel["avg_time_ns"] = _parse_time_to_ns(parts[avg_col].strip())
|
|
389
|
+
|
|
390
|
+
if min_col is not None and min_col < len(parts):
|
|
391
|
+
kernel["min_time_ns"] = _parse_time_to_ns(parts[min_col].strip())
|
|
392
|
+
|
|
393
|
+
if max_col is not None and max_col < len(parts):
|
|
394
|
+
kernel["max_time_ns"] = _parse_time_to_ns(parts[max_col].strip())
|
|
395
|
+
|
|
396
|
+
except (ValueError, IndexError):
|
|
397
|
+
pass
|
|
398
|
+
|
|
399
|
+
if kernel["name"] and kernel["name"] != "Unknown":
|
|
400
|
+
kernels.append(kernel)
|
|
401
|
+
|
|
402
|
+
return kernels
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _parse_time_to_ns(time_str: str) -> int:
|
|
406
|
+
"""Parse time string to nanoseconds."""
|
|
407
|
+
if not time_str:
|
|
408
|
+
return 0
|
|
409
|
+
|
|
410
|
+
time_str = time_str.strip().lower()
|
|
411
|
+
|
|
412
|
+
try:
|
|
413
|
+
if "ms" in time_str:
|
|
414
|
+
return int(float(time_str.replace("ms", "").strip()) * 1_000_000)
|
|
415
|
+
elif "us" in time_str or "µs" in time_str:
|
|
416
|
+
return int(float(time_str.replace("us", "").replace("µs", "").strip()) * 1_000)
|
|
417
|
+
elif "ns" in time_str:
|
|
418
|
+
return int(float(time_str.replace("ns", "").strip()))
|
|
419
|
+
elif "s" in time_str:
|
|
420
|
+
return int(float(time_str.replace("s", "").strip()) * 1_000_000_000)
|
|
421
|
+
else:
|
|
422
|
+
# Assume nanoseconds
|
|
423
|
+
return int(float(time_str))
|
|
424
|
+
except ValueError:
|
|
425
|
+
return 0
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _parse_csv_memory(csv_output: str) -> list[dict]:
|
|
429
|
+
"""Parse memory transfer summary from nsys stats CSV output."""
|
|
430
|
+
transfers = []
|
|
431
|
+
|
|
432
|
+
lines = csv_output.strip().split("\n")
|
|
433
|
+
if len(lines) < 2:
|
|
434
|
+
return transfers
|
|
435
|
+
|
|
436
|
+
# Find header line - look for a line with known CSV header columns
|
|
437
|
+
# The nsys output includes informational lines before the actual CSV
|
|
438
|
+
header_idx = -1
|
|
439
|
+
for i, line in enumerate(lines):
|
|
440
|
+
line_lower = line.lower()
|
|
441
|
+
# Skip comment lines
|
|
442
|
+
if line.startswith("#"):
|
|
443
|
+
continue
|
|
444
|
+
# Check if this looks like a CSV header with expected columns
|
|
445
|
+
if ("time" in line_lower and ("operation" in line_lower or "total" in line_lower)) or \
|
|
446
|
+
("time (%)" in line_lower) or \
|
|
447
|
+
("count" in line_lower and "total" in line_lower):
|
|
448
|
+
header_idx = i
|
|
449
|
+
break
|
|
450
|
+
|
|
451
|
+
if header_idx < 0 or header_idx >= len(lines) - 1:
|
|
452
|
+
return transfers
|
|
453
|
+
|
|
454
|
+
headers = [h.strip().strip('"') for h in lines[header_idx].split(",")]
|
|
455
|
+
|
|
456
|
+
# Map header names
|
|
457
|
+
def find_col(names: list[str]) -> int | None:
|
|
458
|
+
for name in names:
|
|
459
|
+
name_lower = name.lower()
|
|
460
|
+
for i, h in enumerate(headers):
|
|
461
|
+
if name_lower in h.lower():
|
|
462
|
+
return i
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
op_col = find_col(["Operation", "Name", "MemOp"])
|
|
466
|
+
time_col = find_col(["Total Time", "TotalTime", "Duration"])
|
|
467
|
+
size_col = find_col(["Total", "Size", "Bytes"])
|
|
468
|
+
count_col = find_col(["Count", "Instances", "Calls"])
|
|
469
|
+
throughput_col = find_col(["Throughput", "Bandwidth"])
|
|
470
|
+
|
|
471
|
+
for line in lines[header_idx + 1:]:
|
|
472
|
+
if not line.strip() or line.startswith("#"):
|
|
473
|
+
continue
|
|
474
|
+
|
|
475
|
+
parts = [p.strip().strip('"') for p in line.split(",")]
|
|
476
|
+
|
|
477
|
+
if len(parts) <= (op_col or 0):
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
transfer = {
|
|
481
|
+
"operation": parts[op_col] if op_col is not None else "Unknown",
|
|
482
|
+
"total_time_ns": 0,
|
|
483
|
+
"duration_ms": 0.0,
|
|
484
|
+
"size_bytes": 0,
|
|
485
|
+
"instances": 0,
|
|
486
|
+
"throughput_gb_s": 0.0,
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
try:
|
|
490
|
+
if time_col is not None and time_col < len(parts):
|
|
491
|
+
transfer["total_time_ns"] = _parse_time_to_ns(parts[time_col])
|
|
492
|
+
transfer["duration_ms"] = transfer["total_time_ns"] / 1_000_000
|
|
493
|
+
|
|
494
|
+
if size_col is not None and size_col < len(parts):
|
|
495
|
+
size_str = parts[size_col].strip().upper()
|
|
496
|
+
if "GB" in size_str:
|
|
497
|
+
transfer["size_bytes"] = int(float(size_str.replace("GB", "").strip()) * 1e9)
|
|
498
|
+
elif "MB" in size_str:
|
|
499
|
+
transfer["size_bytes"] = int(float(size_str.replace("MB", "").strip()) * 1e6)
|
|
500
|
+
elif "KB" in size_str:
|
|
501
|
+
transfer["size_bytes"] = int(float(size_str.replace("KB", "").strip()) * 1e3)
|
|
502
|
+
else:
|
|
503
|
+
transfer["size_bytes"] = int(float(size_str.replace("B", "").strip() or 0))
|
|
504
|
+
|
|
505
|
+
if count_col is not None and count_col < len(parts):
|
|
506
|
+
transfer["instances"] = int(float(parts[count_col].strip() or 0))
|
|
507
|
+
|
|
508
|
+
if throughput_col is not None and throughput_col < len(parts):
|
|
509
|
+
tp_str = parts[throughput_col].strip().upper()
|
|
510
|
+
if "GB" in tp_str:
|
|
511
|
+
transfer["throughput_gb_s"] = float(tp_str.replace("GB/S", "").strip())
|
|
512
|
+
elif "MB" in tp_str:
|
|
513
|
+
transfer["throughput_gb_s"] = float(tp_str.replace("MB/S", "").strip()) / 1000
|
|
514
|
+
else:
|
|
515
|
+
transfer["throughput_gb_s"] = float(tp_str.replace("/S", "").strip() or 0) / 1e9
|
|
516
|
+
|
|
517
|
+
except (ValueError, IndexError):
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
if transfer["operation"] and transfer["operation"] != "Unknown":
|
|
521
|
+
transfers.append(transfer)
|
|
522
|
+
|
|
523
|
+
return transfers
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _analyze_local(
|
|
527
|
+
filepath: Path,
|
|
528
|
+
nsys_path: str,
|
|
529
|
+
output_dir: Path | None = None,
|
|
530
|
+
json_output: bool = False,
|
|
531
|
+
) -> str:
|
|
532
|
+
"""Analyze NSYS profile locally using installed nsys CLI.
|
|
533
|
+
|
|
534
|
+
Uses `nsys stats` commands to extract kernel and memory statistics.
|
|
535
|
+
This works on any machine with nsys installed - no GPU required for analysis.
|
|
536
|
+
"""
|
|
537
|
+
if not filepath.exists():
|
|
538
|
+
raise FileNotFoundError(f"File must exist: {filepath}")
|
|
539
|
+
if filepath.suffix != ".nsys-rep":
|
|
540
|
+
raise ValueError(f"File must be .nsys-rep: {filepath}")
|
|
541
|
+
|
|
542
|
+
print(f"Analyzing {filepath.name} locally...", file=sys.stderr)
|
|
543
|
+
|
|
544
|
+
# Get GPU kernel summary
|
|
545
|
+
# Note: Report names changed in nsys 2024.x: gpukernsum -> cuda_gpu_kern_sum
|
|
546
|
+
print("Extracting kernel statistics...", file=sys.stderr)
|
|
547
|
+
success, kernel_output = _run_nsys_stats(nsys_path, filepath, "cuda_gpu_kern_sum")
|
|
548
|
+
|
|
549
|
+
# Try legacy report name if new one fails
|
|
550
|
+
if not success:
|
|
551
|
+
success, kernel_output = _run_nsys_stats(nsys_path, filepath, "gpukernsum")
|
|
552
|
+
|
|
553
|
+
kernels = []
|
|
554
|
+
if success:
|
|
555
|
+
kernels = _parse_csv_kernels(kernel_output)
|
|
556
|
+
else:
|
|
557
|
+
print(f"Warning: Could not extract kernel stats: {kernel_output}", file=sys.stderr)
|
|
558
|
+
|
|
559
|
+
# Get memory transfer summary
|
|
560
|
+
# Note: Report names changed in nsys 2024.x: gpumemtimesum -> cuda_gpu_mem_time_sum
|
|
561
|
+
print("Extracting memory statistics...", file=sys.stderr)
|
|
562
|
+
success, mem_output = _run_nsys_stats(nsys_path, filepath, "cuda_gpu_mem_time_sum")
|
|
563
|
+
|
|
564
|
+
# Try legacy report names if new one fails
|
|
565
|
+
if not success:
|
|
566
|
+
success, mem_output = _run_nsys_stats(nsys_path, filepath, "gpumemtimesum")
|
|
567
|
+
|
|
568
|
+
memory_transfers = []
|
|
569
|
+
if success:
|
|
570
|
+
memory_transfers = _parse_csv_memory(mem_output)
|
|
571
|
+
else:
|
|
572
|
+
# Try alternative report name (for very old nsys versions)
|
|
573
|
+
success, mem_output = _run_nsys_stats(nsys_path, filepath, "cudamemcpysum")
|
|
574
|
+
if success:
|
|
575
|
+
memory_transfers = _parse_csv_memory(mem_output)
|
|
576
|
+
|
|
577
|
+
# Get CUDA API summary for additional context
|
|
578
|
+
# Note: Report names changed in nsys 2024.x: cudaapisum -> cuda_api_sum
|
|
579
|
+
print("Extracting CUDA API statistics...", file=sys.stderr)
|
|
580
|
+
success, api_output = _run_nsys_stats(nsys_path, filepath, "cuda_api_sum")
|
|
581
|
+
|
|
582
|
+
# Try legacy report name if new one fails
|
|
583
|
+
if not success:
|
|
584
|
+
success, api_output = _run_nsys_stats(nsys_path, filepath, "cudaapisum")
|
|
585
|
+
|
|
586
|
+
# Build summary
|
|
587
|
+
total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
|
|
588
|
+
total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
|
|
589
|
+
|
|
590
|
+
# Try to get GPU info from report
|
|
591
|
+
gpu_name = "Unknown"
|
|
592
|
+
|
|
593
|
+
# Build result
|
|
594
|
+
result = {
|
|
595
|
+
"success": True,
|
|
596
|
+
"summary": {
|
|
597
|
+
"gpu": gpu_name,
|
|
598
|
+
"duration_ms": total_kernel_time_ms + total_mem_time_ms,
|
|
599
|
+
"kernel_count": len(kernels),
|
|
600
|
+
"memory_transfers": len(memory_transfers),
|
|
601
|
+
"total_kernel_time_ms": total_kernel_time_ms,
|
|
602
|
+
"total_memory_time_ms": total_mem_time_ms,
|
|
603
|
+
},
|
|
604
|
+
"kernels": kernels,
|
|
605
|
+
"memory_transfers": memory_transfers,
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
# Save to output directory if specified
|
|
609
|
+
if output_dir:
|
|
610
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
611
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
|
612
|
+
output_filename = f"nsys_analysis_{filepath.stem}_{timestamp}"
|
|
613
|
+
|
|
614
|
+
if json_output:
|
|
615
|
+
json_path = output_dir / f"{output_filename}.json"
|
|
616
|
+
json_path.write_text(json.dumps(result, indent=2))
|
|
617
|
+
print(f"Saved JSON: {json_path}", file=sys.stderr)
|
|
618
|
+
else:
|
|
619
|
+
txt_path = output_dir / f"{output_filename}.txt"
|
|
620
|
+
txt_path.write_text(_generate_text_output(filepath.name, result))
|
|
621
|
+
print(f"Saved analysis: {txt_path}", file=sys.stderr)
|
|
622
|
+
|
|
623
|
+
print("Analysis complete.", file=sys.stderr)
|
|
624
|
+
|
|
625
|
+
if json_output:
|
|
626
|
+
return json.dumps(result, indent=2)
|
|
627
|
+
else:
|
|
628
|
+
return _generate_text_output(filepath.name, result)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _analyze_remote_direct(
|
|
632
|
+
filepath: Path,
|
|
633
|
+
target_name: str,
|
|
634
|
+
json_output: bool = False,
|
|
635
|
+
) -> str:
|
|
636
|
+
"""Analyze NSYS profile remotely via direct SSH to target.
|
|
637
|
+
|
|
638
|
+
Uploads the .nsys-rep file and runs nsys analysis on the target machine.
|
|
639
|
+
"""
|
|
640
|
+
import tempfile
|
|
641
|
+
|
|
642
|
+
from .gpu_run import push_directory, run_command_capture
|
|
643
|
+
from .targets import load_target
|
|
644
|
+
|
|
645
|
+
# Load target
|
|
646
|
+
try:
|
|
647
|
+
target = load_target(target_name)
|
|
648
|
+
except FileNotFoundError as e:
|
|
649
|
+
raise RuntimeError(f"Target not found: {target_name}. Create with: wafer targets add {target_name}") from e
|
|
650
|
+
|
|
651
|
+
# Create temp directory with just the .nsys-rep file
|
|
652
|
+
workspace_name = f"nsys_analyze_{filepath.stem}"
|
|
653
|
+
|
|
654
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
655
|
+
# Create a directory with the workspace name
|
|
656
|
+
tmp_path = Path(tmpdir) / workspace_name
|
|
657
|
+
tmp_path.mkdir()
|
|
658
|
+
shutil.copy(filepath, tmp_path / filepath.name)
|
|
659
|
+
|
|
660
|
+
# Push the file
|
|
661
|
+
print(f"Uploading {filepath.name} to {target_name}...", file=sys.stderr)
|
|
662
|
+
push_directory(tmp_path, target)
|
|
663
|
+
|
|
664
|
+
# Run nsys stats commands on remote
|
|
665
|
+
# First try to find nsys on the remote system
|
|
666
|
+
nsys_paths = [
|
|
667
|
+
"/usr/bin/nsys",
|
|
668
|
+
"/usr/local/cuda/bin/nsys",
|
|
669
|
+
"/opt/nvidia/nsight-systems/bin/nsys",
|
|
72
670
|
]
|
|
73
671
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
f"### {i}. {kernel.get('name', 'Unknown')}",
|
|
82
|
-
f"- Duration: {kernel.get('duration_ms', 0):.3f} ms",
|
|
83
|
-
f"- Grid Size: {kernel.get('grid_size', 'N/A')}",
|
|
84
|
-
f"- Block Size: {kernel.get('block_size', 'N/A')}",
|
|
85
|
-
f"- Memory Throughput: {kernel.get('memory_throughput_gb_s', 0):.2f} GB/s",
|
|
86
|
-
"",
|
|
87
|
-
])
|
|
672
|
+
nsys_cmd = "nsys" # Default to PATH
|
|
673
|
+
for path in nsys_paths:
|
|
674
|
+
check_cmd = f"test -x {path} && echo found"
|
|
675
|
+
exit_code, output = run_command_capture(check_cmd, workspace_name, target)
|
|
676
|
+
if exit_code == 0 and "found" in output:
|
|
677
|
+
nsys_cmd = path
|
|
678
|
+
break
|
|
88
679
|
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
"",
|
|
95
|
-
])
|
|
96
|
-
for diag in diagnostics:
|
|
97
|
-
level = diag.get("level", "Info")
|
|
98
|
-
text = diag.get("text", "")
|
|
99
|
-
lines.append(f"- [{level}] {text}")
|
|
100
|
-
lines.append("")
|
|
680
|
+
# Run analysis commands
|
|
681
|
+
# Try new report name first, fall back to legacy if it fails
|
|
682
|
+
print("Running NSYS analysis...", file=sys.stderr)
|
|
683
|
+
analysis_cmd = f"{nsys_cmd} stats --report cuda_gpu_kern_sum --format csv --force-export true {filepath.name}"
|
|
684
|
+
exit_code, kernel_output = run_command_capture(analysis_cmd, workspace_name, target)
|
|
101
685
|
|
|
102
|
-
|
|
686
|
+
# Try legacy report name if new one fails
|
|
687
|
+
if exit_code != 0 or "could not be found" in kernel_output.lower():
|
|
688
|
+
analysis_cmd = f"{nsys_cmd} stats --report gpukernsum --format csv --force-export true {filepath.name}"
|
|
689
|
+
exit_code, kernel_output = run_command_capture(analysis_cmd, workspace_name, target)
|
|
690
|
+
|
|
691
|
+
if exit_code != 0:
|
|
692
|
+
raise RuntimeError(f"NSYS kernel stats failed: {kernel_output}")
|
|
693
|
+
|
|
694
|
+
# Get memory stats - try new name first, fall back to legacy
|
|
695
|
+
mem_cmd = f"{nsys_cmd} stats --report cuda_gpu_mem_time_sum --format csv --force-export true {filepath.name}"
|
|
696
|
+
exit_code, mem_output = run_command_capture(mem_cmd, workspace_name, target)
|
|
697
|
+
|
|
698
|
+
# Try legacy report name if new one fails
|
|
699
|
+
if exit_code != 0 or "could not be found" in mem_output.lower():
|
|
700
|
+
mem_cmd = f"{nsys_cmd} stats --report gpumemtimesum --format csv --force-export true {filepath.name}"
|
|
701
|
+
exit_code, mem_output = run_command_capture(mem_cmd, workspace_name, target)
|
|
702
|
+
|
|
703
|
+
# Parse outputs (memory stats may fail if no memory transfers)
|
|
704
|
+
kernels = _parse_csv_kernels(kernel_output) if kernel_output else []
|
|
705
|
+
memory_transfers = _parse_csv_memory(mem_output) if exit_code == 0 and mem_output else []
|
|
706
|
+
|
|
707
|
+
# Build result
|
|
708
|
+
total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
|
|
709
|
+
total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
|
|
710
|
+
|
|
711
|
+
result = {
|
|
712
|
+
"success": True,
|
|
713
|
+
"summary": {
|
|
714
|
+
"gpu": "Unknown", # Would need additional parsing to get GPU name
|
|
715
|
+
"duration_ms": total_kernel_time_ms + total_mem_time_ms,
|
|
716
|
+
"kernel_count": len(kernels),
|
|
717
|
+
"memory_transfers": len(memory_transfers),
|
|
718
|
+
},
|
|
719
|
+
"kernels": kernels,
|
|
720
|
+
"memory_transfers": memory_transfers,
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
if json_output:
|
|
724
|
+
return json.dumps(result, indent=2)
|
|
725
|
+
else:
|
|
726
|
+
return _generate_text_output(filepath.name, result)
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _analyze_workspace(
|
|
730
|
+
filepath: Path,
|
|
731
|
+
workspace_id: str,
|
|
732
|
+
json_output: bool = False,
|
|
733
|
+
) -> str:
|
|
734
|
+
"""Analyze NSYS profile on a Wafer workspace.
|
|
735
|
+
|
|
736
|
+
Uses workspace exec to run nsys analysis on the workspace.
|
|
737
|
+
"""
|
|
738
|
+
from .workspaces import exec_command_capture
|
|
739
|
+
|
|
740
|
+
# First, check if file exists on workspace or needs upload
|
|
741
|
+
# For now, assume file is already on workspace (via sync)
|
|
742
|
+
filename = filepath.name
|
|
743
|
+
|
|
744
|
+
print(f"Running NSYS analysis on workspace {workspace_id}...", file=sys.stderr)
|
|
745
|
+
|
|
746
|
+
# Try to find nsys on the workspace
|
|
747
|
+
nsys_cmd = "nsys"
|
|
748
|
+
for path in ["/usr/bin/nsys", "/usr/local/cuda/bin/nsys", "/opt/nvidia/nsight-systems/bin/nsys"]:
|
|
749
|
+
check_cmd = f"test -x {path} && echo found"
|
|
750
|
+
exit_code, output = exec_command_capture(workspace_id, check_cmd)
|
|
751
|
+
if exit_code == 0 and "found" in output:
|
|
752
|
+
nsys_cmd = path
|
|
753
|
+
break
|
|
754
|
+
|
|
755
|
+
# Run kernel stats - try new report name first, fall back to legacy
|
|
756
|
+
print("Extracting kernel statistics...", file=sys.stderr)
|
|
757
|
+
kernel_cmd = f"{nsys_cmd} stats --report cuda_gpu_kern_sum --format csv --force-export true {filename}"
|
|
758
|
+
exit_code, kernel_output = exec_command_capture(workspace_id, kernel_cmd)
|
|
759
|
+
|
|
760
|
+
# Try legacy report name if new one fails
|
|
761
|
+
if exit_code != 0 or "could not be found" in kernel_output.lower():
|
|
762
|
+
kernel_cmd = f"{nsys_cmd} stats --report gpukernsum --format csv --force-export true {filename}"
|
|
763
|
+
exit_code, kernel_output = exec_command_capture(workspace_id, kernel_cmd)
|
|
764
|
+
|
|
765
|
+
if exit_code != 0:
|
|
766
|
+
raise RuntimeError(f"NSYS kernel stats failed on workspace: {kernel_output}")
|
|
767
|
+
|
|
768
|
+
# Run memory stats - try new report name first, fall back to legacy
|
|
769
|
+
print("Extracting memory statistics...", file=sys.stderr)
|
|
770
|
+
mem_cmd = f"{nsys_cmd} stats --report cuda_gpu_mem_time_sum --format csv --force-export true {filename}"
|
|
771
|
+
exit_code, mem_output = exec_command_capture(workspace_id, mem_cmd)
|
|
772
|
+
|
|
773
|
+
# Try legacy report name if new one fails
|
|
774
|
+
if exit_code != 0 or "could not be found" in mem_output.lower():
|
|
775
|
+
mem_cmd = f"{nsys_cmd} stats --report gpumemtimesum --format csv --force-export true {filename}"
|
|
776
|
+
exit_code, mem_output = exec_command_capture(workspace_id, mem_cmd)
|
|
777
|
+
|
|
778
|
+
# Parse outputs
|
|
779
|
+
kernels = _parse_csv_kernels(kernel_output) if kernel_output else []
|
|
780
|
+
memory_transfers = _parse_csv_memory(mem_output) if exit_code == 0 and mem_output else []
|
|
781
|
+
|
|
782
|
+
# Build result
|
|
783
|
+
total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
|
|
784
|
+
total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
|
|
785
|
+
|
|
786
|
+
result = {
|
|
787
|
+
"success": True,
|
|
788
|
+
"summary": {
|
|
789
|
+
"gpu": "Unknown",
|
|
790
|
+
"duration_ms": total_kernel_time_ms + total_mem_time_ms,
|
|
791
|
+
"kernel_count": len(kernels),
|
|
792
|
+
"memory_transfers": len(memory_transfers),
|
|
793
|
+
},
|
|
794
|
+
"kernels": kernels,
|
|
795
|
+
"memory_transfers": memory_transfers,
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
if json_output:
|
|
799
|
+
return json.dumps(result, indent=2)
|
|
800
|
+
else:
|
|
801
|
+
return _generate_text_output(filepath.name, result)
|
|
103
802
|
|
|
104
803
|
|
|
105
804
|
def _analyze_remote_api(
|
|
@@ -110,10 +809,10 @@ def _analyze_remote_api(
|
|
|
110
809
|
|
|
111
810
|
Uploads the .nsys-rep file and runs analysis on Modal.
|
|
112
811
|
"""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
812
|
+
if not filepath.exists():
|
|
813
|
+
raise FileNotFoundError(f"File must exist: {filepath}")
|
|
814
|
+
if filepath.suffix != ".nsys-rep":
|
|
815
|
+
raise ValueError(f"File must be .nsys-rep: {filepath}")
|
|
117
816
|
|
|
118
817
|
import httpx
|
|
119
818
|
|
|
@@ -123,7 +822,8 @@ def _analyze_remote_api(
|
|
|
123
822
|
api_url = get_api_url()
|
|
124
823
|
headers = get_auth_headers()
|
|
125
824
|
|
|
126
|
-
|
|
825
|
+
if not api_url:
|
|
826
|
+
raise ValueError("API URL must be configured")
|
|
127
827
|
|
|
128
828
|
# Use multipart/form-data upload
|
|
129
829
|
print(f"Uploading {filepath.name} for analysis...", file=sys.stderr)
|
|
@@ -153,7 +853,8 @@ def _analyze_remote_api(
|
|
|
153
853
|
raise RuntimeError(f"Analysis failed: {result.get('error', 'Unknown error')}")
|
|
154
854
|
|
|
155
855
|
# Validate response structure
|
|
156
|
-
|
|
856
|
+
if not isinstance(result, dict):
|
|
857
|
+
raise TypeError("API must return a dictionary")
|
|
157
858
|
|
|
158
859
|
if json_output:
|
|
159
860
|
return json.dumps(result, indent=2)
|
|
@@ -161,10 +862,127 @@ def _analyze_remote_api(
|
|
|
161
862
|
return _generate_text_output(filepath.name, result)
|
|
162
863
|
|
|
163
864
|
|
|
865
|
+
def _generate_text_output(filename: str, result: dict) -> str:
|
|
866
|
+
"""Generate human-readable markdown text from analysis result."""
|
|
867
|
+
if not filename:
|
|
868
|
+
raise ValueError("filename must be non-empty")
|
|
869
|
+
if not isinstance(result, dict):
|
|
870
|
+
raise TypeError("result must be a dictionary")
|
|
871
|
+
|
|
872
|
+
timestamp = datetime.now().isoformat()
|
|
873
|
+
summary = result.get("summary", {})
|
|
874
|
+
kernels = result.get("kernels", [])
|
|
875
|
+
memory_transfers = result.get("memory_transfers", [])
|
|
876
|
+
|
|
877
|
+
lines = [
|
|
878
|
+
"# NSYS Profiling Analysis",
|
|
879
|
+
f"Source: {filename}",
|
|
880
|
+
f"Generated: {timestamp}",
|
|
881
|
+
"",
|
|
882
|
+
"## Summary",
|
|
883
|
+
f"- GPU: {summary.get('gpu', 'Unknown')}",
|
|
884
|
+
f"- Total Duration: {summary.get('duration_ms', 0):.2f} ms",
|
|
885
|
+
f"- Kernel Count: {summary.get('kernel_count', 0)}",
|
|
886
|
+
f"- Memory Transfers: {summary.get('memory_transfers', 0)}",
|
|
887
|
+
"",
|
|
888
|
+
]
|
|
889
|
+
|
|
890
|
+
if kernels:
|
|
891
|
+
lines.extend([
|
|
892
|
+
"## GPU Kernels",
|
|
893
|
+
"",
|
|
894
|
+
"| Kernel | Time (ms) | Instances | Avg (ms) |",
|
|
895
|
+
"|--------|-----------|-----------|----------|",
|
|
896
|
+
])
|
|
897
|
+
|
|
898
|
+
# Sort by duration descending
|
|
899
|
+
sorted_kernels = sorted(kernels, key=lambda k: k.get("duration_ms", 0), reverse=True)
|
|
900
|
+
|
|
901
|
+
for kernel in sorted_kernels[:20]: # Top 20 kernels
|
|
902
|
+
name = kernel.get("name", "Unknown")
|
|
903
|
+
# Truncate long kernel names
|
|
904
|
+
if len(name) > 50:
|
|
905
|
+
name = name[:47] + "..."
|
|
906
|
+
duration = kernel.get("duration_ms", 0)
|
|
907
|
+
instances = kernel.get("instances", 0)
|
|
908
|
+
avg = kernel.get("avg_time_ns", 0) / 1_000_000 if kernel.get("avg_time_ns") else 0
|
|
909
|
+
|
|
910
|
+
lines.append(f"| {name} | {duration:.3f} | {instances} | {avg:.4f} |")
|
|
911
|
+
|
|
912
|
+
if len(kernels) > 20:
|
|
913
|
+
lines.append(f"| ... and {len(kernels) - 20} more kernels | | | |")
|
|
914
|
+
|
|
915
|
+
lines.append("")
|
|
916
|
+
|
|
917
|
+
if memory_transfers:
|
|
918
|
+
lines.extend([
|
|
919
|
+
"## Memory Transfers",
|
|
920
|
+
"",
|
|
921
|
+
"| Operation | Time (ms) | Size | Instances |",
|
|
922
|
+
"|-----------|-----------|------|-----------|",
|
|
923
|
+
])
|
|
924
|
+
|
|
925
|
+
for transfer in memory_transfers:
|
|
926
|
+
op = transfer.get("operation", "Unknown")
|
|
927
|
+
duration = transfer.get("duration_ms", 0)
|
|
928
|
+
size_bytes = transfer.get("size_bytes", 0)
|
|
929
|
+
size_str = _format_bytes(size_bytes)
|
|
930
|
+
instances = transfer.get("instances", 0)
|
|
931
|
+
|
|
932
|
+
lines.append(f"| {op} | {duration:.3f} | {size_str} | {instances} |")
|
|
933
|
+
|
|
934
|
+
lines.append("")
|
|
935
|
+
|
|
936
|
+
# Add diagnostics if present
|
|
937
|
+
diagnostics = result.get("diagnostics", [])
|
|
938
|
+
if diagnostics:
|
|
939
|
+
lines.extend([
|
|
940
|
+
"## Diagnostics",
|
|
941
|
+
"",
|
|
942
|
+
])
|
|
943
|
+
for diag in diagnostics:
|
|
944
|
+
level = diag.get("level", "Info")
|
|
945
|
+
text = diag.get("text", "")
|
|
946
|
+
lines.append(f"- [{level}] {text}")
|
|
947
|
+
lines.append("")
|
|
948
|
+
|
|
949
|
+
return "\n".join(lines)
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def _format_bytes(size_bytes: int) -> str:
|
|
953
|
+
"""Format bytes into human-readable string."""
|
|
954
|
+
if size_bytes >= 1e9:
|
|
955
|
+
return f"{size_bytes / 1e9:.2f} GB"
|
|
956
|
+
elif size_bytes >= 1e6:
|
|
957
|
+
return f"{size_bytes / 1e6:.2f} MB"
|
|
958
|
+
elif size_bytes >= 1e3:
|
|
959
|
+
return f"{size_bytes / 1e3:.2f} KB"
|
|
960
|
+
else:
|
|
961
|
+
return f"{size_bytes} B"
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def _parse_target(target: str) -> tuple[str, str]:
|
|
965
|
+
"""Parse target string into type and identifier.
|
|
966
|
+
|
|
967
|
+
Supports:
|
|
968
|
+
- "workspace:abc123" -> ("workspace", "abc123")
|
|
969
|
+
- "vultr-b200" -> ("target", "vultr-b200")
|
|
970
|
+
|
|
971
|
+
Returns:
|
|
972
|
+
Tuple of (target_type, identifier)
|
|
973
|
+
"""
|
|
974
|
+
if target.startswith("workspace:"):
|
|
975
|
+
return "workspace", target[len("workspace:"):]
|
|
976
|
+
else:
|
|
977
|
+
return "target", target
|
|
978
|
+
|
|
979
|
+
|
|
164
980
|
def analyze_nsys_profile(
|
|
165
981
|
filepath: Path,
|
|
166
982
|
json_output: bool = False,
|
|
167
983
|
remote: bool | None = None,
|
|
984
|
+
target: str | None = None,
|
|
985
|
+
output_dir: Path | None = None,
|
|
168
986
|
) -> str:
|
|
169
987
|
"""Analyze an NSYS profile file and return results.
|
|
170
988
|
|
|
@@ -173,6 +991,8 @@ def analyze_nsys_profile(
|
|
|
173
991
|
json_output: If True, return raw JSON; otherwise return formatted text
|
|
174
992
|
remote: If True, force remote analysis via API. If False, force local.
|
|
175
993
|
If None (default), auto-detect: use local if nsys available, else remote.
|
|
994
|
+
target: Remote target - either "workspace:id" or target name from ~/.wafer/targets/
|
|
995
|
+
output_dir: Optional directory to save analysis results
|
|
176
996
|
|
|
177
997
|
Returns:
|
|
178
998
|
Analysis results as string (JSON or markdown)
|
|
@@ -187,26 +1007,36 @@ def analyze_nsys_profile(
|
|
|
187
1007
|
if filepath.suffix != ".nsys-rep":
|
|
188
1008
|
raise ValueError(f"Expected .nsys-rep file, got: {filepath.suffix}")
|
|
189
1009
|
|
|
1010
|
+
# If target is specified, use appropriate remote execution
|
|
1011
|
+
if target:
|
|
1012
|
+
target_type, target_id = _parse_target(target)
|
|
1013
|
+
|
|
1014
|
+
if target_type == "workspace":
|
|
1015
|
+
return _analyze_workspace(filepath, target_id, json_output)
|
|
1016
|
+
else:
|
|
1017
|
+
return _analyze_remote_direct(filepath, target_id, json_output)
|
|
1018
|
+
|
|
1019
|
+
# Check for local nsys installation
|
|
190
1020
|
nsys_path = _find_nsys()
|
|
191
1021
|
|
|
192
1022
|
# Determine whether to use local or remote
|
|
193
1023
|
use_remote = remote
|
|
194
1024
|
if use_remote is None:
|
|
195
|
-
# Auto-detect: use
|
|
1025
|
+
# Auto-detect: use local if nsys available, else remote
|
|
196
1026
|
use_remote = nsys_path is None
|
|
197
1027
|
|
|
198
1028
|
if use_remote:
|
|
199
1029
|
return _analyze_remote_api(filepath, json_output)
|
|
200
1030
|
else:
|
|
201
|
-
# Local analysis not yet implemented - would need to copy nsys_parser to wafer-core
|
|
202
|
-
# For now, suggest using remote
|
|
203
1031
|
if nsys_path is None:
|
|
1032
|
+
if is_macos():
|
|
1033
|
+
raise FileNotFoundError(
|
|
1034
|
+
"NSYS CLI is not available on macOS (only GUI viewer is provided). "
|
|
1035
|
+
"Use --remote flag for API-based analysis or --target for workspace/SSH analysis."
|
|
1036
|
+
)
|
|
204
1037
|
install_cmd = _get_install_command()
|
|
205
1038
|
raise FileNotFoundError(
|
|
206
1039
|
f"NSYS not installed locally. Use --remote flag or install with: {install_cmd}"
|
|
207
1040
|
)
|
|
208
1041
|
|
|
209
|
-
|
|
210
|
-
raise NotImplementedError(
|
|
211
|
-
"Local NSYS analysis not yet implemented. Use --remote flag to analyze via API."
|
|
212
|
-
)
|
|
1042
|
+
return _analyze_local(filepath, nsys_path, output_dir, json_output)
|