wafer-cli 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/nsys_analyze.py ADDED
@@ -0,0 +1,1042 @@
1
+ """NSYS Analyze - Parse and analyze .nsys-rep profile files.
2
+
3
+ This module provides the implementation for the `wafer nvidia nsys analyze` command.
4
+ Supports local analysis (when nsys is installed), remote analysis via API,
5
+ direct SSH analysis via targets, and workspace execution.
6
+
7
+ Local analysis uses `nsys stats` and `nsys export` commands which work on any machine
8
+ with nsys installed (no GPU required for analysis, only for profiling).
9
+ """
10
+
11
+ import json
12
+ import os
13
+ import platform
14
+ import shutil
15
+ import subprocess
16
+ import sys
17
+ from dataclasses import dataclass
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+
21
+ # Known NSYS installation paths by platform
22
+ # NOTE: On macOS, NVIDIA only provides the GUI viewer (nsys-ui), NOT the CLI tool.
23
+ # The nsys CLI is only available on Linux. macOS users must use remote analysis.
24
+ NSYS_PATHS = {
25
+ "linux": [
26
+ "/usr/bin/nsys",
27
+ "/usr/local/bin/nsys",
28
+ "/usr/local/cuda/bin/nsys",
29
+ "/opt/nvidia/nsight-systems/bin/nsys",
30
+ "/opt/nvidia/nsight-systems-cli/bin/nsys",
31
+ ],
32
+ # macOS: nsys CLI not available - only GUI viewer exists
33
+ # Set to empty list to always fall back to remote analysis
34
+ "darwin": [],
35
+ "windows": [
36
+ r"C:\Program Files\NVIDIA Corporation\Nsight Systems\bin\nsys.exe",
37
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\bin\nsys.exe",
38
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\nsys.exe",
39
+ ],
40
+ }
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class NSYSCheckResult:
45
+ """Result of checking NSYS installation."""
46
+
47
+ installed: bool
48
+ path: str | None = None
49
+ version: str | None = None
50
+ install_command: str | None = None
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class KernelInfo:
55
+ """Information about a CUDA kernel from NSYS profile."""
56
+
57
+ name: str
58
+ duration_ns: int
59
+ duration_ms: float
60
+ instances: int
61
+ avg_duration_ns: float
62
+ min_duration_ns: int
63
+ max_duration_ns: int
64
+ grid_size: str | None = None
65
+ block_size: str | None = None
66
+ registers_per_thread: int | None = None
67
+ shared_memory_bytes: int | None = None
68
+ memory_throughput_gb_s: float | None = None
69
+
70
+
71
+ @dataclass(frozen=True)
72
+ class MemoryTransfer:
73
+ """Information about a memory transfer from NSYS profile."""
74
+
75
+ operation: str # HtoD, DtoH, DtoD, etc.
76
+ duration_ns: int
77
+ size_bytes: int
78
+ throughput_gb_s: float
79
+ instances: int
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class NSYSAnalysisResult:
84
+ """Complete NSYS analysis result."""
85
+
86
+ success: bool
87
+ report_id: str | None = None
88
+ gpu: str = "Unknown"
89
+ duration_ms: float = 0.0
90
+ kernel_count: int = 0
91
+ memory_transfer_count: int = 0
92
+ kernels: list[dict] | None = None
93
+ memory_transfers: list[dict] | None = None
94
+ timeline: list[dict] | None = None
95
+ diagnostics: list[dict] | None = None
96
+ error: str | None = None
97
+
98
+
99
+ def _get_platform() -> str:
100
+ """Get normalized platform name."""
101
+ system = platform.system().lower()
102
+ if system == "darwin":
103
+ return "darwin"
104
+ elif system == "windows":
105
+ return "windows"
106
+ return "linux"
107
+
108
+
109
+ def _find_nsys() -> str | None:
110
+ """Find nsys executable on the system.
111
+
112
+ Searches in order:
113
+ 1. PATH environment variable
114
+ 2. Common installation paths for the current platform
115
+ """
116
+ # First check PATH
117
+ nsys = shutil.which("nsys")
118
+ if nsys:
119
+ return nsys
120
+
121
+ # Then check known installation paths
122
+ plat = _get_platform()
123
+ for path in NSYS_PATHS.get(plat, []):
124
+ if os.path.isfile(path) and os.access(path, os.X_OK):
125
+ return path
126
+
127
+ return None
128
+
129
+
130
+ def _get_nsys_version(nsys_path: str) -> str | None:
131
+ """Get NSYS version string."""
132
+ try:
133
+ result = subprocess.run(
134
+ [nsys_path, "--version"],
135
+ capture_output=True,
136
+ text=True,
137
+ timeout=10,
138
+ )
139
+ if result.returncode == 0:
140
+ # Parse version from output like "NVIDIA Nsight Systems version 2024.6.1.90-246160830v0"
141
+ for line in result.stdout.split("\n"):
142
+ if "version" in line.lower():
143
+ parts = line.split("version")
144
+ if len(parts) >= 2:
145
+ return parts[1].strip().split()[0]
146
+ return result.stdout.strip().split("\n")[0]
147
+ except (subprocess.TimeoutExpired, OSError):
148
+ pass
149
+ return None
150
+
151
+
152
+ def _get_install_command() -> str:
153
+ """Get platform-appropriate install command for NSYS."""
154
+ plat = _get_platform()
155
+
156
+ if plat == "darwin":
157
+ # macOS only has GUI viewer, no CLI - user must use remote analysis
158
+ return "NSYS CLI not available on macOS. Use --remote flag or --target for remote analysis."
159
+
160
+ if plat == "linux":
161
+ if shutil.which("apt-get") or shutil.which("apt"):
162
+ return "sudo apt install nsight-systems"
163
+ elif shutil.which("dnf"):
164
+ return "sudo dnf install nsight-systems"
165
+ elif shutil.which("yum"):
166
+ return "sudo yum install nsight-systems"
167
+ elif shutil.which("pacman"):
168
+ return "sudo pacman -S nsight-systems"
169
+
170
+ if shutil.which("conda"):
171
+ return "conda install -c nvidia nsight-systems"
172
+
173
+ return "Download from https://developer.nvidia.com/nsight-systems"
174
+
175
+
176
+ def is_macos() -> bool:
177
+ """Check if running on macOS."""
178
+ return _get_platform() == "darwin"
179
+
180
+
181
+ def check_nsys_installation() -> NSYSCheckResult:
182
+ """Check if NSYS is installed and return details.
183
+
184
+ Returns:
185
+ NSYSCheckResult with installation status and details
186
+ """
187
+ nsys_path = _find_nsys()
188
+
189
+ if nsys_path is None:
190
+ return NSYSCheckResult(
191
+ installed=False,
192
+ install_command=_get_install_command(),
193
+ )
194
+
195
+ version = _get_nsys_version(nsys_path)
196
+
197
+ return NSYSCheckResult(
198
+ installed=True,
199
+ path=nsys_path,
200
+ version=version,
201
+ )
202
+
203
+
204
+ def _run_nsys_stats(
205
+ nsys_path: str,
206
+ filepath: Path,
207
+ report_name: str,
208
+ timeout: int = 120,
209
+ ) -> tuple[bool, str]:
210
+ """Run nsys stats command to extract report data.
211
+
212
+ Args:
213
+ nsys_path: Path to nsys executable
214
+ filepath: Path to .nsys-rep file
215
+ report_name: Report type (e.g., gpukernsum, gpumemtimesum, cudaapisum)
216
+ timeout: Command timeout in seconds
217
+
218
+ Returns:
219
+ Tuple of (success, output_or_error)
220
+ """
221
+ try:
222
+ result = subprocess.run(
223
+ [
224
+ nsys_path,
225
+ "stats",
226
+ "--report", report_name,
227
+ "--format", "csv",
228
+ "--force-export", "true",
229
+ str(filepath),
230
+ ],
231
+ capture_output=True,
232
+ text=True,
233
+ timeout=timeout,
234
+ )
235
+
236
+ if result.returncode != 0:
237
+ error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
238
+ return False, error_msg
239
+
240
+ return True, result.stdout
241
+
242
+ except subprocess.TimeoutExpired:
243
+ return False, f"Command timed out after {timeout}s"
244
+ except OSError as e:
245
+ return False, f"Failed to execute nsys: {e}"
246
+
247
+
248
+ def _run_nsys_export(
249
+ nsys_path: str,
250
+ filepath: Path,
251
+ output_format: str = "sqlite",
252
+ timeout: int = 180,
253
+ ) -> tuple[bool, str]:
254
+ """Run nsys export command to export trace data.
255
+
256
+ Args:
257
+ nsys_path: Path to nsys executable
258
+ filepath: Path to .nsys-rep file
259
+ output_format: Export format (sqlite, json)
260
+ timeout: Command timeout in seconds
261
+
262
+ Returns:
263
+ Tuple of (success, output_path_or_error)
264
+ """
265
+ # Determine output path
266
+ output_path = filepath.with_suffix(f".{output_format}")
267
+
268
+ try:
269
+ result = subprocess.run(
270
+ [
271
+ nsys_path,
272
+ "export",
273
+ "--type", output_format,
274
+ "--force-overwrite", "true",
275
+ "--output", str(output_path),
276
+ str(filepath),
277
+ ],
278
+ capture_output=True,
279
+ text=True,
280
+ timeout=timeout,
281
+ )
282
+
283
+ if result.returncode != 0:
284
+ error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
285
+ return False, error_msg
286
+
287
+ return True, str(output_path)
288
+
289
+ except subprocess.TimeoutExpired:
290
+ return False, f"Export timed out after {timeout}s"
291
+ except OSError as e:
292
+ return False, f"Failed to execute nsys: {e}"
293
+
294
+
295
+ def _parse_csv_kernels(csv_output: str) -> list[dict]:
296
+ """Parse GPU kernel summary from nsys stats CSV output."""
297
+ kernels = []
298
+
299
+ lines = csv_output.strip().split("\n")
300
+ if len(lines) < 2:
301
+ return kernels
302
+
303
+ # Find header line - look for a line with known CSV header columns
304
+ # The nsys output includes informational lines before the actual CSV
305
+ # Header line should contain "Time" and "Name" columns
306
+ header_idx = -1
307
+ for i, line in enumerate(lines):
308
+ line_lower = line.lower()
309
+ # Skip comment lines and non-CSV lines
310
+ if line.startswith("#"):
311
+ continue
312
+ # Check if this looks like a CSV header with expected columns
313
+ if ("time" in line_lower and "name" in line_lower) or \
314
+ ("time (%)" in line_lower) or \
315
+ ("total time" in line_lower and "instances" in line_lower):
316
+ header_idx = i
317
+ break
318
+
319
+ if header_idx < 0 or header_idx >= len(lines) - 1:
320
+ return kernels
321
+
322
+ headers = [h.strip().strip('"') for h in lines[header_idx].split(",")]
323
+
324
+ # Map header names to indices
325
+ def find_col(names: list[str]) -> int | None:
326
+ for name in names:
327
+ name_lower = name.lower()
328
+ for i, h in enumerate(headers):
329
+ if name_lower in h.lower():
330
+ return i
331
+ return None
332
+
333
+ name_col = find_col(["Name", "Kernel Name", "KernelName"])
334
+ time_col = find_col(["Time (%)", "Time Percent", "Time%"])
335
+ total_time_col = find_col(["Total Time", "TotalTime", "Duration"])
336
+ instances_col = find_col(["Instances", "Count", "Calls"])
337
+ avg_col = find_col(["Avg", "Average", "AvgTime"])
338
+ min_col = find_col(["Min", "Minimum", "MinTime"])
339
+ max_col = find_col(["Max", "Maximum", "MaxTime"])
340
+
341
+ # Parse data rows
342
+ for line in lines[header_idx + 1:]:
343
+ if not line.strip() or line.startswith("#"):
344
+ continue
345
+
346
+ # Handle CSV with quoted fields
347
+ parts = []
348
+ in_quote = False
349
+ current = ""
350
+ for char in line:
351
+ if char == '"':
352
+ in_quote = not in_quote
353
+ elif char == "," and not in_quote:
354
+ parts.append(current.strip().strip('"'))
355
+ current = ""
356
+ else:
357
+ current += char
358
+ parts.append(current.strip().strip('"'))
359
+
360
+ if len(parts) <= (name_col or 0):
361
+ continue
362
+
363
+ kernel = {
364
+ "name": parts[name_col] if name_col is not None else "Unknown",
365
+ "time_percent": 0.0,
366
+ "total_time_ns": 0,
367
+ "duration_ms": 0.0,
368
+ "instances": 0,
369
+ "avg_time_ns": 0,
370
+ "min_time_ns": 0,
371
+ "max_time_ns": 0,
372
+ }
373
+
374
+ try:
375
+ if time_col is not None and time_col < len(parts):
376
+ kernel["time_percent"] = float(parts[time_col].replace("%", "").strip() or 0)
377
+
378
+ if total_time_col is not None and total_time_col < len(parts):
379
+ # Time may be in ns, us, or ms - parse accordingly
380
+ time_str = parts[total_time_col].strip()
381
+ kernel["total_time_ns"] = _parse_time_to_ns(time_str)
382
+ kernel["duration_ms"] = kernel["total_time_ns"] / 1_000_000
383
+
384
+ if instances_col is not None and instances_col < len(parts):
385
+ kernel["instances"] = int(float(parts[instances_col].strip() or 0))
386
+
387
+ if avg_col is not None and avg_col < len(parts):
388
+ kernel["avg_time_ns"] = _parse_time_to_ns(parts[avg_col].strip())
389
+
390
+ if min_col is not None and min_col < len(parts):
391
+ kernel["min_time_ns"] = _parse_time_to_ns(parts[min_col].strip())
392
+
393
+ if max_col is not None and max_col < len(parts):
394
+ kernel["max_time_ns"] = _parse_time_to_ns(parts[max_col].strip())
395
+
396
+ except (ValueError, IndexError):
397
+ pass
398
+
399
+ if kernel["name"] and kernel["name"] != "Unknown":
400
+ kernels.append(kernel)
401
+
402
+ return kernels
403
+
404
+
405
+ def _parse_time_to_ns(time_str: str) -> int:
406
+ """Parse time string to nanoseconds."""
407
+ if not time_str:
408
+ return 0
409
+
410
+ time_str = time_str.strip().lower()
411
+
412
+ try:
413
+ if "ms" in time_str:
414
+ return int(float(time_str.replace("ms", "").strip()) * 1_000_000)
415
+ elif "us" in time_str or "µs" in time_str:
416
+ return int(float(time_str.replace("us", "").replace("µs", "").strip()) * 1_000)
417
+ elif "ns" in time_str:
418
+ return int(float(time_str.replace("ns", "").strip()))
419
+ elif "s" in time_str:
420
+ return int(float(time_str.replace("s", "").strip()) * 1_000_000_000)
421
+ else:
422
+ # Assume nanoseconds
423
+ return int(float(time_str))
424
+ except ValueError:
425
+ return 0
426
+
427
+
428
+ def _parse_csv_memory(csv_output: str) -> list[dict]:
429
+ """Parse memory transfer summary from nsys stats CSV output."""
430
+ transfers = []
431
+
432
+ lines = csv_output.strip().split("\n")
433
+ if len(lines) < 2:
434
+ return transfers
435
+
436
+ # Find header line - look for a line with known CSV header columns
437
+ # The nsys output includes informational lines before the actual CSV
438
+ header_idx = -1
439
+ for i, line in enumerate(lines):
440
+ line_lower = line.lower()
441
+ # Skip comment lines
442
+ if line.startswith("#"):
443
+ continue
444
+ # Check if this looks like a CSV header with expected columns
445
+ if ("time" in line_lower and ("operation" in line_lower or "total" in line_lower)) or \
446
+ ("time (%)" in line_lower) or \
447
+ ("count" in line_lower and "total" in line_lower):
448
+ header_idx = i
449
+ break
450
+
451
+ if header_idx < 0 or header_idx >= len(lines) - 1:
452
+ return transfers
453
+
454
+ headers = [h.strip().strip('"') for h in lines[header_idx].split(",")]
455
+
456
+ # Map header names
457
+ def find_col(names: list[str]) -> int | None:
458
+ for name in names:
459
+ name_lower = name.lower()
460
+ for i, h in enumerate(headers):
461
+ if name_lower in h.lower():
462
+ return i
463
+ return None
464
+
465
+ op_col = find_col(["Operation", "Name", "MemOp"])
466
+ time_col = find_col(["Total Time", "TotalTime", "Duration"])
467
+ size_col = find_col(["Total", "Size", "Bytes"])
468
+ count_col = find_col(["Count", "Instances", "Calls"])
469
+ throughput_col = find_col(["Throughput", "Bandwidth"])
470
+
471
+ for line in lines[header_idx + 1:]:
472
+ if not line.strip() or line.startswith("#"):
473
+ continue
474
+
475
+ parts = [p.strip().strip('"') for p in line.split(",")]
476
+
477
+ if len(parts) <= (op_col or 0):
478
+ continue
479
+
480
+ transfer = {
481
+ "operation": parts[op_col] if op_col is not None else "Unknown",
482
+ "total_time_ns": 0,
483
+ "duration_ms": 0.0,
484
+ "size_bytes": 0,
485
+ "instances": 0,
486
+ "throughput_gb_s": 0.0,
487
+ }
488
+
489
+ try:
490
+ if time_col is not None and time_col < len(parts):
491
+ transfer["total_time_ns"] = _parse_time_to_ns(parts[time_col])
492
+ transfer["duration_ms"] = transfer["total_time_ns"] / 1_000_000
493
+
494
+ if size_col is not None and size_col < len(parts):
495
+ size_str = parts[size_col].strip().upper()
496
+ if "GB" in size_str:
497
+ transfer["size_bytes"] = int(float(size_str.replace("GB", "").strip()) * 1e9)
498
+ elif "MB" in size_str:
499
+ transfer["size_bytes"] = int(float(size_str.replace("MB", "").strip()) * 1e6)
500
+ elif "KB" in size_str:
501
+ transfer["size_bytes"] = int(float(size_str.replace("KB", "").strip()) * 1e3)
502
+ else:
503
+ transfer["size_bytes"] = int(float(size_str.replace("B", "").strip() or 0))
504
+
505
+ if count_col is not None and count_col < len(parts):
506
+ transfer["instances"] = int(float(parts[count_col].strip() or 0))
507
+
508
+ if throughput_col is not None and throughput_col < len(parts):
509
+ tp_str = parts[throughput_col].strip().upper()
510
+ if "GB" in tp_str:
511
+ transfer["throughput_gb_s"] = float(tp_str.replace("GB/S", "").strip())
512
+ elif "MB" in tp_str:
513
+ transfer["throughput_gb_s"] = float(tp_str.replace("MB/S", "").strip()) / 1000
514
+ else:
515
+ transfer["throughput_gb_s"] = float(tp_str.replace("/S", "").strip() or 0) / 1e9
516
+
517
+ except (ValueError, IndexError):
518
+ pass
519
+
520
+ if transfer["operation"] and transfer["operation"] != "Unknown":
521
+ transfers.append(transfer)
522
+
523
+ return transfers
524
+
525
+
526
+ def _analyze_local(
527
+ filepath: Path,
528
+ nsys_path: str,
529
+ output_dir: Path | None = None,
530
+ json_output: bool = False,
531
+ ) -> str:
532
+ """Analyze NSYS profile locally using installed nsys CLI.
533
+
534
+ Uses `nsys stats` commands to extract kernel and memory statistics.
535
+ This works on any machine with nsys installed - no GPU required for analysis.
536
+ """
537
+ if not filepath.exists():
538
+ raise FileNotFoundError(f"File must exist: {filepath}")
539
+ if filepath.suffix != ".nsys-rep":
540
+ raise ValueError(f"File must be .nsys-rep: {filepath}")
541
+
542
+ print(f"Analyzing {filepath.name} locally...", file=sys.stderr)
543
+
544
+ # Get GPU kernel summary
545
+ # Note: Report names changed in nsys 2024.x: gpukernsum -> cuda_gpu_kern_sum
546
+ print("Extracting kernel statistics...", file=sys.stderr)
547
+ success, kernel_output = _run_nsys_stats(nsys_path, filepath, "cuda_gpu_kern_sum")
548
+
549
+ # Try legacy report name if new one fails
550
+ if not success:
551
+ success, kernel_output = _run_nsys_stats(nsys_path, filepath, "gpukernsum")
552
+
553
+ kernels = []
554
+ if success:
555
+ kernels = _parse_csv_kernels(kernel_output)
556
+ else:
557
+ print(f"Warning: Could not extract kernel stats: {kernel_output}", file=sys.stderr)
558
+
559
+ # Get memory transfer summary
560
+ # Note: Report names changed in nsys 2024.x: gpumemtimesum -> cuda_gpu_mem_time_sum
561
+ print("Extracting memory statistics...", file=sys.stderr)
562
+ success, mem_output = _run_nsys_stats(nsys_path, filepath, "cuda_gpu_mem_time_sum")
563
+
564
+ # Try legacy report names if new one fails
565
+ if not success:
566
+ success, mem_output = _run_nsys_stats(nsys_path, filepath, "gpumemtimesum")
567
+
568
+ memory_transfers = []
569
+ if success:
570
+ memory_transfers = _parse_csv_memory(mem_output)
571
+ else:
572
+ # Try alternative report name (for very old nsys versions)
573
+ success, mem_output = _run_nsys_stats(nsys_path, filepath, "cudamemcpysum")
574
+ if success:
575
+ memory_transfers = _parse_csv_memory(mem_output)
576
+
577
+ # Get CUDA API summary for additional context
578
+ # Note: Report names changed in nsys 2024.x: cudaapisum -> cuda_api_sum
579
+ print("Extracting CUDA API statistics...", file=sys.stderr)
580
+ success, api_output = _run_nsys_stats(nsys_path, filepath, "cuda_api_sum")
581
+
582
+ # Try legacy report name if new one fails
583
+ if not success:
584
+ success, api_output = _run_nsys_stats(nsys_path, filepath, "cudaapisum")
585
+
586
+ # Build summary
587
+ total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
588
+ total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
589
+
590
+ # Try to get GPU info from report
591
+ gpu_name = "Unknown"
592
+
593
+ # Build result
594
+ result = {
595
+ "success": True,
596
+ "summary": {
597
+ "gpu": gpu_name,
598
+ "duration_ms": total_kernel_time_ms + total_mem_time_ms,
599
+ "kernel_count": len(kernels),
600
+ "memory_transfers": len(memory_transfers),
601
+ "total_kernel_time_ms": total_kernel_time_ms,
602
+ "total_memory_time_ms": total_mem_time_ms,
603
+ },
604
+ "kernels": kernels,
605
+ "memory_transfers": memory_transfers,
606
+ }
607
+
608
+ # Save to output directory if specified
609
+ if output_dir:
610
+ output_dir.mkdir(parents=True, exist_ok=True)
611
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
612
+ output_filename = f"nsys_analysis_{filepath.stem}_{timestamp}"
613
+
614
+ if json_output:
615
+ json_path = output_dir / f"{output_filename}.json"
616
+ json_path.write_text(json.dumps(result, indent=2))
617
+ print(f"Saved JSON: {json_path}", file=sys.stderr)
618
+ else:
619
+ txt_path = output_dir / f"{output_filename}.txt"
620
+ txt_path.write_text(_generate_text_output(filepath.name, result))
621
+ print(f"Saved analysis: {txt_path}", file=sys.stderr)
622
+
623
+ print("Analysis complete.", file=sys.stderr)
624
+
625
+ if json_output:
626
+ return json.dumps(result, indent=2)
627
+ else:
628
+ return _generate_text_output(filepath.name, result)
629
+
630
+
631
+ def _analyze_remote_direct(
632
+ filepath: Path,
633
+ target_name: str,
634
+ json_output: bool = False,
635
+ ) -> str:
636
+ """Analyze NSYS profile remotely via direct SSH to target.
637
+
638
+ Uploads the .nsys-rep file and runs nsys analysis on the target machine.
639
+ """
640
+ import tempfile
641
+
642
+ from .gpu_run import push_directory, run_command_capture
643
+ from .targets import load_target
644
+
645
+ # Load target
646
+ try:
647
+ target = load_target(target_name)
648
+ except FileNotFoundError as e:
649
+ raise RuntimeError(f"Target not found: {target_name}. Create with: wafer targets add {target_name}") from e
650
+
651
+ # Create temp directory with just the .nsys-rep file
652
+ workspace_name = f"nsys_analyze_{filepath.stem}"
653
+
654
+ with tempfile.TemporaryDirectory() as tmpdir:
655
+ # Create a directory with the workspace name
656
+ tmp_path = Path(tmpdir) / workspace_name
657
+ tmp_path.mkdir()
658
+ shutil.copy(filepath, tmp_path / filepath.name)
659
+
660
+ # Push the file
661
+ print(f"Uploading {filepath.name} to {target_name}...", file=sys.stderr)
662
+ push_directory(tmp_path, target)
663
+
664
+ # Run nsys stats commands on remote
665
+ # First try to find nsys on the remote system
666
+ nsys_paths = [
667
+ "/usr/bin/nsys",
668
+ "/usr/local/cuda/bin/nsys",
669
+ "/opt/nvidia/nsight-systems/bin/nsys",
670
+ ]
671
+
672
+ nsys_cmd = "nsys" # Default to PATH
673
+ for path in nsys_paths:
674
+ check_cmd = f"test -x {path} && echo found"
675
+ exit_code, output = run_command_capture(check_cmd, workspace_name, target)
676
+ if exit_code == 0 and "found" in output:
677
+ nsys_cmd = path
678
+ break
679
+
680
+ # Run analysis commands
681
+ # Try new report name first, fall back to legacy if it fails
682
+ print("Running NSYS analysis...", file=sys.stderr)
683
+ analysis_cmd = f"{nsys_cmd} stats --report cuda_gpu_kern_sum --format csv --force-export true {filepath.name}"
684
+ exit_code, kernel_output = run_command_capture(analysis_cmd, workspace_name, target)
685
+
686
+ # Try legacy report name if new one fails
687
+ if exit_code != 0 or "could not be found" in kernel_output.lower():
688
+ analysis_cmd = f"{nsys_cmd} stats --report gpukernsum --format csv --force-export true {filepath.name}"
689
+ exit_code, kernel_output = run_command_capture(analysis_cmd, workspace_name, target)
690
+
691
+ if exit_code != 0:
692
+ raise RuntimeError(f"NSYS kernel stats failed: {kernel_output}")
693
+
694
+ # Get memory stats - try new name first, fall back to legacy
695
+ mem_cmd = f"{nsys_cmd} stats --report cuda_gpu_mem_time_sum --format csv --force-export true {filepath.name}"
696
+ exit_code, mem_output = run_command_capture(mem_cmd, workspace_name, target)
697
+
698
+ # Try legacy report name if new one fails
699
+ if exit_code != 0 or "could not be found" in mem_output.lower():
700
+ mem_cmd = f"{nsys_cmd} stats --report gpumemtimesum --format csv --force-export true {filepath.name}"
701
+ exit_code, mem_output = run_command_capture(mem_cmd, workspace_name, target)
702
+
703
+ # Parse outputs (memory stats may fail if no memory transfers)
704
+ kernels = _parse_csv_kernels(kernel_output) if kernel_output else []
705
+ memory_transfers = _parse_csv_memory(mem_output) if exit_code == 0 and mem_output else []
706
+
707
+ # Build result
708
+ total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
709
+ total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
710
+
711
+ result = {
712
+ "success": True,
713
+ "summary": {
714
+ "gpu": "Unknown", # Would need additional parsing to get GPU name
715
+ "duration_ms": total_kernel_time_ms + total_mem_time_ms,
716
+ "kernel_count": len(kernels),
717
+ "memory_transfers": len(memory_transfers),
718
+ },
719
+ "kernels": kernels,
720
+ "memory_transfers": memory_transfers,
721
+ }
722
+
723
+ if json_output:
724
+ return json.dumps(result, indent=2)
725
+ else:
726
+ return _generate_text_output(filepath.name, result)
727
+
728
+
729
+ def _analyze_workspace(
730
+ filepath: Path,
731
+ workspace_id: str,
732
+ json_output: bool = False,
733
+ ) -> str:
734
+ """Analyze NSYS profile on a Wafer workspace.
735
+
736
+ Uses workspace exec to run nsys analysis on the workspace.
737
+ """
738
+ from .workspaces import exec_command_capture
739
+
740
+ # First, check if file exists on workspace or needs upload
741
+ # For now, assume file is already on workspace (via sync)
742
+ filename = filepath.name
743
+
744
+ print(f"Running NSYS analysis on workspace {workspace_id}...", file=sys.stderr)
745
+
746
+ # Try to find nsys on the workspace
747
+ nsys_cmd = "nsys"
748
+ for path in ["/usr/bin/nsys", "/usr/local/cuda/bin/nsys", "/opt/nvidia/nsight-systems/bin/nsys"]:
749
+ check_cmd = f"test -x {path} && echo found"
750
+ exit_code, output = exec_command_capture(workspace_id, check_cmd)
751
+ if exit_code == 0 and "found" in output:
752
+ nsys_cmd = path
753
+ break
754
+
755
+ # Run kernel stats - try new report name first, fall back to legacy
756
+ print("Extracting kernel statistics...", file=sys.stderr)
757
+ kernel_cmd = f"{nsys_cmd} stats --report cuda_gpu_kern_sum --format csv --force-export true {filename}"
758
+ exit_code, kernel_output = exec_command_capture(workspace_id, kernel_cmd)
759
+
760
+ # Try legacy report name if new one fails
761
+ if exit_code != 0 or "could not be found" in kernel_output.lower():
762
+ kernel_cmd = f"{nsys_cmd} stats --report gpukernsum --format csv --force-export true {filename}"
763
+ exit_code, kernel_output = exec_command_capture(workspace_id, kernel_cmd)
764
+
765
+ if exit_code != 0:
766
+ raise RuntimeError(f"NSYS kernel stats failed on workspace: {kernel_output}")
767
+
768
+ # Run memory stats - try new report name first, fall back to legacy
769
+ print("Extracting memory statistics...", file=sys.stderr)
770
+ mem_cmd = f"{nsys_cmd} stats --report cuda_gpu_mem_time_sum --format csv --force-export true {filename}"
771
+ exit_code, mem_output = exec_command_capture(workspace_id, mem_cmd)
772
+
773
+ # Try legacy report name if new one fails
774
+ if exit_code != 0 or "could not be found" in mem_output.lower():
775
+ mem_cmd = f"{nsys_cmd} stats --report gpumemtimesum --format csv --force-export true {filename}"
776
+ exit_code, mem_output = exec_command_capture(workspace_id, mem_cmd)
777
+
778
+ # Parse outputs
779
+ kernels = _parse_csv_kernels(kernel_output) if kernel_output else []
780
+ memory_transfers = _parse_csv_memory(mem_output) if exit_code == 0 and mem_output else []
781
+
782
+ # Build result
783
+ total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
784
+ total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
785
+
786
+ result = {
787
+ "success": True,
788
+ "summary": {
789
+ "gpu": "Unknown",
790
+ "duration_ms": total_kernel_time_ms + total_mem_time_ms,
791
+ "kernel_count": len(kernels),
792
+ "memory_transfers": len(memory_transfers),
793
+ },
794
+ "kernels": kernels,
795
+ "memory_transfers": memory_transfers,
796
+ }
797
+
798
+ if json_output:
799
+ return json.dumps(result, indent=2)
800
+ else:
801
+ return _generate_text_output(filepath.name, result)
802
+
803
+
804
+ def _analyze_remote_api(
805
+ filepath: Path,
806
+ json_output: bool = False,
807
+ ) -> str:
808
+ """Analyze NSYS profile remotely via wafer-api.
809
+
810
+ Uploads the .nsys-rep file and runs analysis on Modal.
811
+ """
812
+ if not filepath.exists():
813
+ raise FileNotFoundError(f"File must exist: {filepath}")
814
+ if filepath.suffix != ".nsys-rep":
815
+ raise ValueError(f"File must be .nsys-rep: {filepath}")
816
+
817
+ import httpx
818
+
819
+ from .api_client import get_api_url
820
+ from .auth import get_auth_headers
821
+
822
+ api_url = get_api_url()
823
+ headers = get_auth_headers()
824
+
825
+ if not api_url:
826
+ raise ValueError("API URL must be configured")
827
+
828
+ # Use multipart/form-data upload
829
+ print(f"Uploading {filepath.name} for analysis...", file=sys.stderr)
830
+
831
+ try:
832
+ with httpx.Client(timeout=300.0, headers=headers) as client:
833
+ with open(filepath, "rb") as f:
834
+ files = {"file": (filepath.name, f, "application/octet-stream")}
835
+ data = {"filename": filepath.name}
836
+
837
+ response = client.post(
838
+ f"{api_url}/v1/nsys/tool/analyze",
839
+ files=files,
840
+ data=data,
841
+ )
842
+ response.raise_for_status()
843
+ result = response.json()
844
+
845
+ except httpx.HTTPStatusError as e:
846
+ if e.response.status_code == 401:
847
+ raise RuntimeError("Not authenticated. Run: wafer login") from e
848
+ raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
849
+ except httpx.RequestError as e:
850
+ raise RuntimeError(f"Could not reach API: {e}") from e
851
+
852
+ if not result.get("success", True):
853
+ raise RuntimeError(f"Analysis failed: {result.get('error', 'Unknown error')}")
854
+
855
+ # Validate response structure
856
+ if not isinstance(result, dict):
857
+ raise TypeError("API must return a dictionary")
858
+
859
+ if json_output:
860
+ return json.dumps(result, indent=2)
861
+ else:
862
+ return _generate_text_output(filepath.name, result)
863
+
864
+
865
+ def _generate_text_output(filename: str, result: dict) -> str:
866
+ """Generate human-readable markdown text from analysis result."""
867
+ if not filename:
868
+ raise ValueError("filename must be non-empty")
869
+ if not isinstance(result, dict):
870
+ raise TypeError("result must be a dictionary")
871
+
872
+ timestamp = datetime.now().isoformat()
873
+ summary = result.get("summary", {})
874
+ kernels = result.get("kernels", [])
875
+ memory_transfers = result.get("memory_transfers", [])
876
+
877
+ lines = [
878
+ "# NSYS Profiling Analysis",
879
+ f"Source: {filename}",
880
+ f"Generated: {timestamp}",
881
+ "",
882
+ "## Summary",
883
+ f"- GPU: {summary.get('gpu', 'Unknown')}",
884
+ f"- Total Duration: {summary.get('duration_ms', 0):.2f} ms",
885
+ f"- Kernel Count: {summary.get('kernel_count', 0)}",
886
+ f"- Memory Transfers: {summary.get('memory_transfers', 0)}",
887
+ "",
888
+ ]
889
+
890
+ if kernels:
891
+ lines.extend([
892
+ "## GPU Kernels",
893
+ "",
894
+ "| Kernel | Time (ms) | Instances | Avg (ms) |",
895
+ "|--------|-----------|-----------|----------|",
896
+ ])
897
+
898
+ # Sort by duration descending
899
+ sorted_kernels = sorted(kernels, key=lambda k: k.get("duration_ms", 0), reverse=True)
900
+
901
+ for kernel in sorted_kernels[:20]: # Top 20 kernels
902
+ name = kernel.get("name", "Unknown")
903
+ # Truncate long kernel names
904
+ if len(name) > 50:
905
+ name = name[:47] + "..."
906
+ duration = kernel.get("duration_ms", 0)
907
+ instances = kernel.get("instances", 0)
908
+ avg = kernel.get("avg_time_ns", 0) / 1_000_000 if kernel.get("avg_time_ns") else 0
909
+
910
+ lines.append(f"| {name} | {duration:.3f} | {instances} | {avg:.4f} |")
911
+
912
+ if len(kernels) > 20:
913
+ lines.append(f"| ... and {len(kernels) - 20} more kernels | | | |")
914
+
915
+ lines.append("")
916
+
917
+ if memory_transfers:
918
+ lines.extend([
919
+ "## Memory Transfers",
920
+ "",
921
+ "| Operation | Time (ms) | Size | Instances |",
922
+ "|-----------|-----------|------|-----------|",
923
+ ])
924
+
925
+ for transfer in memory_transfers:
926
+ op = transfer.get("operation", "Unknown")
927
+ duration = transfer.get("duration_ms", 0)
928
+ size_bytes = transfer.get("size_bytes", 0)
929
+ size_str = _format_bytes(size_bytes)
930
+ instances = transfer.get("instances", 0)
931
+
932
+ lines.append(f"| {op} | {duration:.3f} | {size_str} | {instances} |")
933
+
934
+ lines.append("")
935
+
936
+ # Add diagnostics if present
937
+ diagnostics = result.get("diagnostics", [])
938
+ if diagnostics:
939
+ lines.extend([
940
+ "## Diagnostics",
941
+ "",
942
+ ])
943
+ for diag in diagnostics:
944
+ level = diag.get("level", "Info")
945
+ text = diag.get("text", "")
946
+ lines.append(f"- [{level}] {text}")
947
+ lines.append("")
948
+
949
+ return "\n".join(lines)
950
+
951
+
952
+ def _format_bytes(size_bytes: int) -> str:
953
+ """Format bytes into human-readable string."""
954
+ if size_bytes >= 1e9:
955
+ return f"{size_bytes / 1e9:.2f} GB"
956
+ elif size_bytes >= 1e6:
957
+ return f"{size_bytes / 1e6:.2f} MB"
958
+ elif size_bytes >= 1e3:
959
+ return f"{size_bytes / 1e3:.2f} KB"
960
+ else:
961
+ return f"{size_bytes} B"
962
+
963
+
964
+ def _parse_target(target: str) -> tuple[str, str]:
965
+ """Parse target string into type and identifier.
966
+
967
+ Supports:
968
+ - "workspace:abc123" -> ("workspace", "abc123")
969
+ - "vultr-b200" -> ("target", "vultr-b200")
970
+
971
+ Returns:
972
+ Tuple of (target_type, identifier)
973
+ """
974
+ if target.startswith("workspace:"):
975
+ return "workspace", target[len("workspace:"):]
976
+ else:
977
+ return "target", target
978
+
979
+
980
+ def analyze_nsys_profile(
981
+ filepath: Path,
982
+ json_output: bool = False,
983
+ remote: bool | None = None,
984
+ target: str | None = None,
985
+ output_dir: Path | None = None,
986
+ ) -> str:
987
+ """Analyze an NSYS profile file and return results.
988
+
989
+ Args:
990
+ filepath: Path to .nsys-rep file
991
+ json_output: If True, return raw JSON; otherwise return formatted text
992
+ remote: If True, force remote analysis via API. If False, force local.
993
+ If None (default), auto-detect: use local if nsys available, else remote.
994
+ target: Remote target - either "workspace:id" or target name from ~/.wafer/targets/
995
+ output_dir: Optional directory to save analysis results
996
+
997
+ Returns:
998
+ Analysis results as string (JSON or markdown)
999
+
1000
+ Raises:
1001
+ FileNotFoundError: If file doesn't exist
1002
+ RuntimeError: If analysis fails
1003
+ """
1004
+ if not filepath.exists():
1005
+ raise FileNotFoundError(f"File not found: {filepath}")
1006
+
1007
+ if filepath.suffix != ".nsys-rep":
1008
+ raise ValueError(f"Expected .nsys-rep file, got: {filepath.suffix}")
1009
+
1010
+ # If target is specified, use appropriate remote execution
1011
+ if target:
1012
+ target_type, target_id = _parse_target(target)
1013
+
1014
+ if target_type == "workspace":
1015
+ return _analyze_workspace(filepath, target_id, json_output)
1016
+ else:
1017
+ return _analyze_remote_direct(filepath, target_id, json_output)
1018
+
1019
+ # Check for local nsys installation
1020
+ nsys_path = _find_nsys()
1021
+
1022
+ # Determine whether to use local or remote
1023
+ use_remote = remote
1024
+ if use_remote is None:
1025
+ # Auto-detect: use local if nsys available, else remote
1026
+ use_remote = nsys_path is None
1027
+
1028
+ if use_remote:
1029
+ return _analyze_remote_api(filepath, json_output)
1030
+ else:
1031
+ if nsys_path is None:
1032
+ if is_macos():
1033
+ raise FileNotFoundError(
1034
+ "NSYS CLI is not available on macOS (only GUI viewer is provided). "
1035
+ "Use --remote flag for API-based analysis or --target for workspace/SSH analysis."
1036
+ )
1037
+ install_cmd = _get_install_command()
1038
+ raise FileNotFoundError(
1039
+ f"NSYS not installed locally. Use --remote flag or install with: {install_cmd}"
1040
+ )
1041
+
1042
+ return _analyze_local(filepath, nsys_path, output_dir, json_output)