wafer-cli 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/nsys_analyze.py CHANGED
@@ -1,47 +1,171 @@
1
1
  """NSYS Analyze - Parse and analyze .nsys-rep profile files.
2
2
 
3
3
  This module provides the implementation for the `wafer nvidia nsys analyze` command.
4
- Supports both local analysis (when nsys is installed) and remote analysis via API.
4
+ Supports local analysis (when nsys is installed), remote analysis via API,
5
+ direct SSH analysis via targets, and workspace execution.
6
+
7
+ Local analysis uses `nsys stats` and `nsys export` commands which work on any machine
8
+ with nsys installed (no GPU required for analysis, only for profiling).
5
9
  """
6
10
 
7
11
  import json
12
+ import os
8
13
  import platform
9
14
  import shutil
15
+ import subprocess
16
+ import sys
17
+ from dataclasses import dataclass
10
18
  from datetime import datetime
11
19
  from pathlib import Path
12
20
 
21
+ # Known NSYS installation paths by platform
22
+ # NOTE: On macOS, NVIDIA only provides the GUI viewer (nsys-ui), NOT the CLI tool.
23
+ # The nsys CLI is only available on Linux. macOS users must use remote analysis.
24
+ NSYS_PATHS = {
25
+ "linux": [
26
+ "/usr/bin/nsys",
27
+ "/usr/local/bin/nsys",
28
+ "/usr/local/cuda/bin/nsys",
29
+ "/opt/nvidia/nsight-systems/bin/nsys",
30
+ "/opt/nvidia/nsight-systems-cli/bin/nsys",
31
+ ],
32
+ # macOS: nsys CLI not available - only GUI viewer exists
33
+ # Set to empty list to always fall back to remote analysis
34
+ "darwin": [],
35
+ "windows": [
36
+ r"C:\Program Files\NVIDIA Corporation\Nsight Systems\bin\nsys.exe",
37
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\bin\nsys.exe",
38
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\nsys.exe",
39
+ ],
40
+ }
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class NSYSCheckResult:
45
+ """Result of checking NSYS installation."""
46
+
47
+ installed: bool
48
+ path: str | None = None
49
+ version: str | None = None
50
+ install_command: str | None = None
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class KernelInfo:
55
+ """Information about a CUDA kernel from NSYS profile."""
56
+
57
+ name: str
58
+ duration_ns: int
59
+ duration_ms: float
60
+ instances: int
61
+ avg_duration_ns: float
62
+ min_duration_ns: int
63
+ max_duration_ns: int
64
+ grid_size: str | None = None
65
+ block_size: str | None = None
66
+ registers_per_thread: int | None = None
67
+ shared_memory_bytes: int | None = None
68
+ memory_throughput_gb_s: float | None = None
69
+
70
+
71
+ @dataclass(frozen=True)
72
+ class MemoryTransfer:
73
+ """Information about a memory transfer from NSYS profile."""
74
+
75
+ operation: str # HtoD, DtoH, DtoD, etc.
76
+ duration_ns: int
77
+ size_bytes: int
78
+ throughput_gb_s: float
79
+ instances: int
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class NSYSAnalysisResult:
84
+ """Complete NSYS analysis result."""
85
+
86
+ success: bool
87
+ report_id: str | None = None
88
+ gpu: str = "Unknown"
89
+ duration_ms: float = 0.0
90
+ kernel_count: int = 0
91
+ memory_transfer_count: int = 0
92
+ kernels: list[dict] | None = None
93
+ memory_transfers: list[dict] | None = None
94
+ timeline: list[dict] | None = None
95
+ diagnostics: list[dict] | None = None
96
+ error: str | None = None
97
+
98
+
99
+ def _get_platform() -> str:
100
+ """Get normalized platform name."""
101
+ system = platform.system().lower()
102
+ if system == "darwin":
103
+ return "darwin"
104
+ elif system == "windows":
105
+ return "windows"
106
+ return "linux"
107
+
13
108
 
14
109
  def _find_nsys() -> str | None:
15
- """Find nsys executable on the system."""
110
+ """Find nsys executable on the system.
111
+
112
+ Searches in order:
113
+ 1. PATH environment variable
114
+ 2. Common installation paths for the current platform
115
+ """
116
+ # First check PATH
16
117
  nsys = shutil.which("nsys")
17
118
  if nsys:
18
119
  return nsys
19
120
 
20
- # Check common installation paths
21
- common_paths = [
22
- "/usr/bin/nsys",
23
- "/usr/local/cuda/bin/nsys",
24
- "/opt/nvidia/nsight-systems/bin/nsys",
25
- ]
26
-
27
- for path in common_paths:
28
- if Path(path).is_file():
121
+ # Then check known installation paths
122
+ plat = _get_platform()
123
+ for path in NSYS_PATHS.get(plat, []):
124
+ if os.path.isfile(path) and os.access(path, os.X_OK):
29
125
  return path
30
126
 
31
127
  return None
32
128
 
33
129
 
130
+ def _get_nsys_version(nsys_path: str) -> str | None:
131
+ """Get NSYS version string."""
132
+ try:
133
+ result = subprocess.run(
134
+ [nsys_path, "--version"],
135
+ capture_output=True,
136
+ text=True,
137
+ timeout=10,
138
+ )
139
+ if result.returncode == 0:
140
+ # Parse version from output like "NVIDIA Nsight Systems version 2024.6.1.90-246160830v0"
141
+ for line in result.stdout.split("\n"):
142
+ if "version" in line.lower():
143
+ parts = line.split("version")
144
+ if len(parts) >= 2:
145
+ return parts[1].strip().split()[0]
146
+ return result.stdout.strip().split("\n")[0]
147
+ except (subprocess.TimeoutExpired, OSError):
148
+ pass
149
+ return None
150
+
151
+
34
152
  def _get_install_command() -> str:
35
- """Get platform-appropriate install command."""
36
- system = platform.system().lower()
153
+ """Get platform-appropriate install command for NSYS."""
154
+ plat = _get_platform()
155
+
156
+ if plat == "darwin":
157
+ # macOS only has GUI viewer, no CLI - user must use remote analysis
158
+ return "NSYS CLI not available on macOS. Use --remote flag or --target for remote analysis."
37
159
 
38
- if system == "linux":
160
+ if plat == "linux":
39
161
  if shutil.which("apt-get") or shutil.which("apt"):
40
- return "sudo apt install nvidia-cuda-toolkit"
162
+ return "sudo apt install nsight-systems"
41
163
  elif shutil.which("dnf"):
42
164
  return "sudo dnf install nsight-systems"
43
165
  elif shutil.which("yum"):
44
166
  return "sudo yum install nsight-systems"
167
+ elif shutil.which("pacman"):
168
+ return "sudo pacman -S nsight-systems"
45
169
 
46
170
  if shutil.which("conda"):
47
171
  return "conda install -c nvidia nsight-systems"
@@ -49,57 +173,632 @@ def _get_install_command() -> str:
49
173
  return "Download from https://developer.nvidia.com/nsight-systems"
50
174
 
51
175
 
52
- def _generate_text_output(filename: str, result: dict) -> str:
53
- """Generate human-readable markdown text from analysis result."""
54
- assert filename, "filename must be non-empty"
55
- assert isinstance(result, dict), "result must be a dictionary"
176
+ def is_macos() -> bool:
177
+ """Check if running on macOS."""
178
+ return _get_platform() == "darwin"
56
179
 
57
- timestamp = datetime.now().isoformat()
58
- summary = result.get("summary", {})
59
- kernels = result.get("kernels", [])
60
180
 
61
- lines = [
62
- "# NSYS Profiling Analysis",
63
- f"Source: {filename}",
64
- f"Generated: {timestamp}",
65
- "",
66
- "## Summary",
67
- f"- GPU: {summary.get('gpu', 'Unknown')}",
68
- f"- Duration: {summary.get('duration_ms', 0):.2f} ms",
69
- f"- Kernel Count: {summary.get('kernel_count', 0)}",
70
- f"- Memory Transfers: {summary.get('memory_transfers', 0)}",
71
- "",
181
+ def check_nsys_installation() -> NSYSCheckResult:
182
+ """Check if NSYS is installed and return details.
183
+
184
+ Returns:
185
+ NSYSCheckResult with installation status and details
186
+ """
187
+ nsys_path = _find_nsys()
188
+
189
+ if nsys_path is None:
190
+ return NSYSCheckResult(
191
+ installed=False,
192
+ install_command=_get_install_command(),
193
+ )
194
+
195
+ version = _get_nsys_version(nsys_path)
196
+
197
+ return NSYSCheckResult(
198
+ installed=True,
199
+ path=nsys_path,
200
+ version=version,
201
+ )
202
+
203
+
204
+ def _run_nsys_stats(
205
+ nsys_path: str,
206
+ filepath: Path,
207
+ report_name: str,
208
+ timeout: int = 120,
209
+ ) -> tuple[bool, str]:
210
+ """Run nsys stats command to extract report data.
211
+
212
+ Args:
213
+ nsys_path: Path to nsys executable
214
+ filepath: Path to .nsys-rep file
215
+ report_name: Report type (e.g., gpukernsum, gpumemtimesum, cudaapisum)
216
+ timeout: Command timeout in seconds
217
+
218
+ Returns:
219
+ Tuple of (success, output_or_error)
220
+ """
221
+ try:
222
+ result = subprocess.run(
223
+ [
224
+ nsys_path,
225
+ "stats",
226
+ "--report", report_name,
227
+ "--format", "csv",
228
+ "--force-export", "true",
229
+ str(filepath),
230
+ ],
231
+ capture_output=True,
232
+ text=True,
233
+ timeout=timeout,
234
+ )
235
+
236
+ if result.returncode != 0:
237
+ error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
238
+ return False, error_msg
239
+
240
+ return True, result.stdout
241
+
242
+ except subprocess.TimeoutExpired:
243
+ return False, f"Command timed out after {timeout}s"
244
+ except OSError as e:
245
+ return False, f"Failed to execute nsys: {e}"
246
+
247
+
248
+ def _run_nsys_export(
249
+ nsys_path: str,
250
+ filepath: Path,
251
+ output_format: str = "sqlite",
252
+ timeout: int = 180,
253
+ ) -> tuple[bool, str]:
254
+ """Run nsys export command to export trace data.
255
+
256
+ Args:
257
+ nsys_path: Path to nsys executable
258
+ filepath: Path to .nsys-rep file
259
+ output_format: Export format (sqlite, json)
260
+ timeout: Command timeout in seconds
261
+
262
+ Returns:
263
+ Tuple of (success, output_path_or_error)
264
+ """
265
+ # Determine output path
266
+ output_path = filepath.with_suffix(f".{output_format}")
267
+
268
+ try:
269
+ result = subprocess.run(
270
+ [
271
+ nsys_path,
272
+ "export",
273
+ "--type", output_format,
274
+ "--force-overwrite", "true",
275
+ "--output", str(output_path),
276
+ str(filepath),
277
+ ],
278
+ capture_output=True,
279
+ text=True,
280
+ timeout=timeout,
281
+ )
282
+
283
+ if result.returncode != 0:
284
+ error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
285
+ return False, error_msg
286
+
287
+ return True, str(output_path)
288
+
289
+ except subprocess.TimeoutExpired:
290
+ return False, f"Export timed out after {timeout}s"
291
+ except OSError as e:
292
+ return False, f"Failed to execute nsys: {e}"
293
+
294
+
295
+ def _parse_csv_kernels(csv_output: str) -> list[dict]:
296
+ """Parse GPU kernel summary from nsys stats CSV output."""
297
+ kernels = []
298
+
299
+ lines = csv_output.strip().split("\n")
300
+ if len(lines) < 2:
301
+ return kernels
302
+
303
+ # Find header line - look for a line with known CSV header columns
304
+ # The nsys output includes informational lines before the actual CSV
305
+ # Header line should contain "Time" and "Name" columns
306
+ header_idx = -1
307
+ for i, line in enumerate(lines):
308
+ line_lower = line.lower()
309
+ # Skip comment lines and non-CSV lines
310
+ if line.startswith("#"):
311
+ continue
312
+ # Check if this looks like a CSV header with expected columns
313
+ if ("time" in line_lower and "name" in line_lower) or \
314
+ ("time (%)" in line_lower) or \
315
+ ("total time" in line_lower and "instances" in line_lower):
316
+ header_idx = i
317
+ break
318
+
319
+ if header_idx < 0 or header_idx >= len(lines) - 1:
320
+ return kernels
321
+
322
+ headers = [h.strip().strip('"') for h in lines[header_idx].split(",")]
323
+
324
+ # Map header names to indices
325
+ def find_col(names: list[str]) -> int | None:
326
+ for name in names:
327
+ name_lower = name.lower()
328
+ for i, h in enumerate(headers):
329
+ if name_lower in h.lower():
330
+ return i
331
+ return None
332
+
333
+ name_col = find_col(["Name", "Kernel Name", "KernelName"])
334
+ time_col = find_col(["Time (%)", "Time Percent", "Time%"])
335
+ total_time_col = find_col(["Total Time", "TotalTime", "Duration"])
336
+ instances_col = find_col(["Instances", "Count", "Calls"])
337
+ avg_col = find_col(["Avg", "Average", "AvgTime"])
338
+ min_col = find_col(["Min", "Minimum", "MinTime"])
339
+ max_col = find_col(["Max", "Maximum", "MaxTime"])
340
+
341
+ # Parse data rows
342
+ for line in lines[header_idx + 1:]:
343
+ if not line.strip() or line.startswith("#"):
344
+ continue
345
+
346
+ # Handle CSV with quoted fields
347
+ parts = []
348
+ in_quote = False
349
+ current = ""
350
+ for char in line:
351
+ if char == '"':
352
+ in_quote = not in_quote
353
+ elif char == "," and not in_quote:
354
+ parts.append(current.strip().strip('"'))
355
+ current = ""
356
+ else:
357
+ current += char
358
+ parts.append(current.strip().strip('"'))
359
+
360
+ if len(parts) <= (name_col or 0):
361
+ continue
362
+
363
+ kernel = {
364
+ "name": parts[name_col] if name_col is not None else "Unknown",
365
+ "time_percent": 0.0,
366
+ "total_time_ns": 0,
367
+ "duration_ms": 0.0,
368
+ "instances": 0,
369
+ "avg_time_ns": 0,
370
+ "min_time_ns": 0,
371
+ "max_time_ns": 0,
372
+ }
373
+
374
+ try:
375
+ if time_col is not None and time_col < len(parts):
376
+ kernel["time_percent"] = float(parts[time_col].replace("%", "").strip() or 0)
377
+
378
+ if total_time_col is not None and total_time_col < len(parts):
379
+ # Time may be in ns, us, or ms - parse accordingly
380
+ time_str = parts[total_time_col].strip()
381
+ kernel["total_time_ns"] = _parse_time_to_ns(time_str)
382
+ kernel["duration_ms"] = kernel["total_time_ns"] / 1_000_000
383
+
384
+ if instances_col is not None and instances_col < len(parts):
385
+ kernel["instances"] = int(float(parts[instances_col].strip() or 0))
386
+
387
+ if avg_col is not None and avg_col < len(parts):
388
+ kernel["avg_time_ns"] = _parse_time_to_ns(parts[avg_col].strip())
389
+
390
+ if min_col is not None and min_col < len(parts):
391
+ kernel["min_time_ns"] = _parse_time_to_ns(parts[min_col].strip())
392
+
393
+ if max_col is not None and max_col < len(parts):
394
+ kernel["max_time_ns"] = _parse_time_to_ns(parts[max_col].strip())
395
+
396
+ except (ValueError, IndexError):
397
+ pass
398
+
399
+ if kernel["name"] and kernel["name"] != "Unknown":
400
+ kernels.append(kernel)
401
+
402
+ return kernels
403
+
404
+
405
+ def _parse_time_to_ns(time_str: str) -> int:
406
+ """Parse time string to nanoseconds."""
407
+ if not time_str:
408
+ return 0
409
+
410
+ time_str = time_str.strip().lower()
411
+
412
+ try:
413
+ if "ms" in time_str:
414
+ return int(float(time_str.replace("ms", "").strip()) * 1_000_000)
415
+ elif "us" in time_str or "µs" in time_str:
416
+ return int(float(time_str.replace("us", "").replace("µs", "").strip()) * 1_000)
417
+ elif "ns" in time_str:
418
+ return int(float(time_str.replace("ns", "").strip()))
419
+ elif "s" in time_str:
420
+ return int(float(time_str.replace("s", "").strip()) * 1_000_000_000)
421
+ else:
422
+ # Assume nanoseconds
423
+ return int(float(time_str))
424
+ except ValueError:
425
+ return 0
426
+
427
+
428
+ def _parse_csv_memory(csv_output: str) -> list[dict]:
429
+ """Parse memory transfer summary from nsys stats CSV output."""
430
+ transfers = []
431
+
432
+ lines = csv_output.strip().split("\n")
433
+ if len(lines) < 2:
434
+ return transfers
435
+
436
+ # Find header line - look for a line with known CSV header columns
437
+ # The nsys output includes informational lines before the actual CSV
438
+ header_idx = -1
439
+ for i, line in enumerate(lines):
440
+ line_lower = line.lower()
441
+ # Skip comment lines
442
+ if line.startswith("#"):
443
+ continue
444
+ # Check if this looks like a CSV header with expected columns
445
+ if ("time" in line_lower and ("operation" in line_lower or "total" in line_lower)) or \
446
+ ("time (%)" in line_lower) or \
447
+ ("count" in line_lower and "total" in line_lower):
448
+ header_idx = i
449
+ break
450
+
451
+ if header_idx < 0 or header_idx >= len(lines) - 1:
452
+ return transfers
453
+
454
+ headers = [h.strip().strip('"') for h in lines[header_idx].split(",")]
455
+
456
+ # Map header names
457
+ def find_col(names: list[str]) -> int | None:
458
+ for name in names:
459
+ name_lower = name.lower()
460
+ for i, h in enumerate(headers):
461
+ if name_lower in h.lower():
462
+ return i
463
+ return None
464
+
465
+ op_col = find_col(["Operation", "Name", "MemOp"])
466
+ time_col = find_col(["Total Time", "TotalTime", "Duration"])
467
+ size_col = find_col(["Total", "Size", "Bytes"])
468
+ count_col = find_col(["Count", "Instances", "Calls"])
469
+ throughput_col = find_col(["Throughput", "Bandwidth"])
470
+
471
+ for line in lines[header_idx + 1:]:
472
+ if not line.strip() or line.startswith("#"):
473
+ continue
474
+
475
+ parts = [p.strip().strip('"') for p in line.split(",")]
476
+
477
+ if len(parts) <= (op_col or 0):
478
+ continue
479
+
480
+ transfer = {
481
+ "operation": parts[op_col] if op_col is not None else "Unknown",
482
+ "total_time_ns": 0,
483
+ "duration_ms": 0.0,
484
+ "size_bytes": 0,
485
+ "instances": 0,
486
+ "throughput_gb_s": 0.0,
487
+ }
488
+
489
+ try:
490
+ if time_col is not None and time_col < len(parts):
491
+ transfer["total_time_ns"] = _parse_time_to_ns(parts[time_col])
492
+ transfer["duration_ms"] = transfer["total_time_ns"] / 1_000_000
493
+
494
+ if size_col is not None and size_col < len(parts):
495
+ size_str = parts[size_col].strip().upper()
496
+ if "GB" in size_str:
497
+ transfer["size_bytes"] = int(float(size_str.replace("GB", "").strip()) * 1e9)
498
+ elif "MB" in size_str:
499
+ transfer["size_bytes"] = int(float(size_str.replace("MB", "").strip()) * 1e6)
500
+ elif "KB" in size_str:
501
+ transfer["size_bytes"] = int(float(size_str.replace("KB", "").strip()) * 1e3)
502
+ else:
503
+ transfer["size_bytes"] = int(float(size_str.replace("B", "").strip() or 0))
504
+
505
+ if count_col is not None and count_col < len(parts):
506
+ transfer["instances"] = int(float(parts[count_col].strip() or 0))
507
+
508
+ if throughput_col is not None and throughput_col < len(parts):
509
+ tp_str = parts[throughput_col].strip().upper()
510
+ if "GB" in tp_str:
511
+ transfer["throughput_gb_s"] = float(tp_str.replace("GB/S", "").strip())
512
+ elif "MB" in tp_str:
513
+ transfer["throughput_gb_s"] = float(tp_str.replace("MB/S", "").strip()) / 1000
514
+ else:
515
+ transfer["throughput_gb_s"] = float(tp_str.replace("/S", "").strip() or 0) / 1e9
516
+
517
+ except (ValueError, IndexError):
518
+ pass
519
+
520
+ if transfer["operation"] and transfer["operation"] != "Unknown":
521
+ transfers.append(transfer)
522
+
523
+ return transfers
524
+
525
+
526
+ def _analyze_local(
527
+ filepath: Path,
528
+ nsys_path: str,
529
+ output_dir: Path | None = None,
530
+ json_output: bool = False,
531
+ ) -> str:
532
+ """Analyze NSYS profile locally using installed nsys CLI.
533
+
534
+ Uses `nsys stats` commands to extract kernel and memory statistics.
535
+ This works on any machine with nsys installed - no GPU required for analysis.
536
+ """
537
+ if not filepath.exists():
538
+ raise FileNotFoundError(f"File must exist: {filepath}")
539
+ if filepath.suffix != ".nsys-rep":
540
+ raise ValueError(f"File must be .nsys-rep: {filepath}")
541
+
542
+ print(f"Analyzing {filepath.name} locally...", file=sys.stderr)
543
+
544
+ # Get GPU kernel summary
545
+ # Note: Report names changed in nsys 2024.x: gpukernsum -> cuda_gpu_kern_sum
546
+ print("Extracting kernel statistics...", file=sys.stderr)
547
+ success, kernel_output = _run_nsys_stats(nsys_path, filepath, "cuda_gpu_kern_sum")
548
+
549
+ # Try legacy report name if new one fails
550
+ if not success:
551
+ success, kernel_output = _run_nsys_stats(nsys_path, filepath, "gpukernsum")
552
+
553
+ kernels = []
554
+ if success:
555
+ kernels = _parse_csv_kernels(kernel_output)
556
+ else:
557
+ print(f"Warning: Could not extract kernel stats: {kernel_output}", file=sys.stderr)
558
+
559
+ # Get memory transfer summary
560
+ # Note: Report names changed in nsys 2024.x: gpumemtimesum -> cuda_gpu_mem_time_sum
561
+ print("Extracting memory statistics...", file=sys.stderr)
562
+ success, mem_output = _run_nsys_stats(nsys_path, filepath, "cuda_gpu_mem_time_sum")
563
+
564
+ # Try legacy report names if new one fails
565
+ if not success:
566
+ success, mem_output = _run_nsys_stats(nsys_path, filepath, "gpumemtimesum")
567
+
568
+ memory_transfers = []
569
+ if success:
570
+ memory_transfers = _parse_csv_memory(mem_output)
571
+ else:
572
+ # Try alternative report name (for very old nsys versions)
573
+ success, mem_output = _run_nsys_stats(nsys_path, filepath, "cudamemcpysum")
574
+ if success:
575
+ memory_transfers = _parse_csv_memory(mem_output)
576
+
577
+ # Get CUDA API summary for additional context
578
+ # Note: Report names changed in nsys 2024.x: cudaapisum -> cuda_api_sum
579
+ print("Extracting CUDA API statistics...", file=sys.stderr)
580
+ success, api_output = _run_nsys_stats(nsys_path, filepath, "cuda_api_sum")
581
+
582
+ # Try legacy report name if new one fails
583
+ if not success:
584
+ success, api_output = _run_nsys_stats(nsys_path, filepath, "cudaapisum")
585
+
586
+ # Build summary
587
+ total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
588
+ total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
589
+
590
+ # Try to get GPU info from report
591
+ gpu_name = "Unknown"
592
+
593
+ # Build result
594
+ result = {
595
+ "success": True,
596
+ "summary": {
597
+ "gpu": gpu_name,
598
+ "duration_ms": total_kernel_time_ms + total_mem_time_ms,
599
+ "kernel_count": len(kernels),
600
+ "memory_transfers": len(memory_transfers),
601
+ "total_kernel_time_ms": total_kernel_time_ms,
602
+ "total_memory_time_ms": total_mem_time_ms,
603
+ },
604
+ "kernels": kernels,
605
+ "memory_transfers": memory_transfers,
606
+ }
607
+
608
+ # Save to output directory if specified
609
+ if output_dir:
610
+ output_dir.mkdir(parents=True, exist_ok=True)
611
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
612
+ output_filename = f"nsys_analysis_{filepath.stem}_{timestamp}"
613
+
614
+ if json_output:
615
+ json_path = output_dir / f"{output_filename}.json"
616
+ json_path.write_text(json.dumps(result, indent=2))
617
+ print(f"Saved JSON: {json_path}", file=sys.stderr)
618
+ else:
619
+ txt_path = output_dir / f"{output_filename}.txt"
620
+ txt_path.write_text(_generate_text_output(filepath.name, result))
621
+ print(f"Saved analysis: {txt_path}", file=sys.stderr)
622
+
623
+ print("Analysis complete.", file=sys.stderr)
624
+
625
+ if json_output:
626
+ return json.dumps(result, indent=2)
627
+ else:
628
+ return _generate_text_output(filepath.name, result)
629
+
630
+
631
+ def _analyze_remote_direct(
632
+ filepath: Path,
633
+ target_name: str,
634
+ json_output: bool = False,
635
+ ) -> str:
636
+ """Analyze NSYS profile remotely via direct SSH to target.
637
+
638
+ Uploads the .nsys-rep file and runs nsys analysis on the target machine.
639
+ """
640
+ import tempfile
641
+
642
+ from .gpu_run import push_directory, run_command_capture
643
+ from .targets import load_target
644
+
645
+ # Load target
646
+ try:
647
+ target = load_target(target_name)
648
+ except FileNotFoundError as e:
649
+ raise RuntimeError(f"Target not found: {target_name}. Create with: wafer targets add {target_name}") from e
650
+
651
+ # Create temp directory with just the .nsys-rep file
652
+ workspace_name = f"nsys_analyze_{filepath.stem}"
653
+
654
+ with tempfile.TemporaryDirectory() as tmpdir:
655
+ # Create a directory with the workspace name
656
+ tmp_path = Path(tmpdir) / workspace_name
657
+ tmp_path.mkdir()
658
+ shutil.copy(filepath, tmp_path / filepath.name)
659
+
660
+ # Push the file
661
+ print(f"Uploading {filepath.name} to {target_name}...", file=sys.stderr)
662
+ push_directory(tmp_path, target)
663
+
664
+ # Run nsys stats commands on remote
665
+ # First try to find nsys on the remote system
666
+ nsys_paths = [
667
+ "/usr/bin/nsys",
668
+ "/usr/local/cuda/bin/nsys",
669
+ "/opt/nvidia/nsight-systems/bin/nsys",
72
670
  ]
73
671
 
74
- if kernels:
75
- lines.extend([
76
- "## Kernels",
77
- "",
78
- ])
79
- for i, kernel in enumerate(kernels, 1):
80
- lines.extend([
81
- f"### {i}. {kernel.get('name', 'Unknown')}",
82
- f"- Duration: {kernel.get('duration_ms', 0):.3f} ms",
83
- f"- Grid Size: {kernel.get('grid_size', 'N/A')}",
84
- f"- Block Size: {kernel.get('block_size', 'N/A')}",
85
- f"- Memory Throughput: {kernel.get('memory_throughput_gb_s', 0):.2f} GB/s",
86
- "",
87
- ])
672
+ nsys_cmd = "nsys" # Default to PATH
673
+ for path in nsys_paths:
674
+ check_cmd = f"test -x {path} && echo found"
675
+ exit_code, output = run_command_capture(check_cmd, workspace_name, target)
676
+ if exit_code == 0 and "found" in output:
677
+ nsys_cmd = path
678
+ break
88
679
 
89
- # Add diagnostics if present
90
- diagnostics = result.get("diagnostics", [])
91
- if diagnostics:
92
- lines.extend([
93
- "## Diagnostics",
94
- "",
95
- ])
96
- for diag in diagnostics:
97
- level = diag.get("level", "Info")
98
- text = diag.get("text", "")
99
- lines.append(f"- [{level}] {text}")
100
- lines.append("")
680
+ # Run analysis commands
681
+ # Try new report name first, fall back to legacy if it fails
682
+ print("Running NSYS analysis...", file=sys.stderr)
683
+ analysis_cmd = f"{nsys_cmd} stats --report cuda_gpu_kern_sum --format csv --force-export true {filepath.name}"
684
+ exit_code, kernel_output = run_command_capture(analysis_cmd, workspace_name, target)
101
685
 
102
- return "\n".join(lines)
686
+ # Try legacy report name if new one fails
687
+ if exit_code != 0 or "could not be found" in kernel_output.lower():
688
+ analysis_cmd = f"{nsys_cmd} stats --report gpukernsum --format csv --force-export true {filepath.name}"
689
+ exit_code, kernel_output = run_command_capture(analysis_cmd, workspace_name, target)
690
+
691
+ if exit_code != 0:
692
+ raise RuntimeError(f"NSYS kernel stats failed: {kernel_output}")
693
+
694
+ # Get memory stats - try new name first, fall back to legacy
695
+ mem_cmd = f"{nsys_cmd} stats --report cuda_gpu_mem_time_sum --format csv --force-export true {filepath.name}"
696
+ exit_code, mem_output = run_command_capture(mem_cmd, workspace_name, target)
697
+
698
+ # Try legacy report name if new one fails
699
+ if exit_code != 0 or "could not be found" in mem_output.lower():
700
+ mem_cmd = f"{nsys_cmd} stats --report gpumemtimesum --format csv --force-export true {filepath.name}"
701
+ exit_code, mem_output = run_command_capture(mem_cmd, workspace_name, target)
702
+
703
+ # Parse outputs (memory stats may fail if no memory transfers)
704
+ kernels = _parse_csv_kernels(kernel_output) if kernel_output else []
705
+ memory_transfers = _parse_csv_memory(mem_output) if exit_code == 0 and mem_output else []
706
+
707
+ # Build result
708
+ total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
709
+ total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
710
+
711
+ result = {
712
+ "success": True,
713
+ "summary": {
714
+ "gpu": "Unknown", # Would need additional parsing to get GPU name
715
+ "duration_ms": total_kernel_time_ms + total_mem_time_ms,
716
+ "kernel_count": len(kernels),
717
+ "memory_transfers": len(memory_transfers),
718
+ },
719
+ "kernels": kernels,
720
+ "memory_transfers": memory_transfers,
721
+ }
722
+
723
+ if json_output:
724
+ return json.dumps(result, indent=2)
725
+ else:
726
+ return _generate_text_output(filepath.name, result)
727
+
728
+
729
+ def _analyze_workspace(
730
+ filepath: Path,
731
+ workspace_id: str,
732
+ json_output: bool = False,
733
+ ) -> str:
734
+ """Analyze NSYS profile on a Wafer workspace.
735
+
736
+ Uses workspace exec to run nsys analysis on the workspace.
737
+ """
738
+ from .workspaces import exec_command_capture
739
+
740
+ # First, check if file exists on workspace or needs upload
741
+ # For now, assume file is already on workspace (via sync)
742
+ filename = filepath.name
743
+
744
+ print(f"Running NSYS analysis on workspace {workspace_id}...", file=sys.stderr)
745
+
746
+ # Try to find nsys on the workspace
747
+ nsys_cmd = "nsys"
748
+ for path in ["/usr/bin/nsys", "/usr/local/cuda/bin/nsys", "/opt/nvidia/nsight-systems/bin/nsys"]:
749
+ check_cmd = f"test -x {path} && echo found"
750
+ exit_code, output = exec_command_capture(workspace_id, check_cmd)
751
+ if exit_code == 0 and "found" in output:
752
+ nsys_cmd = path
753
+ break
754
+
755
+ # Run kernel stats - try new report name first, fall back to legacy
756
+ print("Extracting kernel statistics...", file=sys.stderr)
757
+ kernel_cmd = f"{nsys_cmd} stats --report cuda_gpu_kern_sum --format csv --force-export true {filename}"
758
+ exit_code, kernel_output = exec_command_capture(workspace_id, kernel_cmd)
759
+
760
+ # Try legacy report name if new one fails
761
+ if exit_code != 0 or "could not be found" in kernel_output.lower():
762
+ kernel_cmd = f"{nsys_cmd} stats --report gpukernsum --format csv --force-export true {filename}"
763
+ exit_code, kernel_output = exec_command_capture(workspace_id, kernel_cmd)
764
+
765
+ if exit_code != 0:
766
+ raise RuntimeError(f"NSYS kernel stats failed on workspace: {kernel_output}")
767
+
768
+ # Run memory stats - try new report name first, fall back to legacy
769
+ print("Extracting memory statistics...", file=sys.stderr)
770
+ mem_cmd = f"{nsys_cmd} stats --report cuda_gpu_mem_time_sum --format csv --force-export true {filename}"
771
+ exit_code, mem_output = exec_command_capture(workspace_id, mem_cmd)
772
+
773
+ # Try legacy report name if new one fails
774
+ if exit_code != 0 or "could not be found" in mem_output.lower():
775
+ mem_cmd = f"{nsys_cmd} stats --report gpumemtimesum --format csv --force-export true {filename}"
776
+ exit_code, mem_output = exec_command_capture(workspace_id, mem_cmd)
777
+
778
+ # Parse outputs
779
+ kernels = _parse_csv_kernels(kernel_output) if kernel_output else []
780
+ memory_transfers = _parse_csv_memory(mem_output) if exit_code == 0 and mem_output else []
781
+
782
+ # Build result
783
+ total_kernel_time_ms = sum(k.get("duration_ms", 0) for k in kernels)
784
+ total_mem_time_ms = sum(m.get("duration_ms", 0) for m in memory_transfers)
785
+
786
+ result = {
787
+ "success": True,
788
+ "summary": {
789
+ "gpu": "Unknown",
790
+ "duration_ms": total_kernel_time_ms + total_mem_time_ms,
791
+ "kernel_count": len(kernels),
792
+ "memory_transfers": len(memory_transfers),
793
+ },
794
+ "kernels": kernels,
795
+ "memory_transfers": memory_transfers,
796
+ }
797
+
798
+ if json_output:
799
+ return json.dumps(result, indent=2)
800
+ else:
801
+ return _generate_text_output(filepath.name, result)
103
802
 
104
803
 
105
804
  def _analyze_remote_api(
@@ -110,10 +809,10 @@ def _analyze_remote_api(
110
809
 
111
810
  Uploads the .nsys-rep file and runs analysis on Modal.
112
811
  """
113
- assert filepath.exists(), f"File must exist: {filepath}"
114
- assert filepath.suffix == ".nsys-rep", f"File must be .nsys-rep: {filepath}"
115
-
116
- import sys
812
+ if not filepath.exists():
813
+ raise FileNotFoundError(f"File must exist: {filepath}")
814
+ if filepath.suffix != ".nsys-rep":
815
+ raise ValueError(f"File must be .nsys-rep: {filepath}")
117
816
 
118
817
  import httpx
119
818
 
@@ -123,7 +822,8 @@ def _analyze_remote_api(
123
822
  api_url = get_api_url()
124
823
  headers = get_auth_headers()
125
824
 
126
- assert api_url, "API URL must be configured"
825
+ if not api_url:
826
+ raise ValueError("API URL must be configured")
127
827
 
128
828
  # Use multipart/form-data upload
129
829
  print(f"Uploading {filepath.name} for analysis...", file=sys.stderr)
@@ -153,7 +853,8 @@ def _analyze_remote_api(
153
853
  raise RuntimeError(f"Analysis failed: {result.get('error', 'Unknown error')}")
154
854
 
155
855
  # Validate response structure
156
- assert isinstance(result, dict), "API must return a dictionary"
856
+ if not isinstance(result, dict):
857
+ raise TypeError("API must return a dictionary")
157
858
 
158
859
  if json_output:
159
860
  return json.dumps(result, indent=2)
@@ -161,10 +862,127 @@ def _analyze_remote_api(
161
862
  return _generate_text_output(filepath.name, result)
162
863
 
163
864
 
865
+ def _generate_text_output(filename: str, result: dict) -> str:
866
+ """Generate human-readable markdown text from analysis result."""
867
+ if not filename:
868
+ raise ValueError("filename must be non-empty")
869
+ if not isinstance(result, dict):
870
+ raise TypeError("result must be a dictionary")
871
+
872
+ timestamp = datetime.now().isoformat()
873
+ summary = result.get("summary", {})
874
+ kernels = result.get("kernels", [])
875
+ memory_transfers = result.get("memory_transfers", [])
876
+
877
+ lines = [
878
+ "# NSYS Profiling Analysis",
879
+ f"Source: {filename}",
880
+ f"Generated: {timestamp}",
881
+ "",
882
+ "## Summary",
883
+ f"- GPU: {summary.get('gpu', 'Unknown')}",
884
+ f"- Total Duration: {summary.get('duration_ms', 0):.2f} ms",
885
+ f"- Kernel Count: {summary.get('kernel_count', 0)}",
886
+ f"- Memory Transfers: {summary.get('memory_transfers', 0)}",
887
+ "",
888
+ ]
889
+
890
+ if kernels:
891
+ lines.extend([
892
+ "## GPU Kernels",
893
+ "",
894
+ "| Kernel | Time (ms) | Instances | Avg (ms) |",
895
+ "|--------|-----------|-----------|----------|",
896
+ ])
897
+
898
+ # Sort by duration descending
899
+ sorted_kernels = sorted(kernels, key=lambda k: k.get("duration_ms", 0), reverse=True)
900
+
901
+ for kernel in sorted_kernels[:20]: # Top 20 kernels
902
+ name = kernel.get("name", "Unknown")
903
+ # Truncate long kernel names
904
+ if len(name) > 50:
905
+ name = name[:47] + "..."
906
+ duration = kernel.get("duration_ms", 0)
907
+ instances = kernel.get("instances", 0)
908
+ avg = kernel.get("avg_time_ns", 0) / 1_000_000 if kernel.get("avg_time_ns") else 0
909
+
910
+ lines.append(f"| {name} | {duration:.3f} | {instances} | {avg:.4f} |")
911
+
912
+ if len(kernels) > 20:
913
+ lines.append(f"| ... and {len(kernels) - 20} more kernels | | | |")
914
+
915
+ lines.append("")
916
+
917
+ if memory_transfers:
918
+ lines.extend([
919
+ "## Memory Transfers",
920
+ "",
921
+ "| Operation | Time (ms) | Size | Instances |",
922
+ "|-----------|-----------|------|-----------|",
923
+ ])
924
+
925
+ for transfer in memory_transfers:
926
+ op = transfer.get("operation", "Unknown")
927
+ duration = transfer.get("duration_ms", 0)
928
+ size_bytes = transfer.get("size_bytes", 0)
929
+ size_str = _format_bytes(size_bytes)
930
+ instances = transfer.get("instances", 0)
931
+
932
+ lines.append(f"| {op} | {duration:.3f} | {size_str} | {instances} |")
933
+
934
+ lines.append("")
935
+
936
+ # Add diagnostics if present
937
+ diagnostics = result.get("diagnostics", [])
938
+ if diagnostics:
939
+ lines.extend([
940
+ "## Diagnostics",
941
+ "",
942
+ ])
943
+ for diag in diagnostics:
944
+ level = diag.get("level", "Info")
945
+ text = diag.get("text", "")
946
+ lines.append(f"- [{level}] {text}")
947
+ lines.append("")
948
+
949
+ return "\n".join(lines)
950
+
951
+
952
+ def _format_bytes(size_bytes: int) -> str:
953
+ """Format bytes into human-readable string."""
954
+ if size_bytes >= 1e9:
955
+ return f"{size_bytes / 1e9:.2f} GB"
956
+ elif size_bytes >= 1e6:
957
+ return f"{size_bytes / 1e6:.2f} MB"
958
+ elif size_bytes >= 1e3:
959
+ return f"{size_bytes / 1e3:.2f} KB"
960
+ else:
961
+ return f"{size_bytes} B"
962
+
963
+
964
+ def _parse_target(target: str) -> tuple[str, str]:
965
+ """Parse target string into type and identifier.
966
+
967
+ Supports:
968
+ - "workspace:abc123" -> ("workspace", "abc123")
969
+ - "vultr-b200" -> ("target", "vultr-b200")
970
+
971
+ Returns:
972
+ Tuple of (target_type, identifier)
973
+ """
974
+ if target.startswith("workspace:"):
975
+ return "workspace", target[len("workspace:"):]
976
+ else:
977
+ return "target", target
978
+
979
+
164
980
  def analyze_nsys_profile(
165
981
  filepath: Path,
166
982
  json_output: bool = False,
167
983
  remote: bool | None = None,
984
+ target: str | None = None,
985
+ output_dir: Path | None = None,
168
986
  ) -> str:
169
987
  """Analyze an NSYS profile file and return results.
170
988
 
@@ -173,6 +991,8 @@ def analyze_nsys_profile(
173
991
  json_output: If True, return raw JSON; otherwise return formatted text
174
992
  remote: If True, force remote analysis via API. If False, force local.
175
993
  If None (default), auto-detect: use local if nsys available, else remote.
994
+ target: Remote target - either "workspace:id" or target name from ~/.wafer/targets/
995
+ output_dir: Optional directory to save analysis results
176
996
 
177
997
  Returns:
178
998
  Analysis results as string (JSON or markdown)
@@ -187,26 +1007,36 @@ def analyze_nsys_profile(
187
1007
  if filepath.suffix != ".nsys-rep":
188
1008
  raise ValueError(f"Expected .nsys-rep file, got: {filepath.suffix}")
189
1009
 
1010
+ # If target is specified, use appropriate remote execution
1011
+ if target:
1012
+ target_type, target_id = _parse_target(target)
1013
+
1014
+ if target_type == "workspace":
1015
+ return _analyze_workspace(filepath, target_id, json_output)
1016
+ else:
1017
+ return _analyze_remote_direct(filepath, target_id, json_output)
1018
+
1019
+ # Check for local nsys installation
190
1020
  nsys_path = _find_nsys()
191
1021
 
192
1022
  # Determine whether to use local or remote
193
1023
  use_remote = remote
194
1024
  if use_remote is None:
195
- # Auto-detect: use remote if nsys not available locally
1025
+ # Auto-detect: use local if nsys available, else remote
196
1026
  use_remote = nsys_path is None
197
1027
 
198
1028
  if use_remote:
199
1029
  return _analyze_remote_api(filepath, json_output)
200
1030
  else:
201
- # Local analysis not yet implemented - would need to copy nsys_parser to wafer-core
202
- # For now, suggest using remote
203
1031
  if nsys_path is None:
1032
+ if is_macos():
1033
+ raise FileNotFoundError(
1034
+ "NSYS CLI is not available on macOS (only GUI viewer is provided). "
1035
+ "Use --remote flag for API-based analysis or --target for workspace/SSH analysis."
1036
+ )
204
1037
  install_cmd = _get_install_command()
205
1038
  raise FileNotFoundError(
206
1039
  f"NSYS not installed locally. Use --remote flag or install with: {install_cmd}"
207
1040
  )
208
1041
 
209
- # TODO: Implement local parsing by moving nsys_parser to wafer-core
210
- raise NotImplementedError(
211
- "Local NSYS analysis not yet implemented. Use --remote flag to analyze via API."
212
- )
1042
+ return _analyze_local(filepath, nsys_path, output_dir, json_output)