wafer-cli 0.2.36__py3-none-any.whl → 0.2.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/ncu_analyze.py CHANGED
@@ -41,6 +41,32 @@ NCU_PATHS = {
41
41
  }
42
42
 
43
43
 
44
+ # GPU SM counts for common NVIDIA GPUs (used for underfill detection)
45
+ GPU_SM_COUNTS = {
46
+ "B200": 148,
47
+ "H100": 132,
48
+ "H200": 132,
49
+ "A100": 108,
50
+ "A10": 72,
51
+ "L4": 58,
52
+ "L40": 142,
53
+ "V100": 80,
54
+ "RTX 4090": 128,
55
+ "RTX 3090": 82,
56
+ }
57
+
58
+
59
+ def _get_sm_count_for_gpu(gpu_name: str) -> int:
60
+ """Get SM count for a GPU name. Returns 148 (B200) as default."""
61
+ if not gpu_name:
62
+ return 148
63
+ gpu_upper = gpu_name.upper()
64
+ for gpu_key, sm_count in GPU_SM_COUNTS.items():
65
+ if gpu_key.upper() in gpu_upper:
66
+ return sm_count
67
+ return 148 # Default to B200
68
+
69
+
44
70
  def _get_platform() -> str:
45
71
  """Get normalized platform name."""
46
72
  system = platform.system().lower()
@@ -85,197 +111,471 @@ def _get_install_command() -> str:
85
111
  return "Download from https://developer.nvidia.com/nsight-compute"
86
112
 
87
113
 
114
+ def _parse_gpu_from_session(session_output: str) -> str:
115
+ """Parse GPU name from NCU session output."""
116
+ assert isinstance(session_output, str)
117
+
118
+ for line in session_output.split("\n"):
119
+ if "display_name" in line:
120
+ parts = line.split()
121
+ if len(parts) >= 2:
122
+ return " ".join(parts[1:])
123
+ return "Unknown"
124
+
125
+
126
+ def _create_kernel_entry(kernel_name: str) -> dict:
127
+ """Create a new kernel metrics dict with default values."""
128
+ assert kernel_name, "kernel_name must not be empty"
129
+
130
+ return {
131
+ "name": kernel_name,
132
+ "duration_us": 0,
133
+ "duration_ms": 0,
134
+ "memory_throughput_pct": 0,
135
+ "compute_throughput_pct": 0,
136
+ "achieved_occupancy_pct": 0,
137
+ "theoretical_occupancy_pct": 0,
138
+ "registers_per_thread": 0,
139
+ "block_size": 0,
140
+ "grid_size": 0,
141
+ "waves_per_sm": 0,
142
+ "estimated_speedup_pct": 0,
143
+ "recommendations": [],
144
+ }
145
+
146
+
147
+ def _parse_metric_line(kernel: dict, metric_line: str, parts: list[str], current_section: str | None) -> None:
148
+ """Parse a metric line and update the kernel dict in place."""
149
+ assert kernel is not None
150
+ assert parts, "parts must not be empty"
151
+
152
+ # Duration (in us)
153
+ if metric_line.startswith("Duration") and "us" in metric_line:
154
+ try:
155
+ value = float(parts[-1].replace(",", ""))
156
+ kernel["duration_us"] = value
157
+ kernel["duration_ms"] = value / 1000
158
+ except (ValueError, IndexError):
159
+ pass
160
+ # Memory Throughput (%)
161
+ elif "Memory Throughput" in metric_line and "%" in metric_line:
162
+ try:
163
+ kernel["memory_throughput_pct"] = float(parts[-1].replace(",", ""))
164
+ except (ValueError, IndexError):
165
+ pass
166
+ # Compute (SM) Throughput (%)
167
+ elif "Compute (SM) Throughput" in metric_line or "Compute Throughput" in metric_line:
168
+ try:
169
+ kernel["compute_throughput_pct"] = float(parts[-1].replace(",", ""))
170
+ except (ValueError, IndexError):
171
+ pass
172
+ # Achieved Occupancy (%)
173
+ elif "Achieved Occupancy" in metric_line and "%" in metric_line:
174
+ try:
175
+ kernel["achieved_occupancy_pct"] = float(parts[-1].replace(",", ""))
176
+ except (ValueError, IndexError):
177
+ pass
178
+ # Registers Per Thread
179
+ elif "Registers Per Thread" in metric_line:
180
+ try:
181
+ kernel["registers_per_thread"] = int(float(parts[-1].replace(",", "")))
182
+ except (ValueError, IndexError):
183
+ pass
184
+ # Block Size (only from Launch Statistics section)
185
+ elif metric_line.startswith("Block Size") and current_section == "Launch Statistics":
186
+ try:
187
+ kernel["block_size"] = int(float(parts[-1].replace(",", "")))
188
+ except (ValueError, IndexError):
189
+ pass
190
+ # Grid Size (only from Launch Statistics section)
191
+ elif metric_line.startswith("Grid Size") and current_section == "Launch Statistics":
192
+ try:
193
+ kernel["grid_size"] = int(float(parts[-1].replace(",", "")))
194
+ except (ValueError, IndexError):
195
+ pass
196
+ # Waves Per SM (key metric for underfill detection)
197
+ elif "Waves Per SM" in metric_line:
198
+ try:
199
+ kernel["waves_per_sm"] = float(parts[-1].replace(",", ""))
200
+ except (ValueError, IndexError):
201
+ pass
202
+ # Theoretical Occupancy (%)
203
+ elif "Theoretical Occupancy" in metric_line and "%" in metric_line:
204
+ try:
205
+ kernel["theoretical_occupancy_pct"] = float(parts[-1].replace(",", ""))
206
+ except (ValueError, IndexError):
207
+ pass
208
+
209
+
210
+ def _extract_speedup(kernel: dict, stripped: str) -> None:
211
+ """Extract estimated speedup from recommendation line."""
212
+ import re
213
+ assert kernel is not None
214
+
215
+ for pattern in [r"Est\. Speedup:\s*([\d.]+)%", r"Est\. Local Speedup:\s*([\d.]+)%"]:
216
+ match = re.search(pattern, stripped)
217
+ if match:
218
+ try:
219
+ speedup = float(match.group(1))
220
+ if speedup > kernel["estimated_speedup_pct"]:
221
+ kernel["estimated_speedup_pct"] = speedup
222
+ except ValueError:
223
+ pass
224
+
225
+
88
226
  def _parse_ncu_output(session_output: str, details_output: str) -> dict:
89
227
  """Parse NCU session and details output into structured data."""
90
228
  import re
229
+
230
+ assert isinstance(session_output, str)
231
+ assert isinstance(details_output, str)
91
232
 
92
233
  summary: dict = {
93
- "gpu": "Unknown",
234
+ "gpu": _parse_gpu_from_session(session_output) if session_output else "Unknown",
94
235
  "kernels": [],
95
236
  "recommendations": [],
96
237
  }
97
238
 
98
- # Parse session output for GPU name
99
- if session_output:
100
- for line in session_output.split("\n"):
101
- if "display_name" in line:
102
- parts = line.split()
103
- if len(parts) >= 2:
104
- summary["gpu"] = " ".join(parts[1:])
105
- break
106
-
107
- # Parse details output for kernel metrics and recommendations
108
- if details_output:
109
- lines = details_output.split("\n")
110
- current_kernel: dict | None = None
111
- current_section: str | None = None
112
- in_recommendation = False
113
- recommendation_lines: list[str] = []
114
-
115
- i = 0
116
- while i < len(lines):
117
- line = lines[i]
118
- stripped = line.strip()
119
-
120
- # Detect kernel header
121
- if (
122
- line.startswith(" ")
123
- and not line.startswith(" ")
124
- and "Context" in line
125
- and "Device" in line
126
- ):
127
- match = re.match(r"^ (.+?)\s+\(\d+,\s*\d+,\s*\d+\)x\(\d+,\s*\d+,\s*\d+\)", line)
128
- if match:
129
- kernel_name = match.group(1).strip()
130
- current_kernel = {
131
- "name": kernel_name,
132
- "duration_us": 0,
133
- "duration_ms": 0,
134
- "memory_throughput_pct": 0,
135
- "compute_throughput_pct": 0,
136
- "achieved_occupancy_pct": 0,
137
- "registers_per_thread": 0,
138
- "block_size": 0,
139
- "grid_size": 0,
140
- "estimated_speedup_pct": 0,
141
- "recommendations": [],
142
- }
143
- summary["kernels"].append(current_kernel)
144
-
145
- # Detect section headers
146
- if stripped.startswith("Section:"):
147
- current_section = stripped.replace("Section:", "").strip()
148
-
149
- # Parse metrics from table rows
150
- if current_kernel and " " in line:
151
- parts = line.split()
152
- if len(parts) >= 2:
153
- metric_line = stripped
154
-
155
- # Duration (in us)
156
- if metric_line.startswith("Duration") and "us" in metric_line:
157
- try:
158
- value = float(parts[-1].replace(",", ""))
159
- current_kernel["duration_us"] = value
160
- current_kernel["duration_ms"] = value / 1000
161
- except (ValueError, IndexError):
162
- pass
163
-
164
- # Memory Throughput (%)
165
- elif "Memory Throughput" in metric_line and "%" in metric_line:
166
- try:
167
- value = float(parts[-1].replace(",", ""))
168
- current_kernel["memory_throughput_pct"] = value
169
- except (ValueError, IndexError):
170
- pass
171
-
172
- # Compute (SM) Throughput (%)
173
- elif (
174
- "Compute (SM) Throughput" in metric_line
175
- or "Compute Throughput" in metric_line
176
- ):
177
- try:
178
- value = float(parts[-1].replace(",", ""))
179
- current_kernel["compute_throughput_pct"] = value
180
- except (ValueError, IndexError):
181
- pass
182
-
183
- # Achieved Occupancy (%)
184
- elif "Achieved Occupancy" in metric_line and "%" in metric_line:
185
- try:
186
- value = float(parts[-1].replace(",", ""))
187
- current_kernel["achieved_occupancy_pct"] = value
188
- except (ValueError, IndexError):
189
- pass
190
-
191
- # Registers Per Thread
192
- elif "Registers Per Thread" in metric_line:
193
- try:
194
- value = int(float(parts[-1].replace(",", "")))
195
- current_kernel["registers_per_thread"] = value
196
- except (ValueError, IndexError):
197
- pass
198
-
199
- # Block Size
200
- elif (
201
- metric_line.startswith("Block Size")
202
- and current_section == "Launch Statistics"
203
- ):
204
- try:
205
- value = int(float(parts[-1].replace(",", "")))
206
- current_kernel["block_size"] = value
207
- except (ValueError, IndexError):
208
- pass
209
-
210
- # Grid Size
211
- elif (
212
- metric_line.startswith("Grid Size")
213
- and current_section == "Launch Statistics"
214
- ):
215
- try:
216
- value = int(float(parts[-1].replace(",", "")))
217
- current_kernel["grid_size"] = value
218
- except (ValueError, IndexError):
219
- pass
220
-
221
- # Parse recommendations (OPT and INF markers)
222
- if stripped.startswith("OPT") or stripped.startswith("INF"):
223
- in_recommendation = True
224
- recommendation_lines = [stripped]
225
-
226
- # Extract estimated speedup
227
- if current_kernel and "Est. Speedup:" in stripped:
228
- speedup_match = re.search(r"Est\. Speedup:\s*([\d.]+)%", stripped)
229
- if speedup_match:
230
- try:
231
- speedup = float(speedup_match.group(1))
232
- if speedup > current_kernel["estimated_speedup_pct"]:
233
- current_kernel["estimated_speedup_pct"] = speedup
234
- except ValueError:
235
- pass
236
-
237
- if current_kernel and "Est. Local Speedup:" in stripped:
238
- speedup_match = re.search(r"Est\. Local Speedup:\s*([\d.]+)%", stripped)
239
- if speedup_match:
240
- try:
241
- speedup = float(speedup_match.group(1))
242
- if speedup > current_kernel["estimated_speedup_pct"]:
243
- current_kernel["estimated_speedup_pct"] = speedup
244
- except ValueError:
245
- pass
246
- elif in_recommendation:
247
- if line.startswith(" ") and stripped:
248
- recommendation_lines.append(stripped)
249
- elif (
250
- stripped.startswith("Section:")
251
- or stripped.startswith("---")
252
- or (stripped and not line.startswith(" "))
253
- ):
254
- if recommendation_lines:
255
- full_rec = " ".join(recommendation_lines)
256
- if full_rec not in summary["recommendations"]:
257
- summary["recommendations"].append(full_rec)
258
- if current_kernel and full_rec not in current_kernel["recommendations"]:
259
- current_kernel["recommendations"].append(full_rec)
260
- in_recommendation = False
261
- recommendation_lines = []
262
-
263
- i += 1
264
-
265
- # Capture last recommendation if any
266
- if recommendation_lines:
267
- full_rec = " ".join(recommendation_lines)
268
- if full_rec not in summary["recommendations"]:
269
- summary["recommendations"].append(full_rec)
270
- if current_kernel and full_rec not in current_kernel["recommendations"]:
271
- current_kernel["recommendations"].append(full_rec)
239
+ if not details_output:
240
+ return summary
241
+
242
+ lines = details_output.split("\n")
243
+ current_kernel: dict | None = None
244
+ current_section: str | None = None
245
+ in_recommendation = False
246
+ recommendation_lines: list[str] = []
247
+
248
+ for line in lines:
249
+ stripped = line.strip()
250
+
251
+ # Detect kernel header
252
+ if line.startswith(" ") and not line.startswith(" ") and "Context" in line and "Device" in line:
253
+ match = re.match(r"^ (.+?)\s+\(\d+,\s*\d+,\s*\d+\)x\(\d+,\s*\d+,\s*\d+\)", line)
254
+ if match:
255
+ current_kernel = _create_kernel_entry(match.group(1).strip())
256
+ summary["kernels"].append(current_kernel)
257
+
258
+ # Detect section headers
259
+ if stripped.startswith("Section:"):
260
+ current_section = stripped.replace("Section:", "").strip()
261
+
262
+ # Parse metrics from table rows
263
+ if current_kernel and " " in line:
264
+ parts = line.split()
265
+ if len(parts) >= 2:
266
+ _parse_metric_line(current_kernel, stripped, parts, current_section)
267
+
268
+ # Parse recommendations (OPT and INF markers)
269
+ if stripped.startswith("OPT") or stripped.startswith("INF"):
270
+ in_recommendation = True
271
+ recommendation_lines = [stripped]
272
+ if current_kernel:
273
+ _extract_speedup(current_kernel, stripped)
274
+ elif in_recommendation:
275
+ if line.startswith(" ") and stripped:
276
+ recommendation_lines.append(stripped)
277
+ elif stripped.startswith("Section:") or stripped.startswith("---") or (stripped and not line.startswith(" ")):
278
+ if recommendation_lines:
279
+ full_rec = " ".join(recommendation_lines)
280
+ if full_rec not in summary["recommendations"]:
281
+ summary["recommendations"].append(full_rec)
282
+ if current_kernel and full_rec not in current_kernel["recommendations"]:
283
+ current_kernel["recommendations"].append(full_rec)
284
+ in_recommendation = False
285
+ recommendation_lines = []
286
+
287
+ # Capture last recommendation if any
288
+ if recommendation_lines:
289
+ full_rec = " ".join(recommendation_lines)
290
+ if full_rec not in summary["recommendations"]:
291
+ summary["recommendations"].append(full_rec)
292
+ if current_kernel and full_rec not in current_kernel["recommendations"]:
293
+ current_kernel["recommendations"].append(full_rec)
272
294
 
273
295
  return summary
274
296
 
275
297
 
298
+ def _classify_underfill(
299
+ waves_per_sm: float, grid_size: int, num_sms: int
300
+ ) -> tuple[str | None, str | None]:
301
+ """Classify underfill type and severity based on metrics.
302
+
303
+ Returns:
304
+ (underfill_type, severity) where:
305
+ - underfill_type: "launch" | "resource" | None
306
+ - severity: "severe" | "moderate" | None
307
+ """
308
+ assert waves_per_sm >= 0, f"waves_per_sm must be non-negative, got {waves_per_sm}"
309
+ assert grid_size >= 0, f"grid_size must be non-negative, got {grid_size}"
310
+ assert num_sms > 0, f"num_sms must be positive, got {num_sms}"
311
+
312
+ is_grid_small = grid_size > 0 and grid_size < num_sms
313
+
314
+ if waves_per_sm > 0 and waves_per_sm < 1.0:
315
+ return ("launch" if is_grid_small else "resource", "severe")
316
+ if waves_per_sm > 0 and waves_per_sm < 2.0:
317
+ return ("launch" if is_grid_small else "resource", "moderate")
318
+ if is_grid_small:
319
+ return ("launch", "severe")
320
+ return (None, None)
321
+
322
+
323
+ def _classify_occupancy(
324
+ achieved_occ: float, theoretical_occ: float
325
+ ) -> tuple[bool, str | None]:
326
+ """Classify occupancy issue.
327
+
328
+ Returns:
329
+ (is_low_occupancy, analysis_type) where:
330
+ - is_low_occupancy: True if achieved < 50%
331
+ - analysis_type: "runtime_issue" | "resource_limited" | None
332
+ """
333
+ assert achieved_occ >= 0, f"achieved_occ must be non-negative, got {achieved_occ}"
334
+ assert theoretical_occ >= 0, f"theoretical_occ must be non-negative, got {theoretical_occ}"
335
+
336
+ if achieved_occ <= 0 or achieved_occ >= 50:
337
+ return (False, None)
338
+
339
+ if theoretical_occ <= 0:
340
+ return (True, None)
341
+
342
+ occ_gap = theoretical_occ - achieved_occ
343
+ if theoretical_occ >= 50 and occ_gap > 20:
344
+ return (True, "runtime_issue")
345
+ if theoretical_occ < 50:
346
+ return (True, "resource_limited")
347
+ return (True, None)
348
+
349
+
350
+ def _classify_throughput(
351
+ memory_tp: float, compute_tp: float, achieved_occ: float
352
+ ) -> tuple[bool, bool, bool]:
353
+ """Classify throughput observations.
354
+
355
+ Returns:
356
+ (has_high_memory, has_high_compute, has_both_low)
357
+ """
358
+ assert memory_tp >= 0, f"memory_tp must be non-negative, got {memory_tp}"
359
+ assert compute_tp >= 0, f"compute_tp must be non-negative, got {compute_tp}"
360
+ assert achieved_occ >= 0, f"achieved_occ must be non-negative, got {achieved_occ}"
361
+
362
+ has_high_memory = memory_tp > 60
363
+ has_high_compute = compute_tp > 60
364
+ has_both_low = memory_tp < 30 and compute_tp < 30 and achieved_occ >= 50
365
+ return (has_high_memory, has_high_compute, has_both_low)
366
+
367
+
368
+ def _format_underfill_diagnosis(
369
+ underfill_type: str,
370
+ underfill_severity: str,
371
+ waves_per_sm: float,
372
+ grid_size: int,
373
+ num_sms: int,
374
+ achieved_occ: float,
375
+ theoretical_occ: float,
376
+ compute_tp: float,
377
+ memory_tp: float,
378
+ estimated_speedup: float,
379
+ ) -> list[str]:
380
+ """Format diagnosis lines for underfill issues. Returns early from _generate_diagnosis."""
381
+ assert underfill_type in ("launch", "resource")
382
+ assert underfill_severity in ("severe", "moderate")
383
+
384
+ severity_label = "UNDERFILL" if underfill_severity == "severe" else "LIMITED CONCURRENCY"
385
+ blocks_per_sm = grid_size / num_sms if grid_size > 0 else 0
386
+
387
+ lines = [f"**Primary Issue: {severity_label}**"]
388
+
389
+ if waves_per_sm > 0:
390
+ lines.append(f"- Waves per SM: {waves_per_sm:.2f} (often benefits from >2 to hide latency)")
391
+ if grid_size > 0:
392
+ lines.append(f"- Grid: {grid_size} blocks for {num_sms} SMs ({blocks_per_sm:.2f} blocks/SM)")
393
+ lines.append("- ⚠️ Compute/memory throughput % not reliable for global bottleneck; underfill dominates")
394
+ lines.append("")
395
+
396
+ if underfill_type == "launch":
397
+ lines.extend([
398
+ "**Type: LAUNCH-LIMITED** (grid smaller than SM count)",
399
+ "",
400
+ "**What WON'T help:**",
401
+ "- Reducing registers/shared memory (can't create more blocks than launched)",
402
+ "",
403
+ "**What MAY help:**",
404
+ "- Increase batch size or problem dimensions",
405
+ "- Split work into more blocks (e.g., tile over batch/head/rows; sequence tiling only if algorithm permits)",
406
+ "- Use persistent CTAs / work queue: launch ~k×SM blocks that pull tasks",
407
+ "- If inherently sequential, focus on per-block latency optimization",
408
+ ])
409
+ else:
410
+ lines.extend([
411
+ "**Type: RESOURCE-LIMITED** (grid is adequate, but few blocks fit per SM)",
412
+ "",
413
+ "**What MAY help:**",
414
+ "- Reduce registers per thread (__launch_bounds__, fewer local vars)",
415
+ "- Reduce shared memory per block (smaller tiles, multi-stage)",
416
+ "- Reduce block size to fit more blocks per SM",
417
+ "- Check 'Block Limit' in NCU Occupancy section for the limiter",
418
+ "",
419
+ "**Note:** If kernel is very short, waves/SM may be less indicative.",
420
+ "Confirm with Occupancy 'Block Limit' and duration metrics.",
421
+ ])
422
+
423
+ lines.extend(["", "**Raw metrics (interpret with caution due to underfill):**"])
424
+ lines.append(f"- Achieved Occupancy: {achieved_occ:.1f}%")
425
+ if theoretical_occ > 0:
426
+ lines.append(f"- Theoretical Occupancy: {theoretical_occ:.1f}%")
427
+ lines.append(f"- Compute Throughput: {compute_tp:.1f}%")
428
+ lines.append(f"- Memory Throughput: {memory_tp:.1f}%")
429
+ if estimated_speedup > 0:
430
+ lines.append(f"- NCU Est. Speedup potential: {estimated_speedup:.1f}%")
431
+ lines.append("")
432
+ return lines
433
+
434
+
435
+ def _format_occupancy_diagnosis(
436
+ achieved_occ: float,
437
+ theoretical_occ: float,
438
+ occupancy_analysis: str | None,
439
+ ) -> list[str]:
440
+ """Format diagnosis lines for low occupancy issues."""
441
+ assert achieved_occ >= 0
442
+
443
+ lines = ["**Observation: Low Achieved Occupancy**", f"- Achieved: {achieved_occ:.1f}%"]
444
+
445
+ if theoretical_occ > 0:
446
+ lines.append(f"- Theoretical: {theoretical_occ:.1f}%")
447
+
448
+ if occupancy_analysis == "runtime_issue":
449
+ lines.extend([
450
+ "",
451
+ "**Analysis: Large gap between theoretical and achieved**",
452
+ "- Theoretical is high, so this is NOT a resource limit (regs/shmem)",
453
+ "- Likely causes: load imbalance, barriers, short kernel duration, tail effects",
454
+ "- Check if work is evenly distributed across blocks",
455
+ ])
456
+ elif occupancy_analysis == "resource_limited":
457
+ lines.extend([
458
+ "",
459
+ "**Analysis: Theoretical occupancy is also low**",
460
+ "- This IS a resource limit (registers, shared memory, or block size)",
461
+ "- Check 'Block Limit' in NCU Occupancy section for the specific limiter",
462
+ ])
463
+
464
+ lines.extend([
465
+ "",
466
+ "**General suggestions:**",
467
+ "- If register-limited: try __launch_bounds__, reduce local arrays",
468
+ "- If shared-mem-limited: reduce tile sizes or use multi-stage",
469
+ "- If runtime-limited: check barriers, load balance, kernel duration",
470
+ "",
471
+ ])
472
+ return lines
473
+
474
+
475
+ def _format_throughput_diagnosis(
476
+ has_high_memory: bool,
477
+ has_high_compute: bool,
478
+ has_both_low: bool,
479
+ memory_tp: float,
480
+ compute_tp: float,
481
+ ) -> list[str]:
482
+ """Format diagnosis lines for throughput observations."""
483
+ lines: list[str] = []
484
+
485
+ if has_high_memory or has_high_compute:
486
+ lines.append("**Throughput observations:**")
487
+ if has_high_memory:
488
+ lines.append(f"- Memory throughput relatively high ({memory_tp:.1f}%)")
489
+ lines.append(" - May benefit from: better caching, shared memory tiling, coalesced access")
490
+ if has_high_compute:
491
+ lines.append(f"- Compute throughput relatively high ({compute_tp:.1f}%)")
492
+ lines.append(" - May benefit from: reduced instruction count, better ILP")
493
+ lines.append(" - Check which pipeline is saturated (FP32/FP16/INT/SFU/TensorCore) if available")
494
+ lines.append("")
495
+ elif has_both_low:
496
+ lines.extend([
497
+ "**Observation: Both % of peak are low**",
498
+ "- Likely: latency-bound, sync-bound, dependency stalls, or non-peak pipelines",
499
+ "- This can happen with: integer-heavy, SFU-heavy, or control-flow-heavy kernels",
500
+ "- Check instruction mix / pipeline utilization metrics if available",
501
+ "- Check NCU stall reasons (smsp__warp_issue_stalled_*) for more detail",
502
+ "",
503
+ ])
504
+
505
+ return lines
506
+
507
+
508
+ def _generate_diagnosis(kernel: dict, num_sms: int = 148) -> list[str]:
509
+ """Generate actionable diagnosis based on kernel metrics.
510
+
511
+ Uses a prioritized decision order:
512
+ 1. Underfill check (waves_per_sm < 2 OR grid_size < num_sms) - overrides other diagnoses
513
+ 2. Occupancy limiters (theoretical vs achieved gap analysis)
514
+ 3. General observations (avoid strong "bound" labels without stall data)
515
+ """
516
+ assert isinstance(kernel, dict), "kernel must be a dict"
517
+ assert num_sms > 0, f"num_sms must be positive, got {num_sms}"
518
+
519
+ # Extract metrics (single assignments)
520
+ grid_size = kernel.get('grid_size', 0)
521
+ achieved_occ = kernel.get('achieved_occupancy_pct', 0)
522
+ theoretical_occ = kernel.get('theoretical_occupancy_pct', 0)
523
+ compute_tp = kernel.get('compute_throughput_pct', 0)
524
+ memory_tp = kernel.get('memory_throughput_pct', 0)
525
+ estimated_speedup = kernel.get('estimated_speedup_pct', 0)
526
+ waves_per_sm = kernel.get('waves_per_sm', 0)
527
+
528
+ # Skip if we don't have enough data
529
+ if grid_size == 0 and achieved_occ == 0 and waves_per_sm == 0:
530
+ return []
531
+
532
+ # Compute all classifications upfront (single assignments)
533
+ underfill_type, underfill_severity = _classify_underfill(waves_per_sm, grid_size, num_sms)
534
+ is_low_occupancy, occupancy_analysis = _classify_occupancy(achieved_occ, theoretical_occ)
535
+ has_high_memory, has_high_compute, has_both_low = _classify_throughput(memory_tp, compute_tp, achieved_occ)
536
+
537
+ # Derived flags (single assignments)
538
+ has_underfill = underfill_type is not None
539
+ has_throughput_obs = has_high_memory or has_high_compute or has_both_low
540
+
541
+ # Build output
542
+ lines = ["#### 🔍 Diagnosis", ""]
543
+
544
+ # PRIORITY 1: Underfill (overrides other diagnoses)
545
+ if has_underfill:
546
+ lines.extend(_format_underfill_diagnosis(
547
+ underfill_type, underfill_severity, waves_per_sm, grid_size, num_sms,
548
+ achieved_occ, theoretical_occ, compute_tp, memory_tp, estimated_speedup,
549
+ ))
550
+ return lines
551
+
552
+ # PRIORITY 2: Low occupancy (when NOT caused by underfill)
553
+ if is_low_occupancy:
554
+ lines.extend(_format_occupancy_diagnosis(achieved_occ, theoretical_occ, occupancy_analysis))
555
+
556
+ # PRIORITY 3: Throughput observations
557
+ lines.extend(_format_throughput_diagnosis(has_high_memory, has_high_compute, has_both_low, memory_tp, compute_tp))
558
+
559
+ # Show NCU's own recommendations if present
560
+ if estimated_speedup > 0:
561
+ lines.extend([f"**NCU estimated speedup potential: {estimated_speedup:.1f}%**",
562
+ "- See NCU recommendations below for specific suggestions", ""])
563
+
564
+ # No major issues detected
565
+ if not (has_underfill or is_low_occupancy or has_throughput_obs):
566
+ lines.extend(["**Status: No obvious bottleneck detected**",
567
+ f"- Occupancy: {achieved_occ:.1f}%, Compute: {compute_tp:.1f}%, Memory: {memory_tp:.1f}%",
568
+ "- Consider profiling with --set full for stall breakdown",
569
+ "- Or the kernel may already be well-optimized for its workload", ""])
570
+
571
+ return lines
572
+
573
+
276
574
  def _generate_text_output(filename: str, summary: dict) -> str:
277
575
  """Generate human-readable markdown text from summary."""
278
576
  timestamp = datetime.now().isoformat()
577
+ gpu_name = summary.get('gpu', 'Unknown')
578
+ num_sms = _get_sm_count_for_gpu(gpu_name)
279
579
 
280
580
  lines = [
281
581
  "# NCU Profiling Analysis",
@@ -283,7 +583,7 @@ def _generate_text_output(filename: str, summary: dict) -> str:
283
583
  f"Generated: {timestamp}",
284
584
  "",
285
585
  "## GPU Information",
286
- f"- Device: {summary.get('gpu', 'Unknown')}",
586
+ f"- Device: {gpu_name}",
287
587
  "",
288
588
  "## Kernel Summary",
289
589
  "",
@@ -301,10 +601,15 @@ def _generate_text_output(filename: str, summary: dict) -> str:
301
601
  f"- Grid Size: {kernel.get('grid_size', 0)}",
302
602
  "",
303
603
  ])
604
+
605
+ # Add actionable diagnosis
606
+ diagnosis = _generate_diagnosis(kernel, num_sms=num_sms)
607
+ if diagnosis:
608
+ lines.extend(diagnosis)
304
609
 
305
610
  if summary.get("recommendations"):
306
611
  lines.extend([
307
- "## Recommendations",
612
+ "## NCU Recommendations",
308
613
  "",
309
614
  ])
310
615
  for i, rec in enumerate(summary["recommendations"], 1):
@@ -534,6 +839,8 @@ def _analyze_remote_api(
534
839
  def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
535
840
  """Generate human-readable text from NCU API result."""
536
841
  timestamp = datetime.now().isoformat()
842
+ gpu_name = result.get('gpu', 'Unknown')
843
+ num_sms = _get_sm_count_for_gpu(gpu_name)
537
844
 
538
845
  lines = [
539
846
  "# NCU Profiling Analysis",
@@ -542,7 +849,7 @@ def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
542
849
  f"Report ID: {result.get('report_id', 'N/A')}",
543
850
  "",
544
851
  "## GPU Information",
545
- f"- Device: {result.get('gpu', 'Unknown')}",
852
+ f"- Device: {gpu_name}",
546
853
  "",
547
854
  "## Kernel Summary",
548
855
  "",
@@ -556,8 +863,23 @@ def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
556
863
  f"- Achieved Occupancy: {kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)):.1f}%",
557
864
  f"- Compute Throughput: {kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)):.1f}%",
558
865
  f"- Memory Throughput: {kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)):.1f}%",
866
+ f"- Grid Size: {kernel.get('grid_size', 0)}",
867
+ f"- Block Size: {kernel.get('block_size', 0)}",
559
868
  "",
560
869
  ])
870
+
871
+ # Add actionable diagnosis (normalize field names from API)
872
+ normalized_kernel = {
873
+ 'grid_size': kernel.get('grid_size', 0),
874
+ 'block_size': kernel.get('block_size', 0),
875
+ 'achieved_occupancy_pct': kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)),
876
+ 'compute_throughput_pct': kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)),
877
+ 'memory_throughput_pct': kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)),
878
+ 'registers_per_thread': kernel.get('registers_per_thread', 0),
879
+ }
880
+ diagnosis = _generate_diagnosis(normalized_kernel, num_sms=num_sms)
881
+ if diagnosis:
882
+ lines.extend(diagnosis)
561
883
 
562
884
  # Add source correlation summary if present
563
885
  source_data = result.get("source_correlation", [])
wafer/targets.py CHANGED
@@ -4,7 +4,7 @@ CRUD operations for GPU targets stored in ~/.wafer/targets/.
4
4
  """
5
5
 
6
6
  import tomllib
7
- from dataclasses import asdict
7
+ from dataclasses import asdict, fields
8
8
  from pathlib import Path
9
9
  from typing import Any
10
10
 
@@ -18,6 +18,12 @@ from wafer_core.utils.kernel_utils.targets.config import (
18
18
  WorkspaceTarget,
19
19
  )
20
20
 
21
+
22
+ def _filter_dataclass_fields(data: dict[str, Any], dataclass_type: type) -> dict[str, Any]:
23
+ """Filter dict to only include fields that exist in the dataclass."""
24
+ valid_fields = {f.name for f in fields(dataclass_type)}
25
+ return {k: v for k, v in data.items() if k in valid_fields}
26
+
21
27
  # Default paths
22
28
  WAFER_DIR = Path.home() / ".wafer"
23
29
  TARGETS_DIR = WAFER_DIR / "targets"
@@ -64,17 +70,17 @@ def _parse_target(data: dict[str, Any]) -> TargetConfig:
64
70
  data_copy["gpu_ids"] = tuple(data_copy["gpu_ids"])
65
71
 
66
72
  if target_type == "baremetal":
67
- return BaremetalTarget(**data_copy)
73
+ return BaremetalTarget(**_filter_dataclass_fields(data_copy, BaremetalTarget))
68
74
  elif target_type == "vm":
69
- return VMTarget(**data_copy)
75
+ return VMTarget(**_filter_dataclass_fields(data_copy, VMTarget))
70
76
  elif target_type == "modal":
71
- return ModalTarget(**data_copy)
77
+ return ModalTarget(**_filter_dataclass_fields(data_copy, ModalTarget))
72
78
  elif target_type == "workspace":
73
- return WorkspaceTarget(**data_copy)
79
+ return WorkspaceTarget(**_filter_dataclass_fields(data_copy, WorkspaceTarget))
74
80
  elif target_type == "runpod":
75
- return RunPodTarget(**data_copy)
81
+ return RunPodTarget(**_filter_dataclass_fields(data_copy, RunPodTarget))
76
82
  elif target_type == "digitalocean":
77
- return DigitalOceanTarget(**data_copy)
83
+ return DigitalOceanTarget(**_filter_dataclass_fields(data_copy, DigitalOceanTarget))
78
84
  else:
79
85
  raise ValueError(
80
86
  f"Unknown target type: {target_type}. Must be baremetal, vm, modal, workspace, runpod, or digitalocean"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-cli
3
- Version: 0.2.36
3
+ Version: 0.2.37
4
4
  Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -16,7 +16,7 @@ wafer/global_config.py,sha256=fhaR_RU3ufMksDmOohH1OLeQ0JT0SDW1hEip_zaP75k,11345
16
16
  wafer/gpu_run.py,sha256=TwqXy72T7f2I7e6n5WWod3xgxCPnDhU0BgLsB4CUoQY,9716
17
17
  wafer/inference.py,sha256=tZCO5i05FKY27ewis3CSBHFBeFbXY3xwj0DSjdoMY9s,4314
18
18
  wafer/kernel_scope.py,sha256=YtnxknAChkJoeU_vIdxiqWsAITGBeabp9OGIK-X32i0,20796
19
- wafer/ncu_analyze.py,sha256=rAWzKQRZEY6E_CL3gAWUaW3uZ4kvQVZskVCPDpsFJuE,24633
19
+ wafer/ncu_analyze.py,sha256=8id2eJRuBabxINnUF0M6SQtS1YbAWBM3pzIN8xkxMCE,37139
20
20
  wafer/nsys_analyze.py,sha256=AhNcjPaapB0QCbqiHRXvyy-ccjevvVwEyxes84D28JU,36124
21
21
  wafer/nsys_profile.py,sha256=QFBl8pkr8r4uRNdNUO9gY-obj9slqpOgVYFZ_sXu6Nw,15478
22
22
  wafer/output.py,sha256=8jw5ifvIMK8ldyBMGW4NhrKvJPl66TV2Y2fJ5Tlhh1I,8293
@@ -27,7 +27,7 @@ wafer/rocprof_systems.py,sha256=4IWbMcbYk1x_8iS7P3FC_u5sgH6EXADCtR2lV9id80M,1862
27
27
  wafer/specs_cli.py,sha256=frMEKwMflxVNpFlAuxprmr33ZZ1Oeh2lB0KWZ4oZWzw,4360
28
28
  wafer/ssh_keys.py,sha256=9kSdhV_dg9T6pQu2JmNQptarkkwGtN9rLyRkI1bW4i4,8094
29
29
  wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
30
- wafer/targets.py,sha256=9r-iRWoKSH5cQl1LcamaX-T7cNVOg99ngIm_hlRk-qU,26922
30
+ wafer/targets.py,sha256=XeEZeOykNBnjJLnCqpoXAnzeqbp6MWZRIW9A26BKqdU,27469
31
31
  wafer/targets_cli.py,sha256=Oe3e02rSXeNrMbe_Qv9DNfQ8dEOKodtU7BbQQWxlNwA,16348
32
32
  wafer/targets_ops.py,sha256=jN1oIBx0mutxRNE9xpIc7SaBxPkVmOyus2eqn0kEKNI,21475
33
33
  wafer/trace_compare.py,sha256=IBVSGI8u5A10haDzL4eQ0R24fM1G_dd1F3-4iEkG1EQ,6349
@@ -43,8 +43,8 @@ wafer/templates/optimize_kernelbench.py,sha256=aoOA13zWEl89r6QW03xF9NKxQ7j4mWe9r
43
43
  wafer/templates/optimize_vllm.py,sha256=_D1rDP9wHA8CCvmoUrdLEW94MiaK4nAYJ-jbnpAvq7A,6154
44
44
  wafer/templates/trace_analyze.py,sha256=B7CiRlsokERzBjLL-k49kGjpU2zlJZqzTE05xbRS1WI,2878
45
45
  wafer/tests/test_eval_cli_parity.py,sha256=SGmaj2NGBZ7GdDF53bXsECvQbV21iHZw8YeL_MJOLk0,7206
46
- wafer_cli-0.2.36.dist-info/METADATA,sha256=POYE0Ub7A0rETiPPssuxv13NIqe_1yHNsAO0ddg_bxk,6461
47
- wafer_cli-0.2.36.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
48
- wafer_cli-0.2.36.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
49
- wafer_cli-0.2.36.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
50
- wafer_cli-0.2.36.dist-info/RECORD,,
46
+ wafer_cli-0.2.37.dist-info/METADATA,sha256=LOnnD6sSASC_Tf0qFMa5hBBUR6qJiMCkZysy2y4NdZw,6461
47
+ wafer_cli-0.2.37.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
48
+ wafer_cli-0.2.37.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
49
+ wafer_cli-0.2.37.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
50
+ wafer_cli-0.2.37.dist-info/RECORD,,