wafer-core 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,951 @@
1
+ """Report formatting for trace comparison results.
2
+
3
+ Provides text, CSV, and JSON output formatters for comparison and fusion analysis.
4
+ """
5
+
6
+ import json
7
+ from typing import Any
8
+
9
+
10
+ def format_text(results: dict[str, Any], show_layers: bool = False, show_all: bool = False, show_stack_traces: bool = False) -> str:
11
+ """Format comparison results as human-readable text report.
12
+
13
+ Args:
14
+ results: Analysis results from analyze_traces()
15
+ show_layers: Whether to include layer-wise breakdown
16
+ show_all: Whether to show all items without truncation
17
+ show_stack_traces: Whether to show Python stack traces
18
+
19
+ Returns:
20
+ Formatted text report
21
+ """
22
+ lines = []
23
+ meta = results["metadata"]
24
+
25
+ lines.append("=" * 80)
26
+ lines.append("VLLM TRACE COMPARISON REPORT")
27
+ if "phase" in meta and meta["phase"] != "all":
28
+ lines.append(f"Phase: {meta['phase'].upper()}")
29
+ lines.append("=" * 80)
30
+ lines.append("")
31
+
32
+ # Determine which trace is AMD and which is NVIDIA
33
+ is_trace1_amd = meta['trace1_platform'] == 'AMD'
34
+ if is_trace1_amd:
35
+ amd_gpu, nvidia_gpu = meta['trace1_gpu'], meta['trace2_gpu']
36
+ amd_kernels, nvidia_kernels = meta['trace1_kernels'], meta['trace2_kernels']
37
+ amd_total_ms, nvidia_total_ms = meta['trace1_total_ms'], meta['trace2_total_ms']
38
+ else:
39
+ amd_gpu, nvidia_gpu = meta['trace2_gpu'], meta['trace1_gpu']
40
+ amd_kernels, nvidia_kernels = meta['trace2_kernels'], meta['trace1_kernels']
41
+ amd_total_ms, nvidia_total_ms = meta['trace2_total_ms'], meta['trace1_total_ms']
42
+
43
+ # Get device properties
44
+ amd_dev = meta['trace1_device'] if is_trace1_amd else meta['trace2_device']
45
+ nvidia_dev = meta['trace2_device'] if is_trace1_amd else meta['trace1_device']
46
+
47
+ lines.append(f"AMD GPU: {amd_gpu}")
48
+ lines.append(f" Compute: {amd_dev['compute_capability']}")
49
+ lines.append(f" Memory: {amd_dev['total_memory_gb']:.1f} GB")
50
+ lines.append(f" SMs: {amd_dev['sm_count']}")
51
+ lines.append(f" Warp Size: {amd_dev['warp_size']}")
52
+ lines.append("")
53
+ lines.append(f"NVIDIA GPU: {nvidia_gpu}")
54
+ lines.append(f" Compute: {nvidia_dev['compute_capability']}")
55
+ lines.append(f" Memory: {nvidia_dev['total_memory_gb']:.1f} GB")
56
+ lines.append(f" SMs: {nvidia_dev['sm_count']}")
57
+ lines.append(f" Warp Size: {nvidia_dev['warp_size']}")
58
+ lines.append("")
59
+ lines.append(f"AMD Kernels: {amd_kernels:,}")
60
+ lines.append(f"NVIDIA Kernels: {nvidia_kernels:,}")
61
+ lines.append(f"AMD Total: {amd_total_ms:.1f} ms")
62
+ lines.append(f"NVIDIA Total: {nvidia_total_ms:.1f} ms")
63
+
64
+ # Handle division by zero for ratio
65
+ if nvidia_total_ms > 0:
66
+ ratio_str = f"{amd_total_ms / nvidia_total_ms:.2f}x"
67
+ elif amd_total_ms > 0:
68
+ ratio_str = "∞ (NVIDIA has no data)"
69
+ else:
70
+ ratio_str = "N/A (both traces empty)"
71
+
72
+ lines.append(f"Ratio: {ratio_str}")
73
+ lines.append("")
74
+
75
+ # Convert operations from trace1/trace2 keys to amd/nvidia keys for easier formatting
76
+ ops = results["operations"]
77
+ for op in ops:
78
+ if is_trace1_amd:
79
+ op['amd_count'] = op['trace1_count']
80
+ op['nvidia_count'] = op['trace2_count']
81
+ op['amd_avg_us'] = op['trace1_avg_us']
82
+ op['nvidia_avg_us'] = op['trace2_avg_us']
83
+ op['amd_total_ms'] = op['trace1_total_ms']
84
+ op['nvidia_total_ms'] = op['trace2_total_ms']
85
+ op['amd_cpu_op'] = op.get('trace1_cpu_op')
86
+ op['nvidia_cpu_op'] = op.get('trace2_cpu_op')
87
+ op['amd_pattern'] = op.get('trace1_pattern')
88
+ op['nvidia_pattern'] = op.get('trace2_pattern')
89
+ op['amd_kernels'] = op.get('trace1_kernels', [])
90
+ op['nvidia_kernels'] = op.get('trace2_kernels', [])
91
+ else:
92
+ op['amd_count'] = op['trace2_count']
93
+ op['nvidia_count'] = op['trace1_count']
94
+ op['amd_avg_us'] = op['trace2_avg_us']
95
+ op['nvidia_avg_us'] = op['trace1_avg_us']
96
+ op['amd_total_ms'] = op['trace2_total_ms']
97
+ op['nvidia_total_ms'] = op['trace1_total_ms']
98
+ op['amd_cpu_op'] = op.get('trace2_cpu_op')
99
+ op['nvidia_cpu_op'] = op.get('trace1_cpu_op')
100
+ op['amd_pattern'] = op.get('trace2_pattern')
101
+ op['nvidia_pattern'] = op.get('trace1_pattern')
102
+ op['amd_kernels'] = op.get('trace2_kernels', [])
103
+ op['nvidia_kernels'] = op.get('trace1_kernels', [])
104
+
105
+ # Convert layers from trace1/trace2 keys to amd/nvidia keys
106
+ layers = results.get("layers", [])
107
+ for layer in layers:
108
+ if is_trace1_amd:
109
+ layer['amd_kernels'] = layer['trace1_kernels']
110
+ layer['nvidia_kernels'] = layer['trace2_kernels']
111
+ layer['amd_total_ms'] = layer['trace1_total_ms']
112
+ layer['nvidia_total_ms'] = layer['trace2_total_ms']
113
+ else:
114
+ layer['amd_kernels'] = layer['trace2_kernels']
115
+ layer['nvidia_kernels'] = layer['trace1_kernels']
116
+ layer['amd_total_ms'] = layer['trace2_total_ms']
117
+ layer['nvidia_total_ms'] = layer['trace1_total_ms']
118
+
119
+ # Update metadata layer counts
120
+ if is_trace1_amd:
121
+ meta['amd_layers'] = meta.get('trace1_layers', 0)
122
+ meta['nvidia_layers'] = meta.get('trace2_layers', 0)
123
+ else:
124
+ meta['amd_layers'] = meta.get('trace2_layers', 0)
125
+ meta['nvidia_layers'] = meta.get('trace1_layers', 0)
126
+
127
+ # Summary stats
128
+ slower = [o for o in ops if o["status"] == "slower"]
129
+ faster = [o for o in ops if o["status"] == "faster"]
130
+ similar = [o for o in ops if o["status"] == "similar"]
131
+
132
+ lines.append("SUMMARY")
133
+ lines.append("-" * 80)
134
+ lines.append(f"Operations where AMD is slower: {len(slower)}")
135
+ lines.append(f"Operations where AMD is faster: {len(faster)}")
136
+ lines.append(f"Operations with similar perf: {len(similar)}")
137
+ lines.append("")
138
+
139
+ # AMD Slower
140
+ if slower:
141
+ slower_to_show = slower if show_all else slower[:10]
142
+ lines.append(f"🔴 AMD SLOWER THAN NVIDIA (Optimization Targets) - Showing {len(slower_to_show)}/{len(slower)}")
143
+ lines.append("=" * 140)
144
+ lines.append(
145
+ f"{'Operation':<22} {'AMD Count':>11} {'NV Count':>10} {'AMD Avg':>10} "
146
+ f"{'NV Avg':>10} {'Ratio':>8} {'AMD Total':>11} {'NV Total':>11} {'AMD Slower By':>14}"
147
+ )
148
+ lines.append("-" * 140)
149
+
150
+ for op in slower_to_show:
151
+ diff_abs = abs(op["gap_ms"])
152
+ lines.append(
153
+ f"{op['operation']:<22} "
154
+ f"{op['amd_count']:>11,} "
155
+ f"{op['nvidia_count']:>10,} "
156
+ f"{op['amd_avg_us']:>8.1f}µs "
157
+ f"{op['nvidia_avg_us']:>8.1f}µs "
158
+ f"{op['ratio']:>7.2f}x "
159
+ f"{op['amd_total_ms']:>9.1f}ms "
160
+ f"{op['nvidia_total_ms']:>9.1f}ms "
161
+ f"{diff_abs:>13.1f}ms"
162
+ )
163
+
164
+ lines.append("")
165
+
166
+ # AMD Faster
167
+ if faster:
168
+ faster_to_show = faster if show_all else faster[:10]
169
+ lines.append(f"🟢 AMD FASTER THAN NVIDIA (Wins) - Showing {len(faster_to_show)}/{len(faster)}")
170
+ lines.append("=" * 140)
171
+ lines.append(
172
+ f"{'Operation':<22} {'AMD Count':>11} {'NV Count':>10} {'AMD Avg':>10} "
173
+ f"{'NV Avg':>10} {'Ratio':>8} {'AMD Total':>11} {'NV Total':>11} {'AMD Faster By':>14}"
174
+ )
175
+ lines.append("-" * 140)
176
+
177
+ for op in faster_to_show:
178
+ diff_abs = abs(op["gap_ms"])
179
+ lines.append(
180
+ f"{op['operation']:<22} "
181
+ f"{op['amd_count']:>11,} "
182
+ f"{op['nvidia_count']:>10,} "
183
+ f"{op['amd_avg_us']:>8.1f}µs "
184
+ f"{op['nvidia_avg_us']:>8.1f}µs "
185
+ f"{op['ratio']:>7.2f}x "
186
+ f"{op['amd_total_ms']:>9.1f}ms "
187
+ f"{op['nvidia_total_ms']:>9.1f}ms "
188
+ f"{diff_abs:>13.1f}ms"
189
+ )
190
+
191
+ lines.append("")
192
+
193
+ # Similar Performance
194
+ if similar:
195
+ lines.append("⚪ SIMILAR PERFORMANCE (Within 10% difference)")
196
+ lines.append("=" * 140)
197
+ lines.append(
198
+ f"{'Operation':<22} {'AMD Count':>11} {'NV Count':>10} {'AMD Avg':>10} "
199
+ f"{'NV Avg':>10} {'Ratio':>8} {'AMD Total':>11} {'NV Total':>11} {'Winner':>14}"
200
+ )
201
+ lines.append("-" * 140)
202
+
203
+ for op in similar:
204
+ if op["gap_ms"] < 0:
205
+ winner = f"AMD by {abs(op['gap_ms']):.1f}ms"
206
+ elif op["gap_ms"] > 0:
207
+ winner = f"NV by {op['gap_ms']:.1f}ms"
208
+ else:
209
+ winner = "Tie"
210
+ lines.append(
211
+ f"{op['operation']:<22} "
212
+ f"{op['amd_count']:>11,} "
213
+ f"{op['nvidia_count']:>10,} "
214
+ f"{op['amd_avg_us']:>8.1f}µs "
215
+ f"{op['nvidia_avg_us']:>8.1f}µs "
216
+ f"{op['ratio']:>7.2f}x "
217
+ f"{op['amd_total_ms']:>9.1f}ms "
218
+ f"{op['nvidia_total_ms']:>9.1f}ms "
219
+ f"{winner:>14}"
220
+ )
221
+
222
+ lines.append("")
223
+
224
+ # CPU operator mapping with stack trace info
225
+ if not show_stack_traces:
226
+ cpu_ops_to_show = ops if show_all else ops[:15]
227
+ lines.append(f"CPU OPERATOR MAPPING - Showing {len(cpu_ops_to_show)}/{len(ops)}")
228
+ lines.append("=" * 80)
229
+ lines.append("Shows the most common PyTorch/vLLM call path for each GPU operation.")
230
+ lines.append("Use --stack-traces to see full call stacks and all variants.")
231
+ lines.append("")
232
+ lines.append(f"{'Operation':<25} {'CPU Operator (most common)':<45} {'Variants':<10}")
233
+ lines.append("-" * 80)
234
+
235
+ # Track if any operations have multiple stack traces
236
+ has_multiple_stacks = False
237
+ for op in cpu_ops_to_show:
238
+ cpu_op = op.get("amd_cpu_op") or op.get("nvidia_cpu_op", "N/A")
239
+ if cpu_op and cpu_op != "N/A":
240
+ # Shorten long operator names
241
+ if len(cpu_op) > 43:
242
+ cpu_op = cpu_op[:40] + "..."
243
+
244
+ # Count unique stack traces across both traces
245
+ amd_stacks = op.get("trace1_python_stacks", []) if is_trace1_amd else op.get("trace2_python_stacks", [])
246
+ nv_stacks = op.get("trace2_python_stacks", []) if is_trace1_amd else op.get("trace1_python_stacks", [])
247
+ total_stacks = len(amd_stacks) + len(nv_stacks)
248
+
249
+ stack_info = ""
250
+ if total_stacks > 1:
251
+ stack_info = f"{total_stacks} paths"
252
+ has_multiple_stacks = True
253
+ elif total_stacks == 1:
254
+ stack_info = "1 path"
255
+ else:
256
+ stack_info = "-"
257
+
258
+ lines.append(f"{op['operation']:<25} {cpu_op:<45} {stack_info:<10}")
259
+
260
+ lines.append("")
261
+ if has_multiple_stacks:
262
+ lines.append("⚠️ Multiple call paths detected. Use --stack-traces to see all variants.")
263
+ lines.append("")
264
+
265
+ # Kernel-level details for top 3 operations with biggest gaps
266
+ non_similar_ops = [op for op in ops if op["status"] != "similar"]
267
+ top_ops = non_similar_ops if show_all else non_similar_ops[:3]
268
+
269
+ lines.append("KERNEL-LEVEL DETAILS (Top Individual Kernels)")
270
+ lines.append("=" * 80)
271
+ lines.append("")
272
+ if show_all:
273
+ lines.append(f"Showing all kernels for {len(top_ops)} operations with performance gaps:")
274
+ else:
275
+ lines.append(f"Showing top 10 kernels for the 3 operations with largest performance gaps:")
276
+ lines.append("")
277
+
278
+ for op in top_ops:
279
+ lines.append(f"Operation: {op['operation']}")
280
+ lines.append("-" * 80)
281
+
282
+ # AMD kernels
283
+ all_amd_kernels = op.get("amd_kernels", [])
284
+ amd_kernels = all_amd_kernels if show_all else all_amd_kernels[:10]
285
+ if amd_kernels:
286
+ kernel_label = f"All {len(amd_kernels)}" if show_all else "Top 10"
287
+ lines.append(f"\n AMD {kernel_label} Kernels (Total: {op['amd_count']} invocations):")
288
+ lines.append(f" {'Kernel Name':<50} {'Total (µs)':>12} {'Count':>8} {'Avg (µs)':>10}")
289
+ lines.append(" " + "-" * 80)
290
+ for k in amd_kernels:
291
+ name = k["name"][:47] + "..." if len(k["name"]) > 50 else k["name"]
292
+ lines.append(
293
+ f" {name:<50} {k['total_us']:>12.0f} {k['count']:>8,} {k['avg_us']:>10.1f}"
294
+ )
295
+
296
+ # NVIDIA kernels
297
+ all_nv_kernels = op.get("nvidia_kernels", [])
298
+ nv_kernels = all_nv_kernels if show_all else all_nv_kernels[:10]
299
+ if nv_kernels:
300
+ kernel_label = f"All {len(nv_kernels)}" if show_all else "Top 10"
301
+ lines.append(f"\n NVIDIA {kernel_label} Kernels (Total: {op['nvidia_count']} invocations):")
302
+ lines.append(f" {'Kernel Name':<50} {'Total (µs)':>12} {'Count':>8} {'Avg (µs)':>10}")
303
+ lines.append(" " + "-" * 80)
304
+ for k in nv_kernels:
305
+ name = k["name"][:47] + "..." if len(k["name"]) > 50 else k["name"]
306
+ lines.append(
307
+ f" {name:<50} {k['total_us']:>12.0f} {k['count']:>8,} {k['avg_us']:>10.1f}"
308
+ )
309
+
310
+ lines.append("")
311
+
312
+ lines.append("=" * 80)
313
+
314
+ # Python stack traces if requested
315
+ if show_stack_traces:
316
+ stack_trace_report = _format_stack_trace_report(results, show_all=show_all)
317
+ lines.append("")
318
+ lines.extend(stack_trace_report)
319
+
320
+ # Layer-wise report if requested
321
+ if show_layers:
322
+ layer_report = _format_layer_report(results, show_all=show_all)
323
+ lines.append("")
324
+ lines.extend(layer_report)
325
+
326
+ return "\n".join(lines)
327
+
328
+
329
+ def _format_layer_report(results: dict[str, Any], show_all: bool = False) -> list[str]:
330
+ """Format layer-wise performance breakdown.
331
+
332
+ Args:
333
+ results: Analysis results
334
+ show_all: Whether to show all layers without truncation
335
+
336
+ Returns:
337
+ List of formatted text lines
338
+ """
339
+ layers = results.get("layers", [])
340
+ if not layers:
341
+ return []
342
+
343
+ lines = []
344
+ meta = results["metadata"]
345
+
346
+ lines.append("=" * 80)
347
+ lines.append("LAYER-WISE PERFORMANCE BREAKDOWN")
348
+ lines.append("=" * 80)
349
+ lines.append("")
350
+ lines.append("NOTE: Layers are identified by correlation IDs in the execution graph.")
351
+ lines.append("Each layer represents one transformer block (Norm + Attention + FFN).")
352
+ lines.append("Layers may have similar timing if the workload is uniform across the model.")
353
+ lines.append("")
354
+
355
+ lines.append(f"Total Layers Detected: {meta.get('amd_layers', 0)} (AMD), {meta.get('nvidia_layers', 0)} (NVIDIA)")
356
+ lines.append("")
357
+
358
+ # Separate layers into comparable and trace-unique
359
+ comparable_layers = [layer for layer in layers if layer.get("in_both", True)]
360
+ amd_only_layers = [layer for layer in layers if layer["status"] == "trace1_only" and meta['trace1_platform'] == 'AMD']
361
+ nvidia_only_layers = [layer for layer in layers if layer["status"] == "trace2_only" and meta['trace1_platform'] == 'AMD']
362
+
363
+ # Handle case where trace2 is AMD
364
+ if meta['trace1_platform'] != 'AMD':
365
+ amd_only_layers = [layer for layer in layers if layer["status"] == "trace2_only"]
366
+ nvidia_only_layers = [layer for layer in layers if layer["status"] == "trace1_only"]
367
+
368
+ slower_layers = [layer for layer in comparable_layers if layer["status"] == "slower"]
369
+ faster_layers = [layer for layer in comparable_layers if layer["status"] == "faster"]
370
+ similar_layers = [layer for layer in comparable_layers if layer["status"] == "similar"]
371
+
372
+ lines.append(f"Layers in both traces: {len(comparable_layers)}")
373
+ lines.append(f" - AMD is slower: {len(slower_layers)}")
374
+ lines.append(f" - AMD is faster: {len(faster_layers)}")
375
+ lines.append(f" - Similar performance: {len(similar_layers)}")
376
+ if amd_only_layers or nvidia_only_layers:
377
+ lines.append(f"Layers only in AMD trace: {len(amd_only_layers)}")
378
+ lines.append(f"Layers only in NVIDIA trace: {len(nvidia_only_layers)}")
379
+ lines.append("")
380
+
381
+ # Show top 20 slowest layers for AMD (or all if show_all)
382
+ if slower_layers:
383
+ slower_to_show = slower_layers if show_all else slower_layers[:20]
384
+ label = f"ALL {len(slower_to_show)}" if show_all else "TOP 20"
385
+ lines.append(f"🔴 {label} LAYERS WHERE AMD IS SLOWER")
386
+ lines.append("=" * 100)
387
+ lines.append(
388
+ f"{'Layer':>6} {'AMD Kernels':>13} {'NV Kernels':>12} "
389
+ f"{'AMD Time':>12} {'NV Time':>11} {'Ratio':>8} {'AMD Slower By':>14}"
390
+ )
391
+ lines.append("-" * 100)
392
+
393
+ for layer in slower_to_show:
394
+ lines.append(
395
+ f"{layer['layer']:>6} "
396
+ f"{layer['amd_kernels']:>13,} "
397
+ f"{layer['nvidia_kernels']:>12,} "
398
+ f"{layer['amd_total_ms']:>10.2f}ms "
399
+ f"{layer['nvidia_total_ms']:>9.2f}ms "
400
+ f"{layer['ratio']:>7.2f}x "
401
+ f"{abs(layer['gap_ms']):>13.2f}ms"
402
+ )
403
+
404
+ lines.append("")
405
+
406
+ # Show top 10 fastest layers for AMD (or all if show_all)
407
+ if faster_layers:
408
+ faster_to_show = faster_layers if show_all else faster_layers[:10]
409
+ label = f"ALL {len(faster_to_show)}" if show_all else "TOP 10"
410
+ lines.append(f"🟢 {label} LAYERS WHERE AMD IS FASTER")
411
+ lines.append("=" * 100)
412
+ lines.append(
413
+ f"{'Layer':>6} {'AMD Kernels':>13} {'NV Kernels':>12} "
414
+ f"{'AMD Time':>12} {'NV Time':>11} {'Ratio':>8} {'AMD Faster By':>14}"
415
+ )
416
+ lines.append("-" * 100)
417
+
418
+ for layer in faster_to_show:
419
+ lines.append(
420
+ f"{layer['layer']:>6} "
421
+ f"{layer['amd_kernels']:>13,} "
422
+ f"{layer['nvidia_kernels']:>12,} "
423
+ f"{layer['amd_total_ms']:>10.2f}ms "
424
+ f"{layer['nvidia_total_ms']:>9.2f}ms "
425
+ f"{layer['ratio']:>7.2f}x "
426
+ f"{abs(layer['gap_ms']):>13.2f}ms"
427
+ )
428
+
429
+ lines.append("")
430
+
431
+ # Show AMD-only layers (simplified display)
432
+ if amd_only_layers:
433
+ amd_to_show = amd_only_layers if show_all else amd_only_layers[:20]
434
+ label = f"ALL {len(amd_to_show)}" if show_all else f"{len(amd_to_show)}/{len(amd_only_layers)}"
435
+ lines.append(f"📊 LAYERS ONLY IN AMD TRACE ({label})")
436
+ lines.append("=" * 60)
437
+ lines.append(f"{'Layer':>6} {'Kernels':>10} {'Time':>12}")
438
+ lines.append("-" * 60)
439
+
440
+ for layer in amd_to_show:
441
+ lines.append(
442
+ f"{layer['layer']:>6} "
443
+ f"{layer['amd_kernels']:>10,} "
444
+ f"{layer['amd_total_ms']:>10.2f}ms"
445
+ )
446
+
447
+ if not show_all and len(amd_only_layers) > 20:
448
+ lines.append(f"\n... and {len(amd_only_layers) - 20} more AMD-only layers")
449
+
450
+ lines.append("")
451
+
452
+ # Show NVIDIA-only layers (simplified display)
453
+ if nvidia_only_layers:
454
+ nvidia_to_show = nvidia_only_layers if show_all else nvidia_only_layers[:20]
455
+ label = f"ALL {len(nvidia_to_show)}" if show_all else f"{len(nvidia_to_show)}/{len(nvidia_only_layers)}"
456
+ lines.append(f"📊 LAYERS ONLY IN NVIDIA TRACE ({label})")
457
+ lines.append("=" * 60)
458
+ lines.append(f"{'Layer':>6} {'Kernels':>10} {'Time':>12}")
459
+ lines.append("-" * 60)
460
+
461
+ for layer in nvidia_to_show:
462
+ lines.append(
463
+ f"{layer['layer']:>6} "
464
+ f"{layer['nvidia_kernels']:>10,} "
465
+ f"{layer['nvidia_total_ms']:>10.2f}ms"
466
+ )
467
+
468
+ if not show_all and len(nvidia_only_layers) > 20:
469
+ lines.append(f"\n... and {len(nvidia_only_layers) - 20} more NVIDIA-only layers")
470
+
471
+ lines.append("")
472
+
473
+ return lines
474
+
475
+
476
+ def _format_stack_trace_report(results: dict[str, Any], show_all: bool = False) -> list[str]:
477
+ """Format Python stack traces for operations.
478
+
479
+ Args:
480
+ results: Analysis results
481
+ show_all: Whether to show all stack traces without truncation
482
+
483
+ Returns:
484
+ List of formatted text lines
485
+ """
486
+ lines = []
487
+ ops = results["operations"]
488
+ meta = results["metadata"]
489
+ is_trace1_amd = meta['trace1_platform'] == 'AMD'
490
+
491
+ lines.append("=" * 80)
492
+ lines.append("PYTHON STACK TRACES & CPU OPERATOR MAPPING")
493
+ lines.append("=" * 80)
494
+ lines.append("")
495
+ lines.append("Full call stacks showing where GPU operations are invoked from PyTorch/vLLM.")
496
+ lines.append("")
497
+
498
+ # Show stack traces for top operations by impact (or all if show_all)
499
+ ops_with_stacks = [
500
+ op for op in ops
501
+ if (op.get("trace1_python_stacks") or op.get("trace2_python_stacks"))
502
+ ]
503
+
504
+ if not ops_with_stacks:
505
+ lines.append("No stack trace information available.")
506
+ return lines
507
+
508
+ ops_to_show = ops_with_stacks if show_all else ops_with_stacks[:10]
509
+ lines.append(f"Showing {len(ops_to_show)}/{len(ops_with_stacks)} operations")
510
+ lines.append("")
511
+
512
+ for op in ops_to_show:
513
+ lines.append(f"Operation: {op['operation']}")
514
+
515
+ # Show CPU operator info
516
+ amd_cpu = op.get("trace1_cpu_op" if is_trace1_amd else "trace2_cpu_op", "N/A")
517
+ nv_cpu = op.get("trace2_cpu_op" if is_trace1_amd else "trace1_cpu_op", "N/A")
518
+
519
+ if amd_cpu != "N/A" or nv_cpu != "N/A":
520
+ lines.append(f" Most common CPU operator:")
521
+ if amd_cpu != "N/A":
522
+ lines.append(f" AMD: {amd_cpu}")
523
+ if nv_cpu != "N/A":
524
+ lines.append(f" NVIDIA: {nv_cpu}")
525
+
526
+ lines.append("-" * 80)
527
+
528
+ # AMD/Trace1 stacks
529
+ trace1_stacks = op.get("trace1_python_stacks", [])
530
+ if trace1_stacks:
531
+ stacks_to_show = trace1_stacks if show_all else trace1_stacks[:3]
532
+ label = "AMD" if is_trace1_amd else "NVIDIA"
533
+ lines.append(f" {label} Stack Traces ({len(stacks_to_show)}/{len(trace1_stacks)} shown):")
534
+ for i, stack in enumerate(stacks_to_show, 1):
535
+ lines.append(f" Variant {i}:")
536
+ for frame in stack:
537
+ lines.append(f" {frame}")
538
+ if i < len(stacks_to_show):
539
+ lines.append("")
540
+
541
+ # NVIDIA/Trace2 stacks
542
+ trace2_stacks = op.get("trace2_python_stacks", [])
543
+ if trace2_stacks:
544
+ stacks_to_show = trace2_stacks if show_all else trace2_stacks[:3]
545
+ label = "NVIDIA" if is_trace1_amd else "AMD"
546
+ if trace1_stacks:
547
+ lines.append("")
548
+ lines.append(f" {label} Stack Traces ({len(stacks_to_show)}/{len(trace2_stacks)} shown):")
549
+ for i, stack in enumerate(stacks_to_show, 1):
550
+ lines.append(f" Variant {i}:")
551
+ for frame in stack:
552
+ lines.append(f" {frame}")
553
+ if i < len(stacks_to_show):
554
+ lines.append("")
555
+
556
+ lines.append("")
557
+
558
+ return lines
559
+
560
+
561
+ def format_csv(results: dict[str, Any], report_type: str = "operations") -> str:
562
+ """Format comparison results as CSV.
563
+
564
+ Args:
565
+ results: Analysis results
566
+ report_type: 'operations' or 'layers'
567
+
568
+ Returns:
569
+ CSV formatted string
570
+ """
571
+ lines = []
572
+ meta = results["metadata"]
573
+ is_trace1_amd = meta['trace1_platform'] == 'AMD'
574
+
575
+ if report_type == "layers":
576
+ lines.append("layer,amd_kernels,nvidia_kernels,amd_total_ms,nvidia_total_ms,ratio,gap_ms,status,in_both")
577
+ for layer in results.get("layers", []):
578
+ # Convert trace1/trace2 to amd/nvidia
579
+ if is_trace1_amd:
580
+ amd_kernels = layer['trace1_kernels']
581
+ nvidia_kernels = layer['trace2_kernels']
582
+ amd_total_ms = layer['trace1_total_ms']
583
+ nvidia_total_ms = layer['trace2_total_ms']
584
+ else:
585
+ amd_kernels = layer['trace2_kernels']
586
+ nvidia_kernels = layer['trace1_kernels']
587
+ amd_total_ms = layer['trace2_total_ms']
588
+ nvidia_total_ms = layer['trace1_total_ms']
589
+
590
+ lines.append(
591
+ f"{layer['layer']},"
592
+ f"{amd_kernels},"
593
+ f"{nvidia_kernels},"
594
+ f"{amd_total_ms:.2f},"
595
+ f"{nvidia_total_ms:.2f},"
596
+ f"{layer['ratio']:.3f},"
597
+ f"{layer['gap_ms']:.2f},"
598
+ f"{layer['status']},"
599
+ f"{layer.get('in_both', True)}"
600
+ )
601
+ else:
602
+ lines.append(
603
+ "operation,amd_count,nvidia_count,amd_avg_us,nvidia_avg_us,amd_total_ms,"
604
+ "nvidia_total_ms,ratio,gap_ms,status,amd_kernel,nvidia_kernel,amd_cpu_op,nvidia_cpu_op"
605
+ )
606
+ for op in results["operations"]:
607
+ # Convert trace1/trace2 to amd/nvidia
608
+ if is_trace1_amd:
609
+ amd_count = op['trace1_count']
610
+ nvidia_count = op['trace2_count']
611
+ amd_avg_us = op['trace1_avg_us']
612
+ nvidia_avg_us = op['trace2_avg_us']
613
+ amd_total_ms = op['trace1_total_ms']
614
+ nvidia_total_ms = op['trace2_total_ms']
615
+ amd_kernel = op.get('trace1_kernel', '')
616
+ nvidia_kernel = op.get('trace2_kernel', '')
617
+ amd_cpu_op = op.get('trace1_cpu_op', '')
618
+ nvidia_cpu_op = op.get('trace2_cpu_op', '')
619
+ else:
620
+ amd_count = op['trace2_count']
621
+ nvidia_count = op['trace1_count']
622
+ amd_avg_us = op['trace2_avg_us']
623
+ nvidia_avg_us = op['trace1_avg_us']
624
+ amd_total_ms = op['trace2_total_ms']
625
+ nvidia_total_ms = op['trace1_total_ms']
626
+ amd_kernel = op.get('trace2_kernel', '')
627
+ nvidia_kernel = op.get('trace1_kernel', '')
628
+ amd_cpu_op = op.get('trace2_cpu_op', '')
629
+ nvidia_cpu_op = op.get('trace1_cpu_op', '')
630
+
631
+ lines.append(
632
+ f"{op['operation']},"
633
+ f"{amd_count},"
634
+ f"{nvidia_count},"
635
+ f"{amd_avg_us:.2f},"
636
+ f"{nvidia_avg_us:.2f},"
637
+ f"{amd_total_ms:.2f},"
638
+ f"{nvidia_total_ms:.2f},"
639
+ f"{op['ratio']:.3f},"
640
+ f"{op['gap_ms']:.2f},"
641
+ f"{op['status']},"
642
+ f"{amd_kernel},"
643
+ f"{nvidia_kernel},"
644
+ f"{amd_cpu_op},"
645
+ f"{nvidia_cpu_op}"
646
+ )
647
+
648
+ return "\n".join(lines)
649
+
650
+
651
+ def format_json(results: dict[str, Any]) -> str:
652
+ """Format comparison results as JSON.
653
+
654
+ Args:
655
+ results: Analysis results
656
+
657
+ Returns:
658
+ JSON formatted string
659
+ """
660
+ sanitized = _sanitize_for_json(results)
661
+ return json.dumps(sanitized, indent=2)
662
+
663
+
664
+ def format_fusion_text(results: dict[str, Any]) -> str:
665
+ """Format fusion analysis results as human-readable text.
666
+
667
+ Args:
668
+ results: Fusion analysis results
669
+
670
+ Returns:
671
+ Formatted text report
672
+ """
673
+ lines = []
674
+ meta = results["metadata"]
675
+
676
+ lines.append("=" * 80)
677
+ lines.append("FUSION ANALYSIS REPORT")
678
+ lines.append("=" * 80)
679
+ lines.append("")
680
+
681
+ # Use generic trace1/trace2 keys
682
+ lines.append(f"Trace 1 GPU: {meta['trace1_gpu']}")
683
+ lines.append(f"Trace 2 GPU: {meta['trace2_gpu']}")
684
+ lines.append(f"Trace 1 Kernels: {meta['trace1_total_kernels']:,}")
685
+ lines.append(f"Trace 2 Kernels: {meta['trace2_total_kernels']:,}")
686
+ lines.append("")
687
+
688
+ lines.append("Correlation Groups Analyzed:")
689
+ lines.append(f" Trace 1: {meta['trace1_correlation_groups']}")
690
+ lines.append(f" Trace 2: {meta['trace2_correlation_groups']}")
691
+ lines.append(f" Matched: {meta['matched_groups']}")
692
+ lines.append("")
693
+
694
+ # Convert global_counts from trace1/trace2 to amd/nvidia keys for display
695
+ # Note: fusion analyzer always uses AMD as trace1, NVIDIA as trace2
696
+ global_counts = results["global_counts"]
697
+ for ktype, counts in global_counts.items():
698
+ counts["amd_count"] = counts["trace1_count"]
699
+ counts["nv_count"] = counts["trace2_count"]
700
+
701
+ # Convert fusion opportunities from trace1/trace2 to amd/nvidia keys
702
+ for opp in results.get("fusion_opportunities", []):
703
+ opp["amd_total"] = opp["trace1_total"]
704
+ opp["nvidia_total"] = opp["trace2_total"]
705
+ opp["amd_avg_per_group"] = opp["trace1_avg_per_group"]
706
+ opp["nvidia_avg_per_group"] = opp["trace2_avg_per_group"]
707
+ opp["amd_time_ms"] = opp["trace1_time_ms"]
708
+ opp["nvidia_time_ms"] = opp["trace2_time_ms"]
709
+
710
+ # Global kernel type distribution
711
+ lines.append("GLOBAL KERNEL TYPE DISTRIBUTION")
712
+ lines.append("=" * 80)
713
+ lines.append(f"{'Kernel Type':<25} {'AMD Count':>12} {'NVIDIA Count':>15} {'Ratio':>12}")
714
+ lines.append("-" * 80)
715
+
716
+ sorted_types = sorted(
717
+ global_counts.items(),
718
+ key=lambda x: x[1]["amd_count"] + x[1]["nv_count"],
719
+ reverse=True,
720
+ )
721
+
722
+ for ktype, counts in sorted_types:
723
+ amd_c = counts["amd_count"]
724
+ nv_c = counts["nv_count"]
725
+ ratio = counts["ratio"]
726
+
727
+ # Mark significant differences
728
+ marker = ""
729
+ if ratio > 2.0:
730
+ marker = " ⚠️ AMD has more"
731
+ elif ratio < 0.5:
732
+ marker = " ⚠️ NVIDIA has more"
733
+ elif nv_c == 0 and amd_c > 20:
734
+ marker = " 🔥 AMD ONLY"
735
+ elif amd_c == 0 and nv_c > 20:
736
+ marker = " 🔥 NVIDIA ONLY"
737
+
738
+ ratio_str = f"{ratio:.2f}x" if ratio != float("inf") else "∞"
739
+ lines.append(f"{ktype:<25} {amd_c:>12,} {nv_c:>15,} {ratio_str:>12}{marker}")
740
+
741
+ lines.append("")
742
+
743
+ # Fusion opportunities
744
+ if results["fusion_opportunities"]:
745
+ lines.append("FUSION OPPORTUNITIES")
746
+ lines.append("=" * 80)
747
+ lines.append("")
748
+
749
+ amd_fuses = [opp for opp in results["fusion_opportunities"] if opp["fused_by"] == "AMD"]
750
+ nv_fuses = [opp for opp in results["fusion_opportunities"] if opp["fused_by"] == "NVIDIA"]
751
+
752
+ if nv_fuses:
753
+ lines.append("🔴 OPERATIONS AMD RUNS SEPARATELY (NVIDIA fuses them)")
754
+ lines.append("-" * 80)
755
+ lines.append("")
756
+
757
+ for i, opp in enumerate(nv_fuses, 1):
758
+ lines.append(f"{i}. {opp['kernel_type']}")
759
+ lines.append(" Kernel Launches:")
760
+ lines.append(f" AMD: {opp['amd_total']:,} calls ({opp['amd_avg_per_group']:.1f} per group)")
761
+ lines.append(f" NVIDIA: {opp['nvidia_total']:,} calls ({opp['nvidia_avg_per_group']:.1f} per group)")
762
+ lines.append(f" Ratio: {opp['ratio']:.2f}x (AMD launches more)")
763
+ lines.append(" Execution Time:")
764
+ lines.append(f" AMD: {opp['amd_time_ms']:.2f} ms")
765
+ lines.append(f" NVIDIA: {opp['nvidia_time_ms']:.2f} ms")
766
+ time_marker = ""
767
+ if opp["time_ratio"] > 1.2:
768
+ time_marker = " (AMD slower ⚠️)"
769
+ elif opp["time_ratio"] < 0.8:
770
+ time_marker = " (AMD faster ✓)"
771
+ else:
772
+ time_marker = " (similar)"
773
+ lines.append(f" Ratio: {opp['time_ratio']:.2f}x{time_marker}")
774
+ lines.append(f" Impact: {opp['groups_affected']}/{opp['total_groups']} groups show this difference")
775
+
776
+ # Provide interpretation
777
+ if opp["nvidia_total"] == 0:
778
+ lines.append(" → NVIDIA completely fuses this operation into another kernel")
779
+ else:
780
+ lines.append(f" → NVIDIA partially fuses, using {opp['ratio']:.1f}x fewer calls")
781
+
782
+ lines.append("")
783
+
784
+ if amd_fuses:
785
+ lines.append("🟢 OPERATIONS NVIDIA RUNS SEPARATELY (AMD fuses them)")
786
+ lines.append("-" * 80)
787
+ lines.append("")
788
+
789
+ for i, opp in enumerate(amd_fuses, 1):
790
+ lines.append(f"{i}. {opp['kernel_type']}")
791
+ lines.append(" Kernel Launches:")
792
+ lines.append(f" AMD: {opp['amd_total']:,} calls ({opp['amd_avg_per_group']:.1f} per group)")
793
+ lines.append(f" NVIDIA: {opp['nvidia_total']:,} calls ({opp['nvidia_avg_per_group']:.1f} per group)")
794
+ lines.append(f" Ratio: {opp['ratio']:.2f}x (NVIDIA launches more)")
795
+ lines.append(" Execution Time:")
796
+ lines.append(f" AMD: {opp['amd_time_ms']:.2f} ms")
797
+ lines.append(f" NVIDIA: {opp['nvidia_time_ms']:.2f} ms")
798
+ time_marker = ""
799
+ if opp["time_ratio"] > 1.2:
800
+ time_marker = " (AMD slower despite fusion ⚠️)"
801
+ elif opp["time_ratio"] < 0.8:
802
+ time_marker = " (AMD faster via fusion ✓)"
803
+ else:
804
+ time_marker = " (similar)"
805
+ lines.append(f" Ratio: {opp['time_ratio']:.2f}x{time_marker}")
806
+ lines.append(f" Impact: {opp['groups_affected']}/{opp['total_groups']} groups show this difference")
807
+ lines.append("")
808
+ else:
809
+ lines.append("No significant fusion differences detected.")
810
+ lines.append("")
811
+
812
+ # Fusion mappings
813
+ fusion_mappings = results.get("fusion_mappings", [])
814
+ if fusion_mappings:
815
+ lines.append("")
816
+ lines.append("FUSION MAPPINGS")
817
+ lines.append("=" * 80)
818
+ lines.append("")
819
+
820
+ # Group by type
821
+ sequence_mappings = []
822
+ intra_type_mappings = []
823
+ partial_mappings = []
824
+
825
+ for mapping in fusion_mappings:
826
+ if len(mapping["unfused_sequence"]) == 2 and \
827
+ mapping["unfused_sequence"][0] == mapping["unfused_sequence"][1] and \
828
+ mapping["unfused_sequence"][0] == mapping["fused_kernel_type"]:
829
+ intra_type_mappings.append(mapping)
830
+ elif len(mapping["unfused_sequence"]) == 1:
831
+ partial_mappings.append(mapping)
832
+ else:
833
+ sequence_mappings.append(mapping)
834
+
835
+ # Show sequence fusion
836
+ if sequence_mappings:
837
+ lines.append("🔗 SEQUENCE FUSION")
838
+ lines.append("-" * 80)
839
+ lines.append("")
840
+
841
+ # Group by evidence
842
+ from collections import defaultdict
843
+ grouped = defaultdict(list)
844
+ for m in sequence_mappings:
845
+ grouped[m["evidence"]].append(m)
846
+
847
+ for evidence, group in list(grouped.items())[:5]: # Show top 5 patterns
848
+ lines.append(f"Pattern: {evidence}")
849
+ lines.append(f" Occurrences: {len(group)} correlation groups")
850
+ lines.append(f" Total calls: {sum(m['pattern_count'] for m in group):,}")
851
+ lines.append(f" Confidence: {group[0]['pattern_confidence']*100:.0f}%")
852
+ lines.append("")
853
+
854
+ if len(grouped) > 5:
855
+ lines.append(f"... and {len(grouped) - 5} more sequence fusion patterns")
856
+ lines.append("")
857
+
858
+ # Show intra-type fusion
859
+ if intra_type_mappings:
860
+ lines.append("⛓️ INTRA-TYPE FUSION (Chain Compression)")
861
+ lines.append("-" * 80)
862
+ lines.append("")
863
+
864
+ for mapping in intra_type_mappings:
865
+ lines.append(f"Kernel: {mapping['fused_kernel_type']}")
866
+ lines.append(f" {mapping['evidence']}")
867
+ lines.append(f" Compression ratio: {mapping['pattern_count'] / max(mapping['fused_count'], 1):.1f}x")
868
+ lines.append("")
869
+
870
+ # Show partial fusion
871
+ if partial_mappings:
872
+ lines.append("📊 PARTIAL FUSION")
873
+ lines.append("-" * 80)
874
+ lines.append("")
875
+
876
+ for mapping in partial_mappings[:5]: # Show top 5
877
+ lines.append(f"Kernel: {mapping['unfused_sequence'][0]}")
878
+ lines.append(f" {mapping['evidence']}")
879
+ lines.append("")
880
+
881
+ lines.append("=" * 80)
882
+
883
+ return "\n".join(lines)
884
+
885
+
886
+ def format_fusion_csv(results: dict[str, Any]) -> str:
887
+ """Format fusion analysis results as CSV.
888
+
889
+ Args:
890
+ results: Fusion analysis results
891
+
892
+ Returns:
893
+ CSV formatted string
894
+ """
895
+ lines = []
896
+ lines.append(
897
+ "kernel_type,amd_count,nvidia_count,amd_time_ms,nvidia_time_ms,"
898
+ "time_ratio,launch_ratio,fused_by,groups_affected,total_groups"
899
+ )
900
+
901
+ for opp in results["fusion_opportunities"]:
902
+ lines.append(
903
+ f"{opp['kernel_type']},"
904
+ f"{opp['amd_total']},"
905
+ f"{opp['nvidia_total']},"
906
+ f"{opp['amd_time_ms']:.3f},"
907
+ f"{opp['nvidia_time_ms']:.3f},"
908
+ f"{opp['time_ratio']:.3f},"
909
+ f"{opp['ratio']:.3f},"
910
+ f"{opp['fused_by']},"
911
+ f"{opp['groups_affected']},"
912
+ f"{opp['total_groups']}"
913
+ )
914
+
915
+ return "\n".join(lines)
916
+
917
+
918
+ def _sanitize_for_json(obj: Any) -> Any:
919
+ """Recursively sanitize data structure to handle Infinity and NaN values.
920
+
921
+ Args:
922
+ obj: Data structure to sanitize
923
+
924
+ Returns:
925
+ Sanitized data structure with Infinity/NaN converted to None
926
+ """
927
+ import math
928
+
929
+ if isinstance(obj, float):
930
+ if math.isinf(obj) or math.isnan(obj):
931
+ return None
932
+ return obj
933
+ elif isinstance(obj, dict):
934
+ return {k: _sanitize_for_json(v) for k, v in obj.items()}
935
+ elif isinstance(obj, list):
936
+ return [_sanitize_for_json(item) for item in obj]
937
+ else:
938
+ return obj
939
+
940
+
941
+ def format_fusion_json(results: dict[str, Any]) -> str:
942
+ """Format fusion analysis results as JSON.
943
+
944
+ Args:
945
+ results: Fusion analysis results
946
+
947
+ Returns:
948
+ JSON formatted string
949
+ """
950
+ sanitized = _sanitize_for_json(results)
951
+ return json.dumps(sanitized, indent=2)