wafer-core 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
- wafer_core/lib/trace_compare/__init__.py +45 -0
- wafer_core/lib/trace_compare/aligner.py +369 -0
- wafer_core/lib/trace_compare/analyzer.py +729 -0
- wafer_core/lib/trace_compare/api.py +225 -0
- wafer_core/lib/trace_compare/architecture.py +77 -0
- wafer_core/lib/trace_compare/classifier.py +486 -0
- wafer_core/lib/trace_compare/formatter.py +951 -0
- wafer_core/lib/trace_compare/fusion_analyzer.py +356 -0
- wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
- wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
- wafer_core/lib/trace_compare/loader.py +635 -0
- wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
- wafer_core/lib/trace_compare/warnings.py +99 -0
- wafer_core/problem_config.py +3 -3
- wafer_core/rollouts/agent_presets/rlm_01_01.py +2 -2
- wafer_core/rollouts/dtypes.py +18 -3
- wafer_core/rollouts/providers/anthropic.py +35 -3
- wafer_core/utils/kernel_utils/defense.py +10 -0
- wafer_core/utils/kernel_utils/targets/config.py +10 -0
- {wafer_core-0.1.25.dist-info → wafer_core-0.1.27.dist-info}/METADATA +3 -1
- {wafer_core-0.1.25.dist-info → wafer_core-0.1.27.dist-info}/RECORD +23 -9
- {wafer_core-0.1.25.dist-info → wafer_core-0.1.27.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,951 @@
|
|
|
1
|
+
"""Report formatting for trace comparison results.
|
|
2
|
+
|
|
3
|
+
Provides text, CSV, and JSON output formatters for comparison and fusion analysis.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def format_text(results: dict[str, Any], show_layers: bool = False, show_all: bool = False, show_stack_traces: bool = False) -> str:
|
|
11
|
+
"""Format comparison results as human-readable text report.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
results: Analysis results from analyze_traces()
|
|
15
|
+
show_layers: Whether to include layer-wise breakdown
|
|
16
|
+
show_all: Whether to show all items without truncation
|
|
17
|
+
show_stack_traces: Whether to show Python stack traces
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Formatted text report
|
|
21
|
+
"""
|
|
22
|
+
lines = []
|
|
23
|
+
meta = results["metadata"]
|
|
24
|
+
|
|
25
|
+
lines.append("=" * 80)
|
|
26
|
+
lines.append("VLLM TRACE COMPARISON REPORT")
|
|
27
|
+
if "phase" in meta and meta["phase"] != "all":
|
|
28
|
+
lines.append(f"Phase: {meta['phase'].upper()}")
|
|
29
|
+
lines.append("=" * 80)
|
|
30
|
+
lines.append("")
|
|
31
|
+
|
|
32
|
+
# Determine which trace is AMD and which is NVIDIA
|
|
33
|
+
is_trace1_amd = meta['trace1_platform'] == 'AMD'
|
|
34
|
+
if is_trace1_amd:
|
|
35
|
+
amd_gpu, nvidia_gpu = meta['trace1_gpu'], meta['trace2_gpu']
|
|
36
|
+
amd_kernels, nvidia_kernels = meta['trace1_kernels'], meta['trace2_kernels']
|
|
37
|
+
amd_total_ms, nvidia_total_ms = meta['trace1_total_ms'], meta['trace2_total_ms']
|
|
38
|
+
else:
|
|
39
|
+
amd_gpu, nvidia_gpu = meta['trace2_gpu'], meta['trace1_gpu']
|
|
40
|
+
amd_kernels, nvidia_kernels = meta['trace2_kernels'], meta['trace1_kernels']
|
|
41
|
+
amd_total_ms, nvidia_total_ms = meta['trace2_total_ms'], meta['trace1_total_ms']
|
|
42
|
+
|
|
43
|
+
# Get device properties
|
|
44
|
+
amd_dev = meta['trace1_device'] if is_trace1_amd else meta['trace2_device']
|
|
45
|
+
nvidia_dev = meta['trace2_device'] if is_trace1_amd else meta['trace1_device']
|
|
46
|
+
|
|
47
|
+
lines.append(f"AMD GPU: {amd_gpu}")
|
|
48
|
+
lines.append(f" Compute: {amd_dev['compute_capability']}")
|
|
49
|
+
lines.append(f" Memory: {amd_dev['total_memory_gb']:.1f} GB")
|
|
50
|
+
lines.append(f" SMs: {amd_dev['sm_count']}")
|
|
51
|
+
lines.append(f" Warp Size: {amd_dev['warp_size']}")
|
|
52
|
+
lines.append("")
|
|
53
|
+
lines.append(f"NVIDIA GPU: {nvidia_gpu}")
|
|
54
|
+
lines.append(f" Compute: {nvidia_dev['compute_capability']}")
|
|
55
|
+
lines.append(f" Memory: {nvidia_dev['total_memory_gb']:.1f} GB")
|
|
56
|
+
lines.append(f" SMs: {nvidia_dev['sm_count']}")
|
|
57
|
+
lines.append(f" Warp Size: {nvidia_dev['warp_size']}")
|
|
58
|
+
lines.append("")
|
|
59
|
+
lines.append(f"AMD Kernels: {amd_kernels:,}")
|
|
60
|
+
lines.append(f"NVIDIA Kernels: {nvidia_kernels:,}")
|
|
61
|
+
lines.append(f"AMD Total: {amd_total_ms:.1f} ms")
|
|
62
|
+
lines.append(f"NVIDIA Total: {nvidia_total_ms:.1f} ms")
|
|
63
|
+
|
|
64
|
+
# Handle division by zero for ratio
|
|
65
|
+
if nvidia_total_ms > 0:
|
|
66
|
+
ratio_str = f"{amd_total_ms / nvidia_total_ms:.2f}x"
|
|
67
|
+
elif amd_total_ms > 0:
|
|
68
|
+
ratio_str = "∞ (NVIDIA has no data)"
|
|
69
|
+
else:
|
|
70
|
+
ratio_str = "N/A (both traces empty)"
|
|
71
|
+
|
|
72
|
+
lines.append(f"Ratio: {ratio_str}")
|
|
73
|
+
lines.append("")
|
|
74
|
+
|
|
75
|
+
# Convert operations from trace1/trace2 keys to amd/nvidia keys for easier formatting
|
|
76
|
+
ops = results["operations"]
|
|
77
|
+
for op in ops:
|
|
78
|
+
if is_trace1_amd:
|
|
79
|
+
op['amd_count'] = op['trace1_count']
|
|
80
|
+
op['nvidia_count'] = op['trace2_count']
|
|
81
|
+
op['amd_avg_us'] = op['trace1_avg_us']
|
|
82
|
+
op['nvidia_avg_us'] = op['trace2_avg_us']
|
|
83
|
+
op['amd_total_ms'] = op['trace1_total_ms']
|
|
84
|
+
op['nvidia_total_ms'] = op['trace2_total_ms']
|
|
85
|
+
op['amd_cpu_op'] = op.get('trace1_cpu_op')
|
|
86
|
+
op['nvidia_cpu_op'] = op.get('trace2_cpu_op')
|
|
87
|
+
op['amd_pattern'] = op.get('trace1_pattern')
|
|
88
|
+
op['nvidia_pattern'] = op.get('trace2_pattern')
|
|
89
|
+
op['amd_kernels'] = op.get('trace1_kernels', [])
|
|
90
|
+
op['nvidia_kernels'] = op.get('trace2_kernels', [])
|
|
91
|
+
else:
|
|
92
|
+
op['amd_count'] = op['trace2_count']
|
|
93
|
+
op['nvidia_count'] = op['trace1_count']
|
|
94
|
+
op['amd_avg_us'] = op['trace2_avg_us']
|
|
95
|
+
op['nvidia_avg_us'] = op['trace1_avg_us']
|
|
96
|
+
op['amd_total_ms'] = op['trace2_total_ms']
|
|
97
|
+
op['nvidia_total_ms'] = op['trace1_total_ms']
|
|
98
|
+
op['amd_cpu_op'] = op.get('trace2_cpu_op')
|
|
99
|
+
op['nvidia_cpu_op'] = op.get('trace1_cpu_op')
|
|
100
|
+
op['amd_pattern'] = op.get('trace2_pattern')
|
|
101
|
+
op['nvidia_pattern'] = op.get('trace1_pattern')
|
|
102
|
+
op['amd_kernels'] = op.get('trace2_kernels', [])
|
|
103
|
+
op['nvidia_kernels'] = op.get('trace1_kernels', [])
|
|
104
|
+
|
|
105
|
+
# Convert layers from trace1/trace2 keys to amd/nvidia keys
|
|
106
|
+
layers = results.get("layers", [])
|
|
107
|
+
for layer in layers:
|
|
108
|
+
if is_trace1_amd:
|
|
109
|
+
layer['amd_kernels'] = layer['trace1_kernels']
|
|
110
|
+
layer['nvidia_kernels'] = layer['trace2_kernels']
|
|
111
|
+
layer['amd_total_ms'] = layer['trace1_total_ms']
|
|
112
|
+
layer['nvidia_total_ms'] = layer['trace2_total_ms']
|
|
113
|
+
else:
|
|
114
|
+
layer['amd_kernels'] = layer['trace2_kernels']
|
|
115
|
+
layer['nvidia_kernels'] = layer['trace1_kernels']
|
|
116
|
+
layer['amd_total_ms'] = layer['trace2_total_ms']
|
|
117
|
+
layer['nvidia_total_ms'] = layer['trace1_total_ms']
|
|
118
|
+
|
|
119
|
+
# Update metadata layer counts
|
|
120
|
+
if is_trace1_amd:
|
|
121
|
+
meta['amd_layers'] = meta.get('trace1_layers', 0)
|
|
122
|
+
meta['nvidia_layers'] = meta.get('trace2_layers', 0)
|
|
123
|
+
else:
|
|
124
|
+
meta['amd_layers'] = meta.get('trace2_layers', 0)
|
|
125
|
+
meta['nvidia_layers'] = meta.get('trace1_layers', 0)
|
|
126
|
+
|
|
127
|
+
# Summary stats
|
|
128
|
+
slower = [o for o in ops if o["status"] == "slower"]
|
|
129
|
+
faster = [o for o in ops if o["status"] == "faster"]
|
|
130
|
+
similar = [o for o in ops if o["status"] == "similar"]
|
|
131
|
+
|
|
132
|
+
lines.append("SUMMARY")
|
|
133
|
+
lines.append("-" * 80)
|
|
134
|
+
lines.append(f"Operations where AMD is slower: {len(slower)}")
|
|
135
|
+
lines.append(f"Operations where AMD is faster: {len(faster)}")
|
|
136
|
+
lines.append(f"Operations with similar perf: {len(similar)}")
|
|
137
|
+
lines.append("")
|
|
138
|
+
|
|
139
|
+
# AMD Slower
|
|
140
|
+
if slower:
|
|
141
|
+
slower_to_show = slower if show_all else slower[:10]
|
|
142
|
+
lines.append(f"🔴 AMD SLOWER THAN NVIDIA (Optimization Targets) - Showing {len(slower_to_show)}/{len(slower)}")
|
|
143
|
+
lines.append("=" * 140)
|
|
144
|
+
lines.append(
|
|
145
|
+
f"{'Operation':<22} {'AMD Count':>11} {'NV Count':>10} {'AMD Avg':>10} "
|
|
146
|
+
f"{'NV Avg':>10} {'Ratio':>8} {'AMD Total':>11} {'NV Total':>11} {'AMD Slower By':>14}"
|
|
147
|
+
)
|
|
148
|
+
lines.append("-" * 140)
|
|
149
|
+
|
|
150
|
+
for op in slower_to_show:
|
|
151
|
+
diff_abs = abs(op["gap_ms"])
|
|
152
|
+
lines.append(
|
|
153
|
+
f"{op['operation']:<22} "
|
|
154
|
+
f"{op['amd_count']:>11,} "
|
|
155
|
+
f"{op['nvidia_count']:>10,} "
|
|
156
|
+
f"{op['amd_avg_us']:>8.1f}µs "
|
|
157
|
+
f"{op['nvidia_avg_us']:>8.1f}µs "
|
|
158
|
+
f"{op['ratio']:>7.2f}x "
|
|
159
|
+
f"{op['amd_total_ms']:>9.1f}ms "
|
|
160
|
+
f"{op['nvidia_total_ms']:>9.1f}ms "
|
|
161
|
+
f"{diff_abs:>13.1f}ms"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
lines.append("")
|
|
165
|
+
|
|
166
|
+
# AMD Faster
|
|
167
|
+
if faster:
|
|
168
|
+
faster_to_show = faster if show_all else faster[:10]
|
|
169
|
+
lines.append(f"🟢 AMD FASTER THAN NVIDIA (Wins) - Showing {len(faster_to_show)}/{len(faster)}")
|
|
170
|
+
lines.append("=" * 140)
|
|
171
|
+
lines.append(
|
|
172
|
+
f"{'Operation':<22} {'AMD Count':>11} {'NV Count':>10} {'AMD Avg':>10} "
|
|
173
|
+
f"{'NV Avg':>10} {'Ratio':>8} {'AMD Total':>11} {'NV Total':>11} {'AMD Faster By':>14}"
|
|
174
|
+
)
|
|
175
|
+
lines.append("-" * 140)
|
|
176
|
+
|
|
177
|
+
for op in faster_to_show:
|
|
178
|
+
diff_abs = abs(op["gap_ms"])
|
|
179
|
+
lines.append(
|
|
180
|
+
f"{op['operation']:<22} "
|
|
181
|
+
f"{op['amd_count']:>11,} "
|
|
182
|
+
f"{op['nvidia_count']:>10,} "
|
|
183
|
+
f"{op['amd_avg_us']:>8.1f}µs "
|
|
184
|
+
f"{op['nvidia_avg_us']:>8.1f}µs "
|
|
185
|
+
f"{op['ratio']:>7.2f}x "
|
|
186
|
+
f"{op['amd_total_ms']:>9.1f}ms "
|
|
187
|
+
f"{op['nvidia_total_ms']:>9.1f}ms "
|
|
188
|
+
f"{diff_abs:>13.1f}ms"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
lines.append("")
|
|
192
|
+
|
|
193
|
+
# Similar Performance
|
|
194
|
+
if similar:
|
|
195
|
+
lines.append("⚪ SIMILAR PERFORMANCE (Within 10% difference)")
|
|
196
|
+
lines.append("=" * 140)
|
|
197
|
+
lines.append(
|
|
198
|
+
f"{'Operation':<22} {'AMD Count':>11} {'NV Count':>10} {'AMD Avg':>10} "
|
|
199
|
+
f"{'NV Avg':>10} {'Ratio':>8} {'AMD Total':>11} {'NV Total':>11} {'Winner':>14}"
|
|
200
|
+
)
|
|
201
|
+
lines.append("-" * 140)
|
|
202
|
+
|
|
203
|
+
for op in similar:
|
|
204
|
+
if op["gap_ms"] < 0:
|
|
205
|
+
winner = f"AMD by {abs(op['gap_ms']):.1f}ms"
|
|
206
|
+
elif op["gap_ms"] > 0:
|
|
207
|
+
winner = f"NV by {op['gap_ms']:.1f}ms"
|
|
208
|
+
else:
|
|
209
|
+
winner = "Tie"
|
|
210
|
+
lines.append(
|
|
211
|
+
f"{op['operation']:<22} "
|
|
212
|
+
f"{op['amd_count']:>11,} "
|
|
213
|
+
f"{op['nvidia_count']:>10,} "
|
|
214
|
+
f"{op['amd_avg_us']:>8.1f}µs "
|
|
215
|
+
f"{op['nvidia_avg_us']:>8.1f}µs "
|
|
216
|
+
f"{op['ratio']:>7.2f}x "
|
|
217
|
+
f"{op['amd_total_ms']:>9.1f}ms "
|
|
218
|
+
f"{op['nvidia_total_ms']:>9.1f}ms "
|
|
219
|
+
f"{winner:>14}"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
lines.append("")
|
|
223
|
+
|
|
224
|
+
# CPU operator mapping with stack trace info
|
|
225
|
+
if not show_stack_traces:
|
|
226
|
+
cpu_ops_to_show = ops if show_all else ops[:15]
|
|
227
|
+
lines.append(f"CPU OPERATOR MAPPING - Showing {len(cpu_ops_to_show)}/{len(ops)}")
|
|
228
|
+
lines.append("=" * 80)
|
|
229
|
+
lines.append("Shows the most common PyTorch/vLLM call path for each GPU operation.")
|
|
230
|
+
lines.append("Use --stack-traces to see full call stacks and all variants.")
|
|
231
|
+
lines.append("")
|
|
232
|
+
lines.append(f"{'Operation':<25} {'CPU Operator (most common)':<45} {'Variants':<10}")
|
|
233
|
+
lines.append("-" * 80)
|
|
234
|
+
|
|
235
|
+
# Track if any operations have multiple stack traces
|
|
236
|
+
has_multiple_stacks = False
|
|
237
|
+
for op in cpu_ops_to_show:
|
|
238
|
+
cpu_op = op.get("amd_cpu_op") or op.get("nvidia_cpu_op", "N/A")
|
|
239
|
+
if cpu_op and cpu_op != "N/A":
|
|
240
|
+
# Shorten long operator names
|
|
241
|
+
if len(cpu_op) > 43:
|
|
242
|
+
cpu_op = cpu_op[:40] + "..."
|
|
243
|
+
|
|
244
|
+
# Count unique stack traces across both traces
|
|
245
|
+
amd_stacks = op.get("trace1_python_stacks", []) if is_trace1_amd else op.get("trace2_python_stacks", [])
|
|
246
|
+
nv_stacks = op.get("trace2_python_stacks", []) if is_trace1_amd else op.get("trace1_python_stacks", [])
|
|
247
|
+
total_stacks = len(amd_stacks) + len(nv_stacks)
|
|
248
|
+
|
|
249
|
+
stack_info = ""
|
|
250
|
+
if total_stacks > 1:
|
|
251
|
+
stack_info = f"{total_stacks} paths"
|
|
252
|
+
has_multiple_stacks = True
|
|
253
|
+
elif total_stacks == 1:
|
|
254
|
+
stack_info = "1 path"
|
|
255
|
+
else:
|
|
256
|
+
stack_info = "-"
|
|
257
|
+
|
|
258
|
+
lines.append(f"{op['operation']:<25} {cpu_op:<45} {stack_info:<10}")
|
|
259
|
+
|
|
260
|
+
lines.append("")
|
|
261
|
+
if has_multiple_stacks:
|
|
262
|
+
lines.append("⚠️ Multiple call paths detected. Use --stack-traces to see all variants.")
|
|
263
|
+
lines.append("")
|
|
264
|
+
|
|
265
|
+
# Kernel-level details for top 3 operations with biggest gaps
|
|
266
|
+
non_similar_ops = [op for op in ops if op["status"] != "similar"]
|
|
267
|
+
top_ops = non_similar_ops if show_all else non_similar_ops[:3]
|
|
268
|
+
|
|
269
|
+
lines.append("KERNEL-LEVEL DETAILS (Top Individual Kernels)")
|
|
270
|
+
lines.append("=" * 80)
|
|
271
|
+
lines.append("")
|
|
272
|
+
if show_all:
|
|
273
|
+
lines.append(f"Showing all kernels for {len(top_ops)} operations with performance gaps:")
|
|
274
|
+
else:
|
|
275
|
+
lines.append(f"Showing top 10 kernels for the 3 operations with largest performance gaps:")
|
|
276
|
+
lines.append("")
|
|
277
|
+
|
|
278
|
+
for op in top_ops:
|
|
279
|
+
lines.append(f"Operation: {op['operation']}")
|
|
280
|
+
lines.append("-" * 80)
|
|
281
|
+
|
|
282
|
+
# AMD kernels
|
|
283
|
+
all_amd_kernels = op.get("amd_kernels", [])
|
|
284
|
+
amd_kernels = all_amd_kernels if show_all else all_amd_kernels[:10]
|
|
285
|
+
if amd_kernels:
|
|
286
|
+
kernel_label = f"All {len(amd_kernels)}" if show_all else "Top 10"
|
|
287
|
+
lines.append(f"\n AMD {kernel_label} Kernels (Total: {op['amd_count']} invocations):")
|
|
288
|
+
lines.append(f" {'Kernel Name':<50} {'Total (µs)':>12} {'Count':>8} {'Avg (µs)':>10}")
|
|
289
|
+
lines.append(" " + "-" * 80)
|
|
290
|
+
for k in amd_kernels:
|
|
291
|
+
name = k["name"][:47] + "..." if len(k["name"]) > 50 else k["name"]
|
|
292
|
+
lines.append(
|
|
293
|
+
f" {name:<50} {k['total_us']:>12.0f} {k['count']:>8,} {k['avg_us']:>10.1f}"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# NVIDIA kernels
|
|
297
|
+
all_nv_kernels = op.get("nvidia_kernels", [])
|
|
298
|
+
nv_kernels = all_nv_kernels if show_all else all_nv_kernels[:10]
|
|
299
|
+
if nv_kernels:
|
|
300
|
+
kernel_label = f"All {len(nv_kernels)}" if show_all else "Top 10"
|
|
301
|
+
lines.append(f"\n NVIDIA {kernel_label} Kernels (Total: {op['nvidia_count']} invocations):")
|
|
302
|
+
lines.append(f" {'Kernel Name':<50} {'Total (µs)':>12} {'Count':>8} {'Avg (µs)':>10}")
|
|
303
|
+
lines.append(" " + "-" * 80)
|
|
304
|
+
for k in nv_kernels:
|
|
305
|
+
name = k["name"][:47] + "..." if len(k["name"]) > 50 else k["name"]
|
|
306
|
+
lines.append(
|
|
307
|
+
f" {name:<50} {k['total_us']:>12.0f} {k['count']:>8,} {k['avg_us']:>10.1f}"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
lines.append("")
|
|
311
|
+
|
|
312
|
+
lines.append("=" * 80)
|
|
313
|
+
|
|
314
|
+
# Python stack traces if requested
|
|
315
|
+
if show_stack_traces:
|
|
316
|
+
stack_trace_report = _format_stack_trace_report(results, show_all=show_all)
|
|
317
|
+
lines.append("")
|
|
318
|
+
lines.extend(stack_trace_report)
|
|
319
|
+
|
|
320
|
+
# Layer-wise report if requested
|
|
321
|
+
if show_layers:
|
|
322
|
+
layer_report = _format_layer_report(results, show_all=show_all)
|
|
323
|
+
lines.append("")
|
|
324
|
+
lines.extend(layer_report)
|
|
325
|
+
|
|
326
|
+
return "\n".join(lines)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _format_layer_report(results: dict[str, Any], show_all: bool = False) -> list[str]:
|
|
330
|
+
"""Format layer-wise performance breakdown.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
results: Analysis results
|
|
334
|
+
show_all: Whether to show all layers without truncation
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
List of formatted text lines
|
|
338
|
+
"""
|
|
339
|
+
layers = results.get("layers", [])
|
|
340
|
+
if not layers:
|
|
341
|
+
return []
|
|
342
|
+
|
|
343
|
+
lines = []
|
|
344
|
+
meta = results["metadata"]
|
|
345
|
+
|
|
346
|
+
lines.append("=" * 80)
|
|
347
|
+
lines.append("LAYER-WISE PERFORMANCE BREAKDOWN")
|
|
348
|
+
lines.append("=" * 80)
|
|
349
|
+
lines.append("")
|
|
350
|
+
lines.append("NOTE: Layers are identified by correlation IDs in the execution graph.")
|
|
351
|
+
lines.append("Each layer represents one transformer block (Norm + Attention + FFN).")
|
|
352
|
+
lines.append("Layers may have similar timing if the workload is uniform across the model.")
|
|
353
|
+
lines.append("")
|
|
354
|
+
|
|
355
|
+
lines.append(f"Total Layers Detected: {meta.get('amd_layers', 0)} (AMD), {meta.get('nvidia_layers', 0)} (NVIDIA)")
|
|
356
|
+
lines.append("")
|
|
357
|
+
|
|
358
|
+
# Separate layers into comparable and trace-unique
|
|
359
|
+
comparable_layers = [layer for layer in layers if layer.get("in_both", True)]
|
|
360
|
+
amd_only_layers = [layer for layer in layers if layer["status"] == "trace1_only" and meta['trace1_platform'] == 'AMD']
|
|
361
|
+
nvidia_only_layers = [layer for layer in layers if layer["status"] == "trace2_only" and meta['trace1_platform'] == 'AMD']
|
|
362
|
+
|
|
363
|
+
# Handle case where trace2 is AMD
|
|
364
|
+
if meta['trace1_platform'] != 'AMD':
|
|
365
|
+
amd_only_layers = [layer for layer in layers if layer["status"] == "trace2_only"]
|
|
366
|
+
nvidia_only_layers = [layer for layer in layers if layer["status"] == "trace1_only"]
|
|
367
|
+
|
|
368
|
+
slower_layers = [layer for layer in comparable_layers if layer["status"] == "slower"]
|
|
369
|
+
faster_layers = [layer for layer in comparable_layers if layer["status"] == "faster"]
|
|
370
|
+
similar_layers = [layer for layer in comparable_layers if layer["status"] == "similar"]
|
|
371
|
+
|
|
372
|
+
lines.append(f"Layers in both traces: {len(comparable_layers)}")
|
|
373
|
+
lines.append(f" - AMD is slower: {len(slower_layers)}")
|
|
374
|
+
lines.append(f" - AMD is faster: {len(faster_layers)}")
|
|
375
|
+
lines.append(f" - Similar performance: {len(similar_layers)}")
|
|
376
|
+
if amd_only_layers or nvidia_only_layers:
|
|
377
|
+
lines.append(f"Layers only in AMD trace: {len(amd_only_layers)}")
|
|
378
|
+
lines.append(f"Layers only in NVIDIA trace: {len(nvidia_only_layers)}")
|
|
379
|
+
lines.append("")
|
|
380
|
+
|
|
381
|
+
# Show top 20 slowest layers for AMD (or all if show_all)
|
|
382
|
+
if slower_layers:
|
|
383
|
+
slower_to_show = slower_layers if show_all else slower_layers[:20]
|
|
384
|
+
label = f"ALL {len(slower_to_show)}" if show_all else "TOP 20"
|
|
385
|
+
lines.append(f"🔴 {label} LAYERS WHERE AMD IS SLOWER")
|
|
386
|
+
lines.append("=" * 100)
|
|
387
|
+
lines.append(
|
|
388
|
+
f"{'Layer':>6} {'AMD Kernels':>13} {'NV Kernels':>12} "
|
|
389
|
+
f"{'AMD Time':>12} {'NV Time':>11} {'Ratio':>8} {'AMD Slower By':>14}"
|
|
390
|
+
)
|
|
391
|
+
lines.append("-" * 100)
|
|
392
|
+
|
|
393
|
+
for layer in slower_to_show:
|
|
394
|
+
lines.append(
|
|
395
|
+
f"{layer['layer']:>6} "
|
|
396
|
+
f"{layer['amd_kernels']:>13,} "
|
|
397
|
+
f"{layer['nvidia_kernels']:>12,} "
|
|
398
|
+
f"{layer['amd_total_ms']:>10.2f}ms "
|
|
399
|
+
f"{layer['nvidia_total_ms']:>9.2f}ms "
|
|
400
|
+
f"{layer['ratio']:>7.2f}x "
|
|
401
|
+
f"{abs(layer['gap_ms']):>13.2f}ms"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
lines.append("")
|
|
405
|
+
|
|
406
|
+
# Show top 10 fastest layers for AMD (or all if show_all)
|
|
407
|
+
if faster_layers:
|
|
408
|
+
faster_to_show = faster_layers if show_all else faster_layers[:10]
|
|
409
|
+
label = f"ALL {len(faster_to_show)}" if show_all else "TOP 10"
|
|
410
|
+
lines.append(f"🟢 {label} LAYERS WHERE AMD IS FASTER")
|
|
411
|
+
lines.append("=" * 100)
|
|
412
|
+
lines.append(
|
|
413
|
+
f"{'Layer':>6} {'AMD Kernels':>13} {'NV Kernels':>12} "
|
|
414
|
+
f"{'AMD Time':>12} {'NV Time':>11} {'Ratio':>8} {'AMD Faster By':>14}"
|
|
415
|
+
)
|
|
416
|
+
lines.append("-" * 100)
|
|
417
|
+
|
|
418
|
+
for layer in faster_to_show:
|
|
419
|
+
lines.append(
|
|
420
|
+
f"{layer['layer']:>6} "
|
|
421
|
+
f"{layer['amd_kernels']:>13,} "
|
|
422
|
+
f"{layer['nvidia_kernels']:>12,} "
|
|
423
|
+
f"{layer['amd_total_ms']:>10.2f}ms "
|
|
424
|
+
f"{layer['nvidia_total_ms']:>9.2f}ms "
|
|
425
|
+
f"{layer['ratio']:>7.2f}x "
|
|
426
|
+
f"{abs(layer['gap_ms']):>13.2f}ms"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
lines.append("")
|
|
430
|
+
|
|
431
|
+
# Show AMD-only layers (simplified display)
|
|
432
|
+
if amd_only_layers:
|
|
433
|
+
amd_to_show = amd_only_layers if show_all else amd_only_layers[:20]
|
|
434
|
+
label = f"ALL {len(amd_to_show)}" if show_all else f"{len(amd_to_show)}/{len(amd_only_layers)}"
|
|
435
|
+
lines.append(f"📊 LAYERS ONLY IN AMD TRACE ({label})")
|
|
436
|
+
lines.append("=" * 60)
|
|
437
|
+
lines.append(f"{'Layer':>6} {'Kernels':>10} {'Time':>12}")
|
|
438
|
+
lines.append("-" * 60)
|
|
439
|
+
|
|
440
|
+
for layer in amd_to_show:
|
|
441
|
+
lines.append(
|
|
442
|
+
f"{layer['layer']:>6} "
|
|
443
|
+
f"{layer['amd_kernels']:>10,} "
|
|
444
|
+
f"{layer['amd_total_ms']:>10.2f}ms"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
if not show_all and len(amd_only_layers) > 20:
|
|
448
|
+
lines.append(f"\n... and {len(amd_only_layers) - 20} more AMD-only layers")
|
|
449
|
+
|
|
450
|
+
lines.append("")
|
|
451
|
+
|
|
452
|
+
# Show NVIDIA-only layers (simplified display)
|
|
453
|
+
if nvidia_only_layers:
|
|
454
|
+
nvidia_to_show = nvidia_only_layers if show_all else nvidia_only_layers[:20]
|
|
455
|
+
label = f"ALL {len(nvidia_to_show)}" if show_all else f"{len(nvidia_to_show)}/{len(nvidia_only_layers)}"
|
|
456
|
+
lines.append(f"📊 LAYERS ONLY IN NVIDIA TRACE ({label})")
|
|
457
|
+
lines.append("=" * 60)
|
|
458
|
+
lines.append(f"{'Layer':>6} {'Kernels':>10} {'Time':>12}")
|
|
459
|
+
lines.append("-" * 60)
|
|
460
|
+
|
|
461
|
+
for layer in nvidia_to_show:
|
|
462
|
+
lines.append(
|
|
463
|
+
f"{layer['layer']:>6} "
|
|
464
|
+
f"{layer['nvidia_kernels']:>10,} "
|
|
465
|
+
f"{layer['nvidia_total_ms']:>10.2f}ms"
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
if not show_all and len(nvidia_only_layers) > 20:
|
|
469
|
+
lines.append(f"\n... and {len(nvidia_only_layers) - 20} more NVIDIA-only layers")
|
|
470
|
+
|
|
471
|
+
lines.append("")
|
|
472
|
+
|
|
473
|
+
return lines
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _format_stack_trace_report(results: dict[str, Any], show_all: bool = False) -> list[str]:
|
|
477
|
+
"""Format Python stack traces for operations.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
results: Analysis results
|
|
481
|
+
show_all: Whether to show all stack traces without truncation
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
List of formatted text lines
|
|
485
|
+
"""
|
|
486
|
+
lines = []
|
|
487
|
+
ops = results["operations"]
|
|
488
|
+
meta = results["metadata"]
|
|
489
|
+
is_trace1_amd = meta['trace1_platform'] == 'AMD'
|
|
490
|
+
|
|
491
|
+
lines.append("=" * 80)
|
|
492
|
+
lines.append("PYTHON STACK TRACES & CPU OPERATOR MAPPING")
|
|
493
|
+
lines.append("=" * 80)
|
|
494
|
+
lines.append("")
|
|
495
|
+
lines.append("Full call stacks showing where GPU operations are invoked from PyTorch/vLLM.")
|
|
496
|
+
lines.append("")
|
|
497
|
+
|
|
498
|
+
# Show stack traces for top operations by impact (or all if show_all)
|
|
499
|
+
ops_with_stacks = [
|
|
500
|
+
op for op in ops
|
|
501
|
+
if (op.get("trace1_python_stacks") or op.get("trace2_python_stacks"))
|
|
502
|
+
]
|
|
503
|
+
|
|
504
|
+
if not ops_with_stacks:
|
|
505
|
+
lines.append("No stack trace information available.")
|
|
506
|
+
return lines
|
|
507
|
+
|
|
508
|
+
ops_to_show = ops_with_stacks if show_all else ops_with_stacks[:10]
|
|
509
|
+
lines.append(f"Showing {len(ops_to_show)}/{len(ops_with_stacks)} operations")
|
|
510
|
+
lines.append("")
|
|
511
|
+
|
|
512
|
+
for op in ops_to_show:
|
|
513
|
+
lines.append(f"Operation: {op['operation']}")
|
|
514
|
+
|
|
515
|
+
# Show CPU operator info
|
|
516
|
+
amd_cpu = op.get("trace1_cpu_op" if is_trace1_amd else "trace2_cpu_op", "N/A")
|
|
517
|
+
nv_cpu = op.get("trace2_cpu_op" if is_trace1_amd else "trace1_cpu_op", "N/A")
|
|
518
|
+
|
|
519
|
+
if amd_cpu != "N/A" or nv_cpu != "N/A":
|
|
520
|
+
lines.append(f" Most common CPU operator:")
|
|
521
|
+
if amd_cpu != "N/A":
|
|
522
|
+
lines.append(f" AMD: {amd_cpu}")
|
|
523
|
+
if nv_cpu != "N/A":
|
|
524
|
+
lines.append(f" NVIDIA: {nv_cpu}")
|
|
525
|
+
|
|
526
|
+
lines.append("-" * 80)
|
|
527
|
+
|
|
528
|
+
# AMD/Trace1 stacks
|
|
529
|
+
trace1_stacks = op.get("trace1_python_stacks", [])
|
|
530
|
+
if trace1_stacks:
|
|
531
|
+
stacks_to_show = trace1_stacks if show_all else trace1_stacks[:3]
|
|
532
|
+
label = "AMD" if is_trace1_amd else "NVIDIA"
|
|
533
|
+
lines.append(f" {label} Stack Traces ({len(stacks_to_show)}/{len(trace1_stacks)} shown):")
|
|
534
|
+
for i, stack in enumerate(stacks_to_show, 1):
|
|
535
|
+
lines.append(f" Variant {i}:")
|
|
536
|
+
for frame in stack:
|
|
537
|
+
lines.append(f" {frame}")
|
|
538
|
+
if i < len(stacks_to_show):
|
|
539
|
+
lines.append("")
|
|
540
|
+
|
|
541
|
+
# NVIDIA/Trace2 stacks
|
|
542
|
+
trace2_stacks = op.get("trace2_python_stacks", [])
|
|
543
|
+
if trace2_stacks:
|
|
544
|
+
stacks_to_show = trace2_stacks if show_all else trace2_stacks[:3]
|
|
545
|
+
label = "NVIDIA" if is_trace1_amd else "AMD"
|
|
546
|
+
if trace1_stacks:
|
|
547
|
+
lines.append("")
|
|
548
|
+
lines.append(f" {label} Stack Traces ({len(stacks_to_show)}/{len(trace2_stacks)} shown):")
|
|
549
|
+
for i, stack in enumerate(stacks_to_show, 1):
|
|
550
|
+
lines.append(f" Variant {i}:")
|
|
551
|
+
for frame in stack:
|
|
552
|
+
lines.append(f" {frame}")
|
|
553
|
+
if i < len(stacks_to_show):
|
|
554
|
+
lines.append("")
|
|
555
|
+
|
|
556
|
+
lines.append("")
|
|
557
|
+
|
|
558
|
+
return lines
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def format_csv(results: dict[str, Any], report_type: str = "operations") -> str:
|
|
562
|
+
"""Format comparison results as CSV.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
results: Analysis results
|
|
566
|
+
report_type: 'operations' or 'layers'
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
CSV formatted string
|
|
570
|
+
"""
|
|
571
|
+
lines = []
|
|
572
|
+
meta = results["metadata"]
|
|
573
|
+
is_trace1_amd = meta['trace1_platform'] == 'AMD'
|
|
574
|
+
|
|
575
|
+
if report_type == "layers":
|
|
576
|
+
lines.append("layer,amd_kernels,nvidia_kernels,amd_total_ms,nvidia_total_ms,ratio,gap_ms,status,in_both")
|
|
577
|
+
for layer in results.get("layers", []):
|
|
578
|
+
# Convert trace1/trace2 to amd/nvidia
|
|
579
|
+
if is_trace1_amd:
|
|
580
|
+
amd_kernels = layer['trace1_kernels']
|
|
581
|
+
nvidia_kernels = layer['trace2_kernels']
|
|
582
|
+
amd_total_ms = layer['trace1_total_ms']
|
|
583
|
+
nvidia_total_ms = layer['trace2_total_ms']
|
|
584
|
+
else:
|
|
585
|
+
amd_kernels = layer['trace2_kernels']
|
|
586
|
+
nvidia_kernels = layer['trace1_kernels']
|
|
587
|
+
amd_total_ms = layer['trace2_total_ms']
|
|
588
|
+
nvidia_total_ms = layer['trace1_total_ms']
|
|
589
|
+
|
|
590
|
+
lines.append(
|
|
591
|
+
f"{layer['layer']},"
|
|
592
|
+
f"{amd_kernels},"
|
|
593
|
+
f"{nvidia_kernels},"
|
|
594
|
+
f"{amd_total_ms:.2f},"
|
|
595
|
+
f"{nvidia_total_ms:.2f},"
|
|
596
|
+
f"{layer['ratio']:.3f},"
|
|
597
|
+
f"{layer['gap_ms']:.2f},"
|
|
598
|
+
f"{layer['status']},"
|
|
599
|
+
f"{layer.get('in_both', True)}"
|
|
600
|
+
)
|
|
601
|
+
else:
|
|
602
|
+
lines.append(
|
|
603
|
+
"operation,amd_count,nvidia_count,amd_avg_us,nvidia_avg_us,amd_total_ms,"
|
|
604
|
+
"nvidia_total_ms,ratio,gap_ms,status,amd_kernel,nvidia_kernel,amd_cpu_op,nvidia_cpu_op"
|
|
605
|
+
)
|
|
606
|
+
for op in results["operations"]:
|
|
607
|
+
# Convert trace1/trace2 to amd/nvidia
|
|
608
|
+
if is_trace1_amd:
|
|
609
|
+
amd_count = op['trace1_count']
|
|
610
|
+
nvidia_count = op['trace2_count']
|
|
611
|
+
amd_avg_us = op['trace1_avg_us']
|
|
612
|
+
nvidia_avg_us = op['trace2_avg_us']
|
|
613
|
+
amd_total_ms = op['trace1_total_ms']
|
|
614
|
+
nvidia_total_ms = op['trace2_total_ms']
|
|
615
|
+
amd_kernel = op.get('trace1_kernel', '')
|
|
616
|
+
nvidia_kernel = op.get('trace2_kernel', '')
|
|
617
|
+
amd_cpu_op = op.get('trace1_cpu_op', '')
|
|
618
|
+
nvidia_cpu_op = op.get('trace2_cpu_op', '')
|
|
619
|
+
else:
|
|
620
|
+
amd_count = op['trace2_count']
|
|
621
|
+
nvidia_count = op['trace1_count']
|
|
622
|
+
amd_avg_us = op['trace2_avg_us']
|
|
623
|
+
nvidia_avg_us = op['trace1_avg_us']
|
|
624
|
+
amd_total_ms = op['trace2_total_ms']
|
|
625
|
+
nvidia_total_ms = op['trace1_total_ms']
|
|
626
|
+
amd_kernel = op.get('trace2_kernel', '')
|
|
627
|
+
nvidia_kernel = op.get('trace1_kernel', '')
|
|
628
|
+
amd_cpu_op = op.get('trace2_cpu_op', '')
|
|
629
|
+
nvidia_cpu_op = op.get('trace1_cpu_op', '')
|
|
630
|
+
|
|
631
|
+
lines.append(
|
|
632
|
+
f"{op['operation']},"
|
|
633
|
+
f"{amd_count},"
|
|
634
|
+
f"{nvidia_count},"
|
|
635
|
+
f"{amd_avg_us:.2f},"
|
|
636
|
+
f"{nvidia_avg_us:.2f},"
|
|
637
|
+
f"{amd_total_ms:.2f},"
|
|
638
|
+
f"{nvidia_total_ms:.2f},"
|
|
639
|
+
f"{op['ratio']:.3f},"
|
|
640
|
+
f"{op['gap_ms']:.2f},"
|
|
641
|
+
f"{op['status']},"
|
|
642
|
+
f"{amd_kernel},"
|
|
643
|
+
f"{nvidia_kernel},"
|
|
644
|
+
f"{amd_cpu_op},"
|
|
645
|
+
f"{nvidia_cpu_op}"
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
return "\n".join(lines)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def format_json(results: dict[str, Any]) -> str:
|
|
652
|
+
"""Format comparison results as JSON.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
results: Analysis results
|
|
656
|
+
|
|
657
|
+
Returns:
|
|
658
|
+
JSON formatted string
|
|
659
|
+
"""
|
|
660
|
+
sanitized = _sanitize_for_json(results)
|
|
661
|
+
return json.dumps(sanitized, indent=2)
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def format_fusion_text(results: dict[str, Any]) -> str:
|
|
665
|
+
"""Format fusion analysis results as human-readable text.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
results: Fusion analysis results
|
|
669
|
+
|
|
670
|
+
Returns:
|
|
671
|
+
Formatted text report
|
|
672
|
+
"""
|
|
673
|
+
lines = []
|
|
674
|
+
meta = results["metadata"]
|
|
675
|
+
|
|
676
|
+
lines.append("=" * 80)
|
|
677
|
+
lines.append("FUSION ANALYSIS REPORT")
|
|
678
|
+
lines.append("=" * 80)
|
|
679
|
+
lines.append("")
|
|
680
|
+
|
|
681
|
+
# Use generic trace1/trace2 keys
|
|
682
|
+
lines.append(f"Trace 1 GPU: {meta['trace1_gpu']}")
|
|
683
|
+
lines.append(f"Trace 2 GPU: {meta['trace2_gpu']}")
|
|
684
|
+
lines.append(f"Trace 1 Kernels: {meta['trace1_total_kernels']:,}")
|
|
685
|
+
lines.append(f"Trace 2 Kernels: {meta['trace2_total_kernels']:,}")
|
|
686
|
+
lines.append("")
|
|
687
|
+
|
|
688
|
+
lines.append("Correlation Groups Analyzed:")
|
|
689
|
+
lines.append(f" Trace 1: {meta['trace1_correlation_groups']}")
|
|
690
|
+
lines.append(f" Trace 2: {meta['trace2_correlation_groups']}")
|
|
691
|
+
lines.append(f" Matched: {meta['matched_groups']}")
|
|
692
|
+
lines.append("")
|
|
693
|
+
|
|
694
|
+
# Convert global_counts from trace1/trace2 to amd/nvidia keys for display
|
|
695
|
+
# Note: fusion analyzer always uses AMD as trace1, NVIDIA as trace2
|
|
696
|
+
global_counts = results["global_counts"]
|
|
697
|
+
for ktype, counts in global_counts.items():
|
|
698
|
+
counts["amd_count"] = counts["trace1_count"]
|
|
699
|
+
counts["nv_count"] = counts["trace2_count"]
|
|
700
|
+
|
|
701
|
+
# Convert fusion opportunities from trace1/trace2 to amd/nvidia keys
|
|
702
|
+
for opp in results.get("fusion_opportunities", []):
|
|
703
|
+
opp["amd_total"] = opp["trace1_total"]
|
|
704
|
+
opp["nvidia_total"] = opp["trace2_total"]
|
|
705
|
+
opp["amd_avg_per_group"] = opp["trace1_avg_per_group"]
|
|
706
|
+
opp["nvidia_avg_per_group"] = opp["trace2_avg_per_group"]
|
|
707
|
+
opp["amd_time_ms"] = opp["trace1_time_ms"]
|
|
708
|
+
opp["nvidia_time_ms"] = opp["trace2_time_ms"]
|
|
709
|
+
|
|
710
|
+
# Global kernel type distribution
|
|
711
|
+
lines.append("GLOBAL KERNEL TYPE DISTRIBUTION")
|
|
712
|
+
lines.append("=" * 80)
|
|
713
|
+
lines.append(f"{'Kernel Type':<25} {'AMD Count':>12} {'NVIDIA Count':>15} {'Ratio':>12}")
|
|
714
|
+
lines.append("-" * 80)
|
|
715
|
+
|
|
716
|
+
sorted_types = sorted(
|
|
717
|
+
global_counts.items(),
|
|
718
|
+
key=lambda x: x[1]["amd_count"] + x[1]["nv_count"],
|
|
719
|
+
reverse=True,
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
for ktype, counts in sorted_types:
|
|
723
|
+
amd_c = counts["amd_count"]
|
|
724
|
+
nv_c = counts["nv_count"]
|
|
725
|
+
ratio = counts["ratio"]
|
|
726
|
+
|
|
727
|
+
# Mark significant differences
|
|
728
|
+
marker = ""
|
|
729
|
+
if ratio > 2.0:
|
|
730
|
+
marker = " ⚠️ AMD has more"
|
|
731
|
+
elif ratio < 0.5:
|
|
732
|
+
marker = " ⚠️ NVIDIA has more"
|
|
733
|
+
elif nv_c == 0 and amd_c > 20:
|
|
734
|
+
marker = " 🔥 AMD ONLY"
|
|
735
|
+
elif amd_c == 0 and nv_c > 20:
|
|
736
|
+
marker = " 🔥 NVIDIA ONLY"
|
|
737
|
+
|
|
738
|
+
ratio_str = f"{ratio:.2f}x" if ratio != float("inf") else "∞"
|
|
739
|
+
lines.append(f"{ktype:<25} {amd_c:>12,} {nv_c:>15,} {ratio_str:>12}{marker}")
|
|
740
|
+
|
|
741
|
+
lines.append("")
|
|
742
|
+
|
|
743
|
+
# Fusion opportunities
|
|
744
|
+
if results["fusion_opportunities"]:
|
|
745
|
+
lines.append("FUSION OPPORTUNITIES")
|
|
746
|
+
lines.append("=" * 80)
|
|
747
|
+
lines.append("")
|
|
748
|
+
|
|
749
|
+
amd_fuses = [opp for opp in results["fusion_opportunities"] if opp["fused_by"] == "AMD"]
|
|
750
|
+
nv_fuses = [opp for opp in results["fusion_opportunities"] if opp["fused_by"] == "NVIDIA"]
|
|
751
|
+
|
|
752
|
+
if nv_fuses:
|
|
753
|
+
lines.append("🔴 OPERATIONS AMD RUNS SEPARATELY (NVIDIA fuses them)")
|
|
754
|
+
lines.append("-" * 80)
|
|
755
|
+
lines.append("")
|
|
756
|
+
|
|
757
|
+
for i, opp in enumerate(nv_fuses, 1):
|
|
758
|
+
lines.append(f"{i}. {opp['kernel_type']}")
|
|
759
|
+
lines.append(" Kernel Launches:")
|
|
760
|
+
lines.append(f" AMD: {opp['amd_total']:,} calls ({opp['amd_avg_per_group']:.1f} per group)")
|
|
761
|
+
lines.append(f" NVIDIA: {opp['nvidia_total']:,} calls ({opp['nvidia_avg_per_group']:.1f} per group)")
|
|
762
|
+
lines.append(f" Ratio: {opp['ratio']:.2f}x (AMD launches more)")
|
|
763
|
+
lines.append(" Execution Time:")
|
|
764
|
+
lines.append(f" AMD: {opp['amd_time_ms']:.2f} ms")
|
|
765
|
+
lines.append(f" NVIDIA: {opp['nvidia_time_ms']:.2f} ms")
|
|
766
|
+
time_marker = ""
|
|
767
|
+
if opp["time_ratio"] > 1.2:
|
|
768
|
+
time_marker = " (AMD slower ⚠️)"
|
|
769
|
+
elif opp["time_ratio"] < 0.8:
|
|
770
|
+
time_marker = " (AMD faster ✓)"
|
|
771
|
+
else:
|
|
772
|
+
time_marker = " (similar)"
|
|
773
|
+
lines.append(f" Ratio: {opp['time_ratio']:.2f}x{time_marker}")
|
|
774
|
+
lines.append(f" Impact: {opp['groups_affected']}/{opp['total_groups']} groups show this difference")
|
|
775
|
+
|
|
776
|
+
# Provide interpretation
|
|
777
|
+
if opp["nvidia_total"] == 0:
|
|
778
|
+
lines.append(" → NVIDIA completely fuses this operation into another kernel")
|
|
779
|
+
else:
|
|
780
|
+
lines.append(f" → NVIDIA partially fuses, using {opp['ratio']:.1f}x fewer calls")
|
|
781
|
+
|
|
782
|
+
lines.append("")
|
|
783
|
+
|
|
784
|
+
if amd_fuses:
|
|
785
|
+
lines.append("🟢 OPERATIONS NVIDIA RUNS SEPARATELY (AMD fuses them)")
|
|
786
|
+
lines.append("-" * 80)
|
|
787
|
+
lines.append("")
|
|
788
|
+
|
|
789
|
+
for i, opp in enumerate(amd_fuses, 1):
|
|
790
|
+
lines.append(f"{i}. {opp['kernel_type']}")
|
|
791
|
+
lines.append(" Kernel Launches:")
|
|
792
|
+
lines.append(f" AMD: {opp['amd_total']:,} calls ({opp['amd_avg_per_group']:.1f} per group)")
|
|
793
|
+
lines.append(f" NVIDIA: {opp['nvidia_total']:,} calls ({opp['nvidia_avg_per_group']:.1f} per group)")
|
|
794
|
+
lines.append(f" Ratio: {opp['ratio']:.2f}x (NVIDIA launches more)")
|
|
795
|
+
lines.append(" Execution Time:")
|
|
796
|
+
lines.append(f" AMD: {opp['amd_time_ms']:.2f} ms")
|
|
797
|
+
lines.append(f" NVIDIA: {opp['nvidia_time_ms']:.2f} ms")
|
|
798
|
+
time_marker = ""
|
|
799
|
+
if opp["time_ratio"] > 1.2:
|
|
800
|
+
time_marker = " (AMD slower despite fusion ⚠️)"
|
|
801
|
+
elif opp["time_ratio"] < 0.8:
|
|
802
|
+
time_marker = " (AMD faster via fusion ✓)"
|
|
803
|
+
else:
|
|
804
|
+
time_marker = " (similar)"
|
|
805
|
+
lines.append(f" Ratio: {opp['time_ratio']:.2f}x{time_marker}")
|
|
806
|
+
lines.append(f" Impact: {opp['groups_affected']}/{opp['total_groups']} groups show this difference")
|
|
807
|
+
lines.append("")
|
|
808
|
+
else:
|
|
809
|
+
lines.append("No significant fusion differences detected.")
|
|
810
|
+
lines.append("")
|
|
811
|
+
|
|
812
|
+
# Fusion mappings
|
|
813
|
+
fusion_mappings = results.get("fusion_mappings", [])
|
|
814
|
+
if fusion_mappings:
|
|
815
|
+
lines.append("")
|
|
816
|
+
lines.append("FUSION MAPPINGS")
|
|
817
|
+
lines.append("=" * 80)
|
|
818
|
+
lines.append("")
|
|
819
|
+
|
|
820
|
+
# Group by type
|
|
821
|
+
sequence_mappings = []
|
|
822
|
+
intra_type_mappings = []
|
|
823
|
+
partial_mappings = []
|
|
824
|
+
|
|
825
|
+
for mapping in fusion_mappings:
|
|
826
|
+
if len(mapping["unfused_sequence"]) == 2 and \
|
|
827
|
+
mapping["unfused_sequence"][0] == mapping["unfused_sequence"][1] and \
|
|
828
|
+
mapping["unfused_sequence"][0] == mapping["fused_kernel_type"]:
|
|
829
|
+
intra_type_mappings.append(mapping)
|
|
830
|
+
elif len(mapping["unfused_sequence"]) == 1:
|
|
831
|
+
partial_mappings.append(mapping)
|
|
832
|
+
else:
|
|
833
|
+
sequence_mappings.append(mapping)
|
|
834
|
+
|
|
835
|
+
# Show sequence fusion
|
|
836
|
+
if sequence_mappings:
|
|
837
|
+
lines.append("🔗 SEQUENCE FUSION")
|
|
838
|
+
lines.append("-" * 80)
|
|
839
|
+
lines.append("")
|
|
840
|
+
|
|
841
|
+
# Group by evidence
|
|
842
|
+
from collections import defaultdict
|
|
843
|
+
grouped = defaultdict(list)
|
|
844
|
+
for m in sequence_mappings:
|
|
845
|
+
grouped[m["evidence"]].append(m)
|
|
846
|
+
|
|
847
|
+
for evidence, group in list(grouped.items())[:5]: # Show top 5 patterns
|
|
848
|
+
lines.append(f"Pattern: {evidence}")
|
|
849
|
+
lines.append(f" Occurrences: {len(group)} correlation groups")
|
|
850
|
+
lines.append(f" Total calls: {sum(m['pattern_count'] for m in group):,}")
|
|
851
|
+
lines.append(f" Confidence: {group[0]['pattern_confidence']*100:.0f}%")
|
|
852
|
+
lines.append("")
|
|
853
|
+
|
|
854
|
+
if len(grouped) > 5:
|
|
855
|
+
lines.append(f"... and {len(grouped) - 5} more sequence fusion patterns")
|
|
856
|
+
lines.append("")
|
|
857
|
+
|
|
858
|
+
# Show intra-type fusion
|
|
859
|
+
if intra_type_mappings:
|
|
860
|
+
lines.append("⛓️ INTRA-TYPE FUSION (Chain Compression)")
|
|
861
|
+
lines.append("-" * 80)
|
|
862
|
+
lines.append("")
|
|
863
|
+
|
|
864
|
+
for mapping in intra_type_mappings:
|
|
865
|
+
lines.append(f"Kernel: {mapping['fused_kernel_type']}")
|
|
866
|
+
lines.append(f" {mapping['evidence']}")
|
|
867
|
+
lines.append(f" Compression ratio: {mapping['pattern_count'] / max(mapping['fused_count'], 1):.1f}x")
|
|
868
|
+
lines.append("")
|
|
869
|
+
|
|
870
|
+
# Show partial fusion
|
|
871
|
+
if partial_mappings:
|
|
872
|
+
lines.append("📊 PARTIAL FUSION")
|
|
873
|
+
lines.append("-" * 80)
|
|
874
|
+
lines.append("")
|
|
875
|
+
|
|
876
|
+
for mapping in partial_mappings[:5]: # Show top 5
|
|
877
|
+
lines.append(f"Kernel: {mapping['unfused_sequence'][0]}")
|
|
878
|
+
lines.append(f" {mapping['evidence']}")
|
|
879
|
+
lines.append("")
|
|
880
|
+
|
|
881
|
+
lines.append("=" * 80)
|
|
882
|
+
|
|
883
|
+
return "\n".join(lines)
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def format_fusion_csv(results: dict[str, Any]) -> str:
|
|
887
|
+
"""Format fusion analysis results as CSV.
|
|
888
|
+
|
|
889
|
+
Args:
|
|
890
|
+
results: Fusion analysis results
|
|
891
|
+
|
|
892
|
+
Returns:
|
|
893
|
+
CSV formatted string
|
|
894
|
+
"""
|
|
895
|
+
lines = []
|
|
896
|
+
lines.append(
|
|
897
|
+
"kernel_type,amd_count,nvidia_count,amd_time_ms,nvidia_time_ms,"
|
|
898
|
+
"time_ratio,launch_ratio,fused_by,groups_affected,total_groups"
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
for opp in results["fusion_opportunities"]:
|
|
902
|
+
lines.append(
|
|
903
|
+
f"{opp['kernel_type']},"
|
|
904
|
+
f"{opp['amd_total']},"
|
|
905
|
+
f"{opp['nvidia_total']},"
|
|
906
|
+
f"{opp['amd_time_ms']:.3f},"
|
|
907
|
+
f"{opp['nvidia_time_ms']:.3f},"
|
|
908
|
+
f"{opp['time_ratio']:.3f},"
|
|
909
|
+
f"{opp['ratio']:.3f},"
|
|
910
|
+
f"{opp['fused_by']},"
|
|
911
|
+
f"{opp['groups_affected']},"
|
|
912
|
+
f"{opp['total_groups']}"
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
return "\n".join(lines)
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def _sanitize_for_json(obj: Any) -> Any:
|
|
919
|
+
"""Recursively sanitize data structure to handle Infinity and NaN values.
|
|
920
|
+
|
|
921
|
+
Args:
|
|
922
|
+
obj: Data structure to sanitize
|
|
923
|
+
|
|
924
|
+
Returns:
|
|
925
|
+
Sanitized data structure with Infinity/NaN converted to None
|
|
926
|
+
"""
|
|
927
|
+
import math
|
|
928
|
+
|
|
929
|
+
if isinstance(obj, float):
|
|
930
|
+
if math.isinf(obj) or math.isnan(obj):
|
|
931
|
+
return None
|
|
932
|
+
return obj
|
|
933
|
+
elif isinstance(obj, dict):
|
|
934
|
+
return {k: _sanitize_for_json(v) for k, v in obj.items()}
|
|
935
|
+
elif isinstance(obj, list):
|
|
936
|
+
return [_sanitize_for_json(item) for item in obj]
|
|
937
|
+
else:
|
|
938
|
+
return obj
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
def format_fusion_json(results: dict[str, Any]) -> str:
|
|
942
|
+
"""Format fusion analysis results as JSON.
|
|
943
|
+
|
|
944
|
+
Args:
|
|
945
|
+
results: Fusion analysis results
|
|
946
|
+
|
|
947
|
+
Returns:
|
|
948
|
+
JSON formatted string
|
|
949
|
+
"""
|
|
950
|
+
sanitized = _sanitize_for_json(results)
|
|
951
|
+
return json.dumps(sanitized, indent=2)
|