wafer-core 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
- wafer_core/lib/trace_compare/__init__.py +45 -0
- wafer_core/lib/trace_compare/aligner.py +369 -0
- wafer_core/lib/trace_compare/analyzer.py +729 -0
- wafer_core/lib/trace_compare/api.py +225 -0
- wafer_core/lib/trace_compare/architecture.py +77 -0
- wafer_core/lib/trace_compare/classifier.py +486 -0
- wafer_core/lib/trace_compare/formatter.py +951 -0
- wafer_core/lib/trace_compare/fusion_analyzer.py +356 -0
- wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
- wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
- wafer_core/lib/trace_compare/loader.py +635 -0
- wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
- wafer_core/lib/trace_compare/warnings.py +99 -0
- wafer_core/problem_config.py +3 -3
- wafer_core/rollouts/agent_presets/rlm_01_01.py +2 -2
- wafer_core/rollouts/dtypes.py +18 -3
- wafer_core/rollouts/providers/anthropic.py +35 -3
- wafer_core/utils/kernel_utils/defense.py +10 -0
- wafer_core/utils/kernel_utils/targets/config.py +10 -0
- {wafer_core-0.1.25.dist-info → wafer_core-0.1.27.dist-info}/METADATA +3 -1
- {wafer_core-0.1.25.dist-info → wafer_core-0.1.27.dist-info}/RECORD +23 -9
- {wafer_core-0.1.25.dist-info → wafer_core-0.1.27.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,729 @@
|
|
|
1
|
+
"""Main trace comparison analysis logic.
|
|
2
|
+
|
|
3
|
+
Compares GPU traces from AMD and NVIDIA platforms, identifying performance differences
|
|
4
|
+
at the operation level and layer level.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from .aligner import align_traces, TraceAlignment
|
|
16
|
+
from .fusion_analyzer import analyze_fusion_from_alignment
|
|
17
|
+
from .same_kernel_analyzer import analyze_same_kernels_from_alignment
|
|
18
|
+
from .loader import load_trace_full, LoadedTrace
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def analyze_traces_from_loaded(
|
|
22
|
+
trace1: LoadedTrace,
|
|
23
|
+
trace2: LoadedTrace,
|
|
24
|
+
phase_filter: str = "all",
|
|
25
|
+
max_stacks: int = 3,
|
|
26
|
+
) -> dict[str, Any]:
|
|
27
|
+
"""Analyze two loaded traces and return comparison data.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
trace1: First loaded trace
|
|
31
|
+
trace2: Second loaded trace
|
|
32
|
+
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
33
|
+
max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Dictionary containing:
|
|
37
|
+
- metadata: trace info (GPUs, kernel counts, total times, etc.)
|
|
38
|
+
- operations: per-operation comparison data
|
|
39
|
+
- layers: per-layer comparison data (if layers detected)
|
|
40
|
+
"""
|
|
41
|
+
df1 = trace1.df
|
|
42
|
+
df2 = trace2.df
|
|
43
|
+
|
|
44
|
+
# Apply phase filter
|
|
45
|
+
if phase_filter != "all":
|
|
46
|
+
df1_filtered = df1[df1["phase"] == phase_filter]
|
|
47
|
+
df2_filtered = df2[df2["phase"] == phase_filter]
|
|
48
|
+
|
|
49
|
+
if len(df1_filtered) == 0 and len(df2_filtered) == 0:
|
|
50
|
+
trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
|
|
51
|
+
trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"No {phase_filter} phase found. "
|
|
54
|
+
f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
df1, df2 = df1_filtered, df2_filtered
|
|
58
|
+
|
|
59
|
+
# Pre-compute aggregations for both operations and layers in single pass
|
|
60
|
+
trace1_by_op = df1.groupby("op").agg({
|
|
61
|
+
"dur_us": ["sum", "mean", "count"],
|
|
62
|
+
"phase": lambda x: set(x.dropna().unique()),
|
|
63
|
+
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
64
|
+
})
|
|
65
|
+
trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
66
|
+
|
|
67
|
+
trace2_by_op = df2.groupby("op").agg({
|
|
68
|
+
"dur_us": ["sum", "mean", "count"],
|
|
69
|
+
"phase": lambda x: set(x.dropna().unique()),
|
|
70
|
+
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
71
|
+
})
|
|
72
|
+
trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
73
|
+
|
|
74
|
+
# Group by layer for layer-level analysis
|
|
75
|
+
df1_layered = df1[df1["layer"].notna()]
|
|
76
|
+
df2_layered = df2[df2["layer"].notna()]
|
|
77
|
+
|
|
78
|
+
trace1_by_layer = df1_layered.groupby("layer").agg({
|
|
79
|
+
"dur_us": ["sum", "count"],
|
|
80
|
+
}) if len(df1_layered) > 0 else pd.DataFrame()
|
|
81
|
+
if len(trace1_by_layer) > 0:
|
|
82
|
+
trace1_by_layer.columns = ["total_us", "count"]
|
|
83
|
+
|
|
84
|
+
trace2_by_layer = df2_layered.groupby("layer").agg({
|
|
85
|
+
"dur_us": ["sum", "count"],
|
|
86
|
+
}) if len(df2_layered) > 0 else pd.DataFrame()
|
|
87
|
+
if len(trace2_by_layer) > 0:
|
|
88
|
+
trace2_by_layer.columns = ["total_us", "count"]
|
|
89
|
+
|
|
90
|
+
results: dict[str, Any] = {
|
|
91
|
+
"metadata": {
|
|
92
|
+
"trace1_name": f"{trace1.platform} {trace1.gpu_name}",
|
|
93
|
+
"trace2_name": f"{trace2.platform} {trace2.gpu_name}",
|
|
94
|
+
"trace1_platform": trace1.platform,
|
|
95
|
+
"trace1_gpu": trace1.gpu_name,
|
|
96
|
+
"trace1_device": trace1.device_props,
|
|
97
|
+
"trace2_platform": trace2.platform,
|
|
98
|
+
"trace2_gpu": trace2.gpu_name,
|
|
99
|
+
"trace2_device": trace2.device_props,
|
|
100
|
+
"trace1_kernels": len(df1),
|
|
101
|
+
"trace2_kernels": len(df2),
|
|
102
|
+
"trace1_total_ms": df1["dur_us"].sum() / 1000,
|
|
103
|
+
"trace2_total_ms": df2["dur_us"].sum() / 1000,
|
|
104
|
+
"phase": phase_filter,
|
|
105
|
+
"trace1_layers": len(trace1.layers),
|
|
106
|
+
"trace2_layers": len(trace2.layers),
|
|
107
|
+
},
|
|
108
|
+
"operations": [],
|
|
109
|
+
"layers": [],
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Per-operation comparison
|
|
113
|
+
all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
|
|
114
|
+
rmsnorm_compared = False
|
|
115
|
+
|
|
116
|
+
for op in sorted(all_ops):
|
|
117
|
+
has_trace1 = op in trace1_by_op.index
|
|
118
|
+
has_trace2 = op in trace2_by_op.index
|
|
119
|
+
|
|
120
|
+
trace1_op_for_pattern = op
|
|
121
|
+
trace2_op_for_pattern = op
|
|
122
|
+
skip_comparison = False
|
|
123
|
+
|
|
124
|
+
if op == "RMSNorm+GEMM" and not has_trace2:
|
|
125
|
+
has_trace2 = "RMSNorm" in trace2_by_op.index
|
|
126
|
+
trace2_op_for_pattern = "RMSNorm"
|
|
127
|
+
rmsnorm_compared = True
|
|
128
|
+
elif op == "RMSNorm" and not has_trace1:
|
|
129
|
+
if rmsnorm_compared:
|
|
130
|
+
skip_comparison = True
|
|
131
|
+
else:
|
|
132
|
+
has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
|
|
133
|
+
trace1_op_for_pattern = "RMSNorm+GEMM"
|
|
134
|
+
rmsnorm_compared = True
|
|
135
|
+
|
|
136
|
+
if skip_comparison or not (has_trace1 and has_trace2):
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
|
|
140
|
+
trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
|
|
141
|
+
|
|
142
|
+
trace1_avg = trace1_agg["avg_us"]
|
|
143
|
+
trace2_avg = trace2_agg["avg_us"]
|
|
144
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
145
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
146
|
+
trace1_count = int(trace1_agg["count"])
|
|
147
|
+
trace2_count = int(trace2_agg["count"])
|
|
148
|
+
ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
|
|
149
|
+
gap_ms = trace1_total - trace2_total
|
|
150
|
+
|
|
151
|
+
trace1_pattern = list(
|
|
152
|
+
trace1.patterns.get(
|
|
153
|
+
(trace1_op_for_pattern, "decode"),
|
|
154
|
+
trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
|
|
155
|
+
)
|
|
156
|
+
)[0]
|
|
157
|
+
trace2_pattern = list(
|
|
158
|
+
trace2.patterns.get(
|
|
159
|
+
(trace2_op_for_pattern, "decode"),
|
|
160
|
+
trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
|
|
161
|
+
)
|
|
162
|
+
)[0]
|
|
163
|
+
|
|
164
|
+
trace1_cpu_op = trace1_agg["cpu_op"]
|
|
165
|
+
trace2_cpu_op = trace2_agg["cpu_op"]
|
|
166
|
+
|
|
167
|
+
# Get detailed kernel data and stacks only when needed
|
|
168
|
+
trace1_data = df1[df1["op"] == trace1_op_for_pattern]
|
|
169
|
+
trace2_data = df2[df2["op"] == trace2_op_for_pattern]
|
|
170
|
+
|
|
171
|
+
# Collect Python stacks if available
|
|
172
|
+
trace1_python_stacks = []
|
|
173
|
+
trace2_python_stacks = []
|
|
174
|
+
|
|
175
|
+
if max_stacks != 0:
|
|
176
|
+
stack_limit = None if max_stacks == 0 else max_stacks
|
|
177
|
+
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
178
|
+
if stack_list and len(stack_list) > 0:
|
|
179
|
+
trace1_python_stacks.append(stack_list)
|
|
180
|
+
|
|
181
|
+
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
182
|
+
if stack_list and len(stack_list) > 0:
|
|
183
|
+
trace2_python_stacks.append(stack_list)
|
|
184
|
+
|
|
185
|
+
# Aggregate individual kernels
|
|
186
|
+
trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
187
|
+
trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
188
|
+
trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
|
|
189
|
+
trace1_kernels_list = trace1_kernels.to_dict("records")
|
|
190
|
+
|
|
191
|
+
trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
192
|
+
trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
193
|
+
trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
|
|
194
|
+
trace2_kernels_list = trace2_kernels.to_dict("records")
|
|
195
|
+
|
|
196
|
+
if gap_ms > 5.0:
|
|
197
|
+
status = "slower"
|
|
198
|
+
elif gap_ms < -5.0:
|
|
199
|
+
status = "faster"
|
|
200
|
+
else:
|
|
201
|
+
status = "similar"
|
|
202
|
+
|
|
203
|
+
phases = trace1_agg["phases"] | trace2_agg["phases"]
|
|
204
|
+
|
|
205
|
+
results["operations"].append({
|
|
206
|
+
"operation": op,
|
|
207
|
+
"trace1_count": trace1_count,
|
|
208
|
+
"trace2_count": trace2_count,
|
|
209
|
+
"trace1_avg_us": trace1_avg,
|
|
210
|
+
"trace2_avg_us": trace2_avg,
|
|
211
|
+
"trace1_total_ms": trace1_total,
|
|
212
|
+
"trace2_total_ms": trace2_total,
|
|
213
|
+
"ratio": ratio,
|
|
214
|
+
"gap_ms": gap_ms,
|
|
215
|
+
"status": status,
|
|
216
|
+
"trace1_kernel": trace1_pattern,
|
|
217
|
+
"trace2_kernel": trace2_pattern,
|
|
218
|
+
"trace1_cpu_op": trace1_cpu_op,
|
|
219
|
+
"trace2_cpu_op": trace2_cpu_op,
|
|
220
|
+
"trace1_python_stacks": trace1_python_stacks,
|
|
221
|
+
"trace2_python_stacks": trace2_python_stacks,
|
|
222
|
+
"trace1_kernels": trace1_kernels_list,
|
|
223
|
+
"trace2_kernels": trace2_kernels_list,
|
|
224
|
+
"phases": sorted(list(phases)) if phases else ["all"],
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
|
|
228
|
+
|
|
229
|
+
# Layer-wise analysis
|
|
230
|
+
if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
|
|
231
|
+
all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
|
|
232
|
+
|
|
233
|
+
for layer_num in all_layers:
|
|
234
|
+
has_trace1 = layer_num in trace1_by_layer.index
|
|
235
|
+
has_trace2 = layer_num in trace2_by_layer.index
|
|
236
|
+
|
|
237
|
+
if has_trace1 and has_trace2:
|
|
238
|
+
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
239
|
+
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
240
|
+
|
|
241
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
242
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
243
|
+
trace1_count = int(trace1_agg["count"])
|
|
244
|
+
trace2_count = int(trace2_agg["count"])
|
|
245
|
+
ratio = trace1_total / trace2_total if trace2_total > 0 else 1
|
|
246
|
+
gap_ms = trace1_total - trace2_total
|
|
247
|
+
|
|
248
|
+
threshold_ms = 0.1
|
|
249
|
+
threshold_ratio = 1.2
|
|
250
|
+
if gap_ms > threshold_ms and ratio > threshold_ratio:
|
|
251
|
+
status = "slower"
|
|
252
|
+
elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
|
|
253
|
+
status = "faster"
|
|
254
|
+
else:
|
|
255
|
+
status = "similar"
|
|
256
|
+
|
|
257
|
+
results["layers"].append({
|
|
258
|
+
"layer": int(layer_num),
|
|
259
|
+
"trace1_kernels": trace1_count,
|
|
260
|
+
"trace2_kernels": trace2_count,
|
|
261
|
+
"trace1_total_ms": trace1_total,
|
|
262
|
+
"trace2_total_ms": trace2_total,
|
|
263
|
+
"ratio": ratio,
|
|
264
|
+
"gap_ms": gap_ms,
|
|
265
|
+
"status": status,
|
|
266
|
+
"in_both": True,
|
|
267
|
+
})
|
|
268
|
+
elif has_trace1:
|
|
269
|
+
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
270
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
271
|
+
trace1_count = int(trace1_agg["count"])
|
|
272
|
+
|
|
273
|
+
results["layers"].append({
|
|
274
|
+
"layer": int(layer_num),
|
|
275
|
+
"trace1_kernels": trace1_count,
|
|
276
|
+
"trace2_kernels": 0,
|
|
277
|
+
"trace1_total_ms": trace1_total,
|
|
278
|
+
"trace2_total_ms": 0.0,
|
|
279
|
+
"ratio": 0.0,
|
|
280
|
+
"gap_ms": trace1_total,
|
|
281
|
+
"status": "trace1_only",
|
|
282
|
+
"in_both": False,
|
|
283
|
+
})
|
|
284
|
+
elif has_trace2:
|
|
285
|
+
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
286
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
287
|
+
trace2_count = int(trace2_agg["count"])
|
|
288
|
+
|
|
289
|
+
results["layers"].append({
|
|
290
|
+
"layer": int(layer_num),
|
|
291
|
+
"trace1_kernels": 0,
|
|
292
|
+
"trace2_kernels": trace2_count,
|
|
293
|
+
"trace1_total_ms": 0.0,
|
|
294
|
+
"trace2_total_ms": trace2_total,
|
|
295
|
+
"ratio": 0.0,
|
|
296
|
+
"gap_ms": -trace2_total,
|
|
297
|
+
"status": "trace2_only",
|
|
298
|
+
"in_both": False,
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
|
|
302
|
+
|
|
303
|
+
return results
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def analyze_traces(
|
|
307
|
+
trace1_path: str | Path,
|
|
308
|
+
trace2_path: str | Path,
|
|
309
|
+
phase_filter: str = "all",
|
|
310
|
+
max_stacks: int = 3,
|
|
311
|
+
include_stacks: bool = True,
|
|
312
|
+
) -> dict[str, Any]:
|
|
313
|
+
"""Analyze two traces and return comparison data.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
trace1_path: Path to first trace file
|
|
317
|
+
trace2_path: Path to second trace file
|
|
318
|
+
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
319
|
+
max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
|
|
320
|
+
include_stacks: Whether to include Python stack traces (disable for faster analysis)
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Dictionary containing:
|
|
324
|
+
- metadata: trace info (GPUs, kernel counts, total times, etc.)
|
|
325
|
+
- operations: per-operation comparison data
|
|
326
|
+
- layers: per-layer comparison data (if layers detected)
|
|
327
|
+
"""
|
|
328
|
+
# Load both traces in parallel using separate processes
|
|
329
|
+
# This provides ~1.7x speedup over sequential loading
|
|
330
|
+
print("Loading traces in parallel...", file=sys.stderr)
|
|
331
|
+
|
|
332
|
+
with ProcessPoolExecutor(max_workers=2) as executor:
|
|
333
|
+
future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
|
|
334
|
+
future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
|
|
335
|
+
trace1 = future1.result()
|
|
336
|
+
trace2 = future2.result()
|
|
337
|
+
|
|
338
|
+
print("Analyzing operations...", file=sys.stderr)
|
|
339
|
+
|
|
340
|
+
result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
|
|
341
|
+
|
|
342
|
+
# Update metadata with file paths for backward compatibility
|
|
343
|
+
result["metadata"]["trace1_name"] = str(trace1_path)
|
|
344
|
+
result["metadata"]["trace2_name"] = str(trace2_path)
|
|
345
|
+
|
|
346
|
+
return result
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def analyze_traces_aligned(
|
|
350
|
+
trace1: LoadedTrace,
|
|
351
|
+
trace2: LoadedTrace,
|
|
352
|
+
phase_filter: str = "all",
|
|
353
|
+
) -> dict[str, Any]:
|
|
354
|
+
"""Analyze traces using kernel-to-kernel alignment.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
trace1: First loaded trace
|
|
358
|
+
trace2: Second loaded trace
|
|
359
|
+
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Dictionary with alignment-based comparison data
|
|
363
|
+
"""
|
|
364
|
+
amd_phases = trace1.phases
|
|
365
|
+
nvidia_phases = trace2.phases
|
|
366
|
+
|
|
367
|
+
if phase_filter != "all":
|
|
368
|
+
amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
|
|
369
|
+
nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
|
|
370
|
+
|
|
371
|
+
amd_kernels = trace1.kernel_events
|
|
372
|
+
nvidia_kernels = trace2.kernel_events
|
|
373
|
+
|
|
374
|
+
if phase_filter != "all" and amd_phases:
|
|
375
|
+
phase_starts = [p["ts_start"] for p in amd_phases]
|
|
376
|
+
phase_ends = [p["ts_end"] for p in amd_phases]
|
|
377
|
+
amd_kernels = [
|
|
378
|
+
k for k in amd_kernels
|
|
379
|
+
if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
|
|
380
|
+
for i in range(len(phase_starts)))
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
if phase_filter != "all" and nvidia_phases:
|
|
384
|
+
phase_starts = [p["ts_start"] for p in nvidia_phases]
|
|
385
|
+
phase_ends = [p["ts_end"] for p in nvidia_phases]
|
|
386
|
+
nvidia_kernels = [
|
|
387
|
+
k for k in nvidia_kernels
|
|
388
|
+
if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
|
|
389
|
+
for i in range(len(phase_starts)))
|
|
390
|
+
]
|
|
391
|
+
|
|
392
|
+
alignment = align_traces(
|
|
393
|
+
amd_kernels,
|
|
394
|
+
nvidia_kernels,
|
|
395
|
+
amd_phases,
|
|
396
|
+
nvidia_phases,
|
|
397
|
+
trace1.platform,
|
|
398
|
+
trace2.platform,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
layer_alignments = []
|
|
402
|
+
for layer_align in alignment.layer_alignments:
|
|
403
|
+
kernel_pairs = []
|
|
404
|
+
for pair in layer_align.kernel_pairs:
|
|
405
|
+
kernel_pairs.append({
|
|
406
|
+
"position": pair.position,
|
|
407
|
+
"operation": pair.operation,
|
|
408
|
+
"operation_detail": pair.operation_detail,
|
|
409
|
+
"amd_kernel": pair.amd_kernel,
|
|
410
|
+
"amd_avg_us": pair.amd_avg_us,
|
|
411
|
+
"amd_count": pair.amd_count,
|
|
412
|
+
"amd_total_us": pair.amd_total_us,
|
|
413
|
+
"nvidia_kernel": pair.nvidia_kernel,
|
|
414
|
+
"nvidia_avg_us": pair.nvidia_avg_us,
|
|
415
|
+
"nvidia_count": pair.nvidia_count,
|
|
416
|
+
"nvidia_total_us": pair.nvidia_total_us,
|
|
417
|
+
"ratio": pair.ratio,
|
|
418
|
+
"gap_us": pair.gap_us,
|
|
419
|
+
"fusion_note": pair.fusion_note,
|
|
420
|
+
"is_same_kernel": pair.is_same_kernel,
|
|
421
|
+
})
|
|
422
|
+
|
|
423
|
+
layer_alignments.append({
|
|
424
|
+
"layer": layer_align.layer,
|
|
425
|
+
"amd_total_us": layer_align.amd_total_us,
|
|
426
|
+
"nvidia_total_us": layer_align.nvidia_total_us,
|
|
427
|
+
"ratio": layer_align.ratio,
|
|
428
|
+
"gap_us": layer_align.gap_us,
|
|
429
|
+
"kernel_pairs": kernel_pairs,
|
|
430
|
+
})
|
|
431
|
+
|
|
432
|
+
fusion_result = analyze_fusion_from_alignment(alignment.layer_alignments)
|
|
433
|
+
same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
|
|
434
|
+
|
|
435
|
+
if trace1.platform == "AMD":
|
|
436
|
+
amd_trace, nvidia_trace = trace1, trace2
|
|
437
|
+
else:
|
|
438
|
+
amd_trace, nvidia_trace = trace2, trace1
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
"metadata": {
|
|
442
|
+
"amd_gpu": amd_trace.gpu_name,
|
|
443
|
+
"nvidia_gpu": nvidia_trace.gpu_name,
|
|
444
|
+
"amd_platform": amd_trace.platform,
|
|
445
|
+
"nvidia_platform": nvidia_trace.platform,
|
|
446
|
+
"model_layers": alignment.num_layers,
|
|
447
|
+
"forward_passes": alignment.num_forward_passes,
|
|
448
|
+
"phase_breakdown": alignment.phase_breakdown,
|
|
449
|
+
"phase_filter": phase_filter,
|
|
450
|
+
"trace1_platform": trace1.platform,
|
|
451
|
+
"trace1_gpu": trace1.gpu_name,
|
|
452
|
+
"trace1_device": trace1.device_props,
|
|
453
|
+
"trace2_platform": trace2.platform,
|
|
454
|
+
"trace2_gpu": trace2.gpu_name,
|
|
455
|
+
"trace2_device": trace2.device_props,
|
|
456
|
+
"trace1_kernels": len(amd_trace.kernel_events),
|
|
457
|
+
"trace2_kernels": len(nvidia_trace.kernel_events),
|
|
458
|
+
"trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
|
|
459
|
+
"trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
|
|
460
|
+
"phase": phase_filter,
|
|
461
|
+
"trace1_layers": alignment.num_layers,
|
|
462
|
+
"trace2_layers": alignment.num_layers,
|
|
463
|
+
},
|
|
464
|
+
"layer_alignments": layer_alignments,
|
|
465
|
+
"fusion_analysis": fusion_result,
|
|
466
|
+
"same_kernel_analysis": same_kernel_result,
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
# Apply phase filter
|
|
470
|
+
if phase_filter != "all":
|
|
471
|
+
df1_filtered = df1[df1["phase"] == phase_filter]
|
|
472
|
+
df2_filtered = df2[df2["phase"] == phase_filter]
|
|
473
|
+
|
|
474
|
+
if len(df1_filtered) == 0 and len(df2_filtered) == 0:
|
|
475
|
+
trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
|
|
476
|
+
trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
|
|
477
|
+
raise ValueError(
|
|
478
|
+
f"No {phase_filter} phase found. "
|
|
479
|
+
f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
df1, df2 = df1_filtered, df2_filtered
|
|
483
|
+
|
|
484
|
+
# Pre-compute aggregations for both operations and layers in single pass
|
|
485
|
+
trace1_by_op = df1.groupby("op").agg({
|
|
486
|
+
"dur_us": ["sum", "mean", "count"],
|
|
487
|
+
"phase": lambda x: set(x.dropna().unique()),
|
|
488
|
+
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
489
|
+
})
|
|
490
|
+
trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
491
|
+
|
|
492
|
+
trace2_by_op = df2.groupby("op").agg({
|
|
493
|
+
"dur_us": ["sum", "mean", "count"],
|
|
494
|
+
"phase": lambda x: set(x.dropna().unique()),
|
|
495
|
+
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
496
|
+
})
|
|
497
|
+
trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
498
|
+
|
|
499
|
+
# Group by layer for layer-level analysis
|
|
500
|
+
df1_layered = df1[df1["layer"].notna()]
|
|
501
|
+
df2_layered = df2[df2["layer"].notna()]
|
|
502
|
+
|
|
503
|
+
trace1_by_layer = df1_layered.groupby("layer").agg({
|
|
504
|
+
"dur_us": ["sum", "count"],
|
|
505
|
+
}) if len(df1_layered) > 0 else pd.DataFrame()
|
|
506
|
+
if len(trace1_by_layer) > 0:
|
|
507
|
+
trace1_by_layer.columns = ["total_us", "count"]
|
|
508
|
+
|
|
509
|
+
trace2_by_layer = df2_layered.groupby("layer").agg({
|
|
510
|
+
"dur_us": ["sum", "count"],
|
|
511
|
+
}) if len(df2_layered) > 0 else pd.DataFrame()
|
|
512
|
+
if len(trace2_by_layer) > 0:
|
|
513
|
+
trace2_by_layer.columns = ["total_us", "count"]
|
|
514
|
+
|
|
515
|
+
results: dict[str, Any] = {
|
|
516
|
+
"metadata": {
|
|
517
|
+
"trace1_name": str(trace1_path),
|
|
518
|
+
"trace2_name": str(trace2_path),
|
|
519
|
+
"trace1_platform": p1,
|
|
520
|
+
"trace1_gpu": gpu1,
|
|
521
|
+
"trace1_device": dev1,
|
|
522
|
+
"trace2_platform": p2,
|
|
523
|
+
"trace2_gpu": gpu2,
|
|
524
|
+
"trace2_device": dev2,
|
|
525
|
+
"trace1_kernels": len(df1),
|
|
526
|
+
"trace2_kernels": len(df2),
|
|
527
|
+
"trace1_total_ms": df1["dur_us"].sum() / 1000,
|
|
528
|
+
"trace2_total_ms": df2["dur_us"].sum() / 1000,
|
|
529
|
+
"phase": phase_filter,
|
|
530
|
+
"trace1_layers": len(layers1),
|
|
531
|
+
"trace2_layers": len(layers2),
|
|
532
|
+
},
|
|
533
|
+
"operations": [],
|
|
534
|
+
"layers": [],
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
# Per-operation comparison
|
|
538
|
+
all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
|
|
539
|
+
rmsnorm_compared = False
|
|
540
|
+
|
|
541
|
+
for op in sorted(all_ops):
|
|
542
|
+
has_trace1 = op in trace1_by_op.index
|
|
543
|
+
has_trace2 = op in trace2_by_op.index
|
|
544
|
+
|
|
545
|
+
trace1_op_for_pattern = op
|
|
546
|
+
trace2_op_for_pattern = op
|
|
547
|
+
skip_comparison = False
|
|
548
|
+
|
|
549
|
+
if op == "RMSNorm+GEMM" and not has_trace2:
|
|
550
|
+
has_trace2 = "RMSNorm" in trace2_by_op.index
|
|
551
|
+
trace2_op_for_pattern = "RMSNorm"
|
|
552
|
+
rmsnorm_compared = True
|
|
553
|
+
elif op == "RMSNorm" and not has_trace1:
|
|
554
|
+
if rmsnorm_compared:
|
|
555
|
+
skip_comparison = True
|
|
556
|
+
else:
|
|
557
|
+
has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
|
|
558
|
+
trace1_op_for_pattern = "RMSNorm+GEMM"
|
|
559
|
+
rmsnorm_compared = True
|
|
560
|
+
|
|
561
|
+
if skip_comparison or not (has_trace1 and has_trace2):
|
|
562
|
+
continue
|
|
563
|
+
|
|
564
|
+
trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
|
|
565
|
+
trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
|
|
566
|
+
|
|
567
|
+
trace1_avg = trace1_agg["avg_us"]
|
|
568
|
+
trace2_avg = trace2_agg["avg_us"]
|
|
569
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
570
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
571
|
+
trace1_count = int(trace1_agg["count"])
|
|
572
|
+
trace2_count = int(trace2_agg["count"])
|
|
573
|
+
ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
|
|
574
|
+
gap_ms = trace1_total - trace2_total
|
|
575
|
+
|
|
576
|
+
trace1_pattern = list(
|
|
577
|
+
patterns1.get(
|
|
578
|
+
(trace1_op_for_pattern, "decode"),
|
|
579
|
+
patterns1.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
|
|
580
|
+
)
|
|
581
|
+
)[0]
|
|
582
|
+
trace2_pattern = list(
|
|
583
|
+
patterns2.get(
|
|
584
|
+
(trace2_op_for_pattern, "decode"),
|
|
585
|
+
patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
|
|
586
|
+
)
|
|
587
|
+
)[0]
|
|
588
|
+
|
|
589
|
+
trace1_cpu_op = trace1_agg["cpu_op"]
|
|
590
|
+
trace2_cpu_op = trace2_agg["cpu_op"]
|
|
591
|
+
|
|
592
|
+
# Get detailed kernel data and stacks only when needed
|
|
593
|
+
trace1_data = df1[df1["op"] == trace1_op_for_pattern]
|
|
594
|
+
trace2_data = df2[df2["op"] == trace2_op_for_pattern]
|
|
595
|
+
|
|
596
|
+
# Collect Python stacks if available
|
|
597
|
+
trace1_python_stacks = []
|
|
598
|
+
trace2_python_stacks = []
|
|
599
|
+
|
|
600
|
+
if include_stacks:
|
|
601
|
+
stack_limit = None if max_stacks == 0 else max_stacks
|
|
602
|
+
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
603
|
+
if stack_list and len(stack_list) > 0:
|
|
604
|
+
trace1_python_stacks.append(stack_list)
|
|
605
|
+
|
|
606
|
+
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
607
|
+
if stack_list and len(stack_list) > 0:
|
|
608
|
+
trace2_python_stacks.append(stack_list)
|
|
609
|
+
|
|
610
|
+
# Aggregate individual kernels
|
|
611
|
+
trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
612
|
+
trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
613
|
+
trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
|
|
614
|
+
trace1_kernels_list = trace1_kernels.to_dict("records")
|
|
615
|
+
|
|
616
|
+
trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
617
|
+
trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
618
|
+
trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
|
|
619
|
+
trace2_kernels_list = trace2_kernels.to_dict("records")
|
|
620
|
+
|
|
621
|
+
if gap_ms > 5.0:
|
|
622
|
+
status = "slower"
|
|
623
|
+
elif gap_ms < -5.0:
|
|
624
|
+
status = "faster"
|
|
625
|
+
else:
|
|
626
|
+
status = "similar"
|
|
627
|
+
|
|
628
|
+
phases = trace1_agg["phases"] | trace2_agg["phases"]
|
|
629
|
+
|
|
630
|
+
results["operations"].append({
|
|
631
|
+
"operation": op,
|
|
632
|
+
"trace1_count": trace1_count,
|
|
633
|
+
"trace2_count": trace2_count,
|
|
634
|
+
"trace1_avg_us": trace1_avg,
|
|
635
|
+
"trace2_avg_us": trace2_avg,
|
|
636
|
+
"trace1_total_ms": trace1_total,
|
|
637
|
+
"trace2_total_ms": trace2_total,
|
|
638
|
+
"ratio": ratio,
|
|
639
|
+
"gap_ms": gap_ms,
|
|
640
|
+
"status": status,
|
|
641
|
+
"trace1_kernel": trace1_pattern,
|
|
642
|
+
"trace2_kernel": trace2_pattern,
|
|
643
|
+
"trace1_cpu_op": trace1_cpu_op,
|
|
644
|
+
"trace2_cpu_op": trace2_cpu_op,
|
|
645
|
+
"trace1_python_stacks": trace1_python_stacks,
|
|
646
|
+
"trace2_python_stacks": trace2_python_stacks,
|
|
647
|
+
"trace1_kernels": trace1_kernels_list,
|
|
648
|
+
"trace2_kernels": trace2_kernels_list,
|
|
649
|
+
"phases": sorted(list(phases)) if phases else ["all"],
|
|
650
|
+
})
|
|
651
|
+
|
|
652
|
+
results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
|
|
653
|
+
|
|
654
|
+
# Layer-wise analysis
|
|
655
|
+
if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
|
|
656
|
+
all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
|
|
657
|
+
|
|
658
|
+
for layer_num in all_layers:
|
|
659
|
+
has_trace1 = layer_num in trace1_by_layer.index
|
|
660
|
+
has_trace2 = layer_num in trace2_by_layer.index
|
|
661
|
+
|
|
662
|
+
if has_trace1 and has_trace2:
|
|
663
|
+
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
664
|
+
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
665
|
+
|
|
666
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
667
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
668
|
+
trace1_count = int(trace1_agg["count"])
|
|
669
|
+
trace2_count = int(trace2_agg["count"])
|
|
670
|
+
ratio = trace1_total / trace2_total if trace2_total > 0 else 1
|
|
671
|
+
gap_ms = trace1_total - trace2_total
|
|
672
|
+
|
|
673
|
+
threshold_ms = 0.1
|
|
674
|
+
threshold_ratio = 1.2
|
|
675
|
+
if gap_ms > threshold_ms and ratio > threshold_ratio:
|
|
676
|
+
status = "slower"
|
|
677
|
+
elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
|
|
678
|
+
status = "faster"
|
|
679
|
+
else:
|
|
680
|
+
status = "similar"
|
|
681
|
+
|
|
682
|
+
results["layers"].append({
|
|
683
|
+
"layer": int(layer_num),
|
|
684
|
+
"trace1_kernels": trace1_count,
|
|
685
|
+
"trace2_kernels": trace2_count,
|
|
686
|
+
"trace1_total_ms": trace1_total,
|
|
687
|
+
"trace2_total_ms": trace2_total,
|
|
688
|
+
"ratio": ratio,
|
|
689
|
+
"gap_ms": gap_ms,
|
|
690
|
+
"status": status,
|
|
691
|
+
"in_both": True,
|
|
692
|
+
})
|
|
693
|
+
elif has_trace1:
|
|
694
|
+
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
695
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
696
|
+
trace1_count = int(trace1_agg["count"])
|
|
697
|
+
|
|
698
|
+
results["layers"].append({
|
|
699
|
+
"layer": int(layer_num),
|
|
700
|
+
"trace1_kernels": trace1_count,
|
|
701
|
+
"trace2_kernels": 0,
|
|
702
|
+
"trace1_total_ms": trace1_total,
|
|
703
|
+
"trace2_total_ms": 0.0,
|
|
704
|
+
"ratio": 0.0,
|
|
705
|
+
"gap_ms": trace1_total,
|
|
706
|
+
"status": "trace1_only",
|
|
707
|
+
"in_both": False,
|
|
708
|
+
})
|
|
709
|
+
elif has_trace2:
|
|
710
|
+
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
711
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
712
|
+
trace2_count = int(trace2_agg["count"])
|
|
713
|
+
|
|
714
|
+
results["layers"].append({
|
|
715
|
+
"layer": int(layer_num),
|
|
716
|
+
"trace1_kernels": 0,
|
|
717
|
+
"trace2_kernels": trace2_count,
|
|
718
|
+
"trace1_total_ms": 0.0,
|
|
719
|
+
"trace2_total_ms": trace2_total,
|
|
720
|
+
"ratio": 0.0,
|
|
721
|
+
"gap_ms": -trace2_total,
|
|
722
|
+
"status": "trace2_only",
|
|
723
|
+
"in_both": False,
|
|
724
|
+
})
|
|
725
|
+
|
|
726
|
+
results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
|
|
727
|
+
|
|
728
|
+
print("Analysis complete.", file=sys.stderr)
|
|
729
|
+
return results
|