wafer-core 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/__init__.py +9 -22
- wafer_core/lib/trace_compare/analyzer.py +160 -584
- wafer_core/lib/trace_compare/classifier.py +18 -321
- wafer_core/lib/trace_compare/fusion_analyzer.py +753 -329
- wafer_core/lib/trace_compare/loader.py +220 -413
- wafer_core/targets/__init__.py +21 -47
- wafer_core/utils/kernel_utils/defense.py +1 -813
- wafer_core/utils/kernel_utils/targets/config.py +24 -8
- {wafer_core-0.1.33.dist-info → wafer_core-0.1.35.dist-info}/METADATA +1 -1
- {wafer_core-0.1.33.dist-info → wafer_core-0.1.35.dist-info}/RECORD +11 -11
- {wafer_core-0.1.33.dist-info → wafer_core-0.1.35.dist-info}/WHEEL +0 -0
|
@@ -4,313 +4,13 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
|
|
|
4
4
|
at the operation level and layer level.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import sys
|
|
8
7
|
from collections import defaultdict
|
|
9
|
-
from concurrent.futures import ProcessPoolExecutor
|
|
10
8
|
from pathlib import Path
|
|
11
9
|
from typing import Any
|
|
12
10
|
|
|
13
11
|
import pandas as pd
|
|
14
12
|
|
|
15
|
-
from .
|
|
16
|
-
from .fusion_analyzer import analyze_fusion_from_alignment
|
|
17
|
-
from .same_kernel_analyzer import analyze_same_kernels_from_alignment
|
|
18
|
-
from .loader import load_trace_full, LoadedTrace
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def analyze_traces_from_loaded(
|
|
22
|
-
trace1: LoadedTrace,
|
|
23
|
-
trace2: LoadedTrace,
|
|
24
|
-
phase_filter: str = "all",
|
|
25
|
-
max_stacks: int = 3,
|
|
26
|
-
) -> dict[str, Any]:
|
|
27
|
-
"""Analyze two loaded traces and return comparison data.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
trace1: First loaded trace
|
|
31
|
-
trace2: Second loaded trace
|
|
32
|
-
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
33
|
-
max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
Dictionary containing:
|
|
37
|
-
- metadata: trace info (GPUs, kernel counts, total times, etc.)
|
|
38
|
-
- operations: per-operation comparison data
|
|
39
|
-
- layers: per-layer comparison data (if layers detected)
|
|
40
|
-
"""
|
|
41
|
-
df1 = trace1.df
|
|
42
|
-
df2 = trace2.df
|
|
43
|
-
|
|
44
|
-
# Apply phase filter
|
|
45
|
-
if phase_filter != "all":
|
|
46
|
-
df1_filtered = df1[df1["phase"] == phase_filter]
|
|
47
|
-
df2_filtered = df2[df2["phase"] == phase_filter]
|
|
48
|
-
|
|
49
|
-
if len(df1_filtered) == 0 and len(df2_filtered) == 0:
|
|
50
|
-
trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
|
|
51
|
-
trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
|
|
52
|
-
raise ValueError(
|
|
53
|
-
f"No {phase_filter} phase found. "
|
|
54
|
-
f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
df1, df2 = df1_filtered, df2_filtered
|
|
58
|
-
|
|
59
|
-
# Pre-compute aggregations for both operations and layers in single pass
|
|
60
|
-
trace1_by_op = df1.groupby("op").agg({
|
|
61
|
-
"dur_us": ["sum", "mean", "count"],
|
|
62
|
-
"phase": lambda x: set(x.dropna().unique()),
|
|
63
|
-
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
64
|
-
})
|
|
65
|
-
trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
66
|
-
|
|
67
|
-
trace2_by_op = df2.groupby("op").agg({
|
|
68
|
-
"dur_us": ["sum", "mean", "count"],
|
|
69
|
-
"phase": lambda x: set(x.dropna().unique()),
|
|
70
|
-
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
71
|
-
})
|
|
72
|
-
trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
73
|
-
|
|
74
|
-
# Group by layer for layer-level analysis
|
|
75
|
-
df1_layered = df1[df1["layer"].notna()]
|
|
76
|
-
df2_layered = df2[df2["layer"].notna()]
|
|
77
|
-
|
|
78
|
-
trace1_by_layer = df1_layered.groupby("layer").agg({
|
|
79
|
-
"dur_us": ["sum", "count"],
|
|
80
|
-
}) if len(df1_layered) > 0 else pd.DataFrame()
|
|
81
|
-
if len(trace1_by_layer) > 0:
|
|
82
|
-
trace1_by_layer.columns = ["total_us", "count"]
|
|
83
|
-
|
|
84
|
-
trace2_by_layer = df2_layered.groupby("layer").agg({
|
|
85
|
-
"dur_us": ["sum", "count"],
|
|
86
|
-
}) if len(df2_layered) > 0 else pd.DataFrame()
|
|
87
|
-
if len(trace2_by_layer) > 0:
|
|
88
|
-
trace2_by_layer.columns = ["total_us", "count"]
|
|
89
|
-
|
|
90
|
-
results: dict[str, Any] = {
|
|
91
|
-
"metadata": {
|
|
92
|
-
"trace1_name": f"{trace1.platform} {trace1.gpu_name}",
|
|
93
|
-
"trace2_name": f"{trace2.platform} {trace2.gpu_name}",
|
|
94
|
-
"trace1_platform": trace1.platform,
|
|
95
|
-
"trace1_gpu": trace1.gpu_name,
|
|
96
|
-
"trace1_device": trace1.device_props,
|
|
97
|
-
"trace2_platform": trace2.platform,
|
|
98
|
-
"trace2_gpu": trace2.gpu_name,
|
|
99
|
-
"trace2_device": trace2.device_props,
|
|
100
|
-
"trace1_kernels": len(df1),
|
|
101
|
-
"trace2_kernels": len(df2),
|
|
102
|
-
"trace1_total_ms": df1["dur_us"].sum() / 1000,
|
|
103
|
-
"trace2_total_ms": df2["dur_us"].sum() / 1000,
|
|
104
|
-
"phase": phase_filter,
|
|
105
|
-
"trace1_layers": len(trace1.layers),
|
|
106
|
-
"trace2_layers": len(trace2.layers),
|
|
107
|
-
},
|
|
108
|
-
"operations": [],
|
|
109
|
-
"layers": [],
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
# Per-operation comparison
|
|
113
|
-
all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
|
|
114
|
-
rmsnorm_compared = False
|
|
115
|
-
|
|
116
|
-
for op in sorted(all_ops):
|
|
117
|
-
has_trace1 = op in trace1_by_op.index
|
|
118
|
-
has_trace2 = op in trace2_by_op.index
|
|
119
|
-
|
|
120
|
-
trace1_op_for_pattern = op
|
|
121
|
-
trace2_op_for_pattern = op
|
|
122
|
-
skip_comparison = False
|
|
123
|
-
|
|
124
|
-
if op == "RMSNorm+GEMM" and not has_trace2:
|
|
125
|
-
has_trace2 = "RMSNorm" in trace2_by_op.index
|
|
126
|
-
trace2_op_for_pattern = "RMSNorm"
|
|
127
|
-
rmsnorm_compared = True
|
|
128
|
-
elif op == "RMSNorm" and not has_trace1:
|
|
129
|
-
if rmsnorm_compared:
|
|
130
|
-
skip_comparison = True
|
|
131
|
-
else:
|
|
132
|
-
has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
|
|
133
|
-
trace1_op_for_pattern = "RMSNorm+GEMM"
|
|
134
|
-
rmsnorm_compared = True
|
|
135
|
-
|
|
136
|
-
if skip_comparison or not (has_trace1 and has_trace2):
|
|
137
|
-
continue
|
|
138
|
-
|
|
139
|
-
trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
|
|
140
|
-
trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
|
|
141
|
-
|
|
142
|
-
trace1_avg = trace1_agg["avg_us"]
|
|
143
|
-
trace2_avg = trace2_agg["avg_us"]
|
|
144
|
-
trace1_total = trace1_agg["total_us"] / 1000
|
|
145
|
-
trace2_total = trace2_agg["total_us"] / 1000
|
|
146
|
-
trace1_count = int(trace1_agg["count"])
|
|
147
|
-
trace2_count = int(trace2_agg["count"])
|
|
148
|
-
# Speedup: ratio of total times (not per-call averages)
|
|
149
|
-
# Shows how many times faster/slower trace1 is compared to trace2
|
|
150
|
-
# > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
|
|
151
|
-
# Using total time instead of avg time per call because operations may have
|
|
152
|
-
# vastly different call counts (e.g., fused vs unfused operations)
|
|
153
|
-
if trace2_total > 0:
|
|
154
|
-
ratio = trace1_total / trace2_total
|
|
155
|
-
elif trace1_total > 0:
|
|
156
|
-
ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
|
|
157
|
-
else:
|
|
158
|
-
ratio = 1.0 # Both are zero
|
|
159
|
-
gap_ms = trace1_total - trace2_total
|
|
160
|
-
|
|
161
|
-
trace1_pattern = list(
|
|
162
|
-
trace1.patterns.get(
|
|
163
|
-
(trace1_op_for_pattern, "decode"),
|
|
164
|
-
trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
|
|
165
|
-
)
|
|
166
|
-
)[0]
|
|
167
|
-
trace2_pattern = list(
|
|
168
|
-
trace2.patterns.get(
|
|
169
|
-
(trace2_op_for_pattern, "decode"),
|
|
170
|
-
trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
|
|
171
|
-
)
|
|
172
|
-
)[0]
|
|
173
|
-
|
|
174
|
-
trace1_cpu_op = trace1_agg["cpu_op"]
|
|
175
|
-
trace2_cpu_op = trace2_agg["cpu_op"]
|
|
176
|
-
|
|
177
|
-
# Get detailed kernel data and stacks only when needed
|
|
178
|
-
trace1_data = df1[df1["op"] == trace1_op_for_pattern]
|
|
179
|
-
trace2_data = df2[df2["op"] == trace2_op_for_pattern]
|
|
180
|
-
|
|
181
|
-
# Collect Python stacks if available
|
|
182
|
-
trace1_python_stacks = []
|
|
183
|
-
trace2_python_stacks = []
|
|
184
|
-
|
|
185
|
-
if max_stacks != 0:
|
|
186
|
-
stack_limit = None if max_stacks == 0 else max_stacks
|
|
187
|
-
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
188
|
-
if stack_list and len(stack_list) > 0:
|
|
189
|
-
trace1_python_stacks.append(stack_list)
|
|
190
|
-
|
|
191
|
-
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
192
|
-
if stack_list and len(stack_list) > 0:
|
|
193
|
-
trace2_python_stacks.append(stack_list)
|
|
194
|
-
|
|
195
|
-
# Aggregate individual kernels
|
|
196
|
-
trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
197
|
-
trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
198
|
-
trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
|
|
199
|
-
trace1_kernels_list = trace1_kernels.to_dict("records")
|
|
200
|
-
|
|
201
|
-
trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
202
|
-
trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
203
|
-
trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
|
|
204
|
-
trace2_kernels_list = trace2_kernels.to_dict("records")
|
|
205
|
-
|
|
206
|
-
if gap_ms > 5.0:
|
|
207
|
-
status = "slower"
|
|
208
|
-
elif gap_ms < -5.0:
|
|
209
|
-
status = "faster"
|
|
210
|
-
else:
|
|
211
|
-
status = "similar"
|
|
212
|
-
|
|
213
|
-
phases = trace1_agg["phases"] | trace2_agg["phases"]
|
|
214
|
-
|
|
215
|
-
results["operations"].append({
|
|
216
|
-
"operation": op,
|
|
217
|
-
"trace1_count": trace1_count,
|
|
218
|
-
"trace2_count": trace2_count,
|
|
219
|
-
"trace1_avg_us": trace1_avg,
|
|
220
|
-
"trace2_avg_us": trace2_avg,
|
|
221
|
-
"trace1_total_ms": trace1_total,
|
|
222
|
-
"trace2_total_ms": trace2_total,
|
|
223
|
-
"ratio": ratio,
|
|
224
|
-
"gap_ms": gap_ms,
|
|
225
|
-
"status": status,
|
|
226
|
-
"trace1_kernel": trace1_pattern,
|
|
227
|
-
"trace2_kernel": trace2_pattern,
|
|
228
|
-
"trace1_cpu_op": trace1_cpu_op,
|
|
229
|
-
"trace2_cpu_op": trace2_cpu_op,
|
|
230
|
-
"trace1_python_stacks": trace1_python_stacks,
|
|
231
|
-
"trace2_python_stacks": trace2_python_stacks,
|
|
232
|
-
"trace1_kernels": trace1_kernels_list,
|
|
233
|
-
"trace2_kernels": trace2_kernels_list,
|
|
234
|
-
"phases": sorted(list(phases)) if phases else ["all"],
|
|
235
|
-
})
|
|
236
|
-
|
|
237
|
-
results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
|
|
238
|
-
|
|
239
|
-
# Layer-wise analysis
|
|
240
|
-
if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
|
|
241
|
-
all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
|
|
242
|
-
|
|
243
|
-
for layer_num in all_layers:
|
|
244
|
-
has_trace1 = layer_num in trace1_by_layer.index
|
|
245
|
-
has_trace2 = layer_num in trace2_by_layer.index
|
|
246
|
-
|
|
247
|
-
if has_trace1 and has_trace2:
|
|
248
|
-
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
249
|
-
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
250
|
-
|
|
251
|
-
trace1_total = trace1_agg["total_us"] / 1000
|
|
252
|
-
trace2_total = trace2_agg["total_us"] / 1000
|
|
253
|
-
trace1_count = int(trace1_agg["count"])
|
|
254
|
-
trace2_count = int(trace2_agg["count"])
|
|
255
|
-
ratio = trace1_total / trace2_total if trace2_total > 0 else 1
|
|
256
|
-
gap_ms = trace1_total - trace2_total
|
|
257
|
-
|
|
258
|
-
threshold_ms = 0.1
|
|
259
|
-
threshold_ratio = 1.2
|
|
260
|
-
if gap_ms > threshold_ms and ratio > threshold_ratio:
|
|
261
|
-
status = "slower"
|
|
262
|
-
elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
|
|
263
|
-
status = "faster"
|
|
264
|
-
else:
|
|
265
|
-
status = "similar"
|
|
266
|
-
|
|
267
|
-
results["layers"].append({
|
|
268
|
-
"layer": int(layer_num),
|
|
269
|
-
"trace1_kernels": trace1_count,
|
|
270
|
-
"trace2_kernels": trace2_count,
|
|
271
|
-
"trace1_total_ms": trace1_total,
|
|
272
|
-
"trace2_total_ms": trace2_total,
|
|
273
|
-
"ratio": ratio,
|
|
274
|
-
"gap_ms": gap_ms,
|
|
275
|
-
"status": status,
|
|
276
|
-
"in_both": True,
|
|
277
|
-
})
|
|
278
|
-
elif has_trace1:
|
|
279
|
-
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
280
|
-
trace1_total = trace1_agg["total_us"] / 1000
|
|
281
|
-
trace1_count = int(trace1_agg["count"])
|
|
282
|
-
|
|
283
|
-
results["layers"].append({
|
|
284
|
-
"layer": int(layer_num),
|
|
285
|
-
"trace1_kernels": trace1_count,
|
|
286
|
-
"trace2_kernels": 0,
|
|
287
|
-
"trace1_total_ms": trace1_total,
|
|
288
|
-
"trace2_total_ms": 0.0,
|
|
289
|
-
"ratio": 0.0,
|
|
290
|
-
"gap_ms": trace1_total,
|
|
291
|
-
"status": "trace1_only",
|
|
292
|
-
"in_both": False,
|
|
293
|
-
})
|
|
294
|
-
elif has_trace2:
|
|
295
|
-
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
296
|
-
trace2_total = trace2_agg["total_us"] / 1000
|
|
297
|
-
trace2_count = int(trace2_agg["count"])
|
|
298
|
-
|
|
299
|
-
results["layers"].append({
|
|
300
|
-
"layer": int(layer_num),
|
|
301
|
-
"trace1_kernels": 0,
|
|
302
|
-
"trace2_kernels": trace2_count,
|
|
303
|
-
"trace1_total_ms": 0.0,
|
|
304
|
-
"trace2_total_ms": trace2_total,
|
|
305
|
-
"ratio": 0.0,
|
|
306
|
-
"gap_ms": -trace2_total,
|
|
307
|
-
"status": "trace2_only",
|
|
308
|
-
"in_both": False,
|
|
309
|
-
})
|
|
310
|
-
|
|
311
|
-
results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
|
|
312
|
-
|
|
313
|
-
return results
|
|
13
|
+
from .loader import load_trace
|
|
314
14
|
|
|
315
15
|
|
|
316
16
|
def analyze_traces(
|
|
@@ -318,224 +18,76 @@ def analyze_traces(
|
|
|
318
18
|
trace2_path: str | Path,
|
|
319
19
|
phase_filter: str = "all",
|
|
320
20
|
max_stacks: int = 3,
|
|
321
|
-
include_stacks: bool = True,
|
|
322
21
|
) -> dict[str, Any]:
|
|
323
22
|
"""Analyze two traces and return comparison data.
|
|
324
|
-
|
|
23
|
+
|
|
325
24
|
Args:
|
|
326
25
|
trace1_path: Path to first trace file
|
|
327
26
|
trace2_path: Path to second trace file
|
|
328
27
|
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
329
28
|
max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
|
|
330
|
-
|
|
331
|
-
|
|
29
|
+
|
|
332
30
|
Returns:
|
|
333
31
|
Dictionary containing:
|
|
334
32
|
- metadata: trace info (GPUs, kernel counts, total times, etc.)
|
|
335
33
|
- operations: per-operation comparison data
|
|
336
34
|
- layers: per-layer comparison data (if layers detected)
|
|
337
35
|
"""
|
|
338
|
-
# Load
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
with ProcessPoolExecutor(max_workers=2) as executor:
|
|
343
|
-
future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
|
|
344
|
-
future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
|
|
345
|
-
trace1 = future1.result()
|
|
346
|
-
trace2 = future2.result()
|
|
347
|
-
|
|
348
|
-
print("Analyzing operations...", file=sys.stderr)
|
|
349
|
-
|
|
350
|
-
result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
|
|
351
|
-
|
|
352
|
-
# Update metadata with file paths for backward compatibility
|
|
353
|
-
result["metadata"]["trace1_name"] = str(trace1_path)
|
|
354
|
-
result["metadata"]["trace2_name"] = str(trace2_path)
|
|
355
|
-
|
|
356
|
-
return result
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
def analyze_traces_aligned(
|
|
360
|
-
trace1: LoadedTrace,
|
|
361
|
-
trace2: LoadedTrace,
|
|
362
|
-
phase_filter: str = "all",
|
|
363
|
-
) -> dict[str, Any]:
|
|
364
|
-
"""Analyze traces using kernel-to-kernel alignment.
|
|
365
|
-
|
|
366
|
-
Args:
|
|
367
|
-
trace1: First loaded trace
|
|
368
|
-
trace2: Second loaded trace
|
|
369
|
-
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
370
|
-
|
|
371
|
-
Returns:
|
|
372
|
-
Dictionary with alignment-based comparison data
|
|
373
|
-
"""
|
|
374
|
-
amd_phases = trace1.phases
|
|
375
|
-
nvidia_phases = trace2.phases
|
|
376
|
-
|
|
377
|
-
if phase_filter != "all":
|
|
378
|
-
amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
|
|
379
|
-
nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
|
|
380
|
-
|
|
381
|
-
amd_kernels = trace1.kernel_events
|
|
382
|
-
nvidia_kernels = trace2.kernel_events
|
|
383
|
-
|
|
384
|
-
if phase_filter != "all" and amd_phases:
|
|
385
|
-
phase_starts = [p["ts_start"] for p in amd_phases]
|
|
386
|
-
phase_ends = [p["ts_end"] for p in amd_phases]
|
|
387
|
-
amd_kernels = [
|
|
388
|
-
k for k in amd_kernels
|
|
389
|
-
if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
|
|
390
|
-
for i in range(len(phase_starts)))
|
|
391
|
-
]
|
|
392
|
-
|
|
393
|
-
if phase_filter != "all" and nvidia_phases:
|
|
394
|
-
phase_starts = [p["ts_start"] for p in nvidia_phases]
|
|
395
|
-
phase_ends = [p["ts_end"] for p in nvidia_phases]
|
|
396
|
-
nvidia_kernels = [
|
|
397
|
-
k for k in nvidia_kernels
|
|
398
|
-
if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
|
|
399
|
-
for i in range(len(phase_starts)))
|
|
400
|
-
]
|
|
401
|
-
|
|
402
|
-
alignment = align_traces(
|
|
403
|
-
amd_kernels,
|
|
404
|
-
nvidia_kernels,
|
|
405
|
-
amd_phases,
|
|
406
|
-
nvidia_phases,
|
|
407
|
-
trace1.platform,
|
|
408
|
-
trace2.platform,
|
|
409
|
-
)
|
|
410
|
-
|
|
411
|
-
layer_alignments = []
|
|
412
|
-
for layer_align in alignment.layer_alignments:
|
|
413
|
-
kernel_pairs = []
|
|
414
|
-
for pair in layer_align.kernel_pairs:
|
|
415
|
-
kernel_pairs.append({
|
|
416
|
-
"position": pair.position,
|
|
417
|
-
"operation": pair.operation,
|
|
418
|
-
"operation_detail": pair.operation_detail,
|
|
419
|
-
"amd_kernel": pair.amd_kernel,
|
|
420
|
-
"amd_avg_us": pair.amd_avg_us,
|
|
421
|
-
"amd_count": pair.amd_count,
|
|
422
|
-
"amd_total_us": pair.amd_total_us,
|
|
423
|
-
"nvidia_kernel": pair.nvidia_kernel,
|
|
424
|
-
"nvidia_avg_us": pair.nvidia_avg_us,
|
|
425
|
-
"nvidia_count": pair.nvidia_count,
|
|
426
|
-
"nvidia_total_us": pair.nvidia_total_us,
|
|
427
|
-
"ratio": pair.ratio,
|
|
428
|
-
"gap_us": pair.gap_us,
|
|
429
|
-
"fusion_note": pair.fusion_note,
|
|
430
|
-
"is_same_kernel": pair.is_same_kernel,
|
|
431
|
-
})
|
|
432
|
-
|
|
433
|
-
layer_alignments.append({
|
|
434
|
-
"layer": layer_align.layer,
|
|
435
|
-
"amd_total_us": layer_align.amd_total_us,
|
|
436
|
-
"nvidia_total_us": layer_align.nvidia_total_us,
|
|
437
|
-
"ratio": layer_align.ratio,
|
|
438
|
-
"gap_us": layer_align.gap_us,
|
|
439
|
-
"kernel_pairs": kernel_pairs,
|
|
440
|
-
})
|
|
441
|
-
|
|
442
|
-
# Determine which trace is AMD vs NVIDIA for fusion analysis
|
|
443
|
-
if trace1.platform == "AMD":
|
|
444
|
-
amd_trace, nvidia_trace = trace1, trace2
|
|
445
|
-
fusion_amd_kernels = amd_kernels
|
|
446
|
-
fusion_nvidia_kernels = nvidia_kernels
|
|
447
|
-
else:
|
|
448
|
-
amd_trace, nvidia_trace = trace2, trace1
|
|
449
|
-
fusion_amd_kernels = nvidia_kernels
|
|
450
|
-
fusion_nvidia_kernels = amd_kernels
|
|
451
|
-
|
|
452
|
-
fusion_result = analyze_fusion_from_alignment(
|
|
453
|
-
alignment.layer_alignments,
|
|
454
|
-
amd_kernels=fusion_amd_kernels,
|
|
455
|
-
nvidia_kernels=fusion_nvidia_kernels,
|
|
456
|
-
)
|
|
457
|
-
same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
|
|
458
|
-
|
|
459
|
-
# Note: amd_kernels = trace1's kernels (filtered if phase_filter != "all")
|
|
460
|
-
# nvidia_kernels = trace2's kernels (filtered if phase_filter != "all")
|
|
461
|
-
# The variable names are misleading but trace1_* should use amd_kernels,
|
|
462
|
-
# and trace2_* should use nvidia_kernels to match the filtered kernel counts/totals.
|
|
463
|
-
|
|
464
|
-
return {
|
|
465
|
-
"metadata": {
|
|
466
|
-
"amd_gpu": amd_trace.gpu_name,
|
|
467
|
-
"nvidia_gpu": nvidia_trace.gpu_name,
|
|
468
|
-
"amd_platform": amd_trace.platform,
|
|
469
|
-
"nvidia_platform": nvidia_trace.platform,
|
|
470
|
-
"model_layers": alignment.num_layers,
|
|
471
|
-
"forward_passes": alignment.num_forward_passes,
|
|
472
|
-
"phase_breakdown": alignment.phase_breakdown,
|
|
473
|
-
"phase_filter": phase_filter,
|
|
474
|
-
"trace1_platform": trace1.platform,
|
|
475
|
-
"trace1_gpu": trace1.gpu_name,
|
|
476
|
-
"trace1_device": trace1.device_props,
|
|
477
|
-
"trace2_platform": trace2.platform,
|
|
478
|
-
"trace2_gpu": trace2.gpu_name,
|
|
479
|
-
"trace2_device": trace2.device_props,
|
|
480
|
-
"trace1_kernels": len(amd_kernels),
|
|
481
|
-
"trace2_kernels": len(nvidia_kernels),
|
|
482
|
-
"trace1_total_ms": sum(k.get("dur", 0) for k in amd_kernels) / 1000,
|
|
483
|
-
"trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_kernels) / 1000,
|
|
484
|
-
"phase": phase_filter,
|
|
485
|
-
"trace1_layers": alignment.num_layers,
|
|
486
|
-
"trace2_layers": alignment.num_layers,
|
|
487
|
-
},
|
|
488
|
-
"layer_alignments": layer_alignments,
|
|
489
|
-
"fusion_analysis": fusion_result,
|
|
490
|
-
"same_kernel_analysis": same_kernel_result,
|
|
491
|
-
}
|
|
492
|
-
|
|
36
|
+
# Load traces
|
|
37
|
+
p1, gpu1, dev1, df1, patterns1, layers1 = load_trace(trace1_path)
|
|
38
|
+
p2, gpu2, dev2, df2, patterns2, layers2 = load_trace(trace2_path)
|
|
39
|
+
|
|
493
40
|
# Apply phase filter
|
|
494
41
|
if phase_filter != "all":
|
|
495
42
|
df1_filtered = df1[df1["phase"] == phase_filter]
|
|
496
43
|
df2_filtered = df2[df2["phase"] == phase_filter]
|
|
497
|
-
|
|
44
|
+
|
|
498
45
|
if len(df1_filtered) == 0 and len(df2_filtered) == 0:
|
|
46
|
+
# No data in requested phase - return early with error info
|
|
499
47
|
trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
|
|
500
48
|
trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
|
|
501
49
|
raise ValueError(
|
|
502
50
|
f"No {phase_filter} phase found. "
|
|
503
51
|
f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
|
|
504
52
|
)
|
|
505
|
-
|
|
53
|
+
|
|
506
54
|
df1, df2 = df1_filtered, df2_filtered
|
|
507
|
-
|
|
55
|
+
|
|
508
56
|
# Pre-compute aggregations for both operations and layers in single pass
|
|
57
|
+
# This is much faster than iterating through filtered dataframes multiple times
|
|
58
|
+
|
|
59
|
+
# Group by operation for operation-level analysis
|
|
509
60
|
trace1_by_op = df1.groupby("op").agg({
|
|
510
61
|
"dur_us": ["sum", "mean", "count"],
|
|
511
62
|
"phase": lambda x: set(x.dropna().unique()),
|
|
512
63
|
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
513
64
|
})
|
|
514
65
|
trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
515
|
-
|
|
66
|
+
|
|
516
67
|
trace2_by_op = df2.groupby("op").agg({
|
|
517
68
|
"dur_us": ["sum", "mean", "count"],
|
|
518
69
|
"phase": lambda x: set(x.dropna().unique()),
|
|
519
70
|
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
520
71
|
})
|
|
521
72
|
trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
522
|
-
|
|
523
|
-
# Group by layer for layer-level analysis
|
|
73
|
+
|
|
74
|
+
# Group by layer for layer-level analysis (only for kernels with layer info)
|
|
524
75
|
df1_layered = df1[df1["layer"].notna()]
|
|
525
76
|
df2_layered = df2[df2["layer"].notna()]
|
|
526
|
-
|
|
77
|
+
|
|
527
78
|
trace1_by_layer = df1_layered.groupby("layer").agg({
|
|
528
79
|
"dur_us": ["sum", "count"],
|
|
529
80
|
}) if len(df1_layered) > 0 else pd.DataFrame()
|
|
530
81
|
if len(trace1_by_layer) > 0:
|
|
531
82
|
trace1_by_layer.columns = ["total_us", "count"]
|
|
532
|
-
|
|
83
|
+
|
|
533
84
|
trace2_by_layer = df2_layered.groupby("layer").agg({
|
|
534
85
|
"dur_us": ["sum", "count"],
|
|
535
86
|
}) if len(df2_layered) > 0 else pd.DataFrame()
|
|
536
87
|
if len(trace2_by_layer) > 0:
|
|
537
88
|
trace2_by_layer.columns = ["total_us", "count"]
|
|
538
|
-
|
|
89
|
+
|
|
90
|
+
# Calculate per-operation statistics
|
|
539
91
|
results: dict[str, Any] = {
|
|
540
92
|
"metadata": {
|
|
541
93
|
"trace1_name": str(trace1_path),
|
|
@@ -557,56 +109,57 @@ def analyze_traces_aligned(
|
|
|
557
109
|
"operations": [],
|
|
558
110
|
"layers": [],
|
|
559
111
|
}
|
|
560
|
-
|
|
561
|
-
# Per-operation comparison
|
|
112
|
+
|
|
113
|
+
# Per-operation comparison using pre-computed aggregations
|
|
562
114
|
all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
|
|
115
|
+
|
|
116
|
+
# Track if we've already compared RMSNorm variants to avoid duplicate comparisons
|
|
563
117
|
rmsnorm_compared = False
|
|
564
|
-
|
|
118
|
+
|
|
565
119
|
for op in sorted(all_ops):
|
|
120
|
+
# Use pre-computed aggregations instead of filtering entire dataframes
|
|
566
121
|
has_trace1 = op in trace1_by_op.index
|
|
567
122
|
has_trace2 = op in trace2_by_op.index
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
123
|
+
|
|
124
|
+
# Handle RMSNorm fusion differences: AMD does RMSNorm+GEMM, NVIDIA does separate RMSNorm
|
|
125
|
+
trace1_op_for_pattern = op # Operation name to use for AMD pattern lookup
|
|
126
|
+
trace2_op_for_pattern = op # Operation name to use for NVIDIA pattern lookup
|
|
571
127
|
skip_comparison = False
|
|
572
|
-
|
|
128
|
+
|
|
573
129
|
if op == "RMSNorm+GEMM" and not has_trace2:
|
|
130
|
+
# Compare AMD's fused version to NVIDIA's separate RMSNorm
|
|
574
131
|
has_trace2 = "RMSNorm" in trace2_by_op.index
|
|
575
|
-
trace2_op_for_pattern = "RMSNorm"
|
|
576
|
-
rmsnorm_compared = True
|
|
132
|
+
trace2_op_for_pattern = "RMSNorm" # NVIDIA kernels are stored under 'RMSNorm'
|
|
133
|
+
rmsnorm_compared = True # Mark that we've compared RMSNorm
|
|
577
134
|
elif op == "RMSNorm" and not has_trace1:
|
|
135
|
+
# Skip this comparison if we already handled it in RMSNorm+GEMM
|
|
578
136
|
if rmsnorm_compared:
|
|
579
137
|
skip_comparison = True
|
|
580
138
|
else:
|
|
139
|
+
# Compare NVIDIA's RMSNorm to AMD's fused version
|
|
581
140
|
has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
|
|
582
|
-
trace1_op_for_pattern =
|
|
141
|
+
trace1_op_for_pattern = (
|
|
142
|
+
"RMSNorm+GEMM" # AMD kernels are stored under 'RMSNorm+GEMM'
|
|
143
|
+
)
|
|
583
144
|
rmsnorm_compared = True
|
|
584
|
-
|
|
145
|
+
|
|
585
146
|
if skip_comparison or not (has_trace1 and has_trace2):
|
|
586
147
|
continue
|
|
587
|
-
|
|
148
|
+
|
|
149
|
+
# Get pre-computed aggregations
|
|
588
150
|
trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
|
|
589
151
|
trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
|
|
590
|
-
|
|
152
|
+
|
|
591
153
|
trace1_avg = trace1_agg["avg_us"]
|
|
592
154
|
trace2_avg = trace2_agg["avg_us"]
|
|
593
155
|
trace1_total = trace1_agg["total_us"] / 1000
|
|
594
156
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
595
157
|
trace1_count = int(trace1_agg["count"])
|
|
596
158
|
trace2_count = int(trace2_agg["count"])
|
|
597
|
-
|
|
598
|
-
# Shows how many times faster/slower trace1 is compared to trace2
|
|
599
|
-
# > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
|
|
600
|
-
# Using total time instead of avg time per call because operations may have
|
|
601
|
-
# vastly different call counts (e.g., fused vs unfused operations)
|
|
602
|
-
if trace2_total > 0:
|
|
603
|
-
ratio = trace1_total / trace2_total
|
|
604
|
-
elif trace1_total > 0:
|
|
605
|
-
ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
|
|
606
|
-
else:
|
|
607
|
-
ratio = 1.0 # Both are zero
|
|
159
|
+
ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
|
|
608
160
|
gap_ms = trace1_total - trace2_total
|
|
609
|
-
|
|
161
|
+
|
|
162
|
+
# Get kernel patterns using the correct operation names for each platform
|
|
610
163
|
trace1_pattern = list(
|
|
611
164
|
patterns1.get(
|
|
612
165
|
(trace1_op_for_pattern, "decode"),
|
|
@@ -619,91 +172,106 @@ def analyze_traces_aligned(
|
|
|
619
172
|
patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
|
|
620
173
|
)
|
|
621
174
|
)[0]
|
|
622
|
-
|
|
175
|
+
|
|
176
|
+
# Get CPU operators from pre-computed aggregations
|
|
623
177
|
trace1_cpu_op = trace1_agg["cpu_op"]
|
|
624
178
|
trace2_cpu_op = trace2_agg["cpu_op"]
|
|
625
|
-
|
|
626
|
-
#
|
|
179
|
+
|
|
180
|
+
# For detailed kernel data and python stacks, we still need to filter (but only when needed)
|
|
627
181
|
trace1_data = df1[df1["op"] == trace1_op_for_pattern]
|
|
628
182
|
trace2_data = df2[df2["op"] == trace2_op_for_pattern]
|
|
629
|
-
|
|
630
|
-
# Collect Python stacks
|
|
183
|
+
|
|
184
|
+
# Collect example Python stacks for this operation (for JSON output)
|
|
631
185
|
trace1_python_stacks = []
|
|
186
|
+
stack_limit = None if max_stacks == 0 else max_stacks
|
|
187
|
+
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
188
|
+
if stack_list and len(stack_list) > 0:
|
|
189
|
+
trace1_python_stacks.append(stack_list)
|
|
190
|
+
|
|
632
191
|
trace2_python_stacks = []
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
641
|
-
if stack_list and len(stack_list) > 0:
|
|
642
|
-
trace2_python_stacks.append(stack_list)
|
|
643
|
-
|
|
644
|
-
# Aggregate individual kernels
|
|
192
|
+
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
193
|
+
if stack_list and len(stack_list) > 0:
|
|
194
|
+
trace2_python_stacks.append(stack_list)
|
|
195
|
+
|
|
196
|
+
# Aggregate individual kernels by name for detailed view
|
|
197
|
+
# Group by kernel name and calculate sum/count/avg
|
|
645
198
|
trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
646
199
|
trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
647
200
|
trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
|
|
648
201
|
trace1_kernels_list = trace1_kernels.to_dict("records")
|
|
649
|
-
|
|
202
|
+
|
|
650
203
|
trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
651
204
|
trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
652
205
|
trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
|
|
653
206
|
trace2_kernels_list = trace2_kernels.to_dict("records")
|
|
654
|
-
|
|
655
|
-
|
|
207
|
+
|
|
208
|
+
# Determine status based on TOTAL TIME (gap), not per-call ratio
|
|
209
|
+
# This handles cases where AMD runs fewer operations via fusion.
|
|
210
|
+
# 5ms threshold chosen because:
|
|
211
|
+
# - Filters out measurement noise and minor variations
|
|
212
|
+
# - Represents meaningful performance impact (0.5% of typical 1s inference)
|
|
213
|
+
# - Aligns with human perception of "noticeable" difference
|
|
214
|
+
# - Too small (1ms) creates false positives from variance
|
|
215
|
+
# - Too large (20ms) misses real optimization opportunities
|
|
216
|
+
if gap_ms > 5.0: # AMD spends >5ms more total time
|
|
656
217
|
status = "slower"
|
|
657
|
-
elif gap_ms < -5.0:
|
|
218
|
+
elif gap_ms < -5.0: # AMD spends >5ms less total time
|
|
658
219
|
status = "faster"
|
|
659
220
|
else:
|
|
660
221
|
status = "similar"
|
|
661
|
-
|
|
222
|
+
|
|
223
|
+
# Get phases from pre-computed aggregations
|
|
662
224
|
phases = trace1_agg["phases"] | trace2_agg["phases"]
|
|
663
|
-
|
|
664
|
-
results["operations"].append(
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
225
|
+
|
|
226
|
+
results["operations"].append(
|
|
227
|
+
{
|
|
228
|
+
"operation": op,
|
|
229
|
+
"trace1_count": trace1_count,
|
|
230
|
+
"trace2_count": trace2_count,
|
|
231
|
+
"trace1_avg_us": trace1_avg,
|
|
232
|
+
"trace2_avg_us": trace2_avg,
|
|
233
|
+
"trace1_total_ms": trace1_total,
|
|
234
|
+
"trace2_total_ms": trace2_total,
|
|
235
|
+
"ratio": ratio,
|
|
236
|
+
"gap_ms": gap_ms,
|
|
237
|
+
"status": status,
|
|
238
|
+
"trace1_kernel": trace1_pattern,
|
|
239
|
+
"trace2_kernel": trace2_pattern,
|
|
240
|
+
"trace1_cpu_op": trace1_cpu_op,
|
|
241
|
+
"trace2_cpu_op": trace2_cpu_op,
|
|
242
|
+
"trace1_python_stacks": trace1_python_stacks, # Full stacks for JSON
|
|
243
|
+
"trace2_python_stacks": trace2_python_stacks,
|
|
244
|
+
"trace1_kernels": trace1_kernels_list, # All individual kernels for JSON
|
|
245
|
+
"trace2_kernels": trace2_kernels_list, # All individual kernels for JSON
|
|
246
|
+
"phases": sorted(list(phases)) if phases else ["all"], # For client-side filtering
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Sort by absolute gap
|
|
686
251
|
results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
|
|
687
|
-
|
|
688
|
-
# Layer-wise analysis
|
|
252
|
+
|
|
253
|
+
# Layer-wise analysis using pre-computed aggregations
|
|
689
254
|
if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
|
|
255
|
+
# Get all unique layers present in either trace
|
|
690
256
|
all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
|
|
691
|
-
|
|
257
|
+
|
|
692
258
|
for layer_num in all_layers:
|
|
693
259
|
has_trace1 = layer_num in trace1_by_layer.index
|
|
694
260
|
has_trace2 = layer_num in trace2_by_layer.index
|
|
695
|
-
|
|
261
|
+
|
|
696
262
|
if has_trace1 and has_trace2:
|
|
263
|
+
# Layer present in both traces - compare them
|
|
697
264
|
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
698
265
|
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
699
|
-
|
|
266
|
+
|
|
700
267
|
trace1_total = trace1_agg["total_us"] / 1000
|
|
701
268
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
702
269
|
trace1_count = int(trace1_agg["count"])
|
|
703
270
|
trace2_count = int(trace2_agg["count"])
|
|
704
271
|
ratio = trace1_total / trace2_total if trace2_total > 0 else 1
|
|
705
272
|
gap_ms = trace1_total - trace2_total
|
|
706
|
-
|
|
273
|
+
|
|
274
|
+
# Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
|
|
707
275
|
threshold_ms = 0.1
|
|
708
276
|
threshold_ratio = 1.2
|
|
709
277
|
if gap_ms > threshold_ms and ratio > threshold_ratio:
|
|
@@ -712,52 +280,60 @@ def analyze_traces_aligned(
|
|
|
712
280
|
status = "faster"
|
|
713
281
|
else:
|
|
714
282
|
status = "similar"
|
|
715
|
-
|
|
716
|
-
results["layers"].append(
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
283
|
+
|
|
284
|
+
results["layers"].append(
|
|
285
|
+
{
|
|
286
|
+
"layer": int(layer_num),
|
|
287
|
+
"trace1_kernels": trace1_count,
|
|
288
|
+
"trace2_kernels": trace2_count,
|
|
289
|
+
"trace1_total_ms": trace1_total,
|
|
290
|
+
"trace2_total_ms": trace2_total,
|
|
291
|
+
"ratio": ratio,
|
|
292
|
+
"gap_ms": gap_ms,
|
|
293
|
+
"status": status,
|
|
294
|
+
"in_both": True,
|
|
295
|
+
}
|
|
296
|
+
)
|
|
727
297
|
elif has_trace1:
|
|
298
|
+
# Layer only in trace1
|
|
728
299
|
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
729
300
|
trace1_total = trace1_agg["total_us"] / 1000
|
|
730
301
|
trace1_count = int(trace1_agg["count"])
|
|
731
|
-
|
|
732
|
-
results["layers"].append(
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
302
|
+
|
|
303
|
+
results["layers"].append(
|
|
304
|
+
{
|
|
305
|
+
"layer": int(layer_num),
|
|
306
|
+
"trace1_kernels": trace1_count,
|
|
307
|
+
"trace2_kernels": 0,
|
|
308
|
+
"trace1_total_ms": trace1_total,
|
|
309
|
+
"trace2_total_ms": 0.0,
|
|
310
|
+
"ratio": 0.0,
|
|
311
|
+
"gap_ms": trace1_total,
|
|
312
|
+
"status": "trace1_only",
|
|
313
|
+
"in_both": False,
|
|
314
|
+
}
|
|
315
|
+
)
|
|
743
316
|
elif has_trace2:
|
|
317
|
+
# Layer only in trace2
|
|
744
318
|
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
745
319
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
746
320
|
trace2_count = int(trace2_agg["count"])
|
|
747
|
-
|
|
748
|
-
results["layers"].append(
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
321
|
+
|
|
322
|
+
results["layers"].append(
|
|
323
|
+
{
|
|
324
|
+
"layer": int(layer_num),
|
|
325
|
+
"trace1_kernels": 0,
|
|
326
|
+
"trace2_kernels": trace2_count,
|
|
327
|
+
"trace1_total_ms": 0.0,
|
|
328
|
+
"trace2_total_ms": trace2_total,
|
|
329
|
+
"ratio": 0.0,
|
|
330
|
+
"gap_ms": -trace2_total,
|
|
331
|
+
"status": "trace2_only",
|
|
332
|
+
"in_both": False,
|
|
333
|
+
}
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Sort: comparable layers first (by absolute gap), then trace-unique layers
|
|
760
337
|
results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
|
|
761
|
-
|
|
762
|
-
print("Analysis complete.", file=sys.stderr)
|
|
338
|
+
|
|
763
339
|
return results
|