wafer-core 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
- wafer_core/lib/trace_compare/__init__.py +22 -9
- wafer_core/lib/trace_compare/aligner.py +376 -0
- wafer_core/lib/trace_compare/analyzer.py +558 -159
- wafer_core/lib/trace_compare/api.py +225 -0
- wafer_core/lib/trace_compare/architecture.py +77 -0
- wafer_core/lib/trace_compare/classifier.py +307 -13
- wafer_core/lib/trace_compare/fusion_analyzer.py +280 -706
- wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
- wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
- wafer_core/lib/trace_compare/loader.py +526 -227
- wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
- wafer_core/lib/trace_compare/warnings.py +99 -0
- wafer_core/targets/__init__.py +47 -21
- wafer_core/targets/pool.py +181 -0
- wafer_core/targets/probe.py +113 -0
- wafer_core/targets/providers/__init__.py +46 -0
- wafer_core/targets/providers/baremetal.py +72 -0
- wafer_core/targets/providers/digitalocean.py +164 -0
- wafer_core/targets/providers/runpod.py +250 -0
- wafer_core/targets/reconcile.py +90 -0
- wafer_core/targets/spec_store.py +200 -0
- wafer_core/targets/state_cache.py +150 -0
- wafer_core/targets/types.py +141 -0
- wafer_core/utils/kernel_utils/targets/config.py +8 -24
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/METADATA +3 -1
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/RECORD +28 -10
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/WHEEL +0 -0
|
@@ -4,13 +4,303 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
|
|
|
4
4
|
at the operation level and layer level.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import sys
|
|
7
8
|
from collections import defaultdict
|
|
9
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
from typing import Any
|
|
10
12
|
|
|
11
13
|
import pandas as pd
|
|
12
14
|
|
|
13
|
-
from .
|
|
15
|
+
from .aligner import align_traces, TraceAlignment
|
|
16
|
+
from .fusion_analyzer import analyze_fusion_from_alignment
|
|
17
|
+
from .same_kernel_analyzer import analyze_same_kernels_from_alignment
|
|
18
|
+
from .loader import load_trace_full, LoadedTrace
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def analyze_traces_from_loaded(
|
|
22
|
+
trace1: LoadedTrace,
|
|
23
|
+
trace2: LoadedTrace,
|
|
24
|
+
phase_filter: str = "all",
|
|
25
|
+
max_stacks: int = 3,
|
|
26
|
+
) -> dict[str, Any]:
|
|
27
|
+
"""Analyze two loaded traces and return comparison data.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
trace1: First loaded trace
|
|
31
|
+
trace2: Second loaded trace
|
|
32
|
+
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
33
|
+
max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Dictionary containing:
|
|
37
|
+
- metadata: trace info (GPUs, kernel counts, total times, etc.)
|
|
38
|
+
- operations: per-operation comparison data
|
|
39
|
+
- layers: per-layer comparison data (if layers detected)
|
|
40
|
+
"""
|
|
41
|
+
df1 = trace1.df
|
|
42
|
+
df2 = trace2.df
|
|
43
|
+
|
|
44
|
+
# Apply phase filter
|
|
45
|
+
if phase_filter != "all":
|
|
46
|
+
df1_filtered = df1[df1["phase"] == phase_filter]
|
|
47
|
+
df2_filtered = df2[df2["phase"] == phase_filter]
|
|
48
|
+
|
|
49
|
+
if len(df1_filtered) == 0 and len(df2_filtered) == 0:
|
|
50
|
+
trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
|
|
51
|
+
trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"No {phase_filter} phase found. "
|
|
54
|
+
f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
df1, df2 = df1_filtered, df2_filtered
|
|
58
|
+
|
|
59
|
+
# Pre-compute aggregations for both operations and layers in single pass
|
|
60
|
+
trace1_by_op = df1.groupby("op").agg({
|
|
61
|
+
"dur_us": ["sum", "mean", "count"],
|
|
62
|
+
"phase": lambda x: set(x.dropna().unique()),
|
|
63
|
+
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
64
|
+
})
|
|
65
|
+
trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
66
|
+
|
|
67
|
+
trace2_by_op = df2.groupby("op").agg({
|
|
68
|
+
"dur_us": ["sum", "mean", "count"],
|
|
69
|
+
"phase": lambda x: set(x.dropna().unique()),
|
|
70
|
+
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
71
|
+
})
|
|
72
|
+
trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
73
|
+
|
|
74
|
+
# Group by layer for layer-level analysis
|
|
75
|
+
df1_layered = df1[df1["layer"].notna()]
|
|
76
|
+
df2_layered = df2[df2["layer"].notna()]
|
|
77
|
+
|
|
78
|
+
trace1_by_layer = df1_layered.groupby("layer").agg({
|
|
79
|
+
"dur_us": ["sum", "count"],
|
|
80
|
+
}) if len(df1_layered) > 0 else pd.DataFrame()
|
|
81
|
+
if len(trace1_by_layer) > 0:
|
|
82
|
+
trace1_by_layer.columns = ["total_us", "count"]
|
|
83
|
+
|
|
84
|
+
trace2_by_layer = df2_layered.groupby("layer").agg({
|
|
85
|
+
"dur_us": ["sum", "count"],
|
|
86
|
+
}) if len(df2_layered) > 0 else pd.DataFrame()
|
|
87
|
+
if len(trace2_by_layer) > 0:
|
|
88
|
+
trace2_by_layer.columns = ["total_us", "count"]
|
|
89
|
+
|
|
90
|
+
results: dict[str, Any] = {
|
|
91
|
+
"metadata": {
|
|
92
|
+
"trace1_name": f"{trace1.platform} {trace1.gpu_name}",
|
|
93
|
+
"trace2_name": f"{trace2.platform} {trace2.gpu_name}",
|
|
94
|
+
"trace1_platform": trace1.platform,
|
|
95
|
+
"trace1_gpu": trace1.gpu_name,
|
|
96
|
+
"trace1_device": trace1.device_props,
|
|
97
|
+
"trace2_platform": trace2.platform,
|
|
98
|
+
"trace2_gpu": trace2.gpu_name,
|
|
99
|
+
"trace2_device": trace2.device_props,
|
|
100
|
+
"trace1_kernels": len(df1),
|
|
101
|
+
"trace2_kernels": len(df2),
|
|
102
|
+
"trace1_total_ms": df1["dur_us"].sum() / 1000,
|
|
103
|
+
"trace2_total_ms": df2["dur_us"].sum() / 1000,
|
|
104
|
+
"phase": phase_filter,
|
|
105
|
+
"trace1_layers": len(trace1.layers),
|
|
106
|
+
"trace2_layers": len(trace2.layers),
|
|
107
|
+
},
|
|
108
|
+
"operations": [],
|
|
109
|
+
"layers": [],
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Per-operation comparison
|
|
113
|
+
all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
|
|
114
|
+
rmsnorm_compared = False
|
|
115
|
+
|
|
116
|
+
for op in sorted(all_ops):
|
|
117
|
+
has_trace1 = op in trace1_by_op.index
|
|
118
|
+
has_trace2 = op in trace2_by_op.index
|
|
119
|
+
|
|
120
|
+
trace1_op_for_pattern = op
|
|
121
|
+
trace2_op_for_pattern = op
|
|
122
|
+
skip_comparison = False
|
|
123
|
+
|
|
124
|
+
if op == "RMSNorm+GEMM" and not has_trace2:
|
|
125
|
+
has_trace2 = "RMSNorm" in trace2_by_op.index
|
|
126
|
+
trace2_op_for_pattern = "RMSNorm"
|
|
127
|
+
rmsnorm_compared = True
|
|
128
|
+
elif op == "RMSNorm" and not has_trace1:
|
|
129
|
+
if rmsnorm_compared:
|
|
130
|
+
skip_comparison = True
|
|
131
|
+
else:
|
|
132
|
+
has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
|
|
133
|
+
trace1_op_for_pattern = "RMSNorm+GEMM"
|
|
134
|
+
rmsnorm_compared = True
|
|
135
|
+
|
|
136
|
+
if skip_comparison or not (has_trace1 and has_trace2):
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
|
|
140
|
+
trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
|
|
141
|
+
|
|
142
|
+
trace1_avg = trace1_agg["avg_us"]
|
|
143
|
+
trace2_avg = trace2_agg["avg_us"]
|
|
144
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
145
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
146
|
+
trace1_count = int(trace1_agg["count"])
|
|
147
|
+
trace2_count = int(trace2_agg["count"])
|
|
148
|
+
ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
|
|
149
|
+
gap_ms = trace1_total - trace2_total
|
|
150
|
+
|
|
151
|
+
trace1_pattern = list(
|
|
152
|
+
trace1.patterns.get(
|
|
153
|
+
(trace1_op_for_pattern, "decode"),
|
|
154
|
+
trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
|
|
155
|
+
)
|
|
156
|
+
)[0]
|
|
157
|
+
trace2_pattern = list(
|
|
158
|
+
trace2.patterns.get(
|
|
159
|
+
(trace2_op_for_pattern, "decode"),
|
|
160
|
+
trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
|
|
161
|
+
)
|
|
162
|
+
)[0]
|
|
163
|
+
|
|
164
|
+
trace1_cpu_op = trace1_agg["cpu_op"]
|
|
165
|
+
trace2_cpu_op = trace2_agg["cpu_op"]
|
|
166
|
+
|
|
167
|
+
# Get detailed kernel data and stacks only when needed
|
|
168
|
+
trace1_data = df1[df1["op"] == trace1_op_for_pattern]
|
|
169
|
+
trace2_data = df2[df2["op"] == trace2_op_for_pattern]
|
|
170
|
+
|
|
171
|
+
# Collect Python stacks if available
|
|
172
|
+
trace1_python_stacks = []
|
|
173
|
+
trace2_python_stacks = []
|
|
174
|
+
|
|
175
|
+
if max_stacks != 0:
|
|
176
|
+
stack_limit = None if max_stacks == 0 else max_stacks
|
|
177
|
+
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
178
|
+
if stack_list and len(stack_list) > 0:
|
|
179
|
+
trace1_python_stacks.append(stack_list)
|
|
180
|
+
|
|
181
|
+
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
182
|
+
if stack_list and len(stack_list) > 0:
|
|
183
|
+
trace2_python_stacks.append(stack_list)
|
|
184
|
+
|
|
185
|
+
# Aggregate individual kernels
|
|
186
|
+
trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
187
|
+
trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
188
|
+
trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
|
|
189
|
+
trace1_kernels_list = trace1_kernels.to_dict("records")
|
|
190
|
+
|
|
191
|
+
trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
192
|
+
trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
193
|
+
trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
|
|
194
|
+
trace2_kernels_list = trace2_kernels.to_dict("records")
|
|
195
|
+
|
|
196
|
+
if gap_ms > 5.0:
|
|
197
|
+
status = "slower"
|
|
198
|
+
elif gap_ms < -5.0:
|
|
199
|
+
status = "faster"
|
|
200
|
+
else:
|
|
201
|
+
status = "similar"
|
|
202
|
+
|
|
203
|
+
phases = trace1_agg["phases"] | trace2_agg["phases"]
|
|
204
|
+
|
|
205
|
+
results["operations"].append({
|
|
206
|
+
"operation": op,
|
|
207
|
+
"trace1_count": trace1_count,
|
|
208
|
+
"trace2_count": trace2_count,
|
|
209
|
+
"trace1_avg_us": trace1_avg,
|
|
210
|
+
"trace2_avg_us": trace2_avg,
|
|
211
|
+
"trace1_total_ms": trace1_total,
|
|
212
|
+
"trace2_total_ms": trace2_total,
|
|
213
|
+
"ratio": ratio,
|
|
214
|
+
"gap_ms": gap_ms,
|
|
215
|
+
"status": status,
|
|
216
|
+
"trace1_kernel": trace1_pattern,
|
|
217
|
+
"trace2_kernel": trace2_pattern,
|
|
218
|
+
"trace1_cpu_op": trace1_cpu_op,
|
|
219
|
+
"trace2_cpu_op": trace2_cpu_op,
|
|
220
|
+
"trace1_python_stacks": trace1_python_stacks,
|
|
221
|
+
"trace2_python_stacks": trace2_python_stacks,
|
|
222
|
+
"trace1_kernels": trace1_kernels_list,
|
|
223
|
+
"trace2_kernels": trace2_kernels_list,
|
|
224
|
+
"phases": sorted(list(phases)) if phases else ["all"],
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
|
|
228
|
+
|
|
229
|
+
# Layer-wise analysis
|
|
230
|
+
if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
|
|
231
|
+
all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
|
|
232
|
+
|
|
233
|
+
for layer_num in all_layers:
|
|
234
|
+
has_trace1 = layer_num in trace1_by_layer.index
|
|
235
|
+
has_trace2 = layer_num in trace2_by_layer.index
|
|
236
|
+
|
|
237
|
+
if has_trace1 and has_trace2:
|
|
238
|
+
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
239
|
+
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
240
|
+
|
|
241
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
242
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
243
|
+
trace1_count = int(trace1_agg["count"])
|
|
244
|
+
trace2_count = int(trace2_agg["count"])
|
|
245
|
+
ratio = trace1_total / trace2_total if trace2_total > 0 else 1
|
|
246
|
+
gap_ms = trace1_total - trace2_total
|
|
247
|
+
|
|
248
|
+
threshold_ms = 0.1
|
|
249
|
+
threshold_ratio = 1.2
|
|
250
|
+
if gap_ms > threshold_ms and ratio > threshold_ratio:
|
|
251
|
+
status = "slower"
|
|
252
|
+
elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
|
|
253
|
+
status = "faster"
|
|
254
|
+
else:
|
|
255
|
+
status = "similar"
|
|
256
|
+
|
|
257
|
+
results["layers"].append({
|
|
258
|
+
"layer": int(layer_num),
|
|
259
|
+
"trace1_kernels": trace1_count,
|
|
260
|
+
"trace2_kernels": trace2_count,
|
|
261
|
+
"trace1_total_ms": trace1_total,
|
|
262
|
+
"trace2_total_ms": trace2_total,
|
|
263
|
+
"ratio": ratio,
|
|
264
|
+
"gap_ms": gap_ms,
|
|
265
|
+
"status": status,
|
|
266
|
+
"in_both": True,
|
|
267
|
+
})
|
|
268
|
+
elif has_trace1:
|
|
269
|
+
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
270
|
+
trace1_total = trace1_agg["total_us"] / 1000
|
|
271
|
+
trace1_count = int(trace1_agg["count"])
|
|
272
|
+
|
|
273
|
+
results["layers"].append({
|
|
274
|
+
"layer": int(layer_num),
|
|
275
|
+
"trace1_kernels": trace1_count,
|
|
276
|
+
"trace2_kernels": 0,
|
|
277
|
+
"trace1_total_ms": trace1_total,
|
|
278
|
+
"trace2_total_ms": 0.0,
|
|
279
|
+
"ratio": 0.0,
|
|
280
|
+
"gap_ms": trace1_total,
|
|
281
|
+
"status": "trace1_only",
|
|
282
|
+
"in_both": False,
|
|
283
|
+
})
|
|
284
|
+
elif has_trace2:
|
|
285
|
+
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
286
|
+
trace2_total = trace2_agg["total_us"] / 1000
|
|
287
|
+
trace2_count = int(trace2_agg["count"])
|
|
288
|
+
|
|
289
|
+
results["layers"].append({
|
|
290
|
+
"layer": int(layer_num),
|
|
291
|
+
"trace1_kernels": 0,
|
|
292
|
+
"trace2_kernels": trace2_count,
|
|
293
|
+
"trace1_total_ms": 0.0,
|
|
294
|
+
"trace2_total_ms": trace2_total,
|
|
295
|
+
"ratio": 0.0,
|
|
296
|
+
"gap_ms": -trace2_total,
|
|
297
|
+
"status": "trace2_only",
|
|
298
|
+
"in_both": False,
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
|
|
302
|
+
|
|
303
|
+
return results
|
|
14
304
|
|
|
15
305
|
|
|
16
306
|
def analyze_traces(
|
|
@@ -18,76 +308,219 @@ def analyze_traces(
|
|
|
18
308
|
trace2_path: str | Path,
|
|
19
309
|
phase_filter: str = "all",
|
|
20
310
|
max_stacks: int = 3,
|
|
311
|
+
include_stacks: bool = True,
|
|
21
312
|
) -> dict[str, Any]:
|
|
22
313
|
"""Analyze two traces and return comparison data.
|
|
23
|
-
|
|
314
|
+
|
|
24
315
|
Args:
|
|
25
316
|
trace1_path: Path to first trace file
|
|
26
317
|
trace2_path: Path to second trace file
|
|
27
318
|
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
28
319
|
max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
|
|
29
|
-
|
|
320
|
+
include_stacks: Whether to include Python stack traces (disable for faster analysis)
|
|
321
|
+
|
|
30
322
|
Returns:
|
|
31
323
|
Dictionary containing:
|
|
32
324
|
- metadata: trace info (GPUs, kernel counts, total times, etc.)
|
|
33
325
|
- operations: per-operation comparison data
|
|
34
326
|
- layers: per-layer comparison data (if layers detected)
|
|
35
327
|
"""
|
|
36
|
-
# Load traces
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
328
|
+
# Load both traces in parallel using separate processes
|
|
329
|
+
# This provides ~1.7x speedup over sequential loading
|
|
330
|
+
print("Loading traces in parallel...", file=sys.stderr)
|
|
331
|
+
|
|
332
|
+
with ProcessPoolExecutor(max_workers=2) as executor:
|
|
333
|
+
future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
|
|
334
|
+
future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
|
|
335
|
+
trace1 = future1.result()
|
|
336
|
+
trace2 = future2.result()
|
|
337
|
+
|
|
338
|
+
print("Analyzing operations...", file=sys.stderr)
|
|
339
|
+
|
|
340
|
+
result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
|
|
341
|
+
|
|
342
|
+
# Update metadata with file paths for backward compatibility
|
|
343
|
+
result["metadata"]["trace1_name"] = str(trace1_path)
|
|
344
|
+
result["metadata"]["trace2_name"] = str(trace2_path)
|
|
345
|
+
|
|
346
|
+
return result
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def analyze_traces_aligned(
|
|
350
|
+
trace1: LoadedTrace,
|
|
351
|
+
trace2: LoadedTrace,
|
|
352
|
+
phase_filter: str = "all",
|
|
353
|
+
) -> dict[str, Any]:
|
|
354
|
+
"""Analyze traces using kernel-to-kernel alignment.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
trace1: First loaded trace
|
|
358
|
+
trace2: Second loaded trace
|
|
359
|
+
phase_filter: Filter by phase ('all', 'prefill', or 'decode')
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Dictionary with alignment-based comparison data
|
|
363
|
+
"""
|
|
364
|
+
amd_phases = trace1.phases
|
|
365
|
+
nvidia_phases = trace2.phases
|
|
366
|
+
|
|
367
|
+
if phase_filter != "all":
|
|
368
|
+
amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
|
|
369
|
+
nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
|
|
370
|
+
|
|
371
|
+
amd_kernels = trace1.kernel_events
|
|
372
|
+
nvidia_kernels = trace2.kernel_events
|
|
373
|
+
|
|
374
|
+
if phase_filter != "all" and amd_phases:
|
|
375
|
+
phase_starts = [p["ts_start"] for p in amd_phases]
|
|
376
|
+
phase_ends = [p["ts_end"] for p in amd_phases]
|
|
377
|
+
amd_kernels = [
|
|
378
|
+
k for k in amd_kernels
|
|
379
|
+
if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
|
|
380
|
+
for i in range(len(phase_starts)))
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
if phase_filter != "all" and nvidia_phases:
|
|
384
|
+
phase_starts = [p["ts_start"] for p in nvidia_phases]
|
|
385
|
+
phase_ends = [p["ts_end"] for p in nvidia_phases]
|
|
386
|
+
nvidia_kernels = [
|
|
387
|
+
k for k in nvidia_kernels
|
|
388
|
+
if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
|
|
389
|
+
for i in range(len(phase_starts)))
|
|
390
|
+
]
|
|
391
|
+
|
|
392
|
+
alignment = align_traces(
|
|
393
|
+
amd_kernels,
|
|
394
|
+
nvidia_kernels,
|
|
395
|
+
amd_phases,
|
|
396
|
+
nvidia_phases,
|
|
397
|
+
trace1.platform,
|
|
398
|
+
trace2.platform,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
layer_alignments = []
|
|
402
|
+
for layer_align in alignment.layer_alignments:
|
|
403
|
+
kernel_pairs = []
|
|
404
|
+
for pair in layer_align.kernel_pairs:
|
|
405
|
+
kernel_pairs.append({
|
|
406
|
+
"position": pair.position,
|
|
407
|
+
"operation": pair.operation,
|
|
408
|
+
"operation_detail": pair.operation_detail,
|
|
409
|
+
"amd_kernel": pair.amd_kernel,
|
|
410
|
+
"amd_avg_us": pair.amd_avg_us,
|
|
411
|
+
"amd_count": pair.amd_count,
|
|
412
|
+
"amd_total_us": pair.amd_total_us,
|
|
413
|
+
"nvidia_kernel": pair.nvidia_kernel,
|
|
414
|
+
"nvidia_avg_us": pair.nvidia_avg_us,
|
|
415
|
+
"nvidia_count": pair.nvidia_count,
|
|
416
|
+
"nvidia_total_us": pair.nvidia_total_us,
|
|
417
|
+
"ratio": pair.ratio,
|
|
418
|
+
"gap_us": pair.gap_us,
|
|
419
|
+
"fusion_note": pair.fusion_note,
|
|
420
|
+
"is_same_kernel": pair.is_same_kernel,
|
|
421
|
+
})
|
|
422
|
+
|
|
423
|
+
layer_alignments.append({
|
|
424
|
+
"layer": layer_align.layer,
|
|
425
|
+
"amd_total_us": layer_align.amd_total_us,
|
|
426
|
+
"nvidia_total_us": layer_align.nvidia_total_us,
|
|
427
|
+
"ratio": layer_align.ratio,
|
|
428
|
+
"gap_us": layer_align.gap_us,
|
|
429
|
+
"kernel_pairs": kernel_pairs,
|
|
430
|
+
})
|
|
431
|
+
|
|
432
|
+
# Determine which trace is AMD vs NVIDIA for fusion analysis
|
|
433
|
+
if trace1.platform == "AMD":
|
|
434
|
+
amd_trace, nvidia_trace = trace1, trace2
|
|
435
|
+
fusion_amd_kernels = amd_kernels
|
|
436
|
+
fusion_nvidia_kernels = nvidia_kernels
|
|
437
|
+
else:
|
|
438
|
+
amd_trace, nvidia_trace = trace2, trace1
|
|
439
|
+
fusion_amd_kernels = nvidia_kernels
|
|
440
|
+
fusion_nvidia_kernels = amd_kernels
|
|
441
|
+
|
|
442
|
+
fusion_result = analyze_fusion_from_alignment(
|
|
443
|
+
alignment.layer_alignments,
|
|
444
|
+
amd_kernels=fusion_amd_kernels,
|
|
445
|
+
nvidia_kernels=fusion_nvidia_kernels,
|
|
446
|
+
)
|
|
447
|
+
same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
|
|
448
|
+
|
|
449
|
+
return {
|
|
450
|
+
"metadata": {
|
|
451
|
+
"amd_gpu": amd_trace.gpu_name,
|
|
452
|
+
"nvidia_gpu": nvidia_trace.gpu_name,
|
|
453
|
+
"amd_platform": amd_trace.platform,
|
|
454
|
+
"nvidia_platform": nvidia_trace.platform,
|
|
455
|
+
"model_layers": alignment.num_layers,
|
|
456
|
+
"forward_passes": alignment.num_forward_passes,
|
|
457
|
+
"phase_breakdown": alignment.phase_breakdown,
|
|
458
|
+
"phase_filter": phase_filter,
|
|
459
|
+
"trace1_platform": trace1.platform,
|
|
460
|
+
"trace1_gpu": trace1.gpu_name,
|
|
461
|
+
"trace1_device": trace1.device_props,
|
|
462
|
+
"trace2_platform": trace2.platform,
|
|
463
|
+
"trace2_gpu": trace2.gpu_name,
|
|
464
|
+
"trace2_device": trace2.device_props,
|
|
465
|
+
"trace1_kernels": len(amd_trace.kernel_events),
|
|
466
|
+
"trace2_kernels": len(nvidia_trace.kernel_events),
|
|
467
|
+
"trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
|
|
468
|
+
"trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
|
|
469
|
+
"phase": phase_filter,
|
|
470
|
+
"trace1_layers": alignment.num_layers,
|
|
471
|
+
"trace2_layers": alignment.num_layers,
|
|
472
|
+
},
|
|
473
|
+
"layer_alignments": layer_alignments,
|
|
474
|
+
"fusion_analysis": fusion_result,
|
|
475
|
+
"same_kernel_analysis": same_kernel_result,
|
|
476
|
+
}
|
|
477
|
+
|
|
40
478
|
# Apply phase filter
|
|
41
479
|
if phase_filter != "all":
|
|
42
480
|
df1_filtered = df1[df1["phase"] == phase_filter]
|
|
43
481
|
df2_filtered = df2[df2["phase"] == phase_filter]
|
|
44
|
-
|
|
482
|
+
|
|
45
483
|
if len(df1_filtered) == 0 and len(df2_filtered) == 0:
|
|
46
|
-
# No data in requested phase - return early with error info
|
|
47
484
|
trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
|
|
48
485
|
trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
|
|
49
486
|
raise ValueError(
|
|
50
487
|
f"No {phase_filter} phase found. "
|
|
51
488
|
f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
|
|
52
489
|
)
|
|
53
|
-
|
|
490
|
+
|
|
54
491
|
df1, df2 = df1_filtered, df2_filtered
|
|
55
|
-
|
|
492
|
+
|
|
56
493
|
# Pre-compute aggregations for both operations and layers in single pass
|
|
57
|
-
# This is much faster than iterating through filtered dataframes multiple times
|
|
58
|
-
|
|
59
|
-
# Group by operation for operation-level analysis
|
|
60
494
|
trace1_by_op = df1.groupby("op").agg({
|
|
61
495
|
"dur_us": ["sum", "mean", "count"],
|
|
62
496
|
"phase": lambda x: set(x.dropna().unique()),
|
|
63
497
|
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
64
498
|
})
|
|
65
499
|
trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
66
|
-
|
|
500
|
+
|
|
67
501
|
trace2_by_op = df2.groupby("op").agg({
|
|
68
502
|
"dur_us": ["sum", "mean", "count"],
|
|
69
503
|
"phase": lambda x: set(x.dropna().unique()),
|
|
70
504
|
"cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
|
|
71
505
|
})
|
|
72
506
|
trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
|
|
73
|
-
|
|
74
|
-
# Group by layer for layer-level analysis
|
|
507
|
+
|
|
508
|
+
# Group by layer for layer-level analysis
|
|
75
509
|
df1_layered = df1[df1["layer"].notna()]
|
|
76
510
|
df2_layered = df2[df2["layer"].notna()]
|
|
77
|
-
|
|
511
|
+
|
|
78
512
|
trace1_by_layer = df1_layered.groupby("layer").agg({
|
|
79
513
|
"dur_us": ["sum", "count"],
|
|
80
514
|
}) if len(df1_layered) > 0 else pd.DataFrame()
|
|
81
515
|
if len(trace1_by_layer) > 0:
|
|
82
516
|
trace1_by_layer.columns = ["total_us", "count"]
|
|
83
|
-
|
|
517
|
+
|
|
84
518
|
trace2_by_layer = df2_layered.groupby("layer").agg({
|
|
85
519
|
"dur_us": ["sum", "count"],
|
|
86
520
|
}) if len(df2_layered) > 0 else pd.DataFrame()
|
|
87
521
|
if len(trace2_by_layer) > 0:
|
|
88
522
|
trace2_by_layer.columns = ["total_us", "count"]
|
|
89
|
-
|
|
90
|
-
# Calculate per-operation statistics
|
|
523
|
+
|
|
91
524
|
results: dict[str, Any] = {
|
|
92
525
|
"metadata": {
|
|
93
526
|
"trace1_name": str(trace1_path),
|
|
@@ -109,47 +542,37 @@ def analyze_traces(
|
|
|
109
542
|
"operations": [],
|
|
110
543
|
"layers": [],
|
|
111
544
|
}
|
|
112
|
-
|
|
113
|
-
# Per-operation comparison
|
|
545
|
+
|
|
546
|
+
# Per-operation comparison
|
|
114
547
|
all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
|
|
115
|
-
|
|
116
|
-
# Track if we've already compared RMSNorm variants to avoid duplicate comparisons
|
|
117
548
|
rmsnorm_compared = False
|
|
118
|
-
|
|
549
|
+
|
|
119
550
|
for op in sorted(all_ops):
|
|
120
|
-
# Use pre-computed aggregations instead of filtering entire dataframes
|
|
121
551
|
has_trace1 = op in trace1_by_op.index
|
|
122
552
|
has_trace2 = op in trace2_by_op.index
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
trace2_op_for_pattern = op # Operation name to use for NVIDIA pattern lookup
|
|
553
|
+
|
|
554
|
+
trace1_op_for_pattern = op
|
|
555
|
+
trace2_op_for_pattern = op
|
|
127
556
|
skip_comparison = False
|
|
128
|
-
|
|
557
|
+
|
|
129
558
|
if op == "RMSNorm+GEMM" and not has_trace2:
|
|
130
|
-
# Compare AMD's fused version to NVIDIA's separate RMSNorm
|
|
131
559
|
has_trace2 = "RMSNorm" in trace2_by_op.index
|
|
132
|
-
trace2_op_for_pattern = "RMSNorm"
|
|
133
|
-
rmsnorm_compared = True
|
|
560
|
+
trace2_op_for_pattern = "RMSNorm"
|
|
561
|
+
rmsnorm_compared = True
|
|
134
562
|
elif op == "RMSNorm" and not has_trace1:
|
|
135
|
-
# Skip this comparison if we already handled it in RMSNorm+GEMM
|
|
136
563
|
if rmsnorm_compared:
|
|
137
564
|
skip_comparison = True
|
|
138
565
|
else:
|
|
139
|
-
# Compare NVIDIA's RMSNorm to AMD's fused version
|
|
140
566
|
has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
|
|
141
|
-
trace1_op_for_pattern =
|
|
142
|
-
"RMSNorm+GEMM" # AMD kernels are stored under 'RMSNorm+GEMM'
|
|
143
|
-
)
|
|
567
|
+
trace1_op_for_pattern = "RMSNorm+GEMM"
|
|
144
568
|
rmsnorm_compared = True
|
|
145
|
-
|
|
569
|
+
|
|
146
570
|
if skip_comparison or not (has_trace1 and has_trace2):
|
|
147
571
|
continue
|
|
148
|
-
|
|
149
|
-
# Get pre-computed aggregations
|
|
572
|
+
|
|
150
573
|
trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
|
|
151
574
|
trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
|
|
152
|
-
|
|
575
|
+
|
|
153
576
|
trace1_avg = trace1_agg["avg_us"]
|
|
154
577
|
trace2_avg = trace2_agg["avg_us"]
|
|
155
578
|
trace1_total = trace1_agg["total_us"] / 1000
|
|
@@ -158,8 +581,7 @@ def analyze_traces(
|
|
|
158
581
|
trace2_count = int(trace2_agg["count"])
|
|
159
582
|
ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
|
|
160
583
|
gap_ms = trace1_total - trace2_total
|
|
161
|
-
|
|
162
|
-
# Get kernel patterns using the correct operation names for each platform
|
|
584
|
+
|
|
163
585
|
trace1_pattern = list(
|
|
164
586
|
patterns1.get(
|
|
165
587
|
(trace1_op_for_pattern, "decode"),
|
|
@@ -172,106 +594,91 @@ def analyze_traces(
|
|
|
172
594
|
patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
|
|
173
595
|
)
|
|
174
596
|
)[0]
|
|
175
|
-
|
|
176
|
-
# Get CPU operators from pre-computed aggregations
|
|
597
|
+
|
|
177
598
|
trace1_cpu_op = trace1_agg["cpu_op"]
|
|
178
599
|
trace2_cpu_op = trace2_agg["cpu_op"]
|
|
179
|
-
|
|
180
|
-
#
|
|
600
|
+
|
|
601
|
+
# Get detailed kernel data and stacks only when needed
|
|
181
602
|
trace1_data = df1[df1["op"] == trace1_op_for_pattern]
|
|
182
603
|
trace2_data = df2[df2["op"] == trace2_op_for_pattern]
|
|
183
|
-
|
|
184
|
-
# Collect
|
|
604
|
+
|
|
605
|
+
# Collect Python stacks if available
|
|
185
606
|
trace1_python_stacks = []
|
|
186
|
-
stack_limit = None if max_stacks == 0 else max_stacks
|
|
187
|
-
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
188
|
-
if stack_list and len(stack_list) > 0:
|
|
189
|
-
trace1_python_stacks.append(stack_list)
|
|
190
|
-
|
|
191
607
|
trace2_python_stacks = []
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
608
|
+
|
|
609
|
+
if include_stacks:
|
|
610
|
+
stack_limit = None if max_stacks == 0 else max_stacks
|
|
611
|
+
for stack_list in trace1_data["python_stack"].head(stack_limit):
|
|
612
|
+
if stack_list and len(stack_list) > 0:
|
|
613
|
+
trace1_python_stacks.append(stack_list)
|
|
614
|
+
|
|
615
|
+
for stack_list in trace2_data["python_stack"].head(stack_limit):
|
|
616
|
+
if stack_list and len(stack_list) > 0:
|
|
617
|
+
trace2_python_stacks.append(stack_list)
|
|
618
|
+
|
|
619
|
+
# Aggregate individual kernels
|
|
198
620
|
trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
199
621
|
trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
200
622
|
trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
|
|
201
623
|
trace1_kernels_list = trace1_kernels.to_dict("records")
|
|
202
|
-
|
|
624
|
+
|
|
203
625
|
trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
|
|
204
626
|
trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
|
|
205
627
|
trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
|
|
206
628
|
trace2_kernels_list = trace2_kernels.to_dict("records")
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
# This handles cases where AMD runs fewer operations via fusion.
|
|
210
|
-
# 5ms threshold chosen because:
|
|
211
|
-
# - Filters out measurement noise and minor variations
|
|
212
|
-
# - Represents meaningful performance impact (0.5% of typical 1s inference)
|
|
213
|
-
# - Aligns with human perception of "noticeable" difference
|
|
214
|
-
# - Too small (1ms) creates false positives from variance
|
|
215
|
-
# - Too large (20ms) misses real optimization opportunities
|
|
216
|
-
if gap_ms > 5.0: # AMD spends >5ms more total time
|
|
629
|
+
|
|
630
|
+
if gap_ms > 5.0:
|
|
217
631
|
status = "slower"
|
|
218
|
-
elif gap_ms < -5.0:
|
|
632
|
+
elif gap_ms < -5.0:
|
|
219
633
|
status = "faster"
|
|
220
634
|
else:
|
|
221
635
|
status = "similar"
|
|
222
|
-
|
|
223
|
-
# Get phases from pre-computed aggregations
|
|
636
|
+
|
|
224
637
|
phases = trace1_agg["phases"] | trace2_agg["phases"]
|
|
225
|
-
|
|
226
|
-
results["operations"].append(
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
# Sort by absolute gap
|
|
638
|
+
|
|
639
|
+
results["operations"].append({
|
|
640
|
+
"operation": op,
|
|
641
|
+
"trace1_count": trace1_count,
|
|
642
|
+
"trace2_count": trace2_count,
|
|
643
|
+
"trace1_avg_us": trace1_avg,
|
|
644
|
+
"trace2_avg_us": trace2_avg,
|
|
645
|
+
"trace1_total_ms": trace1_total,
|
|
646
|
+
"trace2_total_ms": trace2_total,
|
|
647
|
+
"ratio": ratio,
|
|
648
|
+
"gap_ms": gap_ms,
|
|
649
|
+
"status": status,
|
|
650
|
+
"trace1_kernel": trace1_pattern,
|
|
651
|
+
"trace2_kernel": trace2_pattern,
|
|
652
|
+
"trace1_cpu_op": trace1_cpu_op,
|
|
653
|
+
"trace2_cpu_op": trace2_cpu_op,
|
|
654
|
+
"trace1_python_stacks": trace1_python_stacks,
|
|
655
|
+
"trace2_python_stacks": trace2_python_stacks,
|
|
656
|
+
"trace1_kernels": trace1_kernels_list,
|
|
657
|
+
"trace2_kernels": trace2_kernels_list,
|
|
658
|
+
"phases": sorted(list(phases)) if phases else ["all"],
|
|
659
|
+
})
|
|
660
|
+
|
|
251
661
|
results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
|
|
252
|
-
|
|
253
|
-
# Layer-wise analysis
|
|
662
|
+
|
|
663
|
+
# Layer-wise analysis
|
|
254
664
|
if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
|
|
255
|
-
# Get all unique layers present in either trace
|
|
256
665
|
all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
|
|
257
|
-
|
|
666
|
+
|
|
258
667
|
for layer_num in all_layers:
|
|
259
668
|
has_trace1 = layer_num in trace1_by_layer.index
|
|
260
669
|
has_trace2 = layer_num in trace2_by_layer.index
|
|
261
|
-
|
|
670
|
+
|
|
262
671
|
if has_trace1 and has_trace2:
|
|
263
|
-
# Layer present in both traces - compare them
|
|
264
672
|
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
265
673
|
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
266
|
-
|
|
674
|
+
|
|
267
675
|
trace1_total = trace1_agg["total_us"] / 1000
|
|
268
676
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
269
677
|
trace1_count = int(trace1_agg["count"])
|
|
270
678
|
trace2_count = int(trace2_agg["count"])
|
|
271
679
|
ratio = trace1_total / trace2_total if trace2_total > 0 else 1
|
|
272
680
|
gap_ms = trace1_total - trace2_total
|
|
273
|
-
|
|
274
|
-
# Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
|
|
681
|
+
|
|
275
682
|
threshold_ms = 0.1
|
|
276
683
|
threshold_ratio = 1.2
|
|
277
684
|
if gap_ms > threshold_ms and ratio > threshold_ratio:
|
|
@@ -280,60 +687,52 @@ def analyze_traces(
|
|
|
280
687
|
status = "faster"
|
|
281
688
|
else:
|
|
282
689
|
status = "similar"
|
|
283
|
-
|
|
284
|
-
results["layers"].append(
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
}
|
|
296
|
-
)
|
|
690
|
+
|
|
691
|
+
results["layers"].append({
|
|
692
|
+
"layer": int(layer_num),
|
|
693
|
+
"trace1_kernels": trace1_count,
|
|
694
|
+
"trace2_kernels": trace2_count,
|
|
695
|
+
"trace1_total_ms": trace1_total,
|
|
696
|
+
"trace2_total_ms": trace2_total,
|
|
697
|
+
"ratio": ratio,
|
|
698
|
+
"gap_ms": gap_ms,
|
|
699
|
+
"status": status,
|
|
700
|
+
"in_both": True,
|
|
701
|
+
})
|
|
297
702
|
elif has_trace1:
|
|
298
|
-
# Layer only in trace1
|
|
299
703
|
trace1_agg = trace1_by_layer.loc[layer_num]
|
|
300
704
|
trace1_total = trace1_agg["total_us"] / 1000
|
|
301
705
|
trace1_count = int(trace1_agg["count"])
|
|
302
|
-
|
|
303
|
-
results["layers"].append(
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
}
|
|
315
|
-
)
|
|
706
|
+
|
|
707
|
+
results["layers"].append({
|
|
708
|
+
"layer": int(layer_num),
|
|
709
|
+
"trace1_kernels": trace1_count,
|
|
710
|
+
"trace2_kernels": 0,
|
|
711
|
+
"trace1_total_ms": trace1_total,
|
|
712
|
+
"trace2_total_ms": 0.0,
|
|
713
|
+
"ratio": 0.0,
|
|
714
|
+
"gap_ms": trace1_total,
|
|
715
|
+
"status": "trace1_only",
|
|
716
|
+
"in_both": False,
|
|
717
|
+
})
|
|
316
718
|
elif has_trace2:
|
|
317
|
-
# Layer only in trace2
|
|
318
719
|
trace2_agg = trace2_by_layer.loc[layer_num]
|
|
319
720
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
320
721
|
trace2_count = int(trace2_agg["count"])
|
|
321
|
-
|
|
322
|
-
results["layers"].append(
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
# Sort: comparable layers first (by absolute gap), then trace-unique layers
|
|
722
|
+
|
|
723
|
+
results["layers"].append({
|
|
724
|
+
"layer": int(layer_num),
|
|
725
|
+
"trace1_kernels": 0,
|
|
726
|
+
"trace2_kernels": trace2_count,
|
|
727
|
+
"trace1_total_ms": 0.0,
|
|
728
|
+
"trace2_total_ms": trace2_total,
|
|
729
|
+
"ratio": 0.0,
|
|
730
|
+
"gap_ms": -trace2_total,
|
|
731
|
+
"status": "trace2_only",
|
|
732
|
+
"in_both": False,
|
|
733
|
+
})
|
|
734
|
+
|
|
337
735
|
results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
|
|
338
|
-
|
|
736
|
+
|
|
737
|
+
print("Analysis complete.", file=sys.stderr)
|
|
339
738
|
return results
|