wafer-core 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,313 +4,13 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
4
4
  at the operation level and layer level.
5
5
  """
6
6
 
7
- import sys
8
7
  from collections import defaultdict
9
- from concurrent.futures import ProcessPoolExecutor
10
8
  from pathlib import Path
11
9
  from typing import Any
12
10
 
13
11
  import pandas as pd
14
12
 
15
- from .aligner import align_traces, TraceAlignment
16
- from .fusion_analyzer import analyze_fusion_from_alignment
17
- from .same_kernel_analyzer import analyze_same_kernels_from_alignment
18
- from .loader import load_trace_full, LoadedTrace
19
-
20
-
21
- def analyze_traces_from_loaded(
22
- trace1: LoadedTrace,
23
- trace2: LoadedTrace,
24
- phase_filter: str = "all",
25
- max_stacks: int = 3,
26
- ) -> dict[str, Any]:
27
- """Analyze two loaded traces and return comparison data.
28
-
29
- Args:
30
- trace1: First loaded trace
31
- trace2: Second loaded trace
32
- phase_filter: Filter by phase ('all', 'prefill', or 'decode')
33
- max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
34
-
35
- Returns:
36
- Dictionary containing:
37
- - metadata: trace info (GPUs, kernel counts, total times, etc.)
38
- - operations: per-operation comparison data
39
- - layers: per-layer comparison data (if layers detected)
40
- """
41
- df1 = trace1.df
42
- df2 = trace2.df
43
-
44
- # Apply phase filter
45
- if phase_filter != "all":
46
- df1_filtered = df1[df1["phase"] == phase_filter]
47
- df2_filtered = df2[df2["phase"] == phase_filter]
48
-
49
- if len(df1_filtered) == 0 and len(df2_filtered) == 0:
50
- trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
51
- trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
52
- raise ValueError(
53
- f"No {phase_filter} phase found. "
54
- f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
55
- )
56
-
57
- df1, df2 = df1_filtered, df2_filtered
58
-
59
- # Pre-compute aggregations for both operations and layers in single pass
60
- trace1_by_op = df1.groupby("op").agg({
61
- "dur_us": ["sum", "mean", "count"],
62
- "phase": lambda x: set(x.dropna().unique()),
63
- "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
64
- })
65
- trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
66
-
67
- trace2_by_op = df2.groupby("op").agg({
68
- "dur_us": ["sum", "mean", "count"],
69
- "phase": lambda x: set(x.dropna().unique()),
70
- "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
71
- })
72
- trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
73
-
74
- # Group by layer for layer-level analysis
75
- df1_layered = df1[df1["layer"].notna()]
76
- df2_layered = df2[df2["layer"].notna()]
77
-
78
- trace1_by_layer = df1_layered.groupby("layer").agg({
79
- "dur_us": ["sum", "count"],
80
- }) if len(df1_layered) > 0 else pd.DataFrame()
81
- if len(trace1_by_layer) > 0:
82
- trace1_by_layer.columns = ["total_us", "count"]
83
-
84
- trace2_by_layer = df2_layered.groupby("layer").agg({
85
- "dur_us": ["sum", "count"],
86
- }) if len(df2_layered) > 0 else pd.DataFrame()
87
- if len(trace2_by_layer) > 0:
88
- trace2_by_layer.columns = ["total_us", "count"]
89
-
90
- results: dict[str, Any] = {
91
- "metadata": {
92
- "trace1_name": f"{trace1.platform} {trace1.gpu_name}",
93
- "trace2_name": f"{trace2.platform} {trace2.gpu_name}",
94
- "trace1_platform": trace1.platform,
95
- "trace1_gpu": trace1.gpu_name,
96
- "trace1_device": trace1.device_props,
97
- "trace2_platform": trace2.platform,
98
- "trace2_gpu": trace2.gpu_name,
99
- "trace2_device": trace2.device_props,
100
- "trace1_kernels": len(df1),
101
- "trace2_kernels": len(df2),
102
- "trace1_total_ms": df1["dur_us"].sum() / 1000,
103
- "trace2_total_ms": df2["dur_us"].sum() / 1000,
104
- "phase": phase_filter,
105
- "trace1_layers": len(trace1.layers),
106
- "trace2_layers": len(trace2.layers),
107
- },
108
- "operations": [],
109
- "layers": [],
110
- }
111
-
112
- # Per-operation comparison
113
- all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
114
- rmsnorm_compared = False
115
-
116
- for op in sorted(all_ops):
117
- has_trace1 = op in trace1_by_op.index
118
- has_trace2 = op in trace2_by_op.index
119
-
120
- trace1_op_for_pattern = op
121
- trace2_op_for_pattern = op
122
- skip_comparison = False
123
-
124
- if op == "RMSNorm+GEMM" and not has_trace2:
125
- has_trace2 = "RMSNorm" in trace2_by_op.index
126
- trace2_op_for_pattern = "RMSNorm"
127
- rmsnorm_compared = True
128
- elif op == "RMSNorm" and not has_trace1:
129
- if rmsnorm_compared:
130
- skip_comparison = True
131
- else:
132
- has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
133
- trace1_op_for_pattern = "RMSNorm+GEMM"
134
- rmsnorm_compared = True
135
-
136
- if skip_comparison or not (has_trace1 and has_trace2):
137
- continue
138
-
139
- trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
140
- trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
141
-
142
- trace1_avg = trace1_agg["avg_us"]
143
- trace2_avg = trace2_agg["avg_us"]
144
- trace1_total = trace1_agg["total_us"] / 1000
145
- trace2_total = trace2_agg["total_us"] / 1000
146
- trace1_count = int(trace1_agg["count"])
147
- trace2_count = int(trace2_agg["count"])
148
- # Speedup: ratio of total times (not per-call averages)
149
- # Shows how many times faster/slower trace1 is compared to trace2
150
- # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
151
- # Using total time instead of avg time per call because operations may have
152
- # vastly different call counts (e.g., fused vs unfused operations)
153
- if trace2_total > 0:
154
- ratio = trace1_total / trace2_total
155
- elif trace1_total > 0:
156
- ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
157
- else:
158
- ratio = 1.0 # Both are zero
159
- gap_ms = trace1_total - trace2_total
160
-
161
- trace1_pattern = list(
162
- trace1.patterns.get(
163
- (trace1_op_for_pattern, "decode"),
164
- trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
165
- )
166
- )[0]
167
- trace2_pattern = list(
168
- trace2.patterns.get(
169
- (trace2_op_for_pattern, "decode"),
170
- trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
171
- )
172
- )[0]
173
-
174
- trace1_cpu_op = trace1_agg["cpu_op"]
175
- trace2_cpu_op = trace2_agg["cpu_op"]
176
-
177
- # Get detailed kernel data and stacks only when needed
178
- trace1_data = df1[df1["op"] == trace1_op_for_pattern]
179
- trace2_data = df2[df2["op"] == trace2_op_for_pattern]
180
-
181
- # Collect Python stacks if available
182
- trace1_python_stacks = []
183
- trace2_python_stacks = []
184
-
185
- if max_stacks != 0:
186
- stack_limit = None if max_stacks == 0 else max_stacks
187
- for stack_list in trace1_data["python_stack"].head(stack_limit):
188
- if stack_list and len(stack_list) > 0:
189
- trace1_python_stacks.append(stack_list)
190
-
191
- for stack_list in trace2_data["python_stack"].head(stack_limit):
192
- if stack_list and len(stack_list) > 0:
193
- trace2_python_stacks.append(stack_list)
194
-
195
- # Aggregate individual kernels
196
- trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
197
- trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
198
- trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
199
- trace1_kernels_list = trace1_kernels.to_dict("records")
200
-
201
- trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
202
- trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
203
- trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
204
- trace2_kernels_list = trace2_kernels.to_dict("records")
205
-
206
- if gap_ms > 5.0:
207
- status = "slower"
208
- elif gap_ms < -5.0:
209
- status = "faster"
210
- else:
211
- status = "similar"
212
-
213
- phases = trace1_agg["phases"] | trace2_agg["phases"]
214
-
215
- results["operations"].append({
216
- "operation": op,
217
- "trace1_count": trace1_count,
218
- "trace2_count": trace2_count,
219
- "trace1_avg_us": trace1_avg,
220
- "trace2_avg_us": trace2_avg,
221
- "trace1_total_ms": trace1_total,
222
- "trace2_total_ms": trace2_total,
223
- "ratio": ratio,
224
- "gap_ms": gap_ms,
225
- "status": status,
226
- "trace1_kernel": trace1_pattern,
227
- "trace2_kernel": trace2_pattern,
228
- "trace1_cpu_op": trace1_cpu_op,
229
- "trace2_cpu_op": trace2_cpu_op,
230
- "trace1_python_stacks": trace1_python_stacks,
231
- "trace2_python_stacks": trace2_python_stacks,
232
- "trace1_kernels": trace1_kernels_list,
233
- "trace2_kernels": trace2_kernels_list,
234
- "phases": sorted(list(phases)) if phases else ["all"],
235
- })
236
-
237
- results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
238
-
239
- # Layer-wise analysis
240
- if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
241
- all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
242
-
243
- for layer_num in all_layers:
244
- has_trace1 = layer_num in trace1_by_layer.index
245
- has_trace2 = layer_num in trace2_by_layer.index
246
-
247
- if has_trace1 and has_trace2:
248
- trace1_agg = trace1_by_layer.loc[layer_num]
249
- trace2_agg = trace2_by_layer.loc[layer_num]
250
-
251
- trace1_total = trace1_agg["total_us"] / 1000
252
- trace2_total = trace2_agg["total_us"] / 1000
253
- trace1_count = int(trace1_agg["count"])
254
- trace2_count = int(trace2_agg["count"])
255
- ratio = trace1_total / trace2_total if trace2_total > 0 else 1
256
- gap_ms = trace1_total - trace2_total
257
-
258
- threshold_ms = 0.1
259
- threshold_ratio = 1.2
260
- if gap_ms > threshold_ms and ratio > threshold_ratio:
261
- status = "slower"
262
- elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
263
- status = "faster"
264
- else:
265
- status = "similar"
266
-
267
- results["layers"].append({
268
- "layer": int(layer_num),
269
- "trace1_kernels": trace1_count,
270
- "trace2_kernels": trace2_count,
271
- "trace1_total_ms": trace1_total,
272
- "trace2_total_ms": trace2_total,
273
- "ratio": ratio,
274
- "gap_ms": gap_ms,
275
- "status": status,
276
- "in_both": True,
277
- })
278
- elif has_trace1:
279
- trace1_agg = trace1_by_layer.loc[layer_num]
280
- trace1_total = trace1_agg["total_us"] / 1000
281
- trace1_count = int(trace1_agg["count"])
282
-
283
- results["layers"].append({
284
- "layer": int(layer_num),
285
- "trace1_kernels": trace1_count,
286
- "trace2_kernels": 0,
287
- "trace1_total_ms": trace1_total,
288
- "trace2_total_ms": 0.0,
289
- "ratio": 0.0,
290
- "gap_ms": trace1_total,
291
- "status": "trace1_only",
292
- "in_both": False,
293
- })
294
- elif has_trace2:
295
- trace2_agg = trace2_by_layer.loc[layer_num]
296
- trace2_total = trace2_agg["total_us"] / 1000
297
- trace2_count = int(trace2_agg["count"])
298
-
299
- results["layers"].append({
300
- "layer": int(layer_num),
301
- "trace1_kernels": 0,
302
- "trace2_kernels": trace2_count,
303
- "trace1_total_ms": 0.0,
304
- "trace2_total_ms": trace2_total,
305
- "ratio": 0.0,
306
- "gap_ms": -trace2_total,
307
- "status": "trace2_only",
308
- "in_both": False,
309
- })
310
-
311
- results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
312
-
313
- return results
13
+ from .loader import load_trace
314
14
 
315
15
 
316
16
  def analyze_traces(
@@ -318,224 +18,76 @@ def analyze_traces(
318
18
  trace2_path: str | Path,
319
19
  phase_filter: str = "all",
320
20
  max_stacks: int = 3,
321
- include_stacks: bool = True,
322
21
  ) -> dict[str, Any]:
323
22
  """Analyze two traces and return comparison data.
324
-
23
+
325
24
  Args:
326
25
  trace1_path: Path to first trace file
327
26
  trace2_path: Path to second trace file
328
27
  phase_filter: Filter by phase ('all', 'prefill', or 'decode')
329
28
  max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
330
- include_stacks: Whether to include Python stack traces (disable for faster analysis)
331
-
29
+
332
30
  Returns:
333
31
  Dictionary containing:
334
32
  - metadata: trace info (GPUs, kernel counts, total times, etc.)
335
33
  - operations: per-operation comparison data
336
34
  - layers: per-layer comparison data (if layers detected)
337
35
  """
338
- # Load both traces in parallel using separate processes
339
- # This provides ~1.7x speedup over sequential loading
340
- print("Loading traces in parallel...", file=sys.stderr)
341
-
342
- with ProcessPoolExecutor(max_workers=2) as executor:
343
- future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
344
- future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
345
- trace1 = future1.result()
346
- trace2 = future2.result()
347
-
348
- print("Analyzing operations...", file=sys.stderr)
349
-
350
- result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
351
-
352
- # Update metadata with file paths for backward compatibility
353
- result["metadata"]["trace1_name"] = str(trace1_path)
354
- result["metadata"]["trace2_name"] = str(trace2_path)
355
-
356
- return result
357
-
358
-
359
- def analyze_traces_aligned(
360
- trace1: LoadedTrace,
361
- trace2: LoadedTrace,
362
- phase_filter: str = "all",
363
- ) -> dict[str, Any]:
364
- """Analyze traces using kernel-to-kernel alignment.
365
-
366
- Args:
367
- trace1: First loaded trace
368
- trace2: Second loaded trace
369
- phase_filter: Filter by phase ('all', 'prefill', or 'decode')
370
-
371
- Returns:
372
- Dictionary with alignment-based comparison data
373
- """
374
- amd_phases = trace1.phases
375
- nvidia_phases = trace2.phases
376
-
377
- if phase_filter != "all":
378
- amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
379
- nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
380
-
381
- amd_kernels = trace1.kernel_events
382
- nvidia_kernels = trace2.kernel_events
383
-
384
- if phase_filter != "all" and amd_phases:
385
- phase_starts = [p["ts_start"] for p in amd_phases]
386
- phase_ends = [p["ts_end"] for p in amd_phases]
387
- amd_kernels = [
388
- k for k in amd_kernels
389
- if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
390
- for i in range(len(phase_starts)))
391
- ]
392
-
393
- if phase_filter != "all" and nvidia_phases:
394
- phase_starts = [p["ts_start"] for p in nvidia_phases]
395
- phase_ends = [p["ts_end"] for p in nvidia_phases]
396
- nvidia_kernels = [
397
- k for k in nvidia_kernels
398
- if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
399
- for i in range(len(phase_starts)))
400
- ]
401
-
402
- alignment = align_traces(
403
- amd_kernels,
404
- nvidia_kernels,
405
- amd_phases,
406
- nvidia_phases,
407
- trace1.platform,
408
- trace2.platform,
409
- )
410
-
411
- layer_alignments = []
412
- for layer_align in alignment.layer_alignments:
413
- kernel_pairs = []
414
- for pair in layer_align.kernel_pairs:
415
- kernel_pairs.append({
416
- "position": pair.position,
417
- "operation": pair.operation,
418
- "operation_detail": pair.operation_detail,
419
- "amd_kernel": pair.amd_kernel,
420
- "amd_avg_us": pair.amd_avg_us,
421
- "amd_count": pair.amd_count,
422
- "amd_total_us": pair.amd_total_us,
423
- "nvidia_kernel": pair.nvidia_kernel,
424
- "nvidia_avg_us": pair.nvidia_avg_us,
425
- "nvidia_count": pair.nvidia_count,
426
- "nvidia_total_us": pair.nvidia_total_us,
427
- "ratio": pair.ratio,
428
- "gap_us": pair.gap_us,
429
- "fusion_note": pair.fusion_note,
430
- "is_same_kernel": pair.is_same_kernel,
431
- })
432
-
433
- layer_alignments.append({
434
- "layer": layer_align.layer,
435
- "amd_total_us": layer_align.amd_total_us,
436
- "nvidia_total_us": layer_align.nvidia_total_us,
437
- "ratio": layer_align.ratio,
438
- "gap_us": layer_align.gap_us,
439
- "kernel_pairs": kernel_pairs,
440
- })
441
-
442
- # Determine which trace is AMD vs NVIDIA for fusion analysis
443
- if trace1.platform == "AMD":
444
- amd_trace, nvidia_trace = trace1, trace2
445
- fusion_amd_kernels = amd_kernels
446
- fusion_nvidia_kernels = nvidia_kernels
447
- else:
448
- amd_trace, nvidia_trace = trace2, trace1
449
- fusion_amd_kernels = nvidia_kernels
450
- fusion_nvidia_kernels = amd_kernels
451
-
452
- fusion_result = analyze_fusion_from_alignment(
453
- alignment.layer_alignments,
454
- amd_kernels=fusion_amd_kernels,
455
- nvidia_kernels=fusion_nvidia_kernels,
456
- )
457
- same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
458
-
459
- # Note: amd_kernels = trace1's kernels (filtered if phase_filter != "all")
460
- # nvidia_kernels = trace2's kernels (filtered if phase_filter != "all")
461
- # The variable names are misleading but trace1_* should use amd_kernels,
462
- # and trace2_* should use nvidia_kernels to match the filtered kernel counts/totals.
463
-
464
- return {
465
- "metadata": {
466
- "amd_gpu": amd_trace.gpu_name,
467
- "nvidia_gpu": nvidia_trace.gpu_name,
468
- "amd_platform": amd_trace.platform,
469
- "nvidia_platform": nvidia_trace.platform,
470
- "model_layers": alignment.num_layers,
471
- "forward_passes": alignment.num_forward_passes,
472
- "phase_breakdown": alignment.phase_breakdown,
473
- "phase_filter": phase_filter,
474
- "trace1_platform": trace1.platform,
475
- "trace1_gpu": trace1.gpu_name,
476
- "trace1_device": trace1.device_props,
477
- "trace2_platform": trace2.platform,
478
- "trace2_gpu": trace2.gpu_name,
479
- "trace2_device": trace2.device_props,
480
- "trace1_kernels": len(amd_kernels),
481
- "trace2_kernels": len(nvidia_kernels),
482
- "trace1_total_ms": sum(k.get("dur", 0) for k in amd_kernels) / 1000,
483
- "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_kernels) / 1000,
484
- "phase": phase_filter,
485
- "trace1_layers": alignment.num_layers,
486
- "trace2_layers": alignment.num_layers,
487
- },
488
- "layer_alignments": layer_alignments,
489
- "fusion_analysis": fusion_result,
490
- "same_kernel_analysis": same_kernel_result,
491
- }
492
-
36
+ # Load traces
37
+ p1, gpu1, dev1, df1, patterns1, layers1 = load_trace(trace1_path)
38
+ p2, gpu2, dev2, df2, patterns2, layers2 = load_trace(trace2_path)
39
+
493
40
  # Apply phase filter
494
41
  if phase_filter != "all":
495
42
  df1_filtered = df1[df1["phase"] == phase_filter]
496
43
  df2_filtered = df2[df2["phase"] == phase_filter]
497
-
44
+
498
45
  if len(df1_filtered) == 0 and len(df2_filtered) == 0:
46
+ # No data in requested phase - return early with error info
499
47
  trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
500
48
  trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
501
49
  raise ValueError(
502
50
  f"No {phase_filter} phase found. "
503
51
  f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
504
52
  )
505
-
53
+
506
54
  df1, df2 = df1_filtered, df2_filtered
507
-
55
+
508
56
  # Pre-compute aggregations for both operations and layers in single pass
57
+ # This is much faster than iterating through filtered dataframes multiple times
58
+
59
+ # Group by operation for operation-level analysis
509
60
  trace1_by_op = df1.groupby("op").agg({
510
61
  "dur_us": ["sum", "mean", "count"],
511
62
  "phase": lambda x: set(x.dropna().unique()),
512
63
  "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
513
64
  })
514
65
  trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
515
-
66
+
516
67
  trace2_by_op = df2.groupby("op").agg({
517
68
  "dur_us": ["sum", "mean", "count"],
518
69
  "phase": lambda x: set(x.dropna().unique()),
519
70
  "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
520
71
  })
521
72
  trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
522
-
523
- # Group by layer for layer-level analysis
73
+
74
+ # Group by layer for layer-level analysis (only for kernels with layer info)
524
75
  df1_layered = df1[df1["layer"].notna()]
525
76
  df2_layered = df2[df2["layer"].notna()]
526
-
77
+
527
78
  trace1_by_layer = df1_layered.groupby("layer").agg({
528
79
  "dur_us": ["sum", "count"],
529
80
  }) if len(df1_layered) > 0 else pd.DataFrame()
530
81
  if len(trace1_by_layer) > 0:
531
82
  trace1_by_layer.columns = ["total_us", "count"]
532
-
83
+
533
84
  trace2_by_layer = df2_layered.groupby("layer").agg({
534
85
  "dur_us": ["sum", "count"],
535
86
  }) if len(df2_layered) > 0 else pd.DataFrame()
536
87
  if len(trace2_by_layer) > 0:
537
88
  trace2_by_layer.columns = ["total_us", "count"]
538
-
89
+
90
+ # Calculate per-operation statistics
539
91
  results: dict[str, Any] = {
540
92
  "metadata": {
541
93
  "trace1_name": str(trace1_path),
@@ -557,56 +109,57 @@ def analyze_traces_aligned(
557
109
  "operations": [],
558
110
  "layers": [],
559
111
  }
560
-
561
- # Per-operation comparison
112
+
113
+ # Per-operation comparison using pre-computed aggregations
562
114
  all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
115
+
116
+ # Track if we've already compared RMSNorm variants to avoid duplicate comparisons
563
117
  rmsnorm_compared = False
564
-
118
+
565
119
  for op in sorted(all_ops):
120
+ # Use pre-computed aggregations instead of filtering entire dataframes
566
121
  has_trace1 = op in trace1_by_op.index
567
122
  has_trace2 = op in trace2_by_op.index
568
-
569
- trace1_op_for_pattern = op
570
- trace2_op_for_pattern = op
123
+
124
+ # Handle RMSNorm fusion differences: AMD does RMSNorm+GEMM, NVIDIA does separate RMSNorm
125
+ trace1_op_for_pattern = op # Operation name to use for AMD pattern lookup
126
+ trace2_op_for_pattern = op # Operation name to use for NVIDIA pattern lookup
571
127
  skip_comparison = False
572
-
128
+
573
129
  if op == "RMSNorm+GEMM" and not has_trace2:
130
+ # Compare AMD's fused version to NVIDIA's separate RMSNorm
574
131
  has_trace2 = "RMSNorm" in trace2_by_op.index
575
- trace2_op_for_pattern = "RMSNorm"
576
- rmsnorm_compared = True
132
+ trace2_op_for_pattern = "RMSNorm" # NVIDIA kernels are stored under 'RMSNorm'
133
+ rmsnorm_compared = True # Mark that we've compared RMSNorm
577
134
  elif op == "RMSNorm" and not has_trace1:
135
+ # Skip this comparison if we already handled it in RMSNorm+GEMM
578
136
  if rmsnorm_compared:
579
137
  skip_comparison = True
580
138
  else:
139
+ # Compare NVIDIA's RMSNorm to AMD's fused version
581
140
  has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
582
- trace1_op_for_pattern = "RMSNorm+GEMM"
141
+ trace1_op_for_pattern = (
142
+ "RMSNorm+GEMM" # AMD kernels are stored under 'RMSNorm+GEMM'
143
+ )
583
144
  rmsnorm_compared = True
584
-
145
+
585
146
  if skip_comparison or not (has_trace1 and has_trace2):
586
147
  continue
587
-
148
+
149
+ # Get pre-computed aggregations
588
150
  trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
589
151
  trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
590
-
152
+
591
153
  trace1_avg = trace1_agg["avg_us"]
592
154
  trace2_avg = trace2_agg["avg_us"]
593
155
  trace1_total = trace1_agg["total_us"] / 1000
594
156
  trace2_total = trace2_agg["total_us"] / 1000
595
157
  trace1_count = int(trace1_agg["count"])
596
158
  trace2_count = int(trace2_agg["count"])
597
- # Speedup: ratio of total times (not per-call averages)
598
- # Shows how many times faster/slower trace1 is compared to trace2
599
- # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
600
- # Using total time instead of avg time per call because operations may have
601
- # vastly different call counts (e.g., fused vs unfused operations)
602
- if trace2_total > 0:
603
- ratio = trace1_total / trace2_total
604
- elif trace1_total > 0:
605
- ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
606
- else:
607
- ratio = 1.0 # Both are zero
159
+ ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
608
160
  gap_ms = trace1_total - trace2_total
609
-
161
+
162
+ # Get kernel patterns using the correct operation names for each platform
610
163
  trace1_pattern = list(
611
164
  patterns1.get(
612
165
  (trace1_op_for_pattern, "decode"),
@@ -619,91 +172,106 @@ def analyze_traces_aligned(
619
172
  patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
620
173
  )
621
174
  )[0]
622
-
175
+
176
+ # Get CPU operators from pre-computed aggregations
623
177
  trace1_cpu_op = trace1_agg["cpu_op"]
624
178
  trace2_cpu_op = trace2_agg["cpu_op"]
625
-
626
- # Get detailed kernel data and stacks only when needed
179
+
180
+ # For detailed kernel data and python stacks, we still need to filter (but only when needed)
627
181
  trace1_data = df1[df1["op"] == trace1_op_for_pattern]
628
182
  trace2_data = df2[df2["op"] == trace2_op_for_pattern]
629
-
630
- # Collect Python stacks if available
183
+
184
+ # Collect example Python stacks for this operation (for JSON output)
631
185
  trace1_python_stacks = []
186
+ stack_limit = None if max_stacks == 0 else max_stacks
187
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
188
+ if stack_list and len(stack_list) > 0:
189
+ trace1_python_stacks.append(stack_list)
190
+
632
191
  trace2_python_stacks = []
633
-
634
- if include_stacks:
635
- stack_limit = None if max_stacks == 0 else max_stacks
636
- for stack_list in trace1_data["python_stack"].head(stack_limit):
637
- if stack_list and len(stack_list) > 0:
638
- trace1_python_stacks.append(stack_list)
639
-
640
- for stack_list in trace2_data["python_stack"].head(stack_limit):
641
- if stack_list and len(stack_list) > 0:
642
- trace2_python_stacks.append(stack_list)
643
-
644
- # Aggregate individual kernels
192
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
193
+ if stack_list and len(stack_list) > 0:
194
+ trace2_python_stacks.append(stack_list)
195
+
196
+ # Aggregate individual kernels by name for detailed view
197
+ # Group by kernel name and calculate sum/count/avg
645
198
  trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
646
199
  trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
647
200
  trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
648
201
  trace1_kernels_list = trace1_kernels.to_dict("records")
649
-
202
+
650
203
  trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
651
204
  trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
652
205
  trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
653
206
  trace2_kernels_list = trace2_kernels.to_dict("records")
654
-
655
- if gap_ms > 5.0:
207
+
208
+ # Determine status based on TOTAL TIME (gap), not per-call ratio
209
+ # This handles cases where AMD runs fewer operations via fusion.
210
+ # 5ms threshold chosen because:
211
+ # - Filters out measurement noise and minor variations
212
+ # - Represents meaningful performance impact (0.5% of typical 1s inference)
213
+ # - Aligns with human perception of "noticeable" difference
214
+ # - Too small (1ms) creates false positives from variance
215
+ # - Too large (20ms) misses real optimization opportunities
216
+ if gap_ms > 5.0: # AMD spends >5ms more total time
656
217
  status = "slower"
657
- elif gap_ms < -5.0:
218
+ elif gap_ms < -5.0: # AMD spends >5ms less total time
658
219
  status = "faster"
659
220
  else:
660
221
  status = "similar"
661
-
222
+
223
+ # Get phases from pre-computed aggregations
662
224
  phases = trace1_agg["phases"] | trace2_agg["phases"]
663
-
664
- results["operations"].append({
665
- "operation": op,
666
- "trace1_count": trace1_count,
667
- "trace2_count": trace2_count,
668
- "trace1_avg_us": trace1_avg,
669
- "trace2_avg_us": trace2_avg,
670
- "trace1_total_ms": trace1_total,
671
- "trace2_total_ms": trace2_total,
672
- "ratio": ratio,
673
- "gap_ms": gap_ms,
674
- "status": status,
675
- "trace1_kernel": trace1_pattern,
676
- "trace2_kernel": trace2_pattern,
677
- "trace1_cpu_op": trace1_cpu_op,
678
- "trace2_cpu_op": trace2_cpu_op,
679
- "trace1_python_stacks": trace1_python_stacks,
680
- "trace2_python_stacks": trace2_python_stacks,
681
- "trace1_kernels": trace1_kernels_list,
682
- "trace2_kernels": trace2_kernels_list,
683
- "phases": sorted(list(phases)) if phases else ["all"],
684
- })
685
-
225
+
226
+ results["operations"].append(
227
+ {
228
+ "operation": op,
229
+ "trace1_count": trace1_count,
230
+ "trace2_count": trace2_count,
231
+ "trace1_avg_us": trace1_avg,
232
+ "trace2_avg_us": trace2_avg,
233
+ "trace1_total_ms": trace1_total,
234
+ "trace2_total_ms": trace2_total,
235
+ "ratio": ratio,
236
+ "gap_ms": gap_ms,
237
+ "status": status,
238
+ "trace1_kernel": trace1_pattern,
239
+ "trace2_kernel": trace2_pattern,
240
+ "trace1_cpu_op": trace1_cpu_op,
241
+ "trace2_cpu_op": trace2_cpu_op,
242
+ "trace1_python_stacks": trace1_python_stacks, # Full stacks for JSON
243
+ "trace2_python_stacks": trace2_python_stacks,
244
+ "trace1_kernels": trace1_kernels_list, # All individual kernels for JSON
245
+ "trace2_kernels": trace2_kernels_list, # All individual kernels for JSON
246
+ "phases": sorted(list(phases)) if phases else ["all"], # For client-side filtering
247
+ }
248
+ )
249
+
250
+ # Sort by absolute gap
686
251
  results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
687
-
688
- # Layer-wise analysis
252
+
253
+ # Layer-wise analysis using pre-computed aggregations
689
254
  if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
255
+ # Get all unique layers present in either trace
690
256
  all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
691
-
257
+
692
258
  for layer_num in all_layers:
693
259
  has_trace1 = layer_num in trace1_by_layer.index
694
260
  has_trace2 = layer_num in trace2_by_layer.index
695
-
261
+
696
262
  if has_trace1 and has_trace2:
263
+ # Layer present in both traces - compare them
697
264
  trace1_agg = trace1_by_layer.loc[layer_num]
698
265
  trace2_agg = trace2_by_layer.loc[layer_num]
699
-
266
+
700
267
  trace1_total = trace1_agg["total_us"] / 1000
701
268
  trace2_total = trace2_agg["total_us"] / 1000
702
269
  trace1_count = int(trace1_agg["count"])
703
270
  trace2_count = int(trace2_agg["count"])
704
271
  ratio = trace1_total / trace2_total if trace2_total > 0 else 1
705
272
  gap_ms = trace1_total - trace2_total
706
-
273
+
274
+ # Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
707
275
  threshold_ms = 0.1
708
276
  threshold_ratio = 1.2
709
277
  if gap_ms > threshold_ms and ratio > threshold_ratio:
@@ -712,52 +280,60 @@ def analyze_traces_aligned(
712
280
  status = "faster"
713
281
  else:
714
282
  status = "similar"
715
-
716
- results["layers"].append({
717
- "layer": int(layer_num),
718
- "trace1_kernels": trace1_count,
719
- "trace2_kernels": trace2_count,
720
- "trace1_total_ms": trace1_total,
721
- "trace2_total_ms": trace2_total,
722
- "ratio": ratio,
723
- "gap_ms": gap_ms,
724
- "status": status,
725
- "in_both": True,
726
- })
283
+
284
+ results["layers"].append(
285
+ {
286
+ "layer": int(layer_num),
287
+ "trace1_kernels": trace1_count,
288
+ "trace2_kernels": trace2_count,
289
+ "trace1_total_ms": trace1_total,
290
+ "trace2_total_ms": trace2_total,
291
+ "ratio": ratio,
292
+ "gap_ms": gap_ms,
293
+ "status": status,
294
+ "in_both": True,
295
+ }
296
+ )
727
297
  elif has_trace1:
298
+ # Layer only in trace1
728
299
  trace1_agg = trace1_by_layer.loc[layer_num]
729
300
  trace1_total = trace1_agg["total_us"] / 1000
730
301
  trace1_count = int(trace1_agg["count"])
731
-
732
- results["layers"].append({
733
- "layer": int(layer_num),
734
- "trace1_kernels": trace1_count,
735
- "trace2_kernels": 0,
736
- "trace1_total_ms": trace1_total,
737
- "trace2_total_ms": 0.0,
738
- "ratio": 0.0,
739
- "gap_ms": trace1_total,
740
- "status": "trace1_only",
741
- "in_both": False,
742
- })
302
+
303
+ results["layers"].append(
304
+ {
305
+ "layer": int(layer_num),
306
+ "trace1_kernels": trace1_count,
307
+ "trace2_kernels": 0,
308
+ "trace1_total_ms": trace1_total,
309
+ "trace2_total_ms": 0.0,
310
+ "ratio": 0.0,
311
+ "gap_ms": trace1_total,
312
+ "status": "trace1_only",
313
+ "in_both": False,
314
+ }
315
+ )
743
316
  elif has_trace2:
317
+ # Layer only in trace2
744
318
  trace2_agg = trace2_by_layer.loc[layer_num]
745
319
  trace2_total = trace2_agg["total_us"] / 1000
746
320
  trace2_count = int(trace2_agg["count"])
747
-
748
- results["layers"].append({
749
- "layer": int(layer_num),
750
- "trace1_kernels": 0,
751
- "trace2_kernels": trace2_count,
752
- "trace1_total_ms": 0.0,
753
- "trace2_total_ms": trace2_total,
754
- "ratio": 0.0,
755
- "gap_ms": -trace2_total,
756
- "status": "trace2_only",
757
- "in_both": False,
758
- })
759
-
321
+
322
+ results["layers"].append(
323
+ {
324
+ "layer": int(layer_num),
325
+ "trace1_kernels": 0,
326
+ "trace2_kernels": trace2_count,
327
+ "trace1_total_ms": 0.0,
328
+ "trace2_total_ms": trace2_total,
329
+ "ratio": 0.0,
330
+ "gap_ms": -trace2_total,
331
+ "status": "trace2_only",
332
+ "in_both": False,
333
+ }
334
+ )
335
+
336
+ # Sort: comparable layers first (by absolute gap), then trace-unique layers
760
337
  results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
761
-
762
- print("Analysis complete.", file=sys.stderr)
338
+
763
339
  return results