wafer-core 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,303 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
4
4
  at the operation level and layer level.
5
5
  """
6
6
 
7
+ import sys
7
8
  from collections import defaultdict
9
+ from concurrent.futures import ProcessPoolExecutor
8
10
  from pathlib import Path
9
11
  from typing import Any
10
12
 
11
13
  import pandas as pd
12
14
 
13
- from .loader import load_trace
15
+ from .aligner import align_traces, TraceAlignment
16
+ from .fusion_analyzer import analyze_fusion_from_alignment
17
+ from .same_kernel_analyzer import analyze_same_kernels_from_alignment
18
+ from .loader import load_trace_full, LoadedTrace
19
+
20
+
21
+ def analyze_traces_from_loaded(
22
+ trace1: LoadedTrace,
23
+ trace2: LoadedTrace,
24
+ phase_filter: str = "all",
25
+ max_stacks: int = 3,
26
+ ) -> dict[str, Any]:
27
+ """Analyze two loaded traces and return comparison data.
28
+
29
+ Args:
30
+ trace1: First loaded trace
31
+ trace2: Second loaded trace
32
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
33
+ max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
34
+
35
+ Returns:
36
+ Dictionary containing:
37
+ - metadata: trace info (GPUs, kernel counts, total times, etc.)
38
+ - operations: per-operation comparison data
39
+ - layers: per-layer comparison data (if layers detected)
40
+ """
41
+ df1 = trace1.df
42
+ df2 = trace2.df
43
+
44
+ # Apply phase filter
45
+ if phase_filter != "all":
46
+ df1_filtered = df1[df1["phase"] == phase_filter]
47
+ df2_filtered = df2[df2["phase"] == phase_filter]
48
+
49
+ if len(df1_filtered) == 0 and len(df2_filtered) == 0:
50
+ trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
51
+ trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
52
+ raise ValueError(
53
+ f"No {phase_filter} phase found. "
54
+ f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
55
+ )
56
+
57
+ df1, df2 = df1_filtered, df2_filtered
58
+
59
+ # Pre-compute aggregations for both operations and layers in single pass
60
+ trace1_by_op = df1.groupby("op").agg({
61
+ "dur_us": ["sum", "mean", "count"],
62
+ "phase": lambda x: set(x.dropna().unique()),
63
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
64
+ })
65
+ trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
66
+
67
+ trace2_by_op = df2.groupby("op").agg({
68
+ "dur_us": ["sum", "mean", "count"],
69
+ "phase": lambda x: set(x.dropna().unique()),
70
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
71
+ })
72
+ trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
73
+
74
+ # Group by layer for layer-level analysis
75
+ df1_layered = df1[df1["layer"].notna()]
76
+ df2_layered = df2[df2["layer"].notna()]
77
+
78
+ trace1_by_layer = df1_layered.groupby("layer").agg({
79
+ "dur_us": ["sum", "count"],
80
+ }) if len(df1_layered) > 0 else pd.DataFrame()
81
+ if len(trace1_by_layer) > 0:
82
+ trace1_by_layer.columns = ["total_us", "count"]
83
+
84
+ trace2_by_layer = df2_layered.groupby("layer").agg({
85
+ "dur_us": ["sum", "count"],
86
+ }) if len(df2_layered) > 0 else pd.DataFrame()
87
+ if len(trace2_by_layer) > 0:
88
+ trace2_by_layer.columns = ["total_us", "count"]
89
+
90
+ results: dict[str, Any] = {
91
+ "metadata": {
92
+ "trace1_name": f"{trace1.platform} {trace1.gpu_name}",
93
+ "trace2_name": f"{trace2.platform} {trace2.gpu_name}",
94
+ "trace1_platform": trace1.platform,
95
+ "trace1_gpu": trace1.gpu_name,
96
+ "trace1_device": trace1.device_props,
97
+ "trace2_platform": trace2.platform,
98
+ "trace2_gpu": trace2.gpu_name,
99
+ "trace2_device": trace2.device_props,
100
+ "trace1_kernels": len(df1),
101
+ "trace2_kernels": len(df2),
102
+ "trace1_total_ms": df1["dur_us"].sum() / 1000,
103
+ "trace2_total_ms": df2["dur_us"].sum() / 1000,
104
+ "phase": phase_filter,
105
+ "trace1_layers": len(trace1.layers),
106
+ "trace2_layers": len(trace2.layers),
107
+ },
108
+ "operations": [],
109
+ "layers": [],
110
+ }
111
+
112
+ # Per-operation comparison
113
+ all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
114
+ rmsnorm_compared = False
115
+
116
+ for op in sorted(all_ops):
117
+ has_trace1 = op in trace1_by_op.index
118
+ has_trace2 = op in trace2_by_op.index
119
+
120
+ trace1_op_for_pattern = op
121
+ trace2_op_for_pattern = op
122
+ skip_comparison = False
123
+
124
+ if op == "RMSNorm+GEMM" and not has_trace2:
125
+ has_trace2 = "RMSNorm" in trace2_by_op.index
126
+ trace2_op_for_pattern = "RMSNorm"
127
+ rmsnorm_compared = True
128
+ elif op == "RMSNorm" and not has_trace1:
129
+ if rmsnorm_compared:
130
+ skip_comparison = True
131
+ else:
132
+ has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
133
+ trace1_op_for_pattern = "RMSNorm+GEMM"
134
+ rmsnorm_compared = True
135
+
136
+ if skip_comparison or not (has_trace1 and has_trace2):
137
+ continue
138
+
139
+ trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
140
+ trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
141
+
142
+ trace1_avg = trace1_agg["avg_us"]
143
+ trace2_avg = trace2_agg["avg_us"]
144
+ trace1_total = trace1_agg["total_us"] / 1000
145
+ trace2_total = trace2_agg["total_us"] / 1000
146
+ trace1_count = int(trace1_agg["count"])
147
+ trace2_count = int(trace2_agg["count"])
148
+ ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
149
+ gap_ms = trace1_total - trace2_total
150
+
151
+ trace1_pattern = list(
152
+ trace1.patterns.get(
153
+ (trace1_op_for_pattern, "decode"),
154
+ trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
155
+ )
156
+ )[0]
157
+ trace2_pattern = list(
158
+ trace2.patterns.get(
159
+ (trace2_op_for_pattern, "decode"),
160
+ trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
161
+ )
162
+ )[0]
163
+
164
+ trace1_cpu_op = trace1_agg["cpu_op"]
165
+ trace2_cpu_op = trace2_agg["cpu_op"]
166
+
167
+ # Get detailed kernel data and stacks only when needed
168
+ trace1_data = df1[df1["op"] == trace1_op_for_pattern]
169
+ trace2_data = df2[df2["op"] == trace2_op_for_pattern]
170
+
171
+ # Collect Python stacks if available
172
+ trace1_python_stacks = []
173
+ trace2_python_stacks = []
174
+
175
+ if max_stacks != 0:
176
+ stack_limit = None if max_stacks == 0 else max_stacks
177
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
178
+ if stack_list and len(stack_list) > 0:
179
+ trace1_python_stacks.append(stack_list)
180
+
181
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
182
+ if stack_list and len(stack_list) > 0:
183
+ trace2_python_stacks.append(stack_list)
184
+
185
+ # Aggregate individual kernels
186
+ trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
187
+ trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
188
+ trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
189
+ trace1_kernels_list = trace1_kernels.to_dict("records")
190
+
191
+ trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
192
+ trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
193
+ trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
194
+ trace2_kernels_list = trace2_kernels.to_dict("records")
195
+
196
+ if gap_ms > 5.0:
197
+ status = "slower"
198
+ elif gap_ms < -5.0:
199
+ status = "faster"
200
+ else:
201
+ status = "similar"
202
+
203
+ phases = trace1_agg["phases"] | trace2_agg["phases"]
204
+
205
+ results["operations"].append({
206
+ "operation": op,
207
+ "trace1_count": trace1_count,
208
+ "trace2_count": trace2_count,
209
+ "trace1_avg_us": trace1_avg,
210
+ "trace2_avg_us": trace2_avg,
211
+ "trace1_total_ms": trace1_total,
212
+ "trace2_total_ms": trace2_total,
213
+ "ratio": ratio,
214
+ "gap_ms": gap_ms,
215
+ "status": status,
216
+ "trace1_kernel": trace1_pattern,
217
+ "trace2_kernel": trace2_pattern,
218
+ "trace1_cpu_op": trace1_cpu_op,
219
+ "trace2_cpu_op": trace2_cpu_op,
220
+ "trace1_python_stacks": trace1_python_stacks,
221
+ "trace2_python_stacks": trace2_python_stacks,
222
+ "trace1_kernels": trace1_kernels_list,
223
+ "trace2_kernels": trace2_kernels_list,
224
+ "phases": sorted(list(phases)) if phases else ["all"],
225
+ })
226
+
227
+ results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
228
+
229
+ # Layer-wise analysis
230
+ if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
231
+ all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
232
+
233
+ for layer_num in all_layers:
234
+ has_trace1 = layer_num in trace1_by_layer.index
235
+ has_trace2 = layer_num in trace2_by_layer.index
236
+
237
+ if has_trace1 and has_trace2:
238
+ trace1_agg = trace1_by_layer.loc[layer_num]
239
+ trace2_agg = trace2_by_layer.loc[layer_num]
240
+
241
+ trace1_total = trace1_agg["total_us"] / 1000
242
+ trace2_total = trace2_agg["total_us"] / 1000
243
+ trace1_count = int(trace1_agg["count"])
244
+ trace2_count = int(trace2_agg["count"])
245
+ ratio = trace1_total / trace2_total if trace2_total > 0 else 1
246
+ gap_ms = trace1_total - trace2_total
247
+
248
+ threshold_ms = 0.1
249
+ threshold_ratio = 1.2
250
+ if gap_ms > threshold_ms and ratio > threshold_ratio:
251
+ status = "slower"
252
+ elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
253
+ status = "faster"
254
+ else:
255
+ status = "similar"
256
+
257
+ results["layers"].append({
258
+ "layer": int(layer_num),
259
+ "trace1_kernels": trace1_count,
260
+ "trace2_kernels": trace2_count,
261
+ "trace1_total_ms": trace1_total,
262
+ "trace2_total_ms": trace2_total,
263
+ "ratio": ratio,
264
+ "gap_ms": gap_ms,
265
+ "status": status,
266
+ "in_both": True,
267
+ })
268
+ elif has_trace1:
269
+ trace1_agg = trace1_by_layer.loc[layer_num]
270
+ trace1_total = trace1_agg["total_us"] / 1000
271
+ trace1_count = int(trace1_agg["count"])
272
+
273
+ results["layers"].append({
274
+ "layer": int(layer_num),
275
+ "trace1_kernels": trace1_count,
276
+ "trace2_kernels": 0,
277
+ "trace1_total_ms": trace1_total,
278
+ "trace2_total_ms": 0.0,
279
+ "ratio": 0.0,
280
+ "gap_ms": trace1_total,
281
+ "status": "trace1_only",
282
+ "in_both": False,
283
+ })
284
+ elif has_trace2:
285
+ trace2_agg = trace2_by_layer.loc[layer_num]
286
+ trace2_total = trace2_agg["total_us"] / 1000
287
+ trace2_count = int(trace2_agg["count"])
288
+
289
+ results["layers"].append({
290
+ "layer": int(layer_num),
291
+ "trace1_kernels": 0,
292
+ "trace2_kernels": trace2_count,
293
+ "trace1_total_ms": 0.0,
294
+ "trace2_total_ms": trace2_total,
295
+ "ratio": 0.0,
296
+ "gap_ms": -trace2_total,
297
+ "status": "trace2_only",
298
+ "in_both": False,
299
+ })
300
+
301
+ results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
302
+
303
+ return results
14
304
 
15
305
 
16
306
  def analyze_traces(
@@ -18,76 +308,210 @@ def analyze_traces(
18
308
  trace2_path: str | Path,
19
309
  phase_filter: str = "all",
20
310
  max_stacks: int = 3,
311
+ include_stacks: bool = True,
21
312
  ) -> dict[str, Any]:
22
313
  """Analyze two traces and return comparison data.
23
-
314
+
24
315
  Args:
25
316
  trace1_path: Path to first trace file
26
317
  trace2_path: Path to second trace file
27
318
  phase_filter: Filter by phase ('all', 'prefill', or 'decode')
28
319
  max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
29
-
320
+ include_stacks: Whether to include Python stack traces (disable for faster analysis)
321
+
30
322
  Returns:
31
323
  Dictionary containing:
32
324
  - metadata: trace info (GPUs, kernel counts, total times, etc.)
33
325
  - operations: per-operation comparison data
34
326
  - layers: per-layer comparison data (if layers detected)
35
327
  """
36
- # Load traces
37
- p1, gpu1, dev1, df1, patterns1, layers1 = load_trace(trace1_path)
38
- p2, gpu2, dev2, df2, patterns2, layers2 = load_trace(trace2_path)
39
-
328
+ # Load both traces in parallel using separate processes
329
+ # This provides ~1.7x speedup over sequential loading
330
+ print("Loading traces in parallel...", file=sys.stderr)
331
+
332
+ with ProcessPoolExecutor(max_workers=2) as executor:
333
+ future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
334
+ future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
335
+ trace1 = future1.result()
336
+ trace2 = future2.result()
337
+
338
+ print("Analyzing operations...", file=sys.stderr)
339
+
340
+ result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
341
+
342
+ # Update metadata with file paths for backward compatibility
343
+ result["metadata"]["trace1_name"] = str(trace1_path)
344
+ result["metadata"]["trace2_name"] = str(trace2_path)
345
+
346
+ return result
347
+
348
+
349
+ def analyze_traces_aligned(
350
+ trace1: LoadedTrace,
351
+ trace2: LoadedTrace,
352
+ phase_filter: str = "all",
353
+ ) -> dict[str, Any]:
354
+ """Analyze traces using kernel-to-kernel alignment.
355
+
356
+ Args:
357
+ trace1: First loaded trace
358
+ trace2: Second loaded trace
359
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
360
+
361
+ Returns:
362
+ Dictionary with alignment-based comparison data
363
+ """
364
+ amd_phases = trace1.phases
365
+ nvidia_phases = trace2.phases
366
+
367
+ if phase_filter != "all":
368
+ amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
369
+ nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
370
+
371
+ amd_kernels = trace1.kernel_events
372
+ nvidia_kernels = trace2.kernel_events
373
+
374
+ if phase_filter != "all" and amd_phases:
375
+ phase_starts = [p["ts_start"] for p in amd_phases]
376
+ phase_ends = [p["ts_end"] for p in amd_phases]
377
+ amd_kernels = [
378
+ k for k in amd_kernels
379
+ if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
380
+ for i in range(len(phase_starts)))
381
+ ]
382
+
383
+ if phase_filter != "all" and nvidia_phases:
384
+ phase_starts = [p["ts_start"] for p in nvidia_phases]
385
+ phase_ends = [p["ts_end"] for p in nvidia_phases]
386
+ nvidia_kernels = [
387
+ k for k in nvidia_kernels
388
+ if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
389
+ for i in range(len(phase_starts)))
390
+ ]
391
+
392
+ alignment = align_traces(
393
+ amd_kernels,
394
+ nvidia_kernels,
395
+ amd_phases,
396
+ nvidia_phases,
397
+ trace1.platform,
398
+ trace2.platform,
399
+ )
400
+
401
+ layer_alignments = []
402
+ for layer_align in alignment.layer_alignments:
403
+ kernel_pairs = []
404
+ for pair in layer_align.kernel_pairs:
405
+ kernel_pairs.append({
406
+ "position": pair.position,
407
+ "operation": pair.operation,
408
+ "operation_detail": pair.operation_detail,
409
+ "amd_kernel": pair.amd_kernel,
410
+ "amd_avg_us": pair.amd_avg_us,
411
+ "amd_count": pair.amd_count,
412
+ "amd_total_us": pair.amd_total_us,
413
+ "nvidia_kernel": pair.nvidia_kernel,
414
+ "nvidia_avg_us": pair.nvidia_avg_us,
415
+ "nvidia_count": pair.nvidia_count,
416
+ "nvidia_total_us": pair.nvidia_total_us,
417
+ "ratio": pair.ratio,
418
+ "gap_us": pair.gap_us,
419
+ "fusion_note": pair.fusion_note,
420
+ "is_same_kernel": pair.is_same_kernel,
421
+ })
422
+
423
+ layer_alignments.append({
424
+ "layer": layer_align.layer,
425
+ "amd_total_us": layer_align.amd_total_us,
426
+ "nvidia_total_us": layer_align.nvidia_total_us,
427
+ "ratio": layer_align.ratio,
428
+ "gap_us": layer_align.gap_us,
429
+ "kernel_pairs": kernel_pairs,
430
+ })
431
+
432
+ fusion_result = analyze_fusion_from_alignment(alignment.layer_alignments)
433
+ same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
434
+
435
+ if trace1.platform == "AMD":
436
+ amd_trace, nvidia_trace = trace1, trace2
437
+ else:
438
+ amd_trace, nvidia_trace = trace2, trace1
439
+
440
+ return {
441
+ "metadata": {
442
+ "amd_gpu": amd_trace.gpu_name,
443
+ "nvidia_gpu": nvidia_trace.gpu_name,
444
+ "amd_platform": amd_trace.platform,
445
+ "nvidia_platform": nvidia_trace.platform,
446
+ "model_layers": alignment.num_layers,
447
+ "forward_passes": alignment.num_forward_passes,
448
+ "phase_breakdown": alignment.phase_breakdown,
449
+ "phase_filter": phase_filter,
450
+ "trace1_platform": trace1.platform,
451
+ "trace1_gpu": trace1.gpu_name,
452
+ "trace1_device": trace1.device_props,
453
+ "trace2_platform": trace2.platform,
454
+ "trace2_gpu": trace2.gpu_name,
455
+ "trace2_device": trace2.device_props,
456
+ "trace1_kernels": len(amd_trace.kernel_events),
457
+ "trace2_kernels": len(nvidia_trace.kernel_events),
458
+ "trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
459
+ "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
460
+ "phase": phase_filter,
461
+ "trace1_layers": alignment.num_layers,
462
+ "trace2_layers": alignment.num_layers,
463
+ },
464
+ "layer_alignments": layer_alignments,
465
+ "fusion_analysis": fusion_result,
466
+ "same_kernel_analysis": same_kernel_result,
467
+ }
468
+
40
469
  # Apply phase filter
41
470
  if phase_filter != "all":
42
471
  df1_filtered = df1[df1["phase"] == phase_filter]
43
472
  df2_filtered = df2[df2["phase"] == phase_filter]
44
-
473
+
45
474
  if len(df1_filtered) == 0 and len(df2_filtered) == 0:
46
- # No data in requested phase - return early with error info
47
475
  trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
48
476
  trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
49
477
  raise ValueError(
50
478
  f"No {phase_filter} phase found. "
51
479
  f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
52
480
  )
53
-
481
+
54
482
  df1, df2 = df1_filtered, df2_filtered
55
-
483
+
56
484
  # Pre-compute aggregations for both operations and layers in single pass
57
- # This is much faster than iterating through filtered dataframes multiple times
58
-
59
- # Group by operation for operation-level analysis
60
485
  trace1_by_op = df1.groupby("op").agg({
61
486
  "dur_us": ["sum", "mean", "count"],
62
487
  "phase": lambda x: set(x.dropna().unique()),
63
488
  "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
64
489
  })
65
490
  trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
66
-
491
+
67
492
  trace2_by_op = df2.groupby("op").agg({
68
493
  "dur_us": ["sum", "mean", "count"],
69
494
  "phase": lambda x: set(x.dropna().unique()),
70
495
  "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
71
496
  })
72
497
  trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
73
-
74
- # Group by layer for layer-level analysis (only for kernels with layer info)
498
+
499
+ # Group by layer for layer-level analysis
75
500
  df1_layered = df1[df1["layer"].notna()]
76
501
  df2_layered = df2[df2["layer"].notna()]
77
-
502
+
78
503
  trace1_by_layer = df1_layered.groupby("layer").agg({
79
504
  "dur_us": ["sum", "count"],
80
505
  }) if len(df1_layered) > 0 else pd.DataFrame()
81
506
  if len(trace1_by_layer) > 0:
82
507
  trace1_by_layer.columns = ["total_us", "count"]
83
-
508
+
84
509
  trace2_by_layer = df2_layered.groupby("layer").agg({
85
510
  "dur_us": ["sum", "count"],
86
511
  }) if len(df2_layered) > 0 else pd.DataFrame()
87
512
  if len(trace2_by_layer) > 0:
88
513
  trace2_by_layer.columns = ["total_us", "count"]
89
-
90
- # Calculate per-operation statistics
514
+
91
515
  results: dict[str, Any] = {
92
516
  "metadata": {
93
517
  "trace1_name": str(trace1_path),
@@ -109,47 +533,37 @@ def analyze_traces(
109
533
  "operations": [],
110
534
  "layers": [],
111
535
  }
112
-
113
- # Per-operation comparison using pre-computed aggregations
536
+
537
+ # Per-operation comparison
114
538
  all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
115
-
116
- # Track if we've already compared RMSNorm variants to avoid duplicate comparisons
117
539
  rmsnorm_compared = False
118
-
540
+
119
541
  for op in sorted(all_ops):
120
- # Use pre-computed aggregations instead of filtering entire dataframes
121
542
  has_trace1 = op in trace1_by_op.index
122
543
  has_trace2 = op in trace2_by_op.index
123
-
124
- # Handle RMSNorm fusion differences: AMD does RMSNorm+GEMM, NVIDIA does separate RMSNorm
125
- trace1_op_for_pattern = op # Operation name to use for AMD pattern lookup
126
- trace2_op_for_pattern = op # Operation name to use for NVIDIA pattern lookup
544
+
545
+ trace1_op_for_pattern = op
546
+ trace2_op_for_pattern = op
127
547
  skip_comparison = False
128
-
548
+
129
549
  if op == "RMSNorm+GEMM" and not has_trace2:
130
- # Compare AMD's fused version to NVIDIA's separate RMSNorm
131
550
  has_trace2 = "RMSNorm" in trace2_by_op.index
132
- trace2_op_for_pattern = "RMSNorm" # NVIDIA kernels are stored under 'RMSNorm'
133
- rmsnorm_compared = True # Mark that we've compared RMSNorm
551
+ trace2_op_for_pattern = "RMSNorm"
552
+ rmsnorm_compared = True
134
553
  elif op == "RMSNorm" and not has_trace1:
135
- # Skip this comparison if we already handled it in RMSNorm+GEMM
136
554
  if rmsnorm_compared:
137
555
  skip_comparison = True
138
556
  else:
139
- # Compare NVIDIA's RMSNorm to AMD's fused version
140
557
  has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
141
- trace1_op_for_pattern = (
142
- "RMSNorm+GEMM" # AMD kernels are stored under 'RMSNorm+GEMM'
143
- )
558
+ trace1_op_for_pattern = "RMSNorm+GEMM"
144
559
  rmsnorm_compared = True
145
-
560
+
146
561
  if skip_comparison or not (has_trace1 and has_trace2):
147
562
  continue
148
-
149
- # Get pre-computed aggregations
563
+
150
564
  trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
151
565
  trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
152
-
566
+
153
567
  trace1_avg = trace1_agg["avg_us"]
154
568
  trace2_avg = trace2_agg["avg_us"]
155
569
  trace1_total = trace1_agg["total_us"] / 1000
@@ -158,8 +572,7 @@ def analyze_traces(
158
572
  trace2_count = int(trace2_agg["count"])
159
573
  ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
160
574
  gap_ms = trace1_total - trace2_total
161
-
162
- # Get kernel patterns using the correct operation names for each platform
575
+
163
576
  trace1_pattern = list(
164
577
  patterns1.get(
165
578
  (trace1_op_for_pattern, "decode"),
@@ -172,106 +585,91 @@ def analyze_traces(
172
585
  patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
173
586
  )
174
587
  )[0]
175
-
176
- # Get CPU operators from pre-computed aggregations
588
+
177
589
  trace1_cpu_op = trace1_agg["cpu_op"]
178
590
  trace2_cpu_op = trace2_agg["cpu_op"]
179
-
180
- # For detailed kernel data and python stacks, we still need to filter (but only when needed)
591
+
592
+ # Get detailed kernel data and stacks only when needed
181
593
  trace1_data = df1[df1["op"] == trace1_op_for_pattern]
182
594
  trace2_data = df2[df2["op"] == trace2_op_for_pattern]
183
-
184
- # Collect example Python stacks for this operation (for JSON output)
595
+
596
+ # Collect Python stacks if available
185
597
  trace1_python_stacks = []
186
- stack_limit = None if max_stacks == 0 else max_stacks
187
- for stack_list in trace1_data["python_stack"].head(stack_limit):
188
- if stack_list and len(stack_list) > 0:
189
- trace1_python_stacks.append(stack_list)
190
-
191
598
  trace2_python_stacks = []
192
- for stack_list in trace2_data["python_stack"].head(stack_limit):
193
- if stack_list and len(stack_list) > 0:
194
- trace2_python_stacks.append(stack_list)
195
-
196
- # Aggregate individual kernels by name for detailed view
197
- # Group by kernel name and calculate sum/count/avg
599
+
600
+ if include_stacks:
601
+ stack_limit = None if max_stacks == 0 else max_stacks
602
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
603
+ if stack_list and len(stack_list) > 0:
604
+ trace1_python_stacks.append(stack_list)
605
+
606
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
607
+ if stack_list and len(stack_list) > 0:
608
+ trace2_python_stacks.append(stack_list)
609
+
610
+ # Aggregate individual kernels
198
611
  trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
199
612
  trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
200
613
  trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
201
614
  trace1_kernels_list = trace1_kernels.to_dict("records")
202
-
615
+
203
616
  trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
204
617
  trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
205
618
  trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
206
619
  trace2_kernels_list = trace2_kernels.to_dict("records")
207
-
208
- # Determine status based on TOTAL TIME (gap), not per-call ratio
209
- # This handles cases where AMD runs fewer operations via fusion.
210
- # 5ms threshold chosen because:
211
- # - Filters out measurement noise and minor variations
212
- # - Represents meaningful performance impact (0.5% of typical 1s inference)
213
- # - Aligns with human perception of "noticeable" difference
214
- # - Too small (1ms) creates false positives from variance
215
- # - Too large (20ms) misses real optimization opportunities
216
- if gap_ms > 5.0: # AMD spends >5ms more total time
620
+
621
+ if gap_ms > 5.0:
217
622
  status = "slower"
218
- elif gap_ms < -5.0: # AMD spends >5ms less total time
623
+ elif gap_ms < -5.0:
219
624
  status = "faster"
220
625
  else:
221
626
  status = "similar"
222
-
223
- # Get phases from pre-computed aggregations
627
+
224
628
  phases = trace1_agg["phases"] | trace2_agg["phases"]
225
-
226
- results["operations"].append(
227
- {
228
- "operation": op,
229
- "trace1_count": trace1_count,
230
- "trace2_count": trace2_count,
231
- "trace1_avg_us": trace1_avg,
232
- "trace2_avg_us": trace2_avg,
233
- "trace1_total_ms": trace1_total,
234
- "trace2_total_ms": trace2_total,
235
- "ratio": ratio,
236
- "gap_ms": gap_ms,
237
- "status": status,
238
- "trace1_kernel": trace1_pattern,
239
- "trace2_kernel": trace2_pattern,
240
- "trace1_cpu_op": trace1_cpu_op,
241
- "trace2_cpu_op": trace2_cpu_op,
242
- "trace1_python_stacks": trace1_python_stacks, # Full stacks for JSON
243
- "trace2_python_stacks": trace2_python_stacks,
244
- "trace1_kernels": trace1_kernels_list, # All individual kernels for JSON
245
- "trace2_kernels": trace2_kernels_list, # All individual kernels for JSON
246
- "phases": sorted(list(phases)) if phases else ["all"], # For client-side filtering
247
- }
248
- )
249
-
250
- # Sort by absolute gap
629
+
630
+ results["operations"].append({
631
+ "operation": op,
632
+ "trace1_count": trace1_count,
633
+ "trace2_count": trace2_count,
634
+ "trace1_avg_us": trace1_avg,
635
+ "trace2_avg_us": trace2_avg,
636
+ "trace1_total_ms": trace1_total,
637
+ "trace2_total_ms": trace2_total,
638
+ "ratio": ratio,
639
+ "gap_ms": gap_ms,
640
+ "status": status,
641
+ "trace1_kernel": trace1_pattern,
642
+ "trace2_kernel": trace2_pattern,
643
+ "trace1_cpu_op": trace1_cpu_op,
644
+ "trace2_cpu_op": trace2_cpu_op,
645
+ "trace1_python_stacks": trace1_python_stacks,
646
+ "trace2_python_stacks": trace2_python_stacks,
647
+ "trace1_kernels": trace1_kernels_list,
648
+ "trace2_kernels": trace2_kernels_list,
649
+ "phases": sorted(list(phases)) if phases else ["all"],
650
+ })
651
+
251
652
  results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
252
-
253
- # Layer-wise analysis using pre-computed aggregations
653
+
654
+ # Layer-wise analysis
254
655
  if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
255
- # Get all unique layers present in either trace
256
656
  all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
257
-
657
+
258
658
  for layer_num in all_layers:
259
659
  has_trace1 = layer_num in trace1_by_layer.index
260
660
  has_trace2 = layer_num in trace2_by_layer.index
261
-
661
+
262
662
  if has_trace1 and has_trace2:
263
- # Layer present in both traces - compare them
264
663
  trace1_agg = trace1_by_layer.loc[layer_num]
265
664
  trace2_agg = trace2_by_layer.loc[layer_num]
266
-
665
+
267
666
  trace1_total = trace1_agg["total_us"] / 1000
268
667
  trace2_total = trace2_agg["total_us"] / 1000
269
668
  trace1_count = int(trace1_agg["count"])
270
669
  trace2_count = int(trace2_agg["count"])
271
670
  ratio = trace1_total / trace2_total if trace2_total > 0 else 1
272
671
  gap_ms = trace1_total - trace2_total
273
-
274
- # Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
672
+
275
673
  threshold_ms = 0.1
276
674
  threshold_ratio = 1.2
277
675
  if gap_ms > threshold_ms and ratio > threshold_ratio:
@@ -280,60 +678,52 @@ def analyze_traces(
280
678
  status = "faster"
281
679
  else:
282
680
  status = "similar"
283
-
284
- results["layers"].append(
285
- {
286
- "layer": int(layer_num),
287
- "trace1_kernels": trace1_count,
288
- "trace2_kernels": trace2_count,
289
- "trace1_total_ms": trace1_total,
290
- "trace2_total_ms": trace2_total,
291
- "ratio": ratio,
292
- "gap_ms": gap_ms,
293
- "status": status,
294
- "in_both": True,
295
- }
296
- )
681
+
682
+ results["layers"].append({
683
+ "layer": int(layer_num),
684
+ "trace1_kernels": trace1_count,
685
+ "trace2_kernels": trace2_count,
686
+ "trace1_total_ms": trace1_total,
687
+ "trace2_total_ms": trace2_total,
688
+ "ratio": ratio,
689
+ "gap_ms": gap_ms,
690
+ "status": status,
691
+ "in_both": True,
692
+ })
297
693
  elif has_trace1:
298
- # Layer only in trace1
299
694
  trace1_agg = trace1_by_layer.loc[layer_num]
300
695
  trace1_total = trace1_agg["total_us"] / 1000
301
696
  trace1_count = int(trace1_agg["count"])
302
-
303
- results["layers"].append(
304
- {
305
- "layer": int(layer_num),
306
- "trace1_kernels": trace1_count,
307
- "trace2_kernels": 0,
308
- "trace1_total_ms": trace1_total,
309
- "trace2_total_ms": 0.0,
310
- "ratio": 0.0,
311
- "gap_ms": trace1_total,
312
- "status": "trace1_only",
313
- "in_both": False,
314
- }
315
- )
697
+
698
+ results["layers"].append({
699
+ "layer": int(layer_num),
700
+ "trace1_kernels": trace1_count,
701
+ "trace2_kernels": 0,
702
+ "trace1_total_ms": trace1_total,
703
+ "trace2_total_ms": 0.0,
704
+ "ratio": 0.0,
705
+ "gap_ms": trace1_total,
706
+ "status": "trace1_only",
707
+ "in_both": False,
708
+ })
316
709
  elif has_trace2:
317
- # Layer only in trace2
318
710
  trace2_agg = trace2_by_layer.loc[layer_num]
319
711
  trace2_total = trace2_agg["total_us"] / 1000
320
712
  trace2_count = int(trace2_agg["count"])
321
-
322
- results["layers"].append(
323
- {
324
- "layer": int(layer_num),
325
- "trace1_kernels": 0,
326
- "trace2_kernels": trace2_count,
327
- "trace1_total_ms": 0.0,
328
- "trace2_total_ms": trace2_total,
329
- "ratio": 0.0,
330
- "gap_ms": -trace2_total,
331
- "status": "trace2_only",
332
- "in_both": False,
333
- }
334
- )
335
-
336
- # Sort: comparable layers first (by absolute gap), then trace-unique layers
713
+
714
+ results["layers"].append({
715
+ "layer": int(layer_num),
716
+ "trace1_kernels": 0,
717
+ "trace2_kernels": trace2_count,
718
+ "trace1_total_ms": 0.0,
719
+ "trace2_total_ms": trace2_total,
720
+ "ratio": 0.0,
721
+ "gap_ms": -trace2_total,
722
+ "status": "trace2_only",
723
+ "in_both": False,
724
+ })
725
+
337
726
  results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
338
-
727
+
728
+ print("Analysis complete.", file=sys.stderr)
339
729
  return results