wafer-core 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,303 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
4
4
  at the operation level and layer level.
5
5
  """
6
6
 
7
+ import sys
7
8
  from collections import defaultdict
9
+ from concurrent.futures import ProcessPoolExecutor
8
10
  from pathlib import Path
9
11
  from typing import Any
10
12
 
11
13
  import pandas as pd
12
14
 
13
- from .loader import load_trace
15
+ from .aligner import align_traces, TraceAlignment
16
+ from .fusion_analyzer import analyze_fusion_from_alignment
17
+ from .same_kernel_analyzer import analyze_same_kernels_from_alignment
18
+ from .loader import load_trace_full, LoadedTrace
19
+
20
+
21
+ def analyze_traces_from_loaded(
22
+ trace1: LoadedTrace,
23
+ trace2: LoadedTrace,
24
+ phase_filter: str = "all",
25
+ max_stacks: int = 3,
26
+ ) -> dict[str, Any]:
27
+ """Analyze two loaded traces and return comparison data.
28
+
29
+ Args:
30
+ trace1: First loaded trace
31
+ trace2: Second loaded trace
32
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
33
+ max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
34
+
35
+ Returns:
36
+ Dictionary containing:
37
+ - metadata: trace info (GPUs, kernel counts, total times, etc.)
38
+ - operations: per-operation comparison data
39
+ - layers: per-layer comparison data (if layers detected)
40
+ """
41
+ df1 = trace1.df
42
+ df2 = trace2.df
43
+
44
+ # Apply phase filter
45
+ if phase_filter != "all":
46
+ df1_filtered = df1[df1["phase"] == phase_filter]
47
+ df2_filtered = df2[df2["phase"] == phase_filter]
48
+
49
+ if len(df1_filtered) == 0 and len(df2_filtered) == 0:
50
+ trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
51
+ trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
52
+ raise ValueError(
53
+ f"No {phase_filter} phase found. "
54
+ f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
55
+ )
56
+
57
+ df1, df2 = df1_filtered, df2_filtered
58
+
59
+ # Pre-compute aggregations for both operations and layers in single pass
60
+ trace1_by_op = df1.groupby("op").agg({
61
+ "dur_us": ["sum", "mean", "count"],
62
+ "phase": lambda x: set(x.dropna().unique()),
63
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
64
+ })
65
+ trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
66
+
67
+ trace2_by_op = df2.groupby("op").agg({
68
+ "dur_us": ["sum", "mean", "count"],
69
+ "phase": lambda x: set(x.dropna().unique()),
70
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
71
+ })
72
+ trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
73
+
74
+ # Group by layer for layer-level analysis
75
+ df1_layered = df1[df1["layer"].notna()]
76
+ df2_layered = df2[df2["layer"].notna()]
77
+
78
+ trace1_by_layer = df1_layered.groupby("layer").agg({
79
+ "dur_us": ["sum", "count"],
80
+ }) if len(df1_layered) > 0 else pd.DataFrame()
81
+ if len(trace1_by_layer) > 0:
82
+ trace1_by_layer.columns = ["total_us", "count"]
83
+
84
+ trace2_by_layer = df2_layered.groupby("layer").agg({
85
+ "dur_us": ["sum", "count"],
86
+ }) if len(df2_layered) > 0 else pd.DataFrame()
87
+ if len(trace2_by_layer) > 0:
88
+ trace2_by_layer.columns = ["total_us", "count"]
89
+
90
+ results: dict[str, Any] = {
91
+ "metadata": {
92
+ "trace1_name": f"{trace1.platform} {trace1.gpu_name}",
93
+ "trace2_name": f"{trace2.platform} {trace2.gpu_name}",
94
+ "trace1_platform": trace1.platform,
95
+ "trace1_gpu": trace1.gpu_name,
96
+ "trace1_device": trace1.device_props,
97
+ "trace2_platform": trace2.platform,
98
+ "trace2_gpu": trace2.gpu_name,
99
+ "trace2_device": trace2.device_props,
100
+ "trace1_kernels": len(df1),
101
+ "trace2_kernels": len(df2),
102
+ "trace1_total_ms": df1["dur_us"].sum() / 1000,
103
+ "trace2_total_ms": df2["dur_us"].sum() / 1000,
104
+ "phase": phase_filter,
105
+ "trace1_layers": len(trace1.layers),
106
+ "trace2_layers": len(trace2.layers),
107
+ },
108
+ "operations": [],
109
+ "layers": [],
110
+ }
111
+
112
+ # Per-operation comparison
113
+ all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
114
+ rmsnorm_compared = False
115
+
116
+ for op in sorted(all_ops):
117
+ has_trace1 = op in trace1_by_op.index
118
+ has_trace2 = op in trace2_by_op.index
119
+
120
+ trace1_op_for_pattern = op
121
+ trace2_op_for_pattern = op
122
+ skip_comparison = False
123
+
124
+ if op == "RMSNorm+GEMM" and not has_trace2:
125
+ has_trace2 = "RMSNorm" in trace2_by_op.index
126
+ trace2_op_for_pattern = "RMSNorm"
127
+ rmsnorm_compared = True
128
+ elif op == "RMSNorm" and not has_trace1:
129
+ if rmsnorm_compared:
130
+ skip_comparison = True
131
+ else:
132
+ has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
133
+ trace1_op_for_pattern = "RMSNorm+GEMM"
134
+ rmsnorm_compared = True
135
+
136
+ if skip_comparison or not (has_trace1 and has_trace2):
137
+ continue
138
+
139
+ trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
140
+ trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
141
+
142
+ trace1_avg = trace1_agg["avg_us"]
143
+ trace2_avg = trace2_agg["avg_us"]
144
+ trace1_total = trace1_agg["total_us"] / 1000
145
+ trace2_total = trace2_agg["total_us"] / 1000
146
+ trace1_count = int(trace1_agg["count"])
147
+ trace2_count = int(trace2_agg["count"])
148
+ ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
149
+ gap_ms = trace1_total - trace2_total
150
+
151
+ trace1_pattern = list(
152
+ trace1.patterns.get(
153
+ (trace1_op_for_pattern, "decode"),
154
+ trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
155
+ )
156
+ )[0]
157
+ trace2_pattern = list(
158
+ trace2.patterns.get(
159
+ (trace2_op_for_pattern, "decode"),
160
+ trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
161
+ )
162
+ )[0]
163
+
164
+ trace1_cpu_op = trace1_agg["cpu_op"]
165
+ trace2_cpu_op = trace2_agg["cpu_op"]
166
+
167
+ # Get detailed kernel data and stacks only when needed
168
+ trace1_data = df1[df1["op"] == trace1_op_for_pattern]
169
+ trace2_data = df2[df2["op"] == trace2_op_for_pattern]
170
+
171
+ # Collect Python stacks if available
172
+ trace1_python_stacks = []
173
+ trace2_python_stacks = []
174
+
175
+ if max_stacks != 0:
176
+ stack_limit = None if max_stacks == 0 else max_stacks
177
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
178
+ if stack_list and len(stack_list) > 0:
179
+ trace1_python_stacks.append(stack_list)
180
+
181
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
182
+ if stack_list and len(stack_list) > 0:
183
+ trace2_python_stacks.append(stack_list)
184
+
185
+ # Aggregate individual kernels
186
+ trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
187
+ trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
188
+ trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
189
+ trace1_kernels_list = trace1_kernels.to_dict("records")
190
+
191
+ trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
192
+ trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
193
+ trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
194
+ trace2_kernels_list = trace2_kernels.to_dict("records")
195
+
196
+ if gap_ms > 5.0:
197
+ status = "slower"
198
+ elif gap_ms < -5.0:
199
+ status = "faster"
200
+ else:
201
+ status = "similar"
202
+
203
+ phases = trace1_agg["phases"] | trace2_agg["phases"]
204
+
205
+ results["operations"].append({
206
+ "operation": op,
207
+ "trace1_count": trace1_count,
208
+ "trace2_count": trace2_count,
209
+ "trace1_avg_us": trace1_avg,
210
+ "trace2_avg_us": trace2_avg,
211
+ "trace1_total_ms": trace1_total,
212
+ "trace2_total_ms": trace2_total,
213
+ "ratio": ratio,
214
+ "gap_ms": gap_ms,
215
+ "status": status,
216
+ "trace1_kernel": trace1_pattern,
217
+ "trace2_kernel": trace2_pattern,
218
+ "trace1_cpu_op": trace1_cpu_op,
219
+ "trace2_cpu_op": trace2_cpu_op,
220
+ "trace1_python_stacks": trace1_python_stacks,
221
+ "trace2_python_stacks": trace2_python_stacks,
222
+ "trace1_kernels": trace1_kernels_list,
223
+ "trace2_kernels": trace2_kernels_list,
224
+ "phases": sorted(list(phases)) if phases else ["all"],
225
+ })
226
+
227
+ results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
228
+
229
+ # Layer-wise analysis
230
+ if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
231
+ all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
232
+
233
+ for layer_num in all_layers:
234
+ has_trace1 = layer_num in trace1_by_layer.index
235
+ has_trace2 = layer_num in trace2_by_layer.index
236
+
237
+ if has_trace1 and has_trace2:
238
+ trace1_agg = trace1_by_layer.loc[layer_num]
239
+ trace2_agg = trace2_by_layer.loc[layer_num]
240
+
241
+ trace1_total = trace1_agg["total_us"] / 1000
242
+ trace2_total = trace2_agg["total_us"] / 1000
243
+ trace1_count = int(trace1_agg["count"])
244
+ trace2_count = int(trace2_agg["count"])
245
+ ratio = trace1_total / trace2_total if trace2_total > 0 else 1
246
+ gap_ms = trace1_total - trace2_total
247
+
248
+ threshold_ms = 0.1
249
+ threshold_ratio = 1.2
250
+ if gap_ms > threshold_ms and ratio > threshold_ratio:
251
+ status = "slower"
252
+ elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
253
+ status = "faster"
254
+ else:
255
+ status = "similar"
256
+
257
+ results["layers"].append({
258
+ "layer": int(layer_num),
259
+ "trace1_kernels": trace1_count,
260
+ "trace2_kernels": trace2_count,
261
+ "trace1_total_ms": trace1_total,
262
+ "trace2_total_ms": trace2_total,
263
+ "ratio": ratio,
264
+ "gap_ms": gap_ms,
265
+ "status": status,
266
+ "in_both": True,
267
+ })
268
+ elif has_trace1:
269
+ trace1_agg = trace1_by_layer.loc[layer_num]
270
+ trace1_total = trace1_agg["total_us"] / 1000
271
+ trace1_count = int(trace1_agg["count"])
272
+
273
+ results["layers"].append({
274
+ "layer": int(layer_num),
275
+ "trace1_kernels": trace1_count,
276
+ "trace2_kernels": 0,
277
+ "trace1_total_ms": trace1_total,
278
+ "trace2_total_ms": 0.0,
279
+ "ratio": 0.0,
280
+ "gap_ms": trace1_total,
281
+ "status": "trace1_only",
282
+ "in_both": False,
283
+ })
284
+ elif has_trace2:
285
+ trace2_agg = trace2_by_layer.loc[layer_num]
286
+ trace2_total = trace2_agg["total_us"] / 1000
287
+ trace2_count = int(trace2_agg["count"])
288
+
289
+ results["layers"].append({
290
+ "layer": int(layer_num),
291
+ "trace1_kernels": 0,
292
+ "trace2_kernels": trace2_count,
293
+ "trace1_total_ms": 0.0,
294
+ "trace2_total_ms": trace2_total,
295
+ "ratio": 0.0,
296
+ "gap_ms": -trace2_total,
297
+ "status": "trace2_only",
298
+ "in_both": False,
299
+ })
300
+
301
+ results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
302
+
303
+ return results
14
304
 
15
305
 
16
306
  def analyze_traces(
@@ -18,76 +308,219 @@ def analyze_traces(
18
308
  trace2_path: str | Path,
19
309
  phase_filter: str = "all",
20
310
  max_stacks: int = 3,
311
+ include_stacks: bool = True,
21
312
  ) -> dict[str, Any]:
22
313
  """Analyze two traces and return comparison data.
23
-
314
+
24
315
  Args:
25
316
  trace1_path: Path to first trace file
26
317
  trace2_path: Path to second trace file
27
318
  phase_filter: Filter by phase ('all', 'prefill', or 'decode')
28
319
  max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
29
-
320
+ include_stacks: Whether to include Python stack traces (disable for faster analysis)
321
+
30
322
  Returns:
31
323
  Dictionary containing:
32
324
  - metadata: trace info (GPUs, kernel counts, total times, etc.)
33
325
  - operations: per-operation comparison data
34
326
  - layers: per-layer comparison data (if layers detected)
35
327
  """
36
- # Load traces
37
- p1, gpu1, dev1, df1, patterns1, layers1 = load_trace(trace1_path)
38
- p2, gpu2, dev2, df2, patterns2, layers2 = load_trace(trace2_path)
39
-
328
+ # Load both traces in parallel using separate processes
329
+ # This provides ~1.7x speedup over sequential loading
330
+ print("Loading traces in parallel...", file=sys.stderr)
331
+
332
+ with ProcessPoolExecutor(max_workers=2) as executor:
333
+ future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
334
+ future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
335
+ trace1 = future1.result()
336
+ trace2 = future2.result()
337
+
338
+ print("Analyzing operations...", file=sys.stderr)
339
+
340
+ result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
341
+
342
+ # Update metadata with file paths for backward compatibility
343
+ result["metadata"]["trace1_name"] = str(trace1_path)
344
+ result["metadata"]["trace2_name"] = str(trace2_path)
345
+
346
+ return result
347
+
348
+
349
+ def analyze_traces_aligned(
350
+ trace1: LoadedTrace,
351
+ trace2: LoadedTrace,
352
+ phase_filter: str = "all",
353
+ ) -> dict[str, Any]:
354
+ """Analyze traces using kernel-to-kernel alignment.
355
+
356
+ Args:
357
+ trace1: First loaded trace
358
+ trace2: Second loaded trace
359
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
360
+
361
+ Returns:
362
+ Dictionary with alignment-based comparison data
363
+ """
364
+ amd_phases = trace1.phases
365
+ nvidia_phases = trace2.phases
366
+
367
+ if phase_filter != "all":
368
+ amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
369
+ nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
370
+
371
+ amd_kernels = trace1.kernel_events
372
+ nvidia_kernels = trace2.kernel_events
373
+
374
+ if phase_filter != "all" and amd_phases:
375
+ phase_starts = [p["ts_start"] for p in amd_phases]
376
+ phase_ends = [p["ts_end"] for p in amd_phases]
377
+ amd_kernels = [
378
+ k for k in amd_kernels
379
+ if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
380
+ for i in range(len(phase_starts)))
381
+ ]
382
+
383
+ if phase_filter != "all" and nvidia_phases:
384
+ phase_starts = [p["ts_start"] for p in nvidia_phases]
385
+ phase_ends = [p["ts_end"] for p in nvidia_phases]
386
+ nvidia_kernels = [
387
+ k for k in nvidia_kernels
388
+ if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
389
+ for i in range(len(phase_starts)))
390
+ ]
391
+
392
+ alignment = align_traces(
393
+ amd_kernels,
394
+ nvidia_kernels,
395
+ amd_phases,
396
+ nvidia_phases,
397
+ trace1.platform,
398
+ trace2.platform,
399
+ )
400
+
401
+ layer_alignments = []
402
+ for layer_align in alignment.layer_alignments:
403
+ kernel_pairs = []
404
+ for pair in layer_align.kernel_pairs:
405
+ kernel_pairs.append({
406
+ "position": pair.position,
407
+ "operation": pair.operation,
408
+ "operation_detail": pair.operation_detail,
409
+ "amd_kernel": pair.amd_kernel,
410
+ "amd_avg_us": pair.amd_avg_us,
411
+ "amd_count": pair.amd_count,
412
+ "amd_total_us": pair.amd_total_us,
413
+ "nvidia_kernel": pair.nvidia_kernel,
414
+ "nvidia_avg_us": pair.nvidia_avg_us,
415
+ "nvidia_count": pair.nvidia_count,
416
+ "nvidia_total_us": pair.nvidia_total_us,
417
+ "ratio": pair.ratio,
418
+ "gap_us": pair.gap_us,
419
+ "fusion_note": pair.fusion_note,
420
+ "is_same_kernel": pair.is_same_kernel,
421
+ })
422
+
423
+ layer_alignments.append({
424
+ "layer": layer_align.layer,
425
+ "amd_total_us": layer_align.amd_total_us,
426
+ "nvidia_total_us": layer_align.nvidia_total_us,
427
+ "ratio": layer_align.ratio,
428
+ "gap_us": layer_align.gap_us,
429
+ "kernel_pairs": kernel_pairs,
430
+ })
431
+
432
+ # Determine which trace is AMD vs NVIDIA for fusion analysis
433
+ if trace1.platform == "AMD":
434
+ amd_trace, nvidia_trace = trace1, trace2
435
+ fusion_amd_kernels = amd_kernels
436
+ fusion_nvidia_kernels = nvidia_kernels
437
+ else:
438
+ amd_trace, nvidia_trace = trace2, trace1
439
+ fusion_amd_kernels = nvidia_kernels
440
+ fusion_nvidia_kernels = amd_kernels
441
+
442
+ fusion_result = analyze_fusion_from_alignment(
443
+ alignment.layer_alignments,
444
+ amd_kernels=fusion_amd_kernels,
445
+ nvidia_kernels=fusion_nvidia_kernels,
446
+ )
447
+ same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
448
+
449
+ return {
450
+ "metadata": {
451
+ "amd_gpu": amd_trace.gpu_name,
452
+ "nvidia_gpu": nvidia_trace.gpu_name,
453
+ "amd_platform": amd_trace.platform,
454
+ "nvidia_platform": nvidia_trace.platform,
455
+ "model_layers": alignment.num_layers,
456
+ "forward_passes": alignment.num_forward_passes,
457
+ "phase_breakdown": alignment.phase_breakdown,
458
+ "phase_filter": phase_filter,
459
+ "trace1_platform": trace1.platform,
460
+ "trace1_gpu": trace1.gpu_name,
461
+ "trace1_device": trace1.device_props,
462
+ "trace2_platform": trace2.platform,
463
+ "trace2_gpu": trace2.gpu_name,
464
+ "trace2_device": trace2.device_props,
465
+ "trace1_kernels": len(amd_trace.kernel_events),
466
+ "trace2_kernels": len(nvidia_trace.kernel_events),
467
+ "trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
468
+ "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
469
+ "phase": phase_filter,
470
+ "trace1_layers": alignment.num_layers,
471
+ "trace2_layers": alignment.num_layers,
472
+ },
473
+ "layer_alignments": layer_alignments,
474
+ "fusion_analysis": fusion_result,
475
+ "same_kernel_analysis": same_kernel_result,
476
+ }
477
+
40
478
  # Apply phase filter
41
479
  if phase_filter != "all":
42
480
  df1_filtered = df1[df1["phase"] == phase_filter]
43
481
  df2_filtered = df2[df2["phase"] == phase_filter]
44
-
482
+
45
483
  if len(df1_filtered) == 0 and len(df2_filtered) == 0:
46
- # No data in requested phase - return early with error info
47
484
  trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
48
485
  trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
49
486
  raise ValueError(
50
487
  f"No {phase_filter} phase found. "
51
488
  f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
52
489
  )
53
-
490
+
54
491
  df1, df2 = df1_filtered, df2_filtered
55
-
492
+
56
493
  # Pre-compute aggregations for both operations and layers in single pass
57
- # This is much faster than iterating through filtered dataframes multiple times
58
-
59
- # Group by operation for operation-level analysis
60
494
  trace1_by_op = df1.groupby("op").agg({
61
495
  "dur_us": ["sum", "mean", "count"],
62
496
  "phase": lambda x: set(x.dropna().unique()),
63
497
  "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
64
498
  })
65
499
  trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
66
-
500
+
67
501
  trace2_by_op = df2.groupby("op").agg({
68
502
  "dur_us": ["sum", "mean", "count"],
69
503
  "phase": lambda x: set(x.dropna().unique()),
70
504
  "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
71
505
  })
72
506
  trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
73
-
74
- # Group by layer for layer-level analysis (only for kernels with layer info)
507
+
508
+ # Group by layer for layer-level analysis
75
509
  df1_layered = df1[df1["layer"].notna()]
76
510
  df2_layered = df2[df2["layer"].notna()]
77
-
511
+
78
512
  trace1_by_layer = df1_layered.groupby("layer").agg({
79
513
  "dur_us": ["sum", "count"],
80
514
  }) if len(df1_layered) > 0 else pd.DataFrame()
81
515
  if len(trace1_by_layer) > 0:
82
516
  trace1_by_layer.columns = ["total_us", "count"]
83
-
517
+
84
518
  trace2_by_layer = df2_layered.groupby("layer").agg({
85
519
  "dur_us": ["sum", "count"],
86
520
  }) if len(df2_layered) > 0 else pd.DataFrame()
87
521
  if len(trace2_by_layer) > 0:
88
522
  trace2_by_layer.columns = ["total_us", "count"]
89
-
90
- # Calculate per-operation statistics
523
+
91
524
  results: dict[str, Any] = {
92
525
  "metadata": {
93
526
  "trace1_name": str(trace1_path),
@@ -109,47 +542,37 @@ def analyze_traces(
109
542
  "operations": [],
110
543
  "layers": [],
111
544
  }
112
-
113
- # Per-operation comparison using pre-computed aggregations
545
+
546
+ # Per-operation comparison
114
547
  all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
115
-
116
- # Track if we've already compared RMSNorm variants to avoid duplicate comparisons
117
548
  rmsnorm_compared = False
118
-
549
+
119
550
  for op in sorted(all_ops):
120
- # Use pre-computed aggregations instead of filtering entire dataframes
121
551
  has_trace1 = op in trace1_by_op.index
122
552
  has_trace2 = op in trace2_by_op.index
123
-
124
- # Handle RMSNorm fusion differences: AMD does RMSNorm+GEMM, NVIDIA does separate RMSNorm
125
- trace1_op_for_pattern = op # Operation name to use for AMD pattern lookup
126
- trace2_op_for_pattern = op # Operation name to use for NVIDIA pattern lookup
553
+
554
+ trace1_op_for_pattern = op
555
+ trace2_op_for_pattern = op
127
556
  skip_comparison = False
128
-
557
+
129
558
  if op == "RMSNorm+GEMM" and not has_trace2:
130
- # Compare AMD's fused version to NVIDIA's separate RMSNorm
131
559
  has_trace2 = "RMSNorm" in trace2_by_op.index
132
- trace2_op_for_pattern = "RMSNorm" # NVIDIA kernels are stored under 'RMSNorm'
133
- rmsnorm_compared = True # Mark that we've compared RMSNorm
560
+ trace2_op_for_pattern = "RMSNorm"
561
+ rmsnorm_compared = True
134
562
  elif op == "RMSNorm" and not has_trace1:
135
- # Skip this comparison if we already handled it in RMSNorm+GEMM
136
563
  if rmsnorm_compared:
137
564
  skip_comparison = True
138
565
  else:
139
- # Compare NVIDIA's RMSNorm to AMD's fused version
140
566
  has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
141
- trace1_op_for_pattern = (
142
- "RMSNorm+GEMM" # AMD kernels are stored under 'RMSNorm+GEMM'
143
- )
567
+ trace1_op_for_pattern = "RMSNorm+GEMM"
144
568
  rmsnorm_compared = True
145
-
569
+
146
570
  if skip_comparison or not (has_trace1 and has_trace2):
147
571
  continue
148
-
149
- # Get pre-computed aggregations
572
+
150
573
  trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
151
574
  trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
152
-
575
+
153
576
  trace1_avg = trace1_agg["avg_us"]
154
577
  trace2_avg = trace2_agg["avg_us"]
155
578
  trace1_total = trace1_agg["total_us"] / 1000
@@ -158,8 +581,7 @@ def analyze_traces(
158
581
  trace2_count = int(trace2_agg["count"])
159
582
  ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
160
583
  gap_ms = trace1_total - trace2_total
161
-
162
- # Get kernel patterns using the correct operation names for each platform
584
+
163
585
  trace1_pattern = list(
164
586
  patterns1.get(
165
587
  (trace1_op_for_pattern, "decode"),
@@ -172,106 +594,91 @@ def analyze_traces(
172
594
  patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
173
595
  )
174
596
  )[0]
175
-
176
- # Get CPU operators from pre-computed aggregations
597
+
177
598
  trace1_cpu_op = trace1_agg["cpu_op"]
178
599
  trace2_cpu_op = trace2_agg["cpu_op"]
179
-
180
- # For detailed kernel data and python stacks, we still need to filter (but only when needed)
600
+
601
+ # Get detailed kernel data and stacks only when needed
181
602
  trace1_data = df1[df1["op"] == trace1_op_for_pattern]
182
603
  trace2_data = df2[df2["op"] == trace2_op_for_pattern]
183
-
184
- # Collect example Python stacks for this operation (for JSON output)
604
+
605
+ # Collect Python stacks if available
185
606
  trace1_python_stacks = []
186
- stack_limit = None if max_stacks == 0 else max_stacks
187
- for stack_list in trace1_data["python_stack"].head(stack_limit):
188
- if stack_list and len(stack_list) > 0:
189
- trace1_python_stacks.append(stack_list)
190
-
191
607
  trace2_python_stacks = []
192
- for stack_list in trace2_data["python_stack"].head(stack_limit):
193
- if stack_list and len(stack_list) > 0:
194
- trace2_python_stacks.append(stack_list)
195
-
196
- # Aggregate individual kernels by name for detailed view
197
- # Group by kernel name and calculate sum/count/avg
608
+
609
+ if include_stacks:
610
+ stack_limit = None if max_stacks == 0 else max_stacks
611
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
612
+ if stack_list and len(stack_list) > 0:
613
+ trace1_python_stacks.append(stack_list)
614
+
615
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
616
+ if stack_list and len(stack_list) > 0:
617
+ trace2_python_stacks.append(stack_list)
618
+
619
+ # Aggregate individual kernels
198
620
  trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
199
621
  trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
200
622
  trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
201
623
  trace1_kernels_list = trace1_kernels.to_dict("records")
202
-
624
+
203
625
  trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
204
626
  trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
205
627
  trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
206
628
  trace2_kernels_list = trace2_kernels.to_dict("records")
207
-
208
- # Determine status based on TOTAL TIME (gap), not per-call ratio
209
- # This handles cases where AMD runs fewer operations via fusion.
210
- # 5ms threshold chosen because:
211
- # - Filters out measurement noise and minor variations
212
- # - Represents meaningful performance impact (0.5% of typical 1s inference)
213
- # - Aligns with human perception of "noticeable" difference
214
- # - Too small (1ms) creates false positives from variance
215
- # - Too large (20ms) misses real optimization opportunities
216
- if gap_ms > 5.0: # AMD spends >5ms more total time
629
+
630
+ if gap_ms > 5.0:
217
631
  status = "slower"
218
- elif gap_ms < -5.0: # AMD spends >5ms less total time
632
+ elif gap_ms < -5.0:
219
633
  status = "faster"
220
634
  else:
221
635
  status = "similar"
222
-
223
- # Get phases from pre-computed aggregations
636
+
224
637
  phases = trace1_agg["phases"] | trace2_agg["phases"]
225
-
226
- results["operations"].append(
227
- {
228
- "operation": op,
229
- "trace1_count": trace1_count,
230
- "trace2_count": trace2_count,
231
- "trace1_avg_us": trace1_avg,
232
- "trace2_avg_us": trace2_avg,
233
- "trace1_total_ms": trace1_total,
234
- "trace2_total_ms": trace2_total,
235
- "ratio": ratio,
236
- "gap_ms": gap_ms,
237
- "status": status,
238
- "trace1_kernel": trace1_pattern,
239
- "trace2_kernel": trace2_pattern,
240
- "trace1_cpu_op": trace1_cpu_op,
241
- "trace2_cpu_op": trace2_cpu_op,
242
- "trace1_python_stacks": trace1_python_stacks, # Full stacks for JSON
243
- "trace2_python_stacks": trace2_python_stacks,
244
- "trace1_kernels": trace1_kernels_list, # All individual kernels for JSON
245
- "trace2_kernels": trace2_kernels_list, # All individual kernels for JSON
246
- "phases": sorted(list(phases)) if phases else ["all"], # For client-side filtering
247
- }
248
- )
249
-
250
- # Sort by absolute gap
638
+
639
+ results["operations"].append({
640
+ "operation": op,
641
+ "trace1_count": trace1_count,
642
+ "trace2_count": trace2_count,
643
+ "trace1_avg_us": trace1_avg,
644
+ "trace2_avg_us": trace2_avg,
645
+ "trace1_total_ms": trace1_total,
646
+ "trace2_total_ms": trace2_total,
647
+ "ratio": ratio,
648
+ "gap_ms": gap_ms,
649
+ "status": status,
650
+ "trace1_kernel": trace1_pattern,
651
+ "trace2_kernel": trace2_pattern,
652
+ "trace1_cpu_op": trace1_cpu_op,
653
+ "trace2_cpu_op": trace2_cpu_op,
654
+ "trace1_python_stacks": trace1_python_stacks,
655
+ "trace2_python_stacks": trace2_python_stacks,
656
+ "trace1_kernels": trace1_kernels_list,
657
+ "trace2_kernels": trace2_kernels_list,
658
+ "phases": sorted(list(phases)) if phases else ["all"],
659
+ })
660
+
251
661
  results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
252
-
253
- # Layer-wise analysis using pre-computed aggregations
662
+
663
+ # Layer-wise analysis
254
664
  if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
255
- # Get all unique layers present in either trace
256
665
  all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
257
-
666
+
258
667
  for layer_num in all_layers:
259
668
  has_trace1 = layer_num in trace1_by_layer.index
260
669
  has_trace2 = layer_num in trace2_by_layer.index
261
-
670
+
262
671
  if has_trace1 and has_trace2:
263
- # Layer present in both traces - compare them
264
672
  trace1_agg = trace1_by_layer.loc[layer_num]
265
673
  trace2_agg = trace2_by_layer.loc[layer_num]
266
-
674
+
267
675
  trace1_total = trace1_agg["total_us"] / 1000
268
676
  trace2_total = trace2_agg["total_us"] / 1000
269
677
  trace1_count = int(trace1_agg["count"])
270
678
  trace2_count = int(trace2_agg["count"])
271
679
  ratio = trace1_total / trace2_total if trace2_total > 0 else 1
272
680
  gap_ms = trace1_total - trace2_total
273
-
274
- # Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
681
+
275
682
  threshold_ms = 0.1
276
683
  threshold_ratio = 1.2
277
684
  if gap_ms > threshold_ms and ratio > threshold_ratio:
@@ -280,60 +687,52 @@ def analyze_traces(
280
687
  status = "faster"
281
688
  else:
282
689
  status = "similar"
283
-
284
- results["layers"].append(
285
- {
286
- "layer": int(layer_num),
287
- "trace1_kernels": trace1_count,
288
- "trace2_kernels": trace2_count,
289
- "trace1_total_ms": trace1_total,
290
- "trace2_total_ms": trace2_total,
291
- "ratio": ratio,
292
- "gap_ms": gap_ms,
293
- "status": status,
294
- "in_both": True,
295
- }
296
- )
690
+
691
+ results["layers"].append({
692
+ "layer": int(layer_num),
693
+ "trace1_kernels": trace1_count,
694
+ "trace2_kernels": trace2_count,
695
+ "trace1_total_ms": trace1_total,
696
+ "trace2_total_ms": trace2_total,
697
+ "ratio": ratio,
698
+ "gap_ms": gap_ms,
699
+ "status": status,
700
+ "in_both": True,
701
+ })
297
702
  elif has_trace1:
298
- # Layer only in trace1
299
703
  trace1_agg = trace1_by_layer.loc[layer_num]
300
704
  trace1_total = trace1_agg["total_us"] / 1000
301
705
  trace1_count = int(trace1_agg["count"])
302
-
303
- results["layers"].append(
304
- {
305
- "layer": int(layer_num),
306
- "trace1_kernels": trace1_count,
307
- "trace2_kernels": 0,
308
- "trace1_total_ms": trace1_total,
309
- "trace2_total_ms": 0.0,
310
- "ratio": 0.0,
311
- "gap_ms": trace1_total,
312
- "status": "trace1_only",
313
- "in_both": False,
314
- }
315
- )
706
+
707
+ results["layers"].append({
708
+ "layer": int(layer_num),
709
+ "trace1_kernels": trace1_count,
710
+ "trace2_kernels": 0,
711
+ "trace1_total_ms": trace1_total,
712
+ "trace2_total_ms": 0.0,
713
+ "ratio": 0.0,
714
+ "gap_ms": trace1_total,
715
+ "status": "trace1_only",
716
+ "in_both": False,
717
+ })
316
718
  elif has_trace2:
317
- # Layer only in trace2
318
719
  trace2_agg = trace2_by_layer.loc[layer_num]
319
720
  trace2_total = trace2_agg["total_us"] / 1000
320
721
  trace2_count = int(trace2_agg["count"])
321
-
322
- results["layers"].append(
323
- {
324
- "layer": int(layer_num),
325
- "trace1_kernels": 0,
326
- "trace2_kernels": trace2_count,
327
- "trace1_total_ms": 0.0,
328
- "trace2_total_ms": trace2_total,
329
- "ratio": 0.0,
330
- "gap_ms": -trace2_total,
331
- "status": "trace2_only",
332
- "in_both": False,
333
- }
334
- )
335
-
336
- # Sort: comparable layers first (by absolute gap), then trace-unique layers
722
+
723
+ results["layers"].append({
724
+ "layer": int(layer_num),
725
+ "trace1_kernels": 0,
726
+ "trace2_kernels": trace2_count,
727
+ "trace1_total_ms": 0.0,
728
+ "trace2_total_ms": trace2_total,
729
+ "ratio": 0.0,
730
+ "gap_ms": -trace2_total,
731
+ "status": "trace2_only",
732
+ "in_both": False,
733
+ })
734
+
337
735
  results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
338
-
736
+
737
+ print("Analysis complete.", file=sys.stderr)
339
738
  return results