wafer-core 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,729 @@
1
+ """Main trace comparison analysis logic.
2
+
3
+ Compares GPU traces from AMD and NVIDIA platforms, identifying performance differences
4
+ at the operation level and layer level.
5
+ """
6
+
7
+ import sys
8
+ from collections import defaultdict
9
+ from concurrent.futures import ProcessPoolExecutor
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ import pandas as pd
14
+
15
+ from .aligner import align_traces, TraceAlignment
16
+ from .fusion_analyzer import analyze_fusion_from_alignment
17
+ from .same_kernel_analyzer import analyze_same_kernels_from_alignment
18
+ from .loader import load_trace_full, LoadedTrace
19
+
20
+
21
+ def analyze_traces_from_loaded(
22
+ trace1: LoadedTrace,
23
+ trace2: LoadedTrace,
24
+ phase_filter: str = "all",
25
+ max_stacks: int = 3,
26
+ ) -> dict[str, Any]:
27
+ """Analyze two loaded traces and return comparison data.
28
+
29
+ Args:
30
+ trace1: First loaded trace
31
+ trace2: Second loaded trace
32
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
33
+ max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
34
+
35
+ Returns:
36
+ Dictionary containing:
37
+ - metadata: trace info (GPUs, kernel counts, total times, etc.)
38
+ - operations: per-operation comparison data
39
+ - layers: per-layer comparison data (if layers detected)
40
+ """
41
+ df1 = trace1.df
42
+ df2 = trace2.df
43
+
44
+ # Apply phase filter
45
+ if phase_filter != "all":
46
+ df1_filtered = df1[df1["phase"] == phase_filter]
47
+ df2_filtered = df2[df2["phase"] == phase_filter]
48
+
49
+ if len(df1_filtered) == 0 and len(df2_filtered) == 0:
50
+ trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
51
+ trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
52
+ raise ValueError(
53
+ f"No {phase_filter} phase found. "
54
+ f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
55
+ )
56
+
57
+ df1, df2 = df1_filtered, df2_filtered
58
+
59
+ # Pre-compute aggregations for both operations and layers in single pass
60
+ trace1_by_op = df1.groupby("op").agg({
61
+ "dur_us": ["sum", "mean", "count"],
62
+ "phase": lambda x: set(x.dropna().unique()),
63
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
64
+ })
65
+ trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
66
+
67
+ trace2_by_op = df2.groupby("op").agg({
68
+ "dur_us": ["sum", "mean", "count"],
69
+ "phase": lambda x: set(x.dropna().unique()),
70
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
71
+ })
72
+ trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
73
+
74
+ # Group by layer for layer-level analysis
75
+ df1_layered = df1[df1["layer"].notna()]
76
+ df2_layered = df2[df2["layer"].notna()]
77
+
78
+ trace1_by_layer = df1_layered.groupby("layer").agg({
79
+ "dur_us": ["sum", "count"],
80
+ }) if len(df1_layered) > 0 else pd.DataFrame()
81
+ if len(trace1_by_layer) > 0:
82
+ trace1_by_layer.columns = ["total_us", "count"]
83
+
84
+ trace2_by_layer = df2_layered.groupby("layer").agg({
85
+ "dur_us": ["sum", "count"],
86
+ }) if len(df2_layered) > 0 else pd.DataFrame()
87
+ if len(trace2_by_layer) > 0:
88
+ trace2_by_layer.columns = ["total_us", "count"]
89
+
90
+ results: dict[str, Any] = {
91
+ "metadata": {
92
+ "trace1_name": f"{trace1.platform} {trace1.gpu_name}",
93
+ "trace2_name": f"{trace2.platform} {trace2.gpu_name}",
94
+ "trace1_platform": trace1.platform,
95
+ "trace1_gpu": trace1.gpu_name,
96
+ "trace1_device": trace1.device_props,
97
+ "trace2_platform": trace2.platform,
98
+ "trace2_gpu": trace2.gpu_name,
99
+ "trace2_device": trace2.device_props,
100
+ "trace1_kernels": len(df1),
101
+ "trace2_kernels": len(df2),
102
+ "trace1_total_ms": df1["dur_us"].sum() / 1000,
103
+ "trace2_total_ms": df2["dur_us"].sum() / 1000,
104
+ "phase": phase_filter,
105
+ "trace1_layers": len(trace1.layers),
106
+ "trace2_layers": len(trace2.layers),
107
+ },
108
+ "operations": [],
109
+ "layers": [],
110
+ }
111
+
112
+ # Per-operation comparison
113
+ all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
114
+ rmsnorm_compared = False
115
+
116
+ for op in sorted(all_ops):
117
+ has_trace1 = op in trace1_by_op.index
118
+ has_trace2 = op in trace2_by_op.index
119
+
120
+ trace1_op_for_pattern = op
121
+ trace2_op_for_pattern = op
122
+ skip_comparison = False
123
+
124
+ if op == "RMSNorm+GEMM" and not has_trace2:
125
+ has_trace2 = "RMSNorm" in trace2_by_op.index
126
+ trace2_op_for_pattern = "RMSNorm"
127
+ rmsnorm_compared = True
128
+ elif op == "RMSNorm" and not has_trace1:
129
+ if rmsnorm_compared:
130
+ skip_comparison = True
131
+ else:
132
+ has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
133
+ trace1_op_for_pattern = "RMSNorm+GEMM"
134
+ rmsnorm_compared = True
135
+
136
+ if skip_comparison or not (has_trace1 and has_trace2):
137
+ continue
138
+
139
+ trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
140
+ trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
141
+
142
+ trace1_avg = trace1_agg["avg_us"]
143
+ trace2_avg = trace2_agg["avg_us"]
144
+ trace1_total = trace1_agg["total_us"] / 1000
145
+ trace2_total = trace2_agg["total_us"] / 1000
146
+ trace1_count = int(trace1_agg["count"])
147
+ trace2_count = int(trace2_agg["count"])
148
+ ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
149
+ gap_ms = trace1_total - trace2_total
150
+
151
+ trace1_pattern = list(
152
+ trace1.patterns.get(
153
+ (trace1_op_for_pattern, "decode"),
154
+ trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
155
+ )
156
+ )[0]
157
+ trace2_pattern = list(
158
+ trace2.patterns.get(
159
+ (trace2_op_for_pattern, "decode"),
160
+ trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
161
+ )
162
+ )[0]
163
+
164
+ trace1_cpu_op = trace1_agg["cpu_op"]
165
+ trace2_cpu_op = trace2_agg["cpu_op"]
166
+
167
+ # Get detailed kernel data and stacks only when needed
168
+ trace1_data = df1[df1["op"] == trace1_op_for_pattern]
169
+ trace2_data = df2[df2["op"] == trace2_op_for_pattern]
170
+
171
+ # Collect Python stacks if available
172
+ trace1_python_stacks = []
173
+ trace2_python_stacks = []
174
+
175
+ if max_stacks != 0:
176
+ stack_limit = None if max_stacks == 0 else max_stacks
177
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
178
+ if stack_list and len(stack_list) > 0:
179
+ trace1_python_stacks.append(stack_list)
180
+
181
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
182
+ if stack_list and len(stack_list) > 0:
183
+ trace2_python_stacks.append(stack_list)
184
+
185
+ # Aggregate individual kernels
186
+ trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
187
+ trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
188
+ trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
189
+ trace1_kernels_list = trace1_kernels.to_dict("records")
190
+
191
+ trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
192
+ trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
193
+ trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
194
+ trace2_kernels_list = trace2_kernels.to_dict("records")
195
+
196
+ if gap_ms > 5.0:
197
+ status = "slower"
198
+ elif gap_ms < -5.0:
199
+ status = "faster"
200
+ else:
201
+ status = "similar"
202
+
203
+ phases = trace1_agg["phases"] | trace2_agg["phases"]
204
+
205
+ results["operations"].append({
206
+ "operation": op,
207
+ "trace1_count": trace1_count,
208
+ "trace2_count": trace2_count,
209
+ "trace1_avg_us": trace1_avg,
210
+ "trace2_avg_us": trace2_avg,
211
+ "trace1_total_ms": trace1_total,
212
+ "trace2_total_ms": trace2_total,
213
+ "ratio": ratio,
214
+ "gap_ms": gap_ms,
215
+ "status": status,
216
+ "trace1_kernel": trace1_pattern,
217
+ "trace2_kernel": trace2_pattern,
218
+ "trace1_cpu_op": trace1_cpu_op,
219
+ "trace2_cpu_op": trace2_cpu_op,
220
+ "trace1_python_stacks": trace1_python_stacks,
221
+ "trace2_python_stacks": trace2_python_stacks,
222
+ "trace1_kernels": trace1_kernels_list,
223
+ "trace2_kernels": trace2_kernels_list,
224
+ "phases": sorted(list(phases)) if phases else ["all"],
225
+ })
226
+
227
+ results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
228
+
229
+ # Layer-wise analysis
230
+ if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
231
+ all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
232
+
233
+ for layer_num in all_layers:
234
+ has_trace1 = layer_num in trace1_by_layer.index
235
+ has_trace2 = layer_num in trace2_by_layer.index
236
+
237
+ if has_trace1 and has_trace2:
238
+ trace1_agg = trace1_by_layer.loc[layer_num]
239
+ trace2_agg = trace2_by_layer.loc[layer_num]
240
+
241
+ trace1_total = trace1_agg["total_us"] / 1000
242
+ trace2_total = trace2_agg["total_us"] / 1000
243
+ trace1_count = int(trace1_agg["count"])
244
+ trace2_count = int(trace2_agg["count"])
245
+ ratio = trace1_total / trace2_total if trace2_total > 0 else 1
246
+ gap_ms = trace1_total - trace2_total
247
+
248
+ threshold_ms = 0.1
249
+ threshold_ratio = 1.2
250
+ if gap_ms > threshold_ms and ratio > threshold_ratio:
251
+ status = "slower"
252
+ elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
253
+ status = "faster"
254
+ else:
255
+ status = "similar"
256
+
257
+ results["layers"].append({
258
+ "layer": int(layer_num),
259
+ "trace1_kernels": trace1_count,
260
+ "trace2_kernels": trace2_count,
261
+ "trace1_total_ms": trace1_total,
262
+ "trace2_total_ms": trace2_total,
263
+ "ratio": ratio,
264
+ "gap_ms": gap_ms,
265
+ "status": status,
266
+ "in_both": True,
267
+ })
268
+ elif has_trace1:
269
+ trace1_agg = trace1_by_layer.loc[layer_num]
270
+ trace1_total = trace1_agg["total_us"] / 1000
271
+ trace1_count = int(trace1_agg["count"])
272
+
273
+ results["layers"].append({
274
+ "layer": int(layer_num),
275
+ "trace1_kernels": trace1_count,
276
+ "trace2_kernels": 0,
277
+ "trace1_total_ms": trace1_total,
278
+ "trace2_total_ms": 0.0,
279
+ "ratio": 0.0,
280
+ "gap_ms": trace1_total,
281
+ "status": "trace1_only",
282
+ "in_both": False,
283
+ })
284
+ elif has_trace2:
285
+ trace2_agg = trace2_by_layer.loc[layer_num]
286
+ trace2_total = trace2_agg["total_us"] / 1000
287
+ trace2_count = int(trace2_agg["count"])
288
+
289
+ results["layers"].append({
290
+ "layer": int(layer_num),
291
+ "trace1_kernels": 0,
292
+ "trace2_kernels": trace2_count,
293
+ "trace1_total_ms": 0.0,
294
+ "trace2_total_ms": trace2_total,
295
+ "ratio": 0.0,
296
+ "gap_ms": -trace2_total,
297
+ "status": "trace2_only",
298
+ "in_both": False,
299
+ })
300
+
301
+ results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
302
+
303
+ return results
304
+
305
+
306
+ def analyze_traces(
307
+ trace1_path: str | Path,
308
+ trace2_path: str | Path,
309
+ phase_filter: str = "all",
310
+ max_stacks: int = 3,
311
+ include_stacks: bool = True,
312
+ ) -> dict[str, Any]:
313
+ """Analyze two traces and return comparison data.
314
+
315
+ Args:
316
+ trace1_path: Path to first trace file
317
+ trace2_path: Path to second trace file
318
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
319
+ max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
320
+ include_stacks: Whether to include Python stack traces (disable for faster analysis)
321
+
322
+ Returns:
323
+ Dictionary containing:
324
+ - metadata: trace info (GPUs, kernel counts, total times, etc.)
325
+ - operations: per-operation comparison data
326
+ - layers: per-layer comparison data (if layers detected)
327
+ """
328
+ # Load both traces in parallel using separate processes
329
+ # This provides ~1.7x speedup over sequential loading
330
+ print("Loading traces in parallel...", file=sys.stderr)
331
+
332
+ with ProcessPoolExecutor(max_workers=2) as executor:
333
+ future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
334
+ future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
335
+ trace1 = future1.result()
336
+ trace2 = future2.result()
337
+
338
+ print("Analyzing operations...", file=sys.stderr)
339
+
340
+ result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
341
+
342
+ # Update metadata with file paths for backward compatibility
343
+ result["metadata"]["trace1_name"] = str(trace1_path)
344
+ result["metadata"]["trace2_name"] = str(trace2_path)
345
+
346
+ return result
347
+
348
+
349
+ def analyze_traces_aligned(
350
+ trace1: LoadedTrace,
351
+ trace2: LoadedTrace,
352
+ phase_filter: str = "all",
353
+ ) -> dict[str, Any]:
354
+ """Analyze traces using kernel-to-kernel alignment.
355
+
356
+ Args:
357
+ trace1: First loaded trace
358
+ trace2: Second loaded trace
359
+ phase_filter: Filter by phase ('all', 'prefill', or 'decode')
360
+
361
+ Returns:
362
+ Dictionary with alignment-based comparison data
363
+ """
364
+ amd_phases = trace1.phases
365
+ nvidia_phases = trace2.phases
366
+
367
+ if phase_filter != "all":
368
+ amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
369
+ nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
370
+
371
+ amd_kernels = trace1.kernel_events
372
+ nvidia_kernels = trace2.kernel_events
373
+
374
+ if phase_filter != "all" and amd_phases:
375
+ phase_starts = [p["ts_start"] for p in amd_phases]
376
+ phase_ends = [p["ts_end"] for p in amd_phases]
377
+ amd_kernels = [
378
+ k for k in amd_kernels
379
+ if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
380
+ for i in range(len(phase_starts)))
381
+ ]
382
+
383
+ if phase_filter != "all" and nvidia_phases:
384
+ phase_starts = [p["ts_start"] for p in nvidia_phases]
385
+ phase_ends = [p["ts_end"] for p in nvidia_phases]
386
+ nvidia_kernels = [
387
+ k for k in nvidia_kernels
388
+ if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
389
+ for i in range(len(phase_starts)))
390
+ ]
391
+
392
+ alignment = align_traces(
393
+ amd_kernels,
394
+ nvidia_kernels,
395
+ amd_phases,
396
+ nvidia_phases,
397
+ trace1.platform,
398
+ trace2.platform,
399
+ )
400
+
401
+ layer_alignments = []
402
+ for layer_align in alignment.layer_alignments:
403
+ kernel_pairs = []
404
+ for pair in layer_align.kernel_pairs:
405
+ kernel_pairs.append({
406
+ "position": pair.position,
407
+ "operation": pair.operation,
408
+ "operation_detail": pair.operation_detail,
409
+ "amd_kernel": pair.amd_kernel,
410
+ "amd_avg_us": pair.amd_avg_us,
411
+ "amd_count": pair.amd_count,
412
+ "amd_total_us": pair.amd_total_us,
413
+ "nvidia_kernel": pair.nvidia_kernel,
414
+ "nvidia_avg_us": pair.nvidia_avg_us,
415
+ "nvidia_count": pair.nvidia_count,
416
+ "nvidia_total_us": pair.nvidia_total_us,
417
+ "ratio": pair.ratio,
418
+ "gap_us": pair.gap_us,
419
+ "fusion_note": pair.fusion_note,
420
+ "is_same_kernel": pair.is_same_kernel,
421
+ })
422
+
423
+ layer_alignments.append({
424
+ "layer": layer_align.layer,
425
+ "amd_total_us": layer_align.amd_total_us,
426
+ "nvidia_total_us": layer_align.nvidia_total_us,
427
+ "ratio": layer_align.ratio,
428
+ "gap_us": layer_align.gap_us,
429
+ "kernel_pairs": kernel_pairs,
430
+ })
431
+
432
+ fusion_result = analyze_fusion_from_alignment(alignment.layer_alignments)
433
+ same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
434
+
435
+ if trace1.platform == "AMD":
436
+ amd_trace, nvidia_trace = trace1, trace2
437
+ else:
438
+ amd_trace, nvidia_trace = trace2, trace1
439
+
440
+ return {
441
+ "metadata": {
442
+ "amd_gpu": amd_trace.gpu_name,
443
+ "nvidia_gpu": nvidia_trace.gpu_name,
444
+ "amd_platform": amd_trace.platform,
445
+ "nvidia_platform": nvidia_trace.platform,
446
+ "model_layers": alignment.num_layers,
447
+ "forward_passes": alignment.num_forward_passes,
448
+ "phase_breakdown": alignment.phase_breakdown,
449
+ "phase_filter": phase_filter,
450
+ "trace1_platform": trace1.platform,
451
+ "trace1_gpu": trace1.gpu_name,
452
+ "trace1_device": trace1.device_props,
453
+ "trace2_platform": trace2.platform,
454
+ "trace2_gpu": trace2.gpu_name,
455
+ "trace2_device": trace2.device_props,
456
+ "trace1_kernels": len(amd_trace.kernel_events),
457
+ "trace2_kernels": len(nvidia_trace.kernel_events),
458
+ "trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
459
+ "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
460
+ "phase": phase_filter,
461
+ "trace1_layers": alignment.num_layers,
462
+ "trace2_layers": alignment.num_layers,
463
+ },
464
+ "layer_alignments": layer_alignments,
465
+ "fusion_analysis": fusion_result,
466
+ "same_kernel_analysis": same_kernel_result,
467
+ }
468
+
469
+ # Apply phase filter
470
+ if phase_filter != "all":
471
+ df1_filtered = df1[df1["phase"] == phase_filter]
472
+ df2_filtered = df2[df2["phase"] == phase_filter]
473
+
474
+ if len(df1_filtered) == 0 and len(df2_filtered) == 0:
475
+ trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
476
+ trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
477
+ raise ValueError(
478
+ f"No {phase_filter} phase found. "
479
+ f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
480
+ )
481
+
482
+ df1, df2 = df1_filtered, df2_filtered
483
+
484
+ # Pre-compute aggregations for both operations and layers in single pass
485
+ trace1_by_op = df1.groupby("op").agg({
486
+ "dur_us": ["sum", "mean", "count"],
487
+ "phase": lambda x: set(x.dropna().unique()),
488
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
489
+ })
490
+ trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
491
+
492
+ trace2_by_op = df2.groupby("op").agg({
493
+ "dur_us": ["sum", "mean", "count"],
494
+ "phase": lambda x: set(x.dropna().unique()),
495
+ "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
496
+ })
497
+ trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
498
+
499
+ # Group by layer for layer-level analysis
500
+ df1_layered = df1[df1["layer"].notna()]
501
+ df2_layered = df2[df2["layer"].notna()]
502
+
503
+ trace1_by_layer = df1_layered.groupby("layer").agg({
504
+ "dur_us": ["sum", "count"],
505
+ }) if len(df1_layered) > 0 else pd.DataFrame()
506
+ if len(trace1_by_layer) > 0:
507
+ trace1_by_layer.columns = ["total_us", "count"]
508
+
509
+ trace2_by_layer = df2_layered.groupby("layer").agg({
510
+ "dur_us": ["sum", "count"],
511
+ }) if len(df2_layered) > 0 else pd.DataFrame()
512
+ if len(trace2_by_layer) > 0:
513
+ trace2_by_layer.columns = ["total_us", "count"]
514
+
515
+ results: dict[str, Any] = {
516
+ "metadata": {
517
+ "trace1_name": str(trace1_path),
518
+ "trace2_name": str(trace2_path),
519
+ "trace1_platform": p1,
520
+ "trace1_gpu": gpu1,
521
+ "trace1_device": dev1,
522
+ "trace2_platform": p2,
523
+ "trace2_gpu": gpu2,
524
+ "trace2_device": dev2,
525
+ "trace1_kernels": len(df1),
526
+ "trace2_kernels": len(df2),
527
+ "trace1_total_ms": df1["dur_us"].sum() / 1000,
528
+ "trace2_total_ms": df2["dur_us"].sum() / 1000,
529
+ "phase": phase_filter,
530
+ "trace1_layers": len(layers1),
531
+ "trace2_layers": len(layers2),
532
+ },
533
+ "operations": [],
534
+ "layers": [],
535
+ }
536
+
537
+ # Per-operation comparison
538
+ all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
539
+ rmsnorm_compared = False
540
+
541
+ for op in sorted(all_ops):
542
+ has_trace1 = op in trace1_by_op.index
543
+ has_trace2 = op in trace2_by_op.index
544
+
545
+ trace1_op_for_pattern = op
546
+ trace2_op_for_pattern = op
547
+ skip_comparison = False
548
+
549
+ if op == "RMSNorm+GEMM" and not has_trace2:
550
+ has_trace2 = "RMSNorm" in trace2_by_op.index
551
+ trace2_op_for_pattern = "RMSNorm"
552
+ rmsnorm_compared = True
553
+ elif op == "RMSNorm" and not has_trace1:
554
+ if rmsnorm_compared:
555
+ skip_comparison = True
556
+ else:
557
+ has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
558
+ trace1_op_for_pattern = "RMSNorm+GEMM"
559
+ rmsnorm_compared = True
560
+
561
+ if skip_comparison or not (has_trace1 and has_trace2):
562
+ continue
563
+
564
+ trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
565
+ trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
566
+
567
+ trace1_avg = trace1_agg["avg_us"]
568
+ trace2_avg = trace2_agg["avg_us"]
569
+ trace1_total = trace1_agg["total_us"] / 1000
570
+ trace2_total = trace2_agg["total_us"] / 1000
571
+ trace1_count = int(trace1_agg["count"])
572
+ trace2_count = int(trace2_agg["count"])
573
+ ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
574
+ gap_ms = trace1_total - trace2_total
575
+
576
+ trace1_pattern = list(
577
+ patterns1.get(
578
+ (trace1_op_for_pattern, "decode"),
579
+ patterns1.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
580
+ )
581
+ )[0]
582
+ trace2_pattern = list(
583
+ patterns2.get(
584
+ (trace2_op_for_pattern, "decode"),
585
+ patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
586
+ )
587
+ )[0]
588
+
589
+ trace1_cpu_op = trace1_agg["cpu_op"]
590
+ trace2_cpu_op = trace2_agg["cpu_op"]
591
+
592
+ # Get detailed kernel data and stacks only when needed
593
+ trace1_data = df1[df1["op"] == trace1_op_for_pattern]
594
+ trace2_data = df2[df2["op"] == trace2_op_for_pattern]
595
+
596
+ # Collect Python stacks if available
597
+ trace1_python_stacks = []
598
+ trace2_python_stacks = []
599
+
600
+ if include_stacks:
601
+ stack_limit = None if max_stacks == 0 else max_stacks
602
+ for stack_list in trace1_data["python_stack"].head(stack_limit):
603
+ if stack_list and len(stack_list) > 0:
604
+ trace1_python_stacks.append(stack_list)
605
+
606
+ for stack_list in trace2_data["python_stack"].head(stack_limit):
607
+ if stack_list and len(stack_list) > 0:
608
+ trace2_python_stacks.append(stack_list)
609
+
610
+ # Aggregate individual kernels
611
+ trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
612
+ trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
613
+ trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
614
+ trace1_kernels_list = trace1_kernels.to_dict("records")
615
+
616
+ trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
617
+ trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
618
+ trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
619
+ trace2_kernels_list = trace2_kernels.to_dict("records")
620
+
621
+ if gap_ms > 5.0:
622
+ status = "slower"
623
+ elif gap_ms < -5.0:
624
+ status = "faster"
625
+ else:
626
+ status = "similar"
627
+
628
+ phases = trace1_agg["phases"] | trace2_agg["phases"]
629
+
630
+ results["operations"].append({
631
+ "operation": op,
632
+ "trace1_count": trace1_count,
633
+ "trace2_count": trace2_count,
634
+ "trace1_avg_us": trace1_avg,
635
+ "trace2_avg_us": trace2_avg,
636
+ "trace1_total_ms": trace1_total,
637
+ "trace2_total_ms": trace2_total,
638
+ "ratio": ratio,
639
+ "gap_ms": gap_ms,
640
+ "status": status,
641
+ "trace1_kernel": trace1_pattern,
642
+ "trace2_kernel": trace2_pattern,
643
+ "trace1_cpu_op": trace1_cpu_op,
644
+ "trace2_cpu_op": trace2_cpu_op,
645
+ "trace1_python_stacks": trace1_python_stacks,
646
+ "trace2_python_stacks": trace2_python_stacks,
647
+ "trace1_kernels": trace1_kernels_list,
648
+ "trace2_kernels": trace2_kernels_list,
649
+ "phases": sorted(list(phases)) if phases else ["all"],
650
+ })
651
+
652
+ results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
653
+
654
+ # Layer-wise analysis
655
+ if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
656
+ all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
657
+
658
+ for layer_num in all_layers:
659
+ has_trace1 = layer_num in trace1_by_layer.index
660
+ has_trace2 = layer_num in trace2_by_layer.index
661
+
662
+ if has_trace1 and has_trace2:
663
+ trace1_agg = trace1_by_layer.loc[layer_num]
664
+ trace2_agg = trace2_by_layer.loc[layer_num]
665
+
666
+ trace1_total = trace1_agg["total_us"] / 1000
667
+ trace2_total = trace2_agg["total_us"] / 1000
668
+ trace1_count = int(trace1_agg["count"])
669
+ trace2_count = int(trace2_agg["count"])
670
+ ratio = trace1_total / trace2_total if trace2_total > 0 else 1
671
+ gap_ms = trace1_total - trace2_total
672
+
673
+ threshold_ms = 0.1
674
+ threshold_ratio = 1.2
675
+ if gap_ms > threshold_ms and ratio > threshold_ratio:
676
+ status = "slower"
677
+ elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
678
+ status = "faster"
679
+ else:
680
+ status = "similar"
681
+
682
+ results["layers"].append({
683
+ "layer": int(layer_num),
684
+ "trace1_kernels": trace1_count,
685
+ "trace2_kernels": trace2_count,
686
+ "trace1_total_ms": trace1_total,
687
+ "trace2_total_ms": trace2_total,
688
+ "ratio": ratio,
689
+ "gap_ms": gap_ms,
690
+ "status": status,
691
+ "in_both": True,
692
+ })
693
+ elif has_trace1:
694
+ trace1_agg = trace1_by_layer.loc[layer_num]
695
+ trace1_total = trace1_agg["total_us"] / 1000
696
+ trace1_count = int(trace1_agg["count"])
697
+
698
+ results["layers"].append({
699
+ "layer": int(layer_num),
700
+ "trace1_kernels": trace1_count,
701
+ "trace2_kernels": 0,
702
+ "trace1_total_ms": trace1_total,
703
+ "trace2_total_ms": 0.0,
704
+ "ratio": 0.0,
705
+ "gap_ms": trace1_total,
706
+ "status": "trace1_only",
707
+ "in_both": False,
708
+ })
709
+ elif has_trace2:
710
+ trace2_agg = trace2_by_layer.loc[layer_num]
711
+ trace2_total = trace2_agg["total_us"] / 1000
712
+ trace2_count = int(trace2_agg["count"])
713
+
714
+ results["layers"].append({
715
+ "layer": int(layer_num),
716
+ "trace1_kernels": 0,
717
+ "trace2_kernels": trace2_count,
718
+ "trace1_total_ms": 0.0,
719
+ "trace2_total_ms": trace2_total,
720
+ "ratio": 0.0,
721
+ "gap_ms": -trace2_total,
722
+ "status": "trace2_only",
723
+ "in_both": False,
724
+ })
725
+
726
+ results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
727
+
728
+ print("Analysis complete.", file=sys.stderr)
729
+ return results