wafer-core 0.1.45__py3-none-any.whl → 0.1.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,315 @@
1
+ """Deterministic kernel matching using CUDA graph execution order.
2
+
3
+ This module provides 98-99% deterministic matching by leveraging the fact that
4
+ both AMD and NVIDIA traces execute CUDA graphs in identical order, and kernels
5
+ within each graph execute in deterministic timestamp order.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ import orjson
12
+
13
+
14
+ @dataclass
15
+ class KernelMatch:
16
+ """A matched pair of kernels from AMD and NVIDIA traces."""
17
+
18
+ graph_index: int # Which graph execution this belongs to (0-184)
19
+ position_in_graph: int # Position within the graph (0-based)
20
+
21
+ amd_kernel: dict[str, Any]
22
+ nvidia_kernel: dict[str, Any]
23
+
24
+ operation_type: str # GEMM, ATTN, RMS, etc.
25
+ confidence: float # 1.0 = perfect match, <1.0 = potential fusion difference
26
+
27
+ # For debugging/validation
28
+ amd_correlation: int
29
+ nvidia_correlation: int
30
+
31
+
32
+ @dataclass
33
+ class GraphPair:
34
+ """A pair of matched CUDA graphs from AMD and NVIDIA traces."""
35
+
36
+ graph_index: int
37
+ amd_correlation: int
38
+ nvidia_correlation: int
39
+
40
+ amd_kernels: list[dict[str, Any]]
41
+ nvidia_kernels: list[dict[str, Any]]
42
+
43
+ is_layer: bool # True if this is a transformer layer (>100 kernels)
44
+
45
+
46
+ def classify_kernel(name: str) -> str:
47
+ """Classify kernel by operation type.
48
+
49
+ This is a coarse classification for matching purposes.
50
+ """
51
+ nl = name.lower()
52
+
53
+ if 'cijk_' in nl or 'nvjet' in nl:
54
+ return 'GEMM'
55
+ elif 'attention' in nl or 'fmha' in nl:
56
+ return 'ATTN'
57
+ elif 'reshape_and_cache' in nl:
58
+ return 'KV'
59
+ elif 'triton_per' in nl and 'rsqrt' in nl:
60
+ return 'RMS'
61
+ elif 'triton_poi' in nl and 'silu' in nl:
62
+ return 'SILU'
63
+ elif 'triton_poi' in nl:
64
+ return 'POI'
65
+ elif 'triton_red' in nl:
66
+ return 'RED'
67
+ elif 'reduce_segments' in nl:
68
+ return 'RSEG'
69
+ else:
70
+ return 'OTH'
71
+
72
+
73
+ def is_platform_specific_kernel(name: str, platform: str) -> bool:
74
+ """Check if kernel is platform-specific and should be excluded from matching.
75
+
76
+ AMD runs reduce_segments after attention operations, but NVIDIA fuses this
77
+ into adjacent kernels. We need to filter these out for accurate matching.
78
+ """
79
+ if platform == "AMD":
80
+ return 'reduce_segments' in name.lower()
81
+
82
+ # Add NVIDIA-specific exclusions here if discovered
83
+ return False
84
+
85
+
86
+ def load_graph_execution_order(trace_path: str) -> list[tuple[int, int]]:
87
+ """Load CUDA graph execution order from trace.
88
+
89
+ Returns:
90
+ List of (timestamp, correlation_id) tuples in execution order
91
+ """
92
+ with open(trace_path, "rb") as f:
93
+ trace = orjson.loads(f.read())
94
+
95
+ graph_launches = []
96
+ for event in trace.get("traceEvents", []):
97
+ if event.get("cat") == "cuda_runtime":
98
+ name = event.get("name", "")
99
+ if "GraphLaunch" in name or "graphLaunch" in name.lower():
100
+ ts = event.get("ts")
101
+ corr_id = event.get("args", {}).get("correlation")
102
+ if ts is not None and corr_id is not None:
103
+ graph_launches.append((ts, corr_id))
104
+
105
+ # Sort by timestamp to get execution order
106
+ graph_launches.sort()
107
+ return graph_launches
108
+
109
+
110
+ def load_kernels_for_correlation(trace_path: str, correlation_id: int, platform: str) -> list[dict[str, Any]]:
111
+ """Load all kernels for a given correlation ID in timestamp order.
112
+
113
+ Args:
114
+ trace_path: Path to trace JSON
115
+ correlation_id: Correlation ID to filter by
116
+ platform: "AMD" or "NVIDIA" for platform-specific filtering
117
+
118
+ Returns:
119
+ List of kernel events sorted by timestamp, with platform-specific kernels removed
120
+ """
121
+ with open(trace_path, "rb") as f:
122
+ trace = orjson.loads(f.read())
123
+
124
+ kernels = []
125
+ for event in trace.get("traceEvents", []):
126
+ if event.get("cat") == "kernel":
127
+ corr_id = event.get("args", {}).get("correlation")
128
+ if corr_id == correlation_id:
129
+ name = event.get("name", "")
130
+
131
+ # Skip platform-specific kernels
132
+ if is_platform_specific_kernel(name, platform):
133
+ continue
134
+
135
+ kernels.append({
136
+ "name": name,
137
+ "ts": event.get("ts"),
138
+ "dur": event.get("dur", 0),
139
+ "correlation": corr_id,
140
+ "args": event.get("args", {}),
141
+ })
142
+
143
+ # Sort by timestamp for deterministic ordering
144
+ kernels.sort(key=lambda k: k["ts"])
145
+ return kernels
146
+
147
+
148
+ def match_traces(
149
+ amd_trace_path: str,
150
+ nvidia_trace_path: str
151
+ ) -> tuple[list[GraphPair], list[KernelMatch]]:
152
+ """Match kernels between AMD and NVIDIA traces using graph execution order.
153
+
154
+ This provides 98-99% deterministic matching by:
155
+ 1. Matching graphs by execution order (100% deterministic)
156
+ 2. Matching kernels by position within graphs (98-99% deterministic)
157
+ 3. Filtering platform-specific operations (e.g., AMD's reduce_segments)
158
+
159
+ Args:
160
+ amd_trace_path: Path to AMD trace JSON
161
+ nvidia_trace_path: Path to NVIDIA trace JSON
162
+
163
+ Returns:
164
+ Tuple of (graph_pairs, kernel_matches)
165
+ - graph_pairs: List of matched CUDA graph pairs
166
+ - kernel_matches: List of all kernel matches across all graphs
167
+ """
168
+ # Step 1: Get graph execution order from both traces
169
+ amd_graphs = load_graph_execution_order(amd_trace_path)
170
+ nvidia_graphs = load_graph_execution_order(nvidia_trace_path)
171
+
172
+ if len(amd_graphs) != len(nvidia_graphs):
173
+ raise ValueError(
174
+ f"Graph count mismatch: AMD has {len(amd_graphs)} graphs, "
175
+ f"NVIDIA has {len(nvidia_graphs)} graphs. "
176
+ "Traces may be from different workloads."
177
+ )
178
+
179
+ graph_pairs = []
180
+ kernel_matches = []
181
+
182
+ # Step 2: Match graphs by execution order
183
+ for graph_idx, ((amd_ts, amd_corr), (nv_ts, nv_corr)) in enumerate(zip(amd_graphs, nvidia_graphs)):
184
+ # Load kernels for this correlation
185
+ amd_kernels = load_kernels_for_correlation(amd_trace_path, amd_corr, "AMD")
186
+ nvidia_kernels = load_kernels_for_correlation(nvidia_trace_path, nv_corr, "NVIDIA")
187
+
188
+ is_layer = len(amd_kernels) > 100 or len(nvidia_kernels) > 100
189
+
190
+ graph_pairs.append(GraphPair(
191
+ graph_index=graph_idx,
192
+ amd_correlation=amd_corr,
193
+ nvidia_correlation=nv_corr,
194
+ amd_kernels=amd_kernels,
195
+ nvidia_kernels=nvidia_kernels,
196
+ is_layer=is_layer,
197
+ ))
198
+
199
+ # Step 3: Match kernels within this graph by position
200
+ matches = match_kernels_in_graph(
201
+ graph_idx=graph_idx,
202
+ amd_corr=amd_corr,
203
+ nvidia_corr=nv_corr,
204
+ amd_kernels=amd_kernels,
205
+ nvidia_kernels=nvidia_kernels,
206
+ )
207
+ kernel_matches.extend(matches)
208
+
209
+ return graph_pairs, kernel_matches
210
+
211
+
212
+ def match_kernels_in_graph(
213
+ graph_idx: int,
214
+ amd_corr: int,
215
+ nvidia_corr: int,
216
+ amd_kernels: list[dict[str, Any]],
217
+ nvidia_kernels: list[dict[str, Any]],
218
+ ) -> list[KernelMatch]:
219
+ """Match kernels within a single CUDA graph by position.
220
+
221
+ Args:
222
+ graph_idx: Index of this graph in execution order
223
+ amd_corr: AMD correlation ID
224
+ nvidia_corr: NVIDIA correlation ID
225
+ amd_kernels: AMD kernels (already sorted by timestamp, filtered for platform-specific ops)
226
+ nvidia_kernels: NVIDIA kernels (already sorted by timestamp, filtered for platform-specific ops)
227
+
228
+ Returns:
229
+ List of kernel matches with confidence scores
230
+ """
231
+ matches = []
232
+
233
+ # If kernel counts don't match after filtering, something unexpected happened
234
+ if len(amd_kernels) != len(nvidia_kernels):
235
+ # Handle gracefully: match what we can
236
+ min_len = min(len(amd_kernels), len(nvidia_kernels))
237
+
238
+ for i in range(min_len):
239
+ amd_k = amd_kernels[i]
240
+ nv_k = nvidia_kernels[i]
241
+
242
+ amd_type = classify_kernel(amd_k["name"])
243
+ nv_type = classify_kernel(nv_k["name"])
244
+
245
+ # Lower confidence if operation types don't match
246
+ confidence = 1.0 if amd_type == nv_type else 0.5
247
+
248
+ matches.append(KernelMatch(
249
+ graph_index=graph_idx,
250
+ position_in_graph=i,
251
+ amd_kernel=amd_k,
252
+ nvidia_kernel=nv_k,
253
+ operation_type=amd_type,
254
+ confidence=confidence,
255
+ amd_correlation=amd_corr,
256
+ nvidia_correlation=nvidia_corr,
257
+ ))
258
+
259
+ # Note: Unmatched kernels are implicitly dropped
260
+ # Could add logging here for debugging
261
+ else:
262
+ # Perfect length match - match by position
263
+ for i, (amd_k, nv_k) in enumerate(zip(amd_kernels, nvidia_kernels)):
264
+ amd_type = classify_kernel(amd_k["name"])
265
+ nv_type = classify_kernel(nv_k["name"])
266
+
267
+ # Confidence = 1.0 if operation types match, else 0.8
268
+ # (0.8 because position-based matching is still very reliable)
269
+ confidence = 1.0 if amd_type == nv_type else 0.8
270
+
271
+ matches.append(KernelMatch(
272
+ graph_index=graph_idx,
273
+ position_in_graph=i,
274
+ amd_kernel=amd_k,
275
+ nvidia_kernel=nv_k,
276
+ operation_type=amd_type if amd_type == nv_type else f"{amd_type}→{nv_type}",
277
+ confidence=confidence,
278
+ amd_correlation=amd_corr,
279
+ nvidia_correlation=nvidia_corr,
280
+ ))
281
+
282
+ return matches
283
+
284
+
285
+ def get_matching_statistics(kernel_matches: list[KernelMatch]) -> dict[str, Any]:
286
+ """Calculate statistics about matching quality.
287
+
288
+ Returns:
289
+ Dict with:
290
+ - total_matches: Total kernel pairs matched
291
+ - perfect_matches: Matches with confidence=1.0
292
+ - fuzzy_matches: Matches with confidence<1.0
293
+ - match_rate: Percentage of perfect matches
294
+ - by_operation: Breakdown by operation type
295
+ """
296
+ total = len(kernel_matches)
297
+ perfect = sum(1 for m in kernel_matches if m.confidence == 1.0)
298
+
299
+ # Breakdown by operation type
300
+ from collections import defaultdict
301
+ by_operation = defaultdict(lambda: {"total": 0, "perfect": 0})
302
+
303
+ for match in kernel_matches:
304
+ op = match.operation_type
305
+ by_operation[op]["total"] += 1
306
+ if match.confidence == 1.0:
307
+ by_operation[op]["perfect"] += 1
308
+
309
+ return {
310
+ "total_matches": total,
311
+ "perfect_matches": perfect,
312
+ "fuzzy_matches": total - perfect,
313
+ "match_rate": perfect / total if total > 0 else 0.0,
314
+ "by_operation": dict(by_operation),
315
+ }
@@ -0,0 +1,332 @@
1
+ """Improved kernel matching that preserves fusion information.
2
+
3
+ Key improvements over v1:
4
+ 1. Uses existing classifier.py instead of reimplementing
5
+ 2. Marks fusion differences instead of filtering them out
6
+ 3. Provides detailed fusion analysis
7
+ 4. Handles sequence alignment when platforms have different kernel counts
8
+ """
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Any
12
+
13
+ import orjson
14
+
15
+ from .classifier import classify
16
+
17
+
18
+ @dataclass
19
+ class FusionDifference:
20
+ """A fusion difference between platforms."""
21
+
22
+ platform_with_kernel: str # "AMD" or "NVIDIA"
23
+ kernel_name: str
24
+ operation_type: str
25
+ position: int
26
+ likely_fused_into: str | None = None # Best guess of where this work went
27
+
28
+
29
+ @dataclass
30
+ class KernelMatch:
31
+ """A matched pair of kernels, or a fusion difference."""
32
+
33
+ graph_index: int
34
+ amd_position: int | None # None if this is NVIDIA-only
35
+ nvidia_position: int | None # None if this is AMD-only
36
+
37
+ amd_kernel: dict[str, Any] | None
38
+ nvidia_kernel: dict[str, Any] | None
39
+
40
+ operation_type: str
41
+ confidence: float # 1.0 = perfect, 0.5 = fusion difference
42
+
43
+ # If this is a fusion difference
44
+ is_fusion_difference: bool = False
45
+ fusion_info: FusionDifference | None = None
46
+
47
+ amd_correlation: int | None = None
48
+ nvidia_correlation: int | None = None
49
+
50
+
51
+ @dataclass
52
+ class GraphPair:
53
+ """Matched CUDA graph pair with fusion analysis."""
54
+
55
+ graph_index: int
56
+ amd_correlation: int
57
+ nvidia_correlation: int
58
+
59
+ amd_kernels: list[dict[str, Any]]
60
+ nvidia_kernels: list[dict[str, Any]]
61
+
62
+ is_layer: bool
63
+ fusion_differences: list[FusionDifference] = field(default_factory=list)
64
+
65
+
66
+ def load_graph_execution_order(trace_path: str) -> list[tuple[int, int]]:
67
+ """Load CUDA graph execution order."""
68
+ with open(trace_path, "rb") as f:
69
+ trace = orjson.loads(f.read())
70
+
71
+ graph_launches = []
72
+ for event in trace.get("traceEvents", []):
73
+ if event.get("cat") == "cuda_runtime":
74
+ name = event.get("name", "")
75
+ if "GraphLaunch" in name or "graphLaunch" in name.lower():
76
+ ts = event.get("ts")
77
+ corr_id = event.get("args", {}).get("correlation")
78
+ if ts is not None and corr_id is not None:
79
+ graph_launches.append((ts, corr_id))
80
+
81
+ graph_launches.sort()
82
+ return graph_launches
83
+
84
+
85
+ def load_kernels_for_correlation(
86
+ trace_path: str, correlation_id: int, platform: str
87
+ ) -> list[dict[str, Any]]:
88
+ """Load all kernels for a correlation, keeping ALL kernels including fusion differences."""
89
+ with open(trace_path, "rb") as f:
90
+ trace = orjson.loads(f.read())
91
+
92
+ kernels = []
93
+ for event in trace.get("traceEvents", []):
94
+ if event.get("cat") == "kernel":
95
+ corr_id = event.get("args", {}).get("correlation")
96
+ if corr_id == correlation_id:
97
+ kernels.append({
98
+ "name": event.get("name", ""),
99
+ "ts": event.get("ts"),
100
+ "dur": event.get("dur", 0),
101
+ "correlation": corr_id,
102
+ "args": event.get("args", {}),
103
+ })
104
+
105
+ kernels.sort(key=lambda k: k["ts"])
106
+ return kernels
107
+
108
+
109
+ def align_sequences_with_fusion(
110
+ amd_kernels: list[dict[str, Any]],
111
+ nvidia_kernels: list[dict[str, Any]],
112
+ platform_amd: str = "AMD",
113
+ platform_nvidia: str = "NVIDIA",
114
+ ) -> list[tuple[int | None, int | None, str]]:
115
+ """Align two kernel sequences, identifying fusion differences.
116
+
117
+ Returns:
118
+ List of (amd_index, nvidia_index, alignment_type) where:
119
+ - alignment_type is "match", "amd_only", or "nvidia_only"
120
+ """
121
+ # Classify all kernels
122
+ amd_ops = [classify(k["name"], platform_amd)[0].value for k in amd_kernels]
123
+ nvidia_ops = [classify(k["name"], platform_nvidia)[0].value for k in nvidia_kernels]
124
+
125
+ # Use simple sequence alignment
126
+ # For now, handle the common case: AMD has extra "reduce_segments" operations
127
+ alignments = []
128
+ amd_i = 0
129
+ nv_i = 0
130
+
131
+ while amd_i < len(amd_ops) or nv_i < len(nvidia_ops):
132
+ if amd_i >= len(amd_ops):
133
+ # Remaining NVIDIA ops
134
+ alignments.append((None, nv_i, "nvidia_only"))
135
+ nv_i += 1
136
+ elif nv_i >= len(nvidia_ops):
137
+ # Remaining AMD ops
138
+ alignments.append((amd_i, None, "amd_only"))
139
+ amd_i += 1
140
+ elif amd_ops[amd_i] == nvidia_ops[nv_i]:
141
+ # Match
142
+ alignments.append((amd_i, nv_i, "match"))
143
+ amd_i += 1
144
+ nv_i += 1
145
+ else:
146
+ # Mismatch - check if AMD has an extra operation
147
+ amd_kernel_name = amd_kernels[amd_i]["name"].lower()
148
+
149
+ # Known fusion differences
150
+ if "reduce_segments" in amd_kernel_name:
151
+ # AMD has reduce_segments, NVIDIA fuses it
152
+ alignments.append((amd_i, None, "amd_only"))
153
+ amd_i += 1
154
+ else:
155
+ # Unknown mismatch - try to match anyway
156
+ alignments.append((amd_i, nv_i, "match"))
157
+ amd_i += 1
158
+ nv_i += 1
159
+
160
+ return alignments
161
+
162
+
163
+ def match_traces(
164
+ amd_trace_path: str,
165
+ nvidia_trace_path: str,
166
+ ) -> tuple[list[GraphPair], list[KernelMatch]]:
167
+ """Match traces with fusion difference detection."""
168
+ amd_graphs = load_graph_execution_order(amd_trace_path)
169
+ nvidia_graphs = load_graph_execution_order(nvidia_trace_path)
170
+
171
+ if len(amd_graphs) != len(nvidia_graphs):
172
+ raise ValueError(
173
+ f"Graph count mismatch: AMD={len(amd_graphs)}, NVIDIA={len(nvidia_graphs)}"
174
+ )
175
+
176
+ # Detect platform from first kernel
177
+ with open(amd_trace_path, "rb") as f:
178
+ amd_trace = orjson.loads(f.read())
179
+ props = amd_trace.get("deviceProperties", [{}])[0]
180
+ platform_amd = "AMD" if amd_trace.get("roctracer_version") or props.get("warpSize") == 64 else "NVIDIA"
181
+
182
+ with open(nvidia_trace_path, "rb") as f:
183
+ nvidia_trace = orjson.loads(f.read())
184
+ props = nvidia_trace.get("deviceProperties", [{}])[0]
185
+ platform_nvidia = "AMD" if nvidia_trace.get("roctracer_version") or props.get("warpSize") == 64 else "NVIDIA"
186
+
187
+ graph_pairs = []
188
+ kernel_matches = []
189
+
190
+ for graph_idx, ((amd_ts, amd_corr), (nv_ts, nv_corr)) in enumerate(
191
+ zip(amd_graphs, nvidia_graphs)
192
+ ):
193
+ amd_kernels = load_kernels_for_correlation(amd_trace_path, amd_corr, platform_amd)
194
+ nvidia_kernels = load_kernels_for_correlation(
195
+ nvidia_trace_path, nv_corr, platform_nvidia
196
+ )
197
+
198
+ is_layer = len(amd_kernels) > 100 or len(nvidia_kernels) > 100
199
+
200
+ # Align sequences
201
+ alignments = align_sequences_with_fusion(
202
+ amd_kernels, nvidia_kernels, platform_amd, platform_nvidia
203
+ )
204
+
205
+ # Create matches and track fusion differences
206
+ fusion_diffs = []
207
+
208
+ for amd_i, nv_i, align_type in alignments:
209
+ if align_type == "match":
210
+ # Perfect match
211
+ amd_k = amd_kernels[amd_i]
212
+ nv_k = nvidia_kernels[nv_i]
213
+
214
+ amd_op, _ = classify(amd_k["name"], platform_amd)
215
+ nv_op, _ = classify(nv_k["name"], platform_nvidia)
216
+
217
+ confidence = 1.0 if amd_op == nv_op else 0.8
218
+
219
+ kernel_matches.append(
220
+ KernelMatch(
221
+ graph_index=graph_idx,
222
+ amd_position=amd_i,
223
+ nvidia_position=nv_i,
224
+ amd_kernel=amd_k,
225
+ nvidia_kernel=nv_k,
226
+ operation_type=amd_op.value,
227
+ confidence=confidence,
228
+ amd_correlation=amd_corr,
229
+ nvidia_correlation=nv_corr,
230
+ )
231
+ )
232
+ elif align_type == "amd_only":
233
+ # Fusion difference: AMD has this, NVIDIA fused it
234
+ amd_k = amd_kernels[amd_i]
235
+ amd_op, _ = classify(amd_k["name"], platform_amd)
236
+
237
+ fusion_diff = FusionDifference(
238
+ platform_with_kernel="AMD",
239
+ kernel_name=amd_k["name"],
240
+ operation_type=amd_op.value,
241
+ position=amd_i,
242
+ likely_fused_into="adjacent operation (NVIDIA fuses)",
243
+ )
244
+ fusion_diffs.append(fusion_diff)
245
+
246
+ kernel_matches.append(
247
+ KernelMatch(
248
+ graph_index=graph_idx,
249
+ amd_position=amd_i,
250
+ nvidia_position=None,
251
+ amd_kernel=amd_k,
252
+ nvidia_kernel=None,
253
+ operation_type=amd_op.value,
254
+ confidence=0.5,
255
+ is_fusion_difference=True,
256
+ fusion_info=fusion_diff,
257
+ amd_correlation=amd_corr,
258
+ nvidia_correlation=nv_corr,
259
+ )
260
+ )
261
+ elif align_type == "nvidia_only":
262
+ # Fusion difference: NVIDIA has this, AMD fused it
263
+ nv_k = nvidia_kernels[nv_i]
264
+ nv_op, _ = classify(nv_k["name"], platform_nvidia)
265
+
266
+ fusion_diff = FusionDifference(
267
+ platform_with_kernel="NVIDIA",
268
+ kernel_name=nv_k["name"],
269
+ operation_type=nv_op.value,
270
+ position=nv_i,
271
+ likely_fused_into="adjacent operation (AMD fuses)",
272
+ )
273
+ fusion_diffs.append(fusion_diff)
274
+
275
+ kernel_matches.append(
276
+ KernelMatch(
277
+ graph_index=graph_idx,
278
+ amd_position=None,
279
+ nvidia_position=nv_i,
280
+ amd_kernel=None,
281
+ nvidia_kernel=nv_k,
282
+ operation_type=nv_op.value,
283
+ confidence=0.5,
284
+ is_fusion_difference=True,
285
+ fusion_info=fusion_diff,
286
+ amd_correlation=amd_corr,
287
+ nvidia_correlation=nv_corr,
288
+ )
289
+ )
290
+
291
+ graph_pairs.append(
292
+ GraphPair(
293
+ graph_index=graph_idx,
294
+ amd_correlation=amd_corr,
295
+ nvidia_correlation=nv_corr,
296
+ amd_kernels=amd_kernels,
297
+ nvidia_kernels=nvidia_kernels,
298
+ is_layer=is_layer,
299
+ fusion_differences=fusion_diffs,
300
+ )
301
+ )
302
+
303
+ return graph_pairs, kernel_matches
304
+
305
+
306
+ def get_matching_statistics(kernel_matches: list[KernelMatch]) -> dict[str, Any]:
307
+ """Calculate statistics including fusion analysis."""
308
+ total = len(kernel_matches)
309
+ perfect = sum(1 for m in kernel_matches if m.confidence == 1.0)
310
+ fusion_diffs = sum(1 for m in kernel_matches if m.is_fusion_difference)
311
+
312
+ # Breakdown by operation type
313
+ from collections import defaultdict
314
+
315
+ by_operation = defaultdict(lambda: {"total": 0, "perfect": 0, "fusion": 0})
316
+
317
+ for match in kernel_matches:
318
+ op = match.operation_type
319
+ by_operation[op]["total"] += 1
320
+ if match.confidence == 1.0:
321
+ by_operation[op]["perfect"] += 1
322
+ if match.is_fusion_difference:
323
+ by_operation[op]["fusion"] += 1
324
+
325
+ return {
326
+ "total_matches": total,
327
+ "perfect_matches": perfect,
328
+ "fuzzy_matches": total - perfect - fusion_diffs,
329
+ "fusion_differences": fusion_diffs,
330
+ "match_rate": perfect / total if total > 0 else 0.0,
331
+ "by_operation": dict(by_operation),
332
+ }
@@ -722,7 +722,15 @@ class App:
722
722
  """Send message through update, execute resulting command."""
723
723
  if not self._running:
724
724
  return
725
+ old_model = self._model
725
726
  self._model, cmd = self._update_fn(self._model, msg)
727
+ _log(
728
+ "dispatch",
729
+ msg_type=type(msg).__name__,
730
+ msg_data=repr(msg)[:200],
731
+ cmd_kind=cmd._kind,
732
+ model_changed=self._model is not old_model,
733
+ )
726
734
  self._execute_cmd(cmd)
727
735
 
728
736
  def _execute_cmd(self, cmd: Cmd) -> None: