tt-perf-report 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -0,0 +1,858 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
5
+
6
+ import sys
7
+ import argparse
8
+ import re
9
+ from typing import Any, Optional
10
+ from collections import defaultdict
11
+ import pandas as pd
12
+
13
+ # Global variable to store color preference
14
+ color_output = None # None means auto-detect, True forces color, False forces no color
15
+
16
+
17
+ def set_color_output(force_color, force_no_color):
18
+ global color_output
19
+ if force_no_color:
20
+ color_output = False
21
+ elif force_color:
22
+ color_output = True
23
+ else:
24
+ color_output = None # Auto-detect
25
+
26
+
27
+ def colored(text, color):
28
+ if color_output is None:
29
+ should_color = sys.stdout.isatty()
30
+ else:
31
+ should_color = color_output
32
+
33
+ if should_color and color:
34
+ colors = {
35
+ "grey": "\033[38;5;8m",
36
+ "red": "\033[38;5;9m",
37
+ "green": "\033[38;5;10m",
38
+ "yellow": "\033[38;5;11m",
39
+ "blue": "\033[38;5;12m",
40
+ "magenta": "\033[38;5;13m",
41
+ "cyan": "\033[38;5;14m",
42
+ "white": "\033[38;5;15m",
43
+ "end": "\033[0m",
44
+ }
45
+ return f"{colors[color]}{text}{colors['end']}"
46
+ else:
47
+ return text
48
+
49
+
50
+ def tflops_per_core(math_fidelity):
51
+ """Source: https://tenstorrent.com/assets/one-pagers/08.01.24_Wormhole.pdf"""
52
+ if math_fidelity == "HiFi4":
53
+ return 74 / 72
54
+ elif math_fidelity == "HiFi2":
55
+ return 148 / 72
56
+ elif math_fidelity == "LoFi":
57
+ return 262 / 72
58
+ else:
59
+ assert False, f"Unknown math fidelity: {math_fidelity}"
60
+
61
+
62
+ class Cell:
63
+ def __init__(self, value: Any, unit: Optional[str] = None, decimals=0, color=None):
64
+ self.raw_value = value
65
+ self.unit = unit
66
+ self.decimals = decimals
67
+ self.color = color
68
+
69
+ def format(self):
70
+ if self.raw_value is None or pd.isna(self.raw_value):
71
+ return ""
72
+
73
+ if isinstance(self.raw_value, str) and "Matmul" in self.raw_value:
74
+ parts = self.raw_value.split(maxsplit=1)
75
+ op_name = parts[0]
76
+ size = parts[1] if len(parts) > 1 else ""
77
+ formatted = f"{colored(op_name, self.color) if self.color else op_name} {colored(size, 'grey')}"
78
+ else:
79
+ try:
80
+ formatted = f"{float(self.raw_value):,.{self.decimals}f}"
81
+ except (ValueError, TypeError):
82
+ formatted = str(self.raw_value)
83
+
84
+ if self.color:
85
+ formatted = colored(formatted, self.color)
86
+
87
+ if self.unit:
88
+ formatted += f" {colored(self.unit, 'grey')}"
89
+
90
+ return formatted
91
+
92
+ def __str__(self):
93
+ return self.format()
94
+
95
+
96
+ def filter_by_signpost(df, signpost=None, ignore_signposts=False):
97
+ signpost_rows = df[df["OP TYPE"] == "signpost"]
98
+
99
+ if ignore_signposts:
100
+ print(colored("Ignoring all signposts. Using the entire file for analysis.", "cyan"))
101
+ return df
102
+
103
+ if signpost:
104
+ if signpost in signpost_rows["OP CODE"].values:
105
+ print(colored(f"Using specified signpost: {signpost}", "cyan"))
106
+ return df[df["OP CODE"].eq(signpost).cummax()].iloc[1:]
107
+ print(colored(f"Specified signpost '{signpost}' not found. Defaulting to the last signpost.", "yellow"))
108
+
109
+ if signpost_rows.empty:
110
+ print(colored("No signposts found in the file. Using the entire file for analysis.", "yellow"))
111
+ return df
112
+
113
+ last_signpost = signpost_rows.iloc[-1]["OP CODE"]
114
+ print(colored(f"Detected signposts: {', '.join(signpost_rows['OP CODE'])}", "cyan"))
115
+ print(colored(f"Using last signpost: {last_signpost} for analysis.", "cyan"))
116
+ return df[df["OP CODE"].eq(last_signpost).cummax()].iloc[1:]
117
+
118
+
119
+ def get_datatype_size(datatype):
120
+ match = re.search(r"\d+", datatype)
121
+ return int(match.group()) / 8 if match else 4
122
+
123
+
124
+ def visible_length(s):
125
+ return len(re.sub(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])", "", s))
126
+
127
+
128
+ def pad_string(string, length, align="left"):
129
+ visible_len = visible_length(string)
130
+ padding = " " * (length - visible_len)
131
+ return padding + string if align == "right" else string + padding
132
+
133
+
134
+ def evaluate_fidelity(input_0_datatype, input_1_datatype, output_datatype, math_fidelity):
135
+ mantissa_bits = {"BFLOAT16": 8, "BFLOAT8_B": 7, "BFLOAT4_B": 3}
136
+ in0_bits = mantissa_bits[input_0_datatype] # activations -> srcB (7 bits)
137
+ in1_bits = mantissa_bits[input_1_datatype] # weights -> srcA (5 bits)
138
+ out_bits = mantissa_bits[output_datatype]
139
+ if in0_bits == 8 and out_bits >= 7:
140
+ if math_fidelity == "HiFi4":
141
+ return (
142
+ "sufficient",
143
+ "HiFi2 may also work, it discards the lowest bit of the activations and has 2x the throughput of HiFi4",
144
+ )
145
+ elif math_fidelity == "HiFi2":
146
+ return "too_low", "If your matmuls are not FLOP-bound use HiFi4 with BF16 activations for full accuracy"
147
+ elif math_fidelity == "LoFi":
148
+ return "too_low", "Use HiFi2 or HiFi4 with BF16 activations for improved accuracy"
149
+ else:
150
+ assert False, f"Unknown math fidelity: {math_fidelity}"
151
+ elif in0_bits == 8 and out_bits == 3:
152
+ if math_fidelity == "HiFi4":
153
+ return (
154
+ "too_high",
155
+ "HiFi2 is very likely to work for BFP8 output; it discards the lowest bit of the activations and has 2x the throughput of HiFi4",
156
+ )
157
+ elif math_fidelity == "HiFi2":
158
+ return (
159
+ "sufficient",
160
+ "LoFi might also be sufficient with BFP4 output and has almost 2x the throughput of HiFi2",
161
+ )
162
+ elif math_fidelity == "LoFi":
163
+ return (
164
+ "too_low",
165
+ "HiFi2 may give better accuracy for large matmuls with many intermediate accumulations",
166
+ )
167
+ else:
168
+ assert False, f"Unknown math fidelity: {math_fidelity}"
169
+ elif in1_bits >= 7 and out_bits >= 7:
170
+ if math_fidelity == "HiFi4":
171
+ return "too_high", "HiFi2 is sufficient for BFP8 multiplication and has 2x the throughput of HiFi4"
172
+ elif math_fidelity == "HiFi2":
173
+ return "sufficient", None
174
+ elif math_fidelity == "LoFi":
175
+ return "too_low", "HiFi2 is recommended for accuracy; LoFi discards the lowest 2 bits of the weights"
176
+ else:
177
+ assert False, f"Unknown math fidelity: {math_fidelity}"
178
+ elif in1_bits >= 7 and out_bits == 3:
179
+ if math_fidelity == "HiFi4":
180
+ return "too_high", "HiFi2 is sufficient for BFP8 multiplication and has 2x the throughput of HiFi4"
181
+ elif math_fidelity == "HiFi2":
182
+ return (
183
+ "sufficient",
184
+ "LoFi might also be sufficient with BFP4 output and has almost 2x the throughput of HiFi2",
185
+ )
186
+ elif math_fidelity == "LoFi":
187
+ return (
188
+ "too_low",
189
+ "HiFi2 may give slightly better accuracy for large matmuls with many intermediate accumulations",
190
+ )
191
+ else:
192
+ assert False, f"Unknown math fidelity: {math_fidelity}"
193
+ elif in1_bits == 3:
194
+ if math_fidelity == "LoFi":
195
+ return "sufficient", None
196
+ else:
197
+ return "too_high", "LoFi is sufficient with BFP4 weights, use it for much higher throughput"
198
+ else:
199
+ print(f"Using {math_fidelity} for {input_0_datatype}/{input_1_datatype} inputs and {output_datatype} output")
200
+ print(f"Bits: {in0_bits}/{in1_bits}/{out_bits}")
201
+ return (
202
+ "unknown",
203
+ f"Using {math_fidelity} for {input_0_datatype}/{input_1_datatype} inputs and {output_datatype} output",
204
+ )
205
+
206
+
207
+ def analyze_matmul(row):
208
+ input_0_from_dram = "DRAM" in row["INPUT_0_MEMORY"]
209
+ input_1_from_dram = "DRAM" in row["INPUT_1_MEMORY"]
210
+
211
+ total_data_size_bytes = 0
212
+ if input_0_from_dram:
213
+ total_data_size_bytes += (
214
+ row["INPUT_0_W"]
215
+ * row["INPUT_0_Y"]
216
+ * row["INPUT_0_Z"]
217
+ * row["INPUT_0_X"]
218
+ * get_datatype_size(row["INPUT_0_DATATYPE"])
219
+ )
220
+ if input_1_from_dram:
221
+ total_data_size_bytes += (
222
+ row["INPUT_1_W"]
223
+ * row["INPUT_1_Y"]
224
+ * row["INPUT_1_Z"]
225
+ * row["INPUT_1_X"]
226
+ * get_datatype_size(row["INPUT_1_DATATYPE"])
227
+ )
228
+
229
+ # Always include output if it's written to DRAM
230
+ if "DRAM" in row["OUTPUT_0_MEMORY"]:
231
+ total_data_size_bytes += (
232
+ row["OUTPUT_0_W"]
233
+ * row["OUTPUT_0_Y"]
234
+ * row["OUTPUT_0_Z"]
235
+ * row["OUTPUT_0_X"]
236
+ * get_datatype_size(row["OUTPUT_0_DATATYPE"])
237
+ )
238
+
239
+ duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
240
+ dram_speed_gb_s = (total_data_size_bytes / duration_s) / 1e9 if total_data_size_bytes > 0 else None
241
+
242
+ core_count = row["CORE COUNT"]
243
+ math_fidelity = row["MATH FIDELITY"]
244
+
245
+ # Check for DRAM-sharded program config
246
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
247
+ is_dram_sharded = "MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig" in attributes
248
+
249
+ # Override core count for DRAM-sharded matmuls
250
+ if is_dram_sharded:
251
+ core_count = 12
252
+
253
+ peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
254
+
255
+ M, K, N = int(row["INPUT_0_Y"]), int(row["INPUT_0_X"]), int(row["INPUT_1_X"])
256
+ W, Z = int(row["INPUT_0_W"]), int(row["INPUT_0_Z"])
257
+
258
+ flops = (M * K * N * W * Z * 2) / duration_s
259
+
260
+ size = f"{M} x {K} x {N}"
261
+ memory_info = f"({row['INPUT_0_DATATYPE']} {row['INPUT_0_MEMORY'].replace('DEV_0_', '')} @ {row['INPUT_1_DATATYPE']} {row['INPUT_1_MEMORY'].replace('DEV_0_', '')} => {row['OUTPUT_0_DATATYPE']} {row['OUTPUT_0_MEMORY'].replace('DEV_0_', '')})"
262
+
263
+ dram_percentage = (dram_speed_gb_s / 288) * 100 if dram_speed_gb_s is not None else None
264
+ flops_percentage = (flops / peak_flops_value) * 100
265
+
266
+ return (
267
+ dram_speed_gb_s,
268
+ dram_percentage,
269
+ flops,
270
+ flops_percentage,
271
+ size,
272
+ memory_info,
273
+ math_fidelity,
274
+ is_dram_sharded,
275
+ core_count, # Return the potentially adjusted core count
276
+ )
277
+
278
+
279
+ def analyze_op(row, prev_row):
280
+ op_code = Cell(row["OP CODE"])
281
+ cores = Cell(int(row["CORE COUNT"]) if pd.notna(row["CORE COUNT"]) else None)
282
+ device_time = Cell(
283
+ row["DEVICE FW DURATION [ns]"] / 1000 if pd.notna(row["DEVICE FW DURATION [ns]"]) else None,
284
+ unit="us",
285
+ decimals=0,
286
+ )
287
+
288
+ if prev_row is not None and pd.notna(prev_row["OP TO OP LATENCY [ns]"]):
289
+ op_to_op_gap = Cell(
290
+ row["OP TO OP LATENCY [ns]"] / 1000 if pd.notna(row["OP TO OP LATENCY [ns]"]) else None,
291
+ unit="us",
292
+ decimals=0,
293
+ )
294
+ else:
295
+ op_to_op_gap = Cell(None, unit="us", decimals=0)
296
+
297
+ output_datatype = row["OUTPUT_0_DATATYPE"]
298
+ input_0_datatype = row["INPUT_0_DATATYPE"]
299
+ input_1_datatype = row["INPUT_1_DATATYPE"]
300
+ output_datatype_cell = Cell(output_datatype)
301
+ input_0_datatype_cell = Cell(input_0_datatype)
302
+ input_1_datatype_cell = Cell(input_1_datatype)
303
+ short_name = lambda n: {"BFLOAT16": "BF16", "BFLOAT8_B": "BFP8", "BFLOAT4_B": "BFP4"}.get(n, n)
304
+
305
+ if "Matmul" in op_code.raw_value:
306
+ (
307
+ dram_speed,
308
+ dram_percentage,
309
+ flops,
310
+ flops_percentage,
311
+ size,
312
+ memory_info,
313
+ math_fidelity,
314
+ is_dram_sharded,
315
+ adjusted_core_count, # Get the potentially adjusted core count
316
+ ) = analyze_matmul(row)
317
+ op_code = Cell(f"{op_code.raw_value} {size}")
318
+ dram_speed = Cell(dram_speed, unit="GB/s", decimals=0)
319
+ dram_percentage = Cell(dram_percentage, unit="%", decimals=1)
320
+ flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
321
+ flops_percentage = Cell(flops_percentage, unit="%", decimals=1)
322
+ cores.raw_value = adjusted_core_count
323
+
324
+ math_fidelity_cell = Cell(
325
+ f"{math_fidelity} {short_name(input_0_datatype)} x {short_name(input_1_datatype)} => {short_name(output_datatype)}".strip()
326
+ if math_fidelity
327
+ else None
328
+ )
329
+ else:
330
+ dram_speed = Cell(None, unit="GB/s", decimals=0)
331
+ dram_percentage = Cell(None, unit="%", decimals=1)
332
+ flops = Cell(None, unit="TFLOPs", decimals=1)
333
+ flops_percentage = Cell(None, unit="%", decimals=1)
334
+
335
+ math_fidelity = ""
336
+ math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
337
+ math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
338
+ math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
339
+ math_fidelity_cell = Cell(math_fidelity.strip())
340
+
341
+ is_dram_sharded = False
342
+
343
+ output = {
344
+ "ID": None,
345
+ "Bound": Cell(""),
346
+ "OP Code": op_code,
347
+ "Device Time": device_time,
348
+ "Op-to-Op Gap": op_to_op_gap,
349
+ "Cores": cores,
350
+ "DRAM": dram_speed,
351
+ "DRAM %": dram_percentage,
352
+ "FLOPs": flops,
353
+ "FLOPs %": flops_percentage,
354
+ "Math Fidelity": math_fidelity_cell,
355
+ "Output Datatype": output_datatype_cell,
356
+ "Input 0 Datatype": input_0_datatype_cell,
357
+ "Input 1 Datatype": input_1_datatype_cell,
358
+ "DRAM Sharded": Cell(is_dram_sharded),
359
+ }
360
+
361
+ input_0_memory = Cell(row["INPUT_0_MEMORY"] if pd.notna(row["INPUT_0_MEMORY"]) else None)
362
+
363
+ # Extract program config details
364
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
365
+ in0_block_w = Cell(None)
366
+ out_subblock_h = Cell(None)
367
+ out_subblock_w = Cell(None)
368
+
369
+ if "program_config" in attributes:
370
+ match = re.search(r"in0_block_w=(\d+)", attributes)
371
+ if match:
372
+ in0_block_w = Cell(int(match.group(1)))
373
+
374
+ match = re.search(r"out_subblock_h=(\d+)", attributes)
375
+ if match:
376
+ out_subblock_h = Cell(int(match.group(1)))
377
+
378
+ match = re.search(r"out_subblock_w=(\d+)", attributes)
379
+ if match:
380
+ out_subblock_w = Cell(int(match.group(1)))
381
+
382
+ output["Input 0 Memory"] = input_0_memory
383
+ output["Inner Dim Block Size"] = in0_block_w
384
+ output["Output Subblock H"] = out_subblock_h
385
+ output["Output Subblock W"] = out_subblock_w
386
+
387
+ return output, op_to_op_gap.raw_value
388
+
389
+
390
+ def add_derived_columns(rows):
391
+ total_duration = sum(
392
+ op_data["Device Time"].raw_value for op_data in rows if op_data["Device Time"].raw_value is not None
393
+ ) + sum(op_data["Op-to-Op Gap"].raw_value for op_data in rows if op_data["Op-to-Op Gap"].raw_value is not None)
394
+ for op_data in rows:
395
+ device_time = op_data["Device Time"].raw_value if op_data["Device Time"].raw_value is not None else 0
396
+ op_to_op_gap = op_data["Op-to-Op Gap"].raw_value if op_data["Op-to-Op Gap"].raw_value is not None else 0
397
+ op_data["Total %"] = Cell(((device_time + op_to_op_gap) / total_duration) * 100, unit="%", decimals=1)
398
+ if op_data["Device Time"].raw_value is None and op_data["Op-to-Op Gap"].raw_value is None:
399
+ op_data["Total %"].raw_value = None
400
+
401
+ if "Matmul" in op_data["OP Code"].raw_value:
402
+ dram_percentage = op_data["DRAM %"].raw_value
403
+ flops_percentage = op_data["FLOPs %"].raw_value
404
+ if dram_percentage and flops_percentage:
405
+ if dram_percentage >= 65 and flops_percentage >= 65:
406
+ op_data["Bound"] = Cell("BOTH")
407
+ elif dram_percentage >= 65:
408
+ op_data["Bound"] = Cell("DRAM")
409
+ elif flops_percentage >= 65:
410
+ op_data["Bound"] = Cell("FLOP")
411
+ else:
412
+ op_data["Bound"] = Cell("SLOW")
413
+ elif "(torch)" in op_data["OP Code"].raw_value:
414
+ op_data["Bound"] = Cell("HOST")
415
+
416
+
417
+ def print_row(row, col_widths, headers):
418
+ def format_cell(header, cell):
419
+ # Avoid thousand separators for ID column
420
+ text = colored(str(cell.raw_value), cell.color) if header == "ID" else str(cell)
421
+ return pad_string(text, col_widths[headers.index(header)], align="left" if header == "OP Code" else "right")
422
+
423
+ print(" ".join(format_cell(header, row[header]) for header in headers))
424
+
425
+
426
+ def color_row(op_data, percentage, min_percentage):
427
+ if percentage is not None and percentage < min_percentage:
428
+ for v in op_data.values():
429
+ v.color = "grey"
430
+ else:
431
+ op_colors = {
432
+ "(torch)": "red",
433
+ "Matmul": "magenta",
434
+ "LayerNorm": "cyan",
435
+ "AllGather": "cyan",
436
+ "AllReduce": "cyan",
437
+ "ScaledDotProductAttentionDecode": "blue",
438
+ "ScaledDotProductAttentionGQADecode": "blue",
439
+ "NlpCreateHeadsDeviceOperation": "blue",
440
+ "NLPConcatHeadsDecodeDeviceOperation": "blue",
441
+ "UpdateCache": "blue",
442
+ }
443
+ for op, color in op_colors.items():
444
+ if op in op_data["OP Code"].raw_value:
445
+ op_data["OP Code"].color = color
446
+ break
447
+ else:
448
+ op_data["OP Code"].color = "white"
449
+
450
+ num_cores = op_data["Cores"].raw_value
451
+ if num_cores is not None:
452
+ if num_cores < 10:
453
+ op_data["Cores"].color = "red"
454
+ elif num_cores == 64:
455
+ op_data["Cores"].color = "green"
456
+ else:
457
+ op_data["Cores"].color = "grey"
458
+
459
+ if op_data["Bound"].raw_value == "DRAM":
460
+ op_data["Bound"].color = "green"
461
+ op_data["DRAM"].color = "green"
462
+ op_data["DRAM %"].color = "green"
463
+ elif op_data["Bound"].raw_value == "FLOP":
464
+ op_data["Bound"].color = "green"
465
+ op_data["FLOPs"].color = "green"
466
+ op_data["FLOPs %"].color = "green"
467
+ elif op_data["Bound"].raw_value == "SLOW":
468
+ op_data["Bound"].color = "yellow"
469
+ dram_percentage = op_data["DRAM %"].raw_value
470
+ flops_percentage = op_data["FLOPs %"].raw_value
471
+ if dram_percentage is not None and flops_percentage is not None:
472
+ if dram_percentage > flops_percentage:
473
+ op_data["DRAM"].color = "yellow"
474
+ op_data["DRAM %"].color = "yellow"
475
+ else:
476
+ op_data["FLOPs"].color = "yellow"
477
+ op_data["FLOPs %"].color = "yellow"
478
+ elif op_data["Bound"].raw_value == "HOST":
479
+ op_data["Bound"].color = "red"
480
+
481
+ if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5:
482
+ op_data["Op-to-Op Gap"].color = "red"
483
+
484
+ if "Matmul" in op_data["OP Code"].raw_value and op_data["Math Fidelity"].raw_value:
485
+ math_fidelity = op_data["Math Fidelity"].raw_value.split()[0]
486
+ input_0_datatype = op_data["Input 0 Datatype"].raw_value
487
+ input_1_datatype = op_data["Input 1 Datatype"].raw_value
488
+ output_datatype = op_data["Output Datatype"].raw_value
489
+
490
+ fidelity_evaluation, _ = evaluate_fidelity(
491
+ input_0_datatype, input_1_datatype, output_datatype, math_fidelity
492
+ )
493
+
494
+ if fidelity_evaluation == "sufficient":
495
+ op_data["Math Fidelity"].color = "green"
496
+ elif fidelity_evaluation == "too_high":
497
+ op_data["Math Fidelity"].color = "red"
498
+ elif fidelity_evaluation == "too_low":
499
+ op_data["Math Fidelity"].color = "cyan"
500
+ else:
501
+ op_data["Math Fidelity"].color = "white"
502
+
503
+ return op_data
504
+
505
+
506
+ def print_performance_table(rows, headers, col_widths, device_ops, host_ops):
507
+ print("\n🚀 Performance Report 🚀\n========================\n")
508
+
509
+ print(" ".join(pad_string(header, col_widths[i], align="left") for i, header in enumerate(headers)))
510
+ print("-" * sum(col_widths) + "-" * (len(headers) - 1) * 2)
511
+
512
+ for idx, op_data in enumerate(rows):
513
+ print_row(op_data, col_widths, headers)
514
+
515
+ print("-" * (sum(col_widths) + (len(headers) - 1) * 2))
516
+
517
+ total_device_time = sum(
518
+ op_data["Device Time"].raw_value for op_data in rows if op_data["Device Time"].raw_value is not None
519
+ )
520
+ total_visible_gap = sum(
521
+ op_data["Op-to-Op Gap"].raw_value for op_data in rows if op_data["Op-to-Op Gap"].raw_value is not None
522
+ )
523
+ total_row = {
524
+ "ID": Cell(""),
525
+ "Total %": Cell(100.0, unit="%", decimals=1),
526
+ "Bound": Cell(""),
527
+ "OP Code": Cell(f"{device_ops} device ops, {host_ops} host ops"),
528
+ "Device Time": Cell(total_device_time, unit="us", decimals=0),
529
+ "Op-to-Op Gap": Cell(total_visible_gap, unit="us", decimals=0),
530
+ }
531
+ for header in headers:
532
+ if header not in total_row:
533
+ total_row[header] = Cell("")
534
+ print_row(
535
+ {k: Cell(v.raw_value, v.unit, v.decimals, color="grey") for k, v in total_row.items()}, col_widths, headers
536
+ )
537
+
538
+
539
+ def print_advice_section(rows, headers, col_widths):
540
+ print("\n💡 Advice 💡\n============\n")
541
+
542
+ print_fallback_advice(rows, headers, col_widths)
543
+ print_op_to_op_gap_advice(rows, headers, col_widths)
544
+ print_matmul_advice(rows, headers, col_widths)
545
+
546
+
547
+ def print_fallback_advice(rows, headers, col_widths):
548
+ host_ops = [op_data for op_data in rows if "(torch)" in op_data["OP Code"].raw_value]
549
+ if host_ops:
550
+ print("Fallback\n--------")
551
+ for op_data in host_ops:
552
+ print_row(op_data, col_widths, headers)
553
+ print("\nThese ops should be moved to run on device.\n")
554
+
555
+
556
+ def print_op_to_op_gap_advice(rows, headers, col_widths):
557
+ high_gap_ops = [
558
+ (idx + 1, op_data)
559
+ for idx, op_data in enumerate(rows)
560
+ if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5
561
+ ]
562
+
563
+ if high_gap_ops:
564
+ print("High Op-to-Op Gap\n----------------")
565
+ for idx, op_data in high_gap_ops:
566
+ print_row(op_data, col_widths, headers)
567
+ max_gap_overhead = sum(op_data["Op-to-Op Gap"].raw_value - 6 for _, op_data in high_gap_ops)
568
+
569
+ total_duration = sum(
570
+ op_data["Device Time"].raw_value for op_data in rows if op_data["Device Time"].raw_value is not None
571
+ ) + sum(op_data["Op-to-Op Gap"].raw_value for op_data in rows if op_data["Op-to-Op Gap"].raw_value is not None)
572
+
573
+ percentage_saved = (max_gap_overhead / total_duration) * 100
574
+ print(
575
+ f"\nThese ops have a >6us gap since the previous operation. Running with tracing could save {max_gap_overhead:.0f} us ({percentage_saved:.1f}% of overall time)"
576
+ )
577
+ print(
578
+ "Alternatively ensure device is not waiting for the host and use device.enable_async(True). Experts can try moving runtime args in the kernels to compile-time args.\n"
579
+ )
580
+
581
+
582
+ def print_matmul_advice(rows, headers, col_widths):
583
+ matmul_ops = [op_data for op_data in rows if "Matmul" in op_data["OP Code"].raw_value]
584
+
585
+ if matmul_ops:
586
+ print("Matmul Optimization\n-------------------")
587
+ for op_data in matmul_ops:
588
+ print_row(op_data, col_widths, headers)
589
+ advice = []
590
+ color = "grey" if op_data["OP Code"].color == "grey" else "white"
591
+
592
+ math_fidelity = (
593
+ op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
594
+ )
595
+ output_datatype = op_data["Output Datatype"].raw_value
596
+ input_0_datatype = op_data["Input 0 Datatype"].raw_value
597
+ input_1_datatype = op_data["Input 1 Datatype"].raw_value
598
+ cores = op_data["Cores"].raw_value
599
+ fidelity_evaluation, fidelity_advice = evaluate_fidelity(
600
+ input_0_datatype, input_1_datatype, output_datatype, math_fidelity
601
+ )
602
+
603
+ if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
604
+ if not op_data["DRAM Sharded"].raw_value:
605
+ advice.append(
606
+ "- Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
607
+ )
608
+ if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
609
+ advice.append(f"- {fidelity_advice}")
610
+ if fidelity_evaluation == "too_high":
611
+ advice.append(f"- {fidelity_advice}")
612
+ elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
613
+ if cores < 64:
614
+ advice.append(f"- Increase grid size (currently using {cores})")
615
+ if fidelity_evaluation == "too_high":
616
+ advice.append(f"- {fidelity_advice}")
617
+ elif op_data["Bound"].raw_value == "SLOW":
618
+ input_0_memory = op_data["Input 0 Memory"].raw_value
619
+ if input_0_memory and "L1" not in input_0_memory:
620
+ advice.append(f"- If possible place input 0 in L1 (currently in {input_0_memory})")
621
+
622
+ inner_dim_block = op_data["Inner Dim Block Size"].raw_value
623
+ out_h = op_data["Output Subblock H"].raw_value
624
+ out_w = op_data["Output Subblock W"].raw_value
625
+
626
+ if inner_dim_block is None and out_h is None and out_w is None:
627
+ advice.append(
628
+ "- No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
629
+ )
630
+ else:
631
+ all_good = True
632
+ if inner_dim_block is not None:
633
+ if inner_dim_block < 2:
634
+ advice.append(f"- in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
635
+ all_good = False
636
+ else:
637
+ advice.append("- No inner dim block size found")
638
+ all_good = False
639
+
640
+ if out_h is not None and out_w is not None:
641
+ out_area = out_h * out_w
642
+ if out_area < 2:
643
+ advice.append(
644
+ f"- Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
645
+ )
646
+ all_good = False
647
+ else:
648
+ advice.append("- No output subblock size found")
649
+ all_good = False
650
+
651
+ if all_good:
652
+ advice.append(
653
+ f"- in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
654
+ )
655
+ if fidelity_advice:
656
+ advice.append(f"- {fidelity_advice}")
657
+
658
+ if advice:
659
+ for item in advice:
660
+ print(colored(item, color))
661
+ else:
662
+ print(colored("✅ Optimized", color))
663
+ print() # Add a blank line between matmuls
664
+
665
+
666
+ def merge_device_rows(df):
667
+ block_by_device = defaultdict(list)
668
+
669
+ for _, row in df.iterrows():
670
+ op_name = row["OP CODE"]
671
+ op_type = row["OP TYPE"]
672
+
673
+ if op_type == "tt_dnn_device":
674
+ device_id = int(row["DEVICE ID"])
675
+ block_by_device[device_id].append((op_name, row.to_dict()))
676
+
677
+ device_ids = sorted(block_by_device.keys())
678
+ merged_blocks = []
679
+
680
+ for blocks in zip(*[block_by_device[device_id] for device_id in device_ids]):
681
+ op_name = blocks[0][0]
682
+
683
+ if "AllGather" in op_name or "ReduceScatter" in op_name:
684
+ # For collective ops, take the row with minimum duration
685
+ min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE FW DURATION [ns]"])
686
+ merged_blocks.append(min_duration_block[1])
687
+ else:
688
+ # For non-collective ops, take the row with maximum duration
689
+ max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE FW DURATION [ns]"])
690
+ merged_blocks.append(max_duration_block[1])
691
+
692
+ return pd.DataFrame(merged_blocks)
693
+
694
+
695
+ def parse_id_range(id_range_str):
696
+ if id_range_str is None:
697
+ return None
698
+
699
+ parts = id_range_str.split("-")
700
+ if len(parts) != 2:
701
+ raise ValueError("Invalid ID range format")
702
+
703
+ start = int(parts[0].replace(",", "")) if parts[0] else None
704
+ end = int(parts[1].replace(",", "")) if parts[1] else None
705
+
706
+ return (start, end)
707
+
708
+
709
+ def filter_by_id_range(rows, id_range):
710
+ if id_range:
711
+ start, end = id_range
712
+ if start is None:
713
+ print(colored(f"Filtering rows with IDs up to {end}", "cyan"))
714
+ filtered_rows = [row for row in rows if row["ID"].raw_value <= end]
715
+ elif end is None:
716
+ print(colored(f"Filtering rows with IDs from {start} onwards", "cyan"))
717
+ filtered_rows = [row for row in rows if row["ID"].raw_value >= start]
718
+ else:
719
+ print(colored(f"Filtering rows with IDs from {start} to {end}", "cyan"))
720
+ filtered_rows = [row for row in rows if start <= row["ID"].raw_value <= end]
721
+
722
+ # Reset the op-to-op gap for the first item in the filtered range
723
+ if filtered_rows:
724
+ filtered_rows[0]["Op-to-Op Gap"] = Cell(None, unit="us", decimals=0)
725
+
726
+ return filtered_rows
727
+ return rows
728
+
729
+
730
+ def main():
731
+ args, id_range = parse_args()
732
+ generate_perf_report(
733
+ args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice
734
+ )
735
+
736
+
737
+ def parse_args():
738
+ parser = argparse.ArgumentParser(description="User-friendly Performance Report Analysis Tool")
739
+ parser.add_argument("csv_file", type=str, help="Path to the performance report CSV file")
740
+ parser.add_argument("--signpost", type=str, help="Specify a signpost to use for analysis", default=None)
741
+ parser.add_argument(
742
+ "--ignore-signposts", action="store_true", help="Ignore all signposts and use the entire file for analysis"
743
+ )
744
+ parser.add_argument(
745
+ "--min-percentage", type=float, default=0.5, help="Minimum percentage for coloring (default: 0.5)"
746
+ )
747
+ parser.add_argument(
748
+ "--id-range", type=str, help="Show only rows with IDs in the specified range (e.g., '5-10', '31-', or '-12')"
749
+ )
750
+ parser.add_argument("--color", action="store_true", help="Force colored output even when output is redirected")
751
+ parser.add_argument("--no-color", action="store_true", help="Force output without color")
752
+ parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
753
+ parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
754
+ args = parser.parse_args()
755
+
756
+ # Set the global color_output variable
757
+ set_color_output(args.color, args.no_color)
758
+
759
+ # Parse id_range
760
+ try:
761
+ id_range = parse_id_range(args.id_range)
762
+ except ValueError:
763
+ print(colored("Invalid --id-range format. Please use 'START-END', 'START-', or '-END'.", "red"))
764
+ exit(1)
765
+
766
+ return args, id_range
767
+
768
+
769
+ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice):
770
+ df = pd.read_csv(csv_file, low_memory=False)
771
+
772
+ # Add a column for original row numbers
773
+ df["ORIGINAL_ROW"] = df.index + 2 # +2 to match Excel row numbers (1-based + header)
774
+
775
+ # Sort the DataFrame by "HOST START TS" column
776
+ if "HOST START TS" in df.columns:
777
+ print(colored("Sorting CSV by 'HOST START TS' column...", "cyan"))
778
+ df = df.sort_values(by="HOST START TS")
779
+ else:
780
+ print(colored("Warning: 'HOST START TS' column not found. CSV will not be sorted.", "yellow"))
781
+
782
+ df = filter_by_signpost(df, signpost, ignore_signposts)
783
+
784
+ # Check if the file contains multiple devices
785
+ if "DEVICE ID" in df.columns and df["DEVICE ID"].nunique() > 1:
786
+ print(colored(f"Detected data from {df['DEVICE ID'].nunique()} devices. Merging device data...", "cyan"))
787
+ df = merge_device_rows(df)
788
+
789
+ rows = []
790
+ prev_row = None
791
+ device_ops = 0
792
+ host_ops = 0
793
+ for _, row in df.iterrows():
794
+ op_data, current_gap = analyze_op(row, prev_row)
795
+ op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
796
+ rows.append(op_data)
797
+ prev_row = row
798
+
799
+ # Count device and host ops
800
+ if "(torch)" in op_data["OP Code"].raw_value:
801
+ host_ops += 1
802
+ else:
803
+ device_ops += 1
804
+
805
+ # Calculate total duration and add derived columns
806
+ add_derived_columns(rows)
807
+
808
+ # Filter rows based on id_range
809
+ rows = filter_by_id_range(rows, id_range)
810
+
811
+ # Recalculate derived columns after filtering
812
+ add_derived_columns(rows)
813
+
814
+ rows = [color_row(op_data, op_data["Total %"].raw_value, min_percentage) for op_data in rows]
815
+
816
+ visible_headers = [
817
+ "ID",
818
+ "Total %",
819
+ "Bound",
820
+ "OP Code",
821
+ "Device Time",
822
+ "Op-to-Op Gap",
823
+ "Cores",
824
+ "DRAM",
825
+ "DRAM %",
826
+ "FLOPs",
827
+ "FLOPs %",
828
+ "Math Fidelity",
829
+ ]
830
+
831
+ if csv_output_file:
832
+ all_headers = visible_headers + [
833
+ "Output Datatype",
834
+ "Input 0 Datatype",
835
+ "Input 1 Datatype",
836
+ "DRAM Sharded",
837
+ "Input 0 Memory",
838
+ "Inner Dim Block Size",
839
+ "Output Subblock H",
840
+ "Output Subblock W",
841
+ ]
842
+ print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
843
+ with open(csv_output_file, "w") as f:
844
+ f.write(",".join(all_headers) + "\n")
845
+ for op_data in rows:
846
+ f.write(",".join(str(op_data[header].raw_value) for header in all_headers) + "\n")
847
+ else:
848
+ col_widths = [
849
+ max(max(visible_length(str(row[header])) for row in rows), visible_length(header))
850
+ for header in visible_headers
851
+ ]
852
+ print_performance_table(rows, visible_headers, col_widths, device_ops, host_ops)
853
+ if not no_advice:
854
+ print_advice_section(rows, visible_headers, col_widths)
855
+
856
+
857
+ if __name__ == "__main__":
858
+ main()