tt-perf-report 1.0.6__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.6
3
+ Version: 1.1.0
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,8 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Requires-Dist: matplotlib
222
+ Dynamic: license-file
221
223
 
222
224
  # Performance Report Analysis Tool
223
225
 
@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.6"
7
+ version = "1.1.0"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
11
11
  keywords = ["tenstorrent", "tt-metal"]
12
- dependencies = ["pandas"]
12
+ dependencies = ["pandas", "matplotlib"]
13
13
 
14
14
  [project.scripts]
15
15
  tt-perf-report = "tt_perf_report.perf_report:main"
@@ -2,17 +2,32 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
5
- import csv
6
- import sys
7
5
  import argparse
6
+ import csv
7
+ from collections import defaultdict
8
+ import os
8
9
  import re
10
+ import sys
9
11
  from typing import Any, Optional, Union
10
- from collections import defaultdict
12
+
13
+ import matplotlib.pyplot as plt
11
14
  import pandas as pd
12
15
 
13
16
  # Global variable to store color preference
14
17
  color_output = None # None means auto-detect, True forces color, False forces no color
15
18
 
19
+ def get_value_physical_logical(input : str, is_physical : bool = True):
20
+ if "[" in input and "]" in input:
21
+ physical_part = input.split("[")[0]
22
+ logical_part = input.split("[")[1].split("]")[0]
23
+
24
+ if is_physical:
25
+ return int(physical_part)
26
+ else:
27
+ return int(logical_part)
28
+ else:
29
+ # back compatible
30
+ return int(input)
16
31
 
17
32
  def set_color_output(force_color, force_no_color):
18
33
  global color_output
@@ -71,7 +86,7 @@ class Cell:
71
86
  if self.raw_value is None or pd.isna(self.raw_value):
72
87
  return ""
73
88
 
74
- if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
89
+ if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value or "HaloDeviceOperation" in self.raw_value):
75
90
  parts = self.raw_value.split(maxsplit=1)
76
91
  op_name = parts[0]
77
92
  size = parts[1] if len(parts) > 1 else ""
@@ -212,28 +227,28 @@ def analyze_matmul(row):
212
227
  total_data_size_bytes = 0
213
228
  if input_0_from_dram:
214
229
  total_data_size_bytes += (
215
- row["INPUT_0_W"]
216
- * row["INPUT_0_Y"]
217
- * row["INPUT_0_Z"]
218
- * row["INPUT_0_X"]
230
+ get_value_physical_logical(row["INPUT_0_W_PAD[LOGICAL]"])
231
+ * get_value_physical_logical(row["INPUT_0_Y_PAD[LOGICAL]"])
232
+ * get_value_physical_logical(row["INPUT_0_Z_PAD[LOGICAL]"])
233
+ * get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"])
219
234
  * get_datatype_size(row["INPUT_0_DATATYPE"])
220
235
  )
221
236
  if input_1_from_dram:
222
237
  total_data_size_bytes += (
223
- row["INPUT_1_W"]
224
- * row["INPUT_1_Y"]
225
- * row["INPUT_1_Z"]
226
- * row["INPUT_1_X"]
238
+ get_value_physical_logical(row["INPUT_1_W_PAD[LOGICAL]"])
239
+ * get_value_physical_logical(row["INPUT_1_Y_PAD[LOGICAL]"])
240
+ * get_value_physical_logical(row["INPUT_1_Z_PAD[LOGICAL]"])
241
+ * get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
227
242
  * get_datatype_size(row["INPUT_1_DATATYPE"])
228
243
  )
229
244
 
230
245
  # Always include output if it's written to DRAM
231
246
  if "DRAM" in row["OUTPUT_0_MEMORY"]:
232
247
  total_data_size_bytes += (
233
- row["OUTPUT_0_W"]
234
- * row["OUTPUT_0_Y"]
235
- * row["OUTPUT_0_Z"]
236
- * row["OUTPUT_0_X"]
248
+ get_value_physical_logical(row["OUTPUT_0_W_PAD[LOGICAL]"])
249
+ * get_value_physical_logical(row["OUTPUT_0_Y_PAD[LOGICAL]"])
250
+ * get_value_physical_logical(row["OUTPUT_0_Z_PAD[LOGICAL]"])
251
+ * get_value_physical_logical(row["OUTPUT_0_X_PAD[LOGICAL]"])
237
252
  * get_datatype_size(row["OUTPUT_0_DATATYPE"])
238
253
  )
239
254
 
@@ -253,8 +268,8 @@ def analyze_matmul(row):
253
268
 
254
269
  peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
255
270
 
256
- M, K, N = int(row["INPUT_0_Y"]), int(row["INPUT_0_X"]), int(row["INPUT_1_X"])
257
- W, Z = int(row["INPUT_0_W"]), int(row["INPUT_0_Z"])
271
+ M, K, N = get_value_physical_logical(row["INPUT_0_Y_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
272
+ W, Z = get_value_physical_logical(row["INPUT_0_W_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_0_Z_PAD[LOGICAL]"])
258
273
 
259
274
  flops = (M * K * N * W * Z * 2) / duration_s
260
275
 
@@ -276,6 +291,42 @@ def analyze_matmul(row):
276
291
  core_count, # Return the potentially adjusted core count
277
292
  )
278
293
 
294
+ def analyze_halo(row):
295
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
296
+
297
+ try:
298
+ window_hw = attributes.split("window_hw=")[1].split(";")[0:2]
299
+ window_hw = ",".join(window_hw[0:2])
300
+ except (IndexError, AttributeError):
301
+ window_hw = "x"
302
+
303
+ try:
304
+ stride_hw = attributes.split("stride_hw=")[1].split(";")[0:2]
305
+ stride_hw = ",".join(stride_hw[0:2])
306
+ except (IndexError, AttributeError):
307
+ stride_hw = "x"
308
+
309
+ try:
310
+ pad_hw = attributes.split("padding=")[1].split(";")[0:4]
311
+ pad_hw = ",".join(pad_hw[0:4])
312
+ except (IndexError, AttributeError):
313
+ pad_hw = "x"
314
+
315
+ try:
316
+ dilation_hw = attributes.split("dilation_hw=")[1].split(";")[0:2]
317
+ dilation_hw = ",".join(dilation_hw[0:2])
318
+ except (IndexError, AttributeError):
319
+ dilation_hw = "x"
320
+
321
+ try:
322
+ memory_layout = attributes.split("memory_layout=")[1].split(";")[0].split("::")[1]
323
+ except (IndexError, AttributeError):
324
+ memory_layout = "x"
325
+
326
+ config = f"w={window_hw} s={stride_hw} p={pad_hw} d={dilation_hw} | {memory_layout}"
327
+
328
+ return config
329
+
279
330
  def analyze_conv(row):
280
331
  duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
281
332
 
@@ -287,10 +338,10 @@ def analyze_conv(row):
287
338
 
288
339
  peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
289
340
 
290
- NHW = int(row["OUTPUT_0_Y"])
291
- CH_IN = int(row["INPUT_0_X"])
341
+ NHW = get_value_physical_logical(row["OUTPUT_0_Y_PAD[LOGICAL]"])
342
+ CH_IN = get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"])
292
343
  W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
293
- CH_OUT = int(row["INPUT_1_X"])
344
+ CH_OUT = get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
294
345
 
295
346
  M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
296
347
  flops = (M * K * N * 2) / duration_s
@@ -300,12 +351,40 @@ def analyze_conv(row):
300
351
 
301
352
  flops_percentage = (flops / peak_flops_value) * 100
302
353
 
354
+ try:
355
+ act_block_h_ntiles = int(attributes.split("act_block_h_ntiles")[1][1:].split(";")[0])
356
+ except (IndexError, ValueError):
357
+ act_block_h_ntiles = "x"
358
+
359
+ try:
360
+ enable_act_double_buffer = "true" == attributes.split("enable_act_double_buffer': '")[1].split("'")[0]
361
+ except (IndexError, ValueError):
362
+ enable_act_double_buffer = "x"
363
+
364
+ try:
365
+ enable_split_reader = "true" == attributes.split("enable_split_reader': '")[1].split("'")[0]
366
+ except (IndexError, ValueError):
367
+ enable_split_reader = "x"
368
+
369
+ try:
370
+ per_core_out_matrix_height_ntile = int(attributes.split("per_core_out_matrix_height_ntile")[1][1:].split(";")[0])
371
+ except (IndexError, ValueError):
372
+ per_core_out_matrix_height_ntile = "x"
373
+
374
+ config = f"[ABH={per_core_out_matrix_height_ntile}|{act_block_h_ntiles}"
375
+ if (enable_act_double_buffer):
376
+ config += " ADB"
377
+ if (enable_split_reader):
378
+ config += " SR"
379
+ config += "]"
380
+
303
381
  return (
304
382
  flops,
305
383
  flops_percentage,
306
384
  size,
307
385
  memory_info,
308
- math_fidelity
386
+ math_fidelity,
387
+ config,
309
388
  )
310
389
 
311
390
  def analyze_op(row, prev_row):
@@ -381,8 +460,9 @@ def analyze_op(row, prev_row):
381
460
  size,
382
461
  memory_info,
383
462
  math_fidelity,
463
+ config,
384
464
  ) = analyze_conv(row)
385
- op_code = Cell(f"{op_code.raw_value} {size}")
465
+ op_code = Cell(f"{op_code.raw_value} {size} {config}")
386
466
  dram_speed = Cell(None, unit="GB/s", decimals=0)
387
467
  dram_percentage = Cell(None, unit="%", decimals=1)
388
468
  flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
@@ -392,6 +472,13 @@ def analyze_op(row, prev_row):
392
472
  if math_fidelity
393
473
  else None
394
474
  )
475
+ elif "HaloDeviceOperation" in op_code.raw_value:
476
+ config = analyze_halo(row)
477
+ op_code = Cell(f"{op_code.raw_value} {config}")
478
+ dram_speed = Cell(None, unit="GB/s", decimals=0)
479
+ dram_percentage = Cell(None, unit="%", decimals=1)
480
+ flops = Cell(None, unit="TFLOPs", decimals=1)
481
+ flops_percentage = Cell(None, unit="%", decimals=1)
395
482
 
396
483
  output = {
397
484
  "ID": None,
@@ -728,6 +815,93 @@ def generate_matmul_advice(op_data):
728
815
  return advice
729
816
 
730
817
 
818
+ def generate_stacked_report(rows, visible_headers, stack_by_input0_layout:bool = False):
819
+ if stack_by_input0_layout:
820
+ visible_headers.append("Input 0 Memory")
821
+
822
+ # Create a pandas DataFrame from rows and headers
823
+ data = {header: [row[header].raw_value for row in rows] for header in visible_headers}
824
+ df = pd.DataFrame(data)
825
+
826
+ if (stack_by_input0_layout):
827
+ df["OP Code Joined"] = df["OP Code"].str.split().str[0] \
828
+ + " (in0:" + df["Input 0 Memory"].str.split('_').str[-2].str.lower() + "_" + df["Input 0 Memory"].str.split('_').str[-1].str.lower() + ")"
829
+ else:
830
+ df["OP Code Joined"] = df["OP Code"].str.split().str[0]
831
+
832
+ # Group by the joined OP Code and aggregate the data
833
+ stacked_df = df.groupby("OP Code Joined").agg(
834
+ Device_Time_Sum_us=("Device Time", "sum"),
835
+ Ops_Count=("Device Time", "count"),
836
+ Flops_min=("FLOPs %", "min"),
837
+ Flops_max=("FLOPs %", "max"),
838
+ Flops_mean=("FLOPs %", "mean"),
839
+ Flops_std=("FLOPs %", "std"),
840
+ ).reset_index()
841
+
842
+ # Calculate the percentage of device time
843
+ total_device_time = stacked_df["Device_Time_Sum_us"].sum()
844
+ stacked_df["%"] = (stacked_df["Device_Time_Sum_us"] / total_device_time) * 100
845
+ # Reorder columns to move Device_Time_Percentage to be the 3rd column
846
+ cols = stacked_df.columns.tolist()
847
+ cols.insert(0, cols.pop(cols.index("%")))
848
+ stacked_df = stacked_df[cols]
849
+ # Sort the stacked dataframe by "Device_Time_Sum_us" in descending order
850
+ stacked_df = stacked_df.sort_values(by="Device_Time_Sum_us", ascending=False)
851
+
852
+ return stacked_df
853
+
854
+
855
+ def print_stacked_report(stacked_df: pd.DataFrame):
856
+ print("\n📊 Stacked report 📊\n============\n")
857
+ print(stacked_df.to_string(index=False, float_format="%.2f"))
858
+
859
+
860
+ def dump_stacked_report(stacked_df: pd.DataFrame, output_file: str):
861
+ stacked_df.to_csv(output_file, index=False, float_format="%.1f")
862
+
863
+
864
+ def plot_stacked_report(stacked_df: pd.DataFrame, output_file: str, threshold: float = 0.02):
865
+ # Prepare data for the stacked bar plot
866
+ device_time_sum = stacked_df["Device_Time_Sum_us"]
867
+ total_sum = device_time_sum.sum()
868
+
869
+ # Create a stacked bar plot
870
+ plt.figure(figsize=(6, 8), dpi=300)
871
+ width = 0.5
872
+ bottom = 0
873
+ colors = plt.cm.tab20.colors + plt.cm.tab20b.colors + plt.cm.tab20c.colors
874
+
875
+ for i, row in stacked_df.iterrows():
876
+ color = colors[i % len(colors)]
877
+ bar = plt.bar(1, row["Device_Time_Sum_us"], width, label=row["OP Code Joined"], bottom=bottom, color=color)
878
+
879
+ text = f"({row['%']:.1f}%) {row['OP Code Joined']} total={row['Device_Time_Sum_us']:.1f}us; {row['Ops_Count']} ops"
880
+ if not pd.isna(row["Flops_mean"]):
881
+ text += f"\n Util [{row['Flops_min']:.1f} - {row['Flops_max']:.1f}] {row['Flops_mean']:.1f} ± {row['Flops_std']:.1f} %"
882
+
883
+ # Add overlay text if the data is significant
884
+ if row["Device_Time_Sum_us"] >= total_sum * threshold:
885
+ plt.text(
886
+ bar[0].get_x() + bar[0].get_width() / 2,
887
+ bottom + row["Device_Time_Sum_us"] / 2,
888
+ text,
889
+ ha="center",
890
+ va="center",
891
+ fontsize=6,
892
+ color="white"
893
+ )
894
+ bottom += row["Device_Time_Sum_us"]
895
+
896
+ # Set plot labels and title
897
+ plt.xlim(1 - width / 2 - 0.05, 1 + width / 2 + 0.05)
898
+ plt.ylabel("Device Time [us]")
899
+ plt.title(f"Stacked Device Time (Total: {total_sum:.1f} us)")
900
+ plt.tight_layout()
901
+
902
+ # Save the plot to a file
903
+ plt.savefig(output_file)
904
+
731
905
  def merge_device_rows(df):
732
906
  block_by_device = defaultdict(list)
733
907
 
@@ -765,10 +939,17 @@ def merge_device_rows(df):
765
939
  if not blocks:
766
940
  break
767
941
 
768
- if "AllGather" in op_name or "ReduceScatter" in op_name:
769
- # For collective ops, take the row with minimum duration
770
- min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
771
- merged_blocks.append(min_duration_block[1])
942
+ if "AllGather" in op_name or "ReduceScatter" in op_name or "AllReduce" in op_name:
943
+ # For collective ops, take the average duration over all rows within a block
944
+ device_kernel_durations = [d["DEVICE KERNEL DURATION [ns]"]
945
+ for _, d in blocks
946
+ if pd.notna(d["DEVICE KERNEL DURATION [ns]"])]
947
+ # Use the first block's data but update its duration with the average
948
+ base_block = blocks[0][1].copy()
949
+ base_block["DEVICE KERNEL DURATION [ns]"] = (
950
+ sum(device_kernel_durations) / len(device_kernel_durations) if device_kernel_durations else float("nan")
951
+ )
952
+ merged_blocks.append(base_block)
772
953
  else:
773
954
  # For non-collective ops, take the row with maximum duration
774
955
  max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
@@ -822,7 +1003,7 @@ def main():
822
1003
  args, id_range = parse_args()
823
1004
  generate_perf_report(
824
1005
  args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
825
- args.tracing_mode, args.raw_op_codes, args.no_host_ops)
1006
+ args.tracing_mode, args.raw_op_codes, args.no_host_ops, args.no_stacked_report, args.no_stack_by_in0, args.stacked_csv)
826
1007
 
827
1008
 
828
1009
  def parse_args():
@@ -845,6 +1026,12 @@ def parse_args():
845
1026
  parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
846
1027
  parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
847
1028
  parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
1029
+ parser.add_argument("--no-stacked-report", action="store_true", help="Do not generate a stacked report")
1030
+ parser.add_argument("--no-stack-by-in0", action="store_true",
1031
+ help="Do not group the stacked report by the layout of Input 0 (extracted from the Input 0 Memory column)"
1032
+ )
1033
+ parser.add_argument("--stacked-csv", type=str,
1034
+ help="Output filename for the stacked report CSV; Defaults to OUTPUT_FILE_stacked.csv", metavar="STACKED_FILE")
848
1035
 
849
1036
  args = parser.parse_args()
850
1037
 
@@ -863,7 +1050,7 @@ def parse_args():
863
1050
 
864
1051
  def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
865
1052
  id_range, csv_output_file, no_advice, tracing_mode,
866
- raw_op_codes, no_host_ops):
1053
+ raw_op_codes, no_host_ops, no_stacked_report, no_stack_by_in0, stacked_report_file):
867
1054
  df = pd.read_csv(csv_file, low_memory=False)
868
1055
 
869
1056
  # Add a column for original row numbers
@@ -891,6 +1078,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
891
1078
  for _, row in df.iterrows():
892
1079
  op_data, current_gap = analyze_op(row, prev_row)
893
1080
  op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
1081
+ op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
894
1082
  if raw_op_codes:
895
1083
  op_data["Raw OP Code"] = Cell(row["OP CODE"])
896
1084
  rows.append(op_data)
@@ -941,6 +1129,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
941
1129
  "Inner Dim Block Size",
942
1130
  "Output Subblock H",
943
1131
  "Output Subblock W",
1132
+ "Global Call Count",
944
1133
  ]
945
1134
  if not no_advice:
946
1135
  all_headers.append("Advice")
@@ -965,6 +1154,22 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
965
1154
  if not no_advice:
966
1155
  print_advice_section(rows, visible_headers, col_widths)
967
1156
 
1157
+ # handle stacked report generation
1158
+ if not(no_stacked_report):
1159
+ stacked_report = generate_stacked_report(rows, visible_headers, not(no_stack_by_in0))
1160
+
1161
+ if not(csv_output_file):
1162
+ print_stacked_report(stacked_report)
1163
+ if stacked_report_file or csv_output_file:
1164
+ if not stacked_report_file:
1165
+ base_stacked_report_file = f"{os.path.splitext(csv_output_file)[0]}_stacked"
1166
+ else:
1167
+ base_stacked_report_file = os.path.splitext(stacked_report_file)[0]
1168
+ print(colored(f"Writing CSV stacked report to {base_stacked_report_file}.csv", "cyan"))
1169
+ dump_stacked_report(stacked_report, f"{base_stacked_report_file}.csv")
1170
+ print(colored(f"Plotting PNG stacked report to {base_stacked_report_file}.png", "cyan"))
1171
+ plot_stacked_report(stacked_report, f"{base_stacked_report_file}.png")
1172
+
968
1173
 
969
1174
  def is_host_op(op_data):
970
1175
  return "(torch)" in op_data["OP Code"].raw_value
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.6
3
+ Version: 1.1.0
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,8 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Requires-Dist: matplotlib
222
+ Dynamic: license-file
221
223
 
222
224
  # Performance Report Analysis Tool
223
225
 
@@ -0,0 +1,2 @@
1
+ pandas
2
+ matplotlib
File without changes
File without changes
File without changes