tt-perf-report 1.0.7__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.7
3
+ Version: 1.1.0
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Requires-Dist: matplotlib
221
222
  Dynamic: license-file
222
223
 
223
224
  # Performance Report Analysis Tool
@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.7"
7
+ version = "1.1.0"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
11
11
  keywords = ["tenstorrent", "tt-metal"]
12
- dependencies = ["pandas"]
12
+ dependencies = ["pandas", "matplotlib"]
13
13
 
14
14
  [project.scripts]
15
15
  tt-perf-report = "tt_perf_report.perf_report:main"
@@ -2,17 +2,32 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
5
- import csv
6
- import sys
7
5
  import argparse
6
+ import csv
7
+ from collections import defaultdict
8
+ import os
8
9
  import re
10
+ import sys
9
11
  from typing import Any, Optional, Union
10
- from collections import defaultdict
12
+
13
+ import matplotlib.pyplot as plt
11
14
  import pandas as pd
12
15
 
13
16
  # Global variable to store color preference
14
17
  color_output = None # None means auto-detect, True forces color, False forces no color
15
18
 
19
+ def get_value_physical_logical(input : str, is_physical : bool = True):
20
+ if "[" in input and "]" in input:
21
+ physical_part = input.split("[")[0]
22
+ logical_part = input.split("[")[1].split("]")[0]
23
+
24
+ if is_physical:
25
+ return int(physical_part)
26
+ else:
27
+ return int(logical_part)
28
+ else:
29
+ # back compatible
30
+ return int(input)
16
31
 
17
32
  def set_color_output(force_color, force_no_color):
18
33
  global color_output
@@ -212,28 +227,28 @@ def analyze_matmul(row):
212
227
  total_data_size_bytes = 0
213
228
  if input_0_from_dram:
214
229
  total_data_size_bytes += (
215
- row["INPUT_0_W"]
216
- * row["INPUT_0_Y"]
217
- * row["INPUT_0_Z"]
218
- * row["INPUT_0_X"]
230
+ get_value_physical_logical(row["INPUT_0_W_PAD[LOGICAL]"])
231
+ * get_value_physical_logical(row["INPUT_0_Y_PAD[LOGICAL]"])
232
+ * get_value_physical_logical(row["INPUT_0_Z_PAD[LOGICAL]"])
233
+ * get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"])
219
234
  * get_datatype_size(row["INPUT_0_DATATYPE"])
220
235
  )
221
236
  if input_1_from_dram:
222
237
  total_data_size_bytes += (
223
- row["INPUT_1_W"]
224
- * row["INPUT_1_Y"]
225
- * row["INPUT_1_Z"]
226
- * row["INPUT_1_X"]
238
+ get_value_physical_logical(row["INPUT_1_W_PAD[LOGICAL]"])
239
+ * get_value_physical_logical(row["INPUT_1_Y_PAD[LOGICAL]"])
240
+ * get_value_physical_logical(row["INPUT_1_Z_PAD[LOGICAL]"])
241
+ * get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
227
242
  * get_datatype_size(row["INPUT_1_DATATYPE"])
228
243
  )
229
244
 
230
245
  # Always include output if it's written to DRAM
231
246
  if "DRAM" in row["OUTPUT_0_MEMORY"]:
232
247
  total_data_size_bytes += (
233
- row["OUTPUT_0_W"]
234
- * row["OUTPUT_0_Y"]
235
- * row["OUTPUT_0_Z"]
236
- * row["OUTPUT_0_X"]
248
+ get_value_physical_logical(row["OUTPUT_0_W_PAD[LOGICAL]"])
249
+ * get_value_physical_logical(row["OUTPUT_0_Y_PAD[LOGICAL]"])
250
+ * get_value_physical_logical(row["OUTPUT_0_Z_PAD[LOGICAL]"])
251
+ * get_value_physical_logical(row["OUTPUT_0_X_PAD[LOGICAL]"])
237
252
  * get_datatype_size(row["OUTPUT_0_DATATYPE"])
238
253
  )
239
254
 
@@ -253,8 +268,8 @@ def analyze_matmul(row):
253
268
 
254
269
  peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
255
270
 
256
- M, K, N = int(row["INPUT_0_Y"]), int(row["INPUT_0_X"]), int(row["INPUT_1_X"])
257
- W, Z = int(row["INPUT_0_W"]), int(row["INPUT_0_Z"])
271
+ M, K, N = get_value_physical_logical(row["INPUT_0_Y_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
272
+ W, Z = get_value_physical_logical(row["INPUT_0_W_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_0_Z_PAD[LOGICAL]"])
258
273
 
259
274
  flops = (M * K * N * W * Z * 2) / duration_s
260
275
 
@@ -323,10 +338,10 @@ def analyze_conv(row):
323
338
 
324
339
  peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
325
340
 
326
- NHW = int(row["OUTPUT_0_Y"])
327
- CH_IN = int(row["INPUT_0_X"])
341
+ NHW = get_value_physical_logical(row["OUTPUT_0_Y_PAD[LOGICAL]"])
342
+ CH_IN = get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"])
328
343
  W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
329
- CH_OUT = int(row["INPUT_1_X"])
344
+ CH_OUT = get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
330
345
 
331
346
  M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
332
347
  flops = (M * K * N * 2) / duration_s
@@ -800,6 +815,93 @@ def generate_matmul_advice(op_data):
800
815
  return advice
801
816
 
802
817
 
818
+ def generate_stacked_report(rows, visible_headers, stack_by_input0_layout:bool = False):
819
+ if stack_by_input0_layout:
820
+ visible_headers.append("Input 0 Memory")
821
+
822
+ # Create a pandas DataFrame from rows and headers
823
+ data = {header: [row[header].raw_value for row in rows] for header in visible_headers}
824
+ df = pd.DataFrame(data)
825
+
826
+ if (stack_by_input0_layout):
827
+ df["OP Code Joined"] = df["OP Code"].str.split().str[0] \
828
+ + " (in0:" + df["Input 0 Memory"].str.split('_').str[-2].str.lower() + "_" + df["Input 0 Memory"].str.split('_').str[-1].str.lower() + ")"
829
+ else:
830
+ df["OP Code Joined"] = df["OP Code"].str.split().str[0]
831
+
832
+ # Group by the joined OP Code and aggregate the data
833
+ stacked_df = df.groupby("OP Code Joined").agg(
834
+ Device_Time_Sum_us=("Device Time", "sum"),
835
+ Ops_Count=("Device Time", "count"),
836
+ Flops_min=("FLOPs %", "min"),
837
+ Flops_max=("FLOPs %", "max"),
838
+ Flops_mean=("FLOPs %", "mean"),
839
+ Flops_std=("FLOPs %", "std"),
840
+ ).reset_index()
841
+
842
+ # Calculate the percentage of device time
843
+ total_device_time = stacked_df["Device_Time_Sum_us"].sum()
844
+ stacked_df["%"] = (stacked_df["Device_Time_Sum_us"] / total_device_time) * 100
845
+ # Reorder columns to move Device_Time_Percentage to be the 3rd column
846
+ cols = stacked_df.columns.tolist()
847
+ cols.insert(0, cols.pop(cols.index("%")))
848
+ stacked_df = stacked_df[cols]
849
+ # Sort the stacked dataframe by "Device_Time_Sum_us" in descending order
850
+ stacked_df = stacked_df.sort_values(by="Device_Time_Sum_us", ascending=False)
851
+
852
+ return stacked_df
853
+
854
+
855
+ def print_stacked_report(stacked_df: pd.DataFrame):
856
+ print("\n📊 Stacked report 📊\n============\n")
857
+ print(stacked_df.to_string(index=False, float_format="%.2f"))
858
+
859
+
860
+ def dump_stacked_report(stacked_df: pd.DataFrame, output_file: str):
861
+ stacked_df.to_csv(output_file, index=False, float_format="%.1f")
862
+
863
+
864
+ def plot_stacked_report(stacked_df: pd.DataFrame, output_file: str, threshold: float = 0.02):
865
+ # Prepare data for the stacked bar plot
866
+ device_time_sum = stacked_df["Device_Time_Sum_us"]
867
+ total_sum = device_time_sum.sum()
868
+
869
+ # Create a stacked bar plot
870
+ plt.figure(figsize=(6, 8), dpi=300)
871
+ width = 0.5
872
+ bottom = 0
873
+ colors = plt.cm.tab20.colors + plt.cm.tab20b.colors + plt.cm.tab20c.colors
874
+
875
+ for i, row in stacked_df.iterrows():
876
+ color = colors[i % len(colors)]
877
+ bar = plt.bar(1, row["Device_Time_Sum_us"], width, label=row["OP Code Joined"], bottom=bottom, color=color)
878
+
879
+ text = f"({row['%']:.1f}%) {row['OP Code Joined']} total={row['Device_Time_Sum_us']:.1f}us; {row['Ops_Count']} ops"
880
+ if not pd.isna(row["Flops_mean"]):
881
+ text += f"\n Util [{row['Flops_min']:.1f} - {row['Flops_max']:.1f}] {row['Flops_mean']:.1f} ± {row['Flops_std']:.1f} %"
882
+
883
+ # Add overlay text if the data is significant
884
+ if row["Device_Time_Sum_us"] >= total_sum * threshold:
885
+ plt.text(
886
+ bar[0].get_x() + bar[0].get_width() / 2,
887
+ bottom + row["Device_Time_Sum_us"] / 2,
888
+ text,
889
+ ha="center",
890
+ va="center",
891
+ fontsize=6,
892
+ color="white"
893
+ )
894
+ bottom += row["Device_Time_Sum_us"]
895
+
896
+ # Set plot labels and title
897
+ plt.xlim(1 - width / 2 - 0.05, 1 + width / 2 + 0.05)
898
+ plt.ylabel("Device Time [us]")
899
+ plt.title(f"Stacked Device Time (Total: {total_sum:.1f} us)")
900
+ plt.tight_layout()
901
+
902
+ # Save the plot to a file
903
+ plt.savefig(output_file)
904
+
803
905
  def merge_device_rows(df):
804
906
  block_by_device = defaultdict(list)
805
907
 
@@ -901,7 +1003,7 @@ def main():
901
1003
  args, id_range = parse_args()
902
1004
  generate_perf_report(
903
1005
  args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
904
- args.tracing_mode, args.raw_op_codes, args.no_host_ops)
1006
+ args.tracing_mode, args.raw_op_codes, args.no_host_ops, args.no_stacked_report, args.no_stack_by_in0, args.stacked_csv)
905
1007
 
906
1008
 
907
1009
  def parse_args():
@@ -924,6 +1026,12 @@ def parse_args():
924
1026
  parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
925
1027
  parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
926
1028
  parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
1029
+ parser.add_argument("--no-stacked-report", action="store_true", help="Do not generate a stacked report")
1030
+ parser.add_argument("--no-stack-by-in0", action="store_true",
1031
+ help="Do not group the stacked report by the layout of Input 0 (extracted from the Input 0 Memory column)"
1032
+ )
1033
+ parser.add_argument("--stacked-csv", type=str,
1034
+ help="Output filename for the stacked report CSV; Defaults to OUTPUT_FILE_stacked.csv", metavar="STACKED_FILE")
927
1035
 
928
1036
  args = parser.parse_args()
929
1037
 
@@ -942,7 +1050,7 @@ def parse_args():
942
1050
 
943
1051
  def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
944
1052
  id_range, csv_output_file, no_advice, tracing_mode,
945
- raw_op_codes, no_host_ops):
1053
+ raw_op_codes, no_host_ops, no_stacked_report, no_stack_by_in0, stacked_report_file):
946
1054
  df = pd.read_csv(csv_file, low_memory=False)
947
1055
 
948
1056
  # Add a column for original row numbers
@@ -1046,6 +1154,22 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
1046
1154
  if not no_advice:
1047
1155
  print_advice_section(rows, visible_headers, col_widths)
1048
1156
 
1157
+ # handle stacked report generation
1158
+ if not(no_stacked_report):
1159
+ stacked_report = generate_stacked_report(rows, visible_headers, not(no_stack_by_in0))
1160
+
1161
+ if not(csv_output_file):
1162
+ print_stacked_report(stacked_report)
1163
+ if stacked_report_file or csv_output_file:
1164
+ if not stacked_report_file:
1165
+ base_stacked_report_file = f"{os.path.splitext(csv_output_file)[0]}_stacked"
1166
+ else:
1167
+ base_stacked_report_file = os.path.splitext(stacked_report_file)[0]
1168
+ print(colored(f"Writing CSV stacked report to {base_stacked_report_file}.csv", "cyan"))
1169
+ dump_stacked_report(stacked_report, f"{base_stacked_report_file}.csv")
1170
+ print(colored(f"Plotting PNG stacked report to {base_stacked_report_file}.png", "cyan"))
1171
+ plot_stacked_report(stacked_report, f"{base_stacked_report_file}.png")
1172
+
1049
1173
 
1050
1174
  def is_host_op(op_data):
1051
1175
  return "(torch)" in op_data["OP Code"].raw_value
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.7
3
+ Version: 1.1.0
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Requires-Dist: matplotlib
221
222
  Dynamic: license-file
222
223
 
223
224
  # Performance Report Analysis Tool
@@ -0,0 +1,2 @@
1
+ pandas
2
+ matplotlib
File without changes
File without changes
File without changes