tt-perf-report 1.0.7__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.7
3
+ Version: 1.1.1
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Requires-Dist: matplotlib
221
222
  Dynamic: license-file
222
223
 
223
224
  # Performance Report Analysis Tool
@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.7"
7
+ version = "1.1.1"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
11
11
  keywords = ["tenstorrent", "tt-metal"]
12
- dependencies = ["pandas"]
12
+ dependencies = ["pandas", "matplotlib"]
13
13
 
14
14
  [project.scripts]
15
15
  tt-perf-report = "tt_perf_report.perf_report:main"
@@ -2,18 +2,54 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
5
- import csv
6
- import sys
7
5
  import argparse
6
+ import csv
7
+ from collections import defaultdict
8
+ import os
8
9
  import re
10
+ import sys
9
11
  from typing import Any, Optional, Union
10
- from collections import defaultdict
12
+
13
+ import matplotlib.pyplot as plt
11
14
  import pandas as pd
12
15
 
13
16
  # Global variable to store color preference
14
17
  color_output = None # None means auto-detect, True forces color, False forces no color
15
18
 
16
19
 
20
+ def get_value_physical_logical(input, is_physical : bool = True):
21
+ # Handle numeric inputs (old format)
22
+ if isinstance(input, (int, float)):
23
+ return int(input)
24
+
25
+ # Handle string inputs (new format)
26
+ if isinstance(input, str) and "[" in input and "]" in input:
27
+ physical_part = input.split("[")[0]
28
+ logical_part = input.split("[")[1].split("]")[0]
29
+
30
+ if is_physical:
31
+ return int(physical_part)
32
+ else:
33
+ return int(logical_part)
34
+ else:
35
+ # backwards compatibility - convert string to int
36
+ return int(input)
37
+
38
+
39
+ def detect_csv_format(df):
40
+ """Detect if CSV uses v1 (old) or v2 (new) format by checking for _PAD[LOGICAL] columns"""
41
+ v2_columns = [col for col in df.columns if "_PAD[LOGICAL]" in col]
42
+ return "v2" if v2_columns else "v1"
43
+
44
+
45
+ def get_column_name(base_name, csv_format):
46
+ """Get the appropriate column name based on CSV format version"""
47
+ if csv_format == "v2":
48
+ return f"{base_name}_PAD[LOGICAL]"
49
+ else:
50
+ return base_name
51
+
52
+
17
53
  def set_color_output(force_color, force_no_color):
18
54
  global color_output
19
55
  if force_no_color:
@@ -205,35 +241,35 @@ def evaluate_fidelity(input_0_datatype, input_1_datatype, output_datatype, math_
205
241
  )
206
242
 
207
243
 
208
- def analyze_matmul(row):
244
+ def analyze_matmul(row, csv_format="v2"):
209
245
  input_0_from_dram = "DRAM" in row["INPUT_0_MEMORY"]
210
246
  input_1_from_dram = "DRAM" in row["INPUT_1_MEMORY"]
211
247
 
212
248
  total_data_size_bytes = 0
213
249
  if input_0_from_dram:
214
250
  total_data_size_bytes += (
215
- row["INPUT_0_W"]
216
- * row["INPUT_0_Y"]
217
- * row["INPUT_0_Z"]
218
- * row["INPUT_0_X"]
251
+ get_value_physical_logical(row[get_column_name("INPUT_0_W", csv_format)])
252
+ * get_value_physical_logical(row[get_column_name("INPUT_0_Y", csv_format)])
253
+ * get_value_physical_logical(row[get_column_name("INPUT_0_Z", csv_format)])
254
+ * get_value_physical_logical(row[get_column_name("INPUT_0_X", csv_format)])
219
255
  * get_datatype_size(row["INPUT_0_DATATYPE"])
220
256
  )
221
257
  if input_1_from_dram:
222
258
  total_data_size_bytes += (
223
- row["INPUT_1_W"]
224
- * row["INPUT_1_Y"]
225
- * row["INPUT_1_Z"]
226
- * row["INPUT_1_X"]
259
+ get_value_physical_logical(row[get_column_name("INPUT_1_W", csv_format)])
260
+ * get_value_physical_logical(row[get_column_name("INPUT_1_Y", csv_format)])
261
+ * get_value_physical_logical(row[get_column_name("INPUT_1_Z", csv_format)])
262
+ * get_value_physical_logical(row[get_column_name("INPUT_1_X", csv_format)])
227
263
  * get_datatype_size(row["INPUT_1_DATATYPE"])
228
264
  )
229
265
 
230
266
  # Always include output if it's written to DRAM
231
267
  if "DRAM" in row["OUTPUT_0_MEMORY"]:
232
268
  total_data_size_bytes += (
233
- row["OUTPUT_0_W"]
234
- * row["OUTPUT_0_Y"]
235
- * row["OUTPUT_0_Z"]
236
- * row["OUTPUT_0_X"]
269
+ get_value_physical_logical(row[get_column_name("OUTPUT_0_W", csv_format)])
270
+ * get_value_physical_logical(row[get_column_name("OUTPUT_0_Y", csv_format)])
271
+ * get_value_physical_logical(row[get_column_name("OUTPUT_0_Z", csv_format)])
272
+ * get_value_physical_logical(row[get_column_name("OUTPUT_0_X", csv_format)])
237
273
  * get_datatype_size(row["OUTPUT_0_DATATYPE"])
238
274
  )
239
275
 
@@ -253,8 +289,8 @@ def analyze_matmul(row):
253
289
 
254
290
  peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
255
291
 
256
- M, K, N = int(row["INPUT_0_Y"]), int(row["INPUT_0_X"]), int(row["INPUT_1_X"])
257
- W, Z = int(row["INPUT_0_W"]), int(row["INPUT_0_Z"])
292
+ M, K, N = get_value_physical_logical(row[get_column_name("INPUT_0_Y", csv_format)]), get_value_physical_logical(row[get_column_name("INPUT_0_X", csv_format)]), get_value_physical_logical(row[get_column_name("INPUT_1_X", csv_format)])
293
+ W, Z = get_value_physical_logical(row[get_column_name("INPUT_0_W", csv_format)]), get_value_physical_logical(row[get_column_name("INPUT_0_Z", csv_format)])
258
294
 
259
295
  flops = (M * K * N * W * Z * 2) / duration_s
260
296
 
@@ -312,7 +348,7 @@ def analyze_halo(row):
312
348
 
313
349
  return config
314
350
 
315
- def analyze_conv(row):
351
+ def analyze_conv(row, csv_format="v2"):
316
352
  duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
317
353
 
318
354
  core_count = 64 # we decided to normalize to the max core count
@@ -323,10 +359,10 @@ def analyze_conv(row):
323
359
 
324
360
  peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
325
361
 
326
- NHW = int(row["OUTPUT_0_Y"])
327
- CH_IN = int(row["INPUT_0_X"])
362
+ NHW = get_value_physical_logical(row[get_column_name("OUTPUT_0_Y", csv_format)])
363
+ CH_IN = get_value_physical_logical(row[get_column_name("INPUT_0_X", csv_format)])
328
364
  W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
329
- CH_OUT = int(row["INPUT_1_X"])
365
+ CH_OUT = get_value_physical_logical(row[get_column_name("INPUT_1_X", csv_format)])
330
366
 
331
367
  M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
332
368
  flops = (M * K * N * 2) / duration_s
@@ -372,7 +408,7 @@ def analyze_conv(row):
372
408
  config,
373
409
  )
374
410
 
375
- def analyze_op(row, prev_row):
411
+ def analyze_op(row, prev_row, csv_format="v2"):
376
412
  op_code = Cell(row["OP CODE"])
377
413
  cores = Cell(int(row["CORE COUNT"]) if pd.notna(row["CORE COUNT"]) else None)
378
414
  device_time = Cell(
@@ -425,7 +461,7 @@ def analyze_op(row, prev_row):
425
461
  math_fidelity,
426
462
  is_dram_sharded,
427
463
  adjusted_core_count, # Get the potentially adjusted core count
428
- ) = analyze_matmul(row)
464
+ ) = analyze_matmul(row, csv_format)
429
465
  op_code = Cell(f"{op_code.raw_value} {size}")
430
466
  dram_speed = Cell(dram_speed, unit="GB/s", decimals=0)
431
467
  dram_percentage = Cell(dram_percentage, unit="%", decimals=1)
@@ -446,7 +482,7 @@ def analyze_op(row, prev_row):
446
482
  memory_info,
447
483
  math_fidelity,
448
484
  config,
449
- ) = analyze_conv(row)
485
+ ) = analyze_conv(row, csv_format)
450
486
  op_code = Cell(f"{op_code.raw_value} {size} {config}")
451
487
  dram_speed = Cell(None, unit="GB/s", decimals=0)
452
488
  dram_percentage = Cell(None, unit="%", decimals=1)
@@ -800,6 +836,93 @@ def generate_matmul_advice(op_data):
800
836
  return advice
801
837
 
802
838
 
839
+ def generate_stacked_report(rows, visible_headers, stack_by_input0_layout:bool = False):
840
+ if stack_by_input0_layout:
841
+ visible_headers.append("Input 0 Memory")
842
+
843
+ # Create a pandas DataFrame from rows and headers
844
+ data = {header: [row[header].raw_value for row in rows] for header in visible_headers}
845
+ df = pd.DataFrame(data)
846
+
847
+ if (stack_by_input0_layout):
848
+ df["OP Code Joined"] = df["OP Code"].str.split().str[0] \
849
+ + " (in0:" + df["Input 0 Memory"].str.split('_').str[-2].str.lower() + "_" + df["Input 0 Memory"].str.split('_').str[-1].str.lower() + ")"
850
+ else:
851
+ df["OP Code Joined"] = df["OP Code"].str.split().str[0]
852
+
853
+ # Group by the joined OP Code and aggregate the data
854
+ stacked_df = df.groupby("OP Code Joined").agg(
855
+ Device_Time_Sum_us=("Device Time", "sum"),
856
+ Ops_Count=("Device Time", "count"),
857
+ Flops_min=("FLOPs %", "min"),
858
+ Flops_max=("FLOPs %", "max"),
859
+ Flops_mean=("FLOPs %", "mean"),
860
+ Flops_std=("FLOPs %", "std"),
861
+ ).reset_index()
862
+
863
+ # Calculate the percentage of device time
864
+ total_device_time = stacked_df["Device_Time_Sum_us"].sum()
865
+ stacked_df["%"] = (stacked_df["Device_Time_Sum_us"] / total_device_time) * 100
866
+ # Reorder columns to move Device_Time_Percentage to be the 3rd column
867
+ cols = stacked_df.columns.tolist()
868
+ cols.insert(0, cols.pop(cols.index("%")))
869
+ stacked_df = stacked_df[cols]
870
+ # Sort the stacked dataframe by "Device_Time_Sum_us" in descending order
871
+ stacked_df = stacked_df.sort_values(by="Device_Time_Sum_us", ascending=False)
872
+
873
+ return stacked_df
874
+
875
+
876
+ def print_stacked_report(stacked_df: pd.DataFrame):
877
+ print("\n📊 Stacked report 📊\n============\n")
878
+ print(stacked_df.to_string(index=False, float_format="%.2f"))
879
+
880
+
881
+ def dump_stacked_report(stacked_df: pd.DataFrame, output_file: str):
882
+ stacked_df.to_csv(output_file, index=False, float_format="%.1f")
883
+
884
+
885
+ def plot_stacked_report(stacked_df: pd.DataFrame, output_file: str, threshold: float = 0.02):
886
+ # Prepare data for the stacked bar plot
887
+ device_time_sum = stacked_df["Device_Time_Sum_us"]
888
+ total_sum = device_time_sum.sum()
889
+
890
+ # Create a stacked bar plot
891
+ plt.figure(figsize=(6, 8), dpi=300)
892
+ width = 0.5
893
+ bottom = 0
894
+ colors = plt.cm.tab20.colors + plt.cm.tab20b.colors + plt.cm.tab20c.colors
895
+
896
+ for i, row in stacked_df.iterrows():
897
+ color = colors[i % len(colors)]
898
+ bar = plt.bar(1, row["Device_Time_Sum_us"], width, label=row["OP Code Joined"], bottom=bottom, color=color)
899
+
900
+ text = f"({row['%']:.1f}%) {row['OP Code Joined']} total={row['Device_Time_Sum_us']:.1f}us; {row['Ops_Count']} ops"
901
+ if not pd.isna(row["Flops_mean"]):
902
+ text += f"\n Util [{row['Flops_min']:.1f} - {row['Flops_max']:.1f}] {row['Flops_mean']:.1f} ± {row['Flops_std']:.1f} %"
903
+
904
+ # Add overlay text if the data is significant
905
+ if row["Device_Time_Sum_us"] >= total_sum * threshold:
906
+ plt.text(
907
+ bar[0].get_x() + bar[0].get_width() / 2,
908
+ bottom + row["Device_Time_Sum_us"] / 2,
909
+ text,
910
+ ha="center",
911
+ va="center",
912
+ fontsize=6,
913
+ color="white"
914
+ )
915
+ bottom += row["Device_Time_Sum_us"]
916
+
917
+ # Set plot labels and title
918
+ plt.xlim(1 - width / 2 - 0.05, 1 + width / 2 + 0.05)
919
+ plt.ylabel("Device Time [us]")
920
+ plt.title(f"Stacked Device Time (Total: {total_sum:.1f} us)")
921
+ plt.tight_layout()
922
+
923
+ # Save the plot to a file
924
+ plt.savefig(output_file)
925
+
803
926
  def merge_device_rows(df):
804
927
  block_by_device = defaultdict(list)
805
928
 
@@ -901,7 +1024,7 @@ def main():
901
1024
  args, id_range = parse_args()
902
1025
  generate_perf_report(
903
1026
  args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
904
- args.tracing_mode, args.raw_op_codes, args.no_host_ops)
1027
+ args.tracing_mode, args.raw_op_codes, args.no_host_ops, args.no_stacked_report, args.no_stack_by_in0, args.stacked_csv)
905
1028
 
906
1029
 
907
1030
  def parse_args():
@@ -924,6 +1047,12 @@ def parse_args():
924
1047
  parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
925
1048
  parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
926
1049
  parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
1050
+ parser.add_argument("--no-stacked-report", action="store_true", help="Do not generate a stacked report")
1051
+ parser.add_argument("--no-stack-by-in0", action="store_true",
1052
+ help="Do not group the stacked report by the layout of Input 0 (extracted from the Input 0 Memory column)"
1053
+ )
1054
+ parser.add_argument("--stacked-csv", type=str,
1055
+ help="Output filename for the stacked report CSV; Defaults to OUTPUT_FILE_stacked.csv", metavar="STACKED_FILE")
927
1056
 
928
1057
  args = parser.parse_args()
929
1058
 
@@ -942,9 +1071,15 @@ def parse_args():
942
1071
 
943
1072
  def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
944
1073
  id_range, csv_output_file, no_advice, tracing_mode,
945
- raw_op_codes, no_host_ops):
1074
+ raw_op_codes, no_host_ops, no_stacked_report, no_stack_by_in0, stacked_report_file):
946
1075
  df = pd.read_csv(csv_file, low_memory=False)
947
1076
 
1077
+ # Detect CSV format version
1078
+ csv_format = detect_csv_format(df)
1079
+
1080
+ if csv_format != "v2":
1081
+ print(colored(f"Detected CSV format: v1 (legacy format)", "cyan"))
1082
+
948
1083
  # Add a column for original row numbers
949
1084
  df["ORIGINAL_ROW"] = df.index + 2 # +2 to match Excel row numbers (1-based + header)
950
1085
 
@@ -968,7 +1103,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
968
1103
  device_ops = 0
969
1104
  host_ops = 0
970
1105
  for _, row in df.iterrows():
971
- op_data, current_gap = analyze_op(row, prev_row)
1106
+ op_data, current_gap = analyze_op(row, prev_row, csv_format)
972
1107
  op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
973
1108
  op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
974
1109
  if raw_op_codes:
@@ -1046,6 +1181,22 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
1046
1181
  if not no_advice:
1047
1182
  print_advice_section(rows, visible_headers, col_widths)
1048
1183
 
1184
+ # handle stacked report generation
1185
+ if not(no_stacked_report):
1186
+ stacked_report = generate_stacked_report(rows, visible_headers, not(no_stack_by_in0))
1187
+
1188
+ if not(csv_output_file):
1189
+ print_stacked_report(stacked_report)
1190
+ if stacked_report_file or csv_output_file:
1191
+ if not stacked_report_file:
1192
+ base_stacked_report_file = f"{os.path.splitext(csv_output_file)[0]}_stacked"
1193
+ else:
1194
+ base_stacked_report_file = os.path.splitext(stacked_report_file)[0]
1195
+ print(colored(f"Writing CSV stacked report to {base_stacked_report_file}.csv", "cyan"))
1196
+ dump_stacked_report(stacked_report, f"{base_stacked_report_file}.csv")
1197
+ print(colored(f"Plotting PNG stacked report to {base_stacked_report_file}.png", "cyan"))
1198
+ plot_stacked_report(stacked_report, f"{base_stacked_report_file}.png")
1199
+
1049
1200
 
1050
1201
  def is_host_op(op_data):
1051
1202
  return "(torch)" in op_data["OP Code"].raw_value
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.7
3
+ Version: 1.1.1
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Requires-Dist: matplotlib
221
222
  Dynamic: license-file
222
223
 
223
224
  # Performance Report Analysis Tool
@@ -0,0 +1,2 @@
1
+ pandas
2
+ matplotlib
File without changes
File without changes
File without changes