tt-perf-report 1.0.0__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: tt-perf-report
3
- Version: 1.0.0
3
+ Version: 1.0.3
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.0"
7
+ version = "1.0.3"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
@@ -2,11 +2,11 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
5
-
5
+ import csv
6
6
  import sys
7
7
  import argparse
8
8
  import re
9
- from typing import Any, Optional
9
+ from typing import Any, Optional, Union
10
10
  from collections import defaultdict
11
11
  import pandas as pd
12
12
 
@@ -280,7 +280,7 @@ def analyze_op(row, prev_row):
280
280
  op_code = Cell(row["OP CODE"])
281
281
  cores = Cell(int(row["CORE COUNT"]) if pd.notna(row["CORE COUNT"]) else None)
282
282
  device_time = Cell(
283
- row["DEVICE FW DURATION [ns]"] / 1000 if pd.notna(row["DEVICE FW DURATION [ns]"]) else None,
283
+ row["DEVICE KERNEL DURATION [ns]"] / 1000 if pd.notna(row["DEVICE KERNEL DURATION [ns]"]) else None,
284
284
  unit="us",
285
285
  decimals=0,
286
286
  )
@@ -294,9 +294,12 @@ def analyze_op(row, prev_row):
294
294
  else:
295
295
  op_to_op_gap = Cell(None, unit="us", decimals=0)
296
296
 
297
- output_datatype = row["OUTPUT_0_DATATYPE"]
298
- input_0_datatype = row["INPUT_0_DATATYPE"]
299
- input_1_datatype = row["INPUT_1_DATATYPE"]
297
+ def get_entry(k: str) -> Union[str, None]:
298
+ return row[k] if k in row else None
299
+
300
+ output_datatype = get_entry("OUTPUT_0_DATATYPE")
301
+ input_0_datatype = get_entry("INPUT_0_DATATYPE")
302
+ input_1_datatype = get_entry("INPUT_1_DATATYPE")
300
303
  output_datatype_cell = Cell(output_datatype)
301
304
  input_0_datatype_cell = Cell(input_0_datatype)
302
305
  input_1_datatype_cell = Cell(input_1_datatype)
@@ -579,88 +582,98 @@ def print_op_to_op_gap_advice(rows, headers, col_widths):
579
582
  )
580
583
 
581
584
 
585
+ def is_matmul_op(op_data):
586
+ return "Matmul" in op_data["OP Code"].raw_value
587
+
588
+
582
589
  def print_matmul_advice(rows, headers, col_widths):
583
- matmul_ops = [op_data for op_data in rows if "Matmul" in op_data["OP Code"].raw_value]
590
+ matmul_ops = [op_data for op_data in rows if is_matmul_op(op_data)]
584
591
 
585
592
  if matmul_ops:
586
593
  print("Matmul Optimization\n-------------------")
587
594
  for op_data in matmul_ops:
588
595
  print_row(op_data, col_widths, headers)
589
- advice = []
596
+ advice = generate_matmul_advice(op_data)
590
597
  color = "grey" if op_data["OP Code"].color == "grey" else "white"
591
598
 
592
- math_fidelity = (
593
- op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
599
+ if advice:
600
+ for item in advice:
601
+ print(colored(f"- {item}", color))
602
+ else:
603
+ print(colored("✅ Optimized", color))
604
+ print() # Add a blank line between matmuls
605
+
606
+
607
+ def generate_matmul_advice(op_data):
608
+ advice = []
609
+
610
+ math_fidelity = (
611
+ op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
612
+ )
613
+ output_datatype = op_data["Output Datatype"].raw_value
614
+ input_0_datatype = op_data["Input 0 Datatype"].raw_value
615
+ input_1_datatype = op_data["Input 1 Datatype"].raw_value
616
+ cores = op_data["Cores"].raw_value
617
+ fidelity_evaluation, fidelity_advice = evaluate_fidelity(
618
+ input_0_datatype, input_1_datatype, output_datatype, math_fidelity
619
+ )
620
+
621
+ if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
622
+ if not op_data["DRAM Sharded"].raw_value:
623
+ advice.append(
624
+ "Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
594
625
  )
595
- output_datatype = op_data["Output Datatype"].raw_value
596
- input_0_datatype = op_data["Input 0 Datatype"].raw_value
597
- input_1_datatype = op_data["Input 1 Datatype"].raw_value
598
- cores = op_data["Cores"].raw_value
599
- fidelity_evaluation, fidelity_advice = evaluate_fidelity(
600
- input_0_datatype, input_1_datatype, output_datatype, math_fidelity
626
+ if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
627
+ advice.append(fidelity_advice)
628
+ if fidelity_evaluation == "too_high":
629
+ advice.append(fidelity_advice)
630
+ elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
631
+ if cores < 64:
632
+ advice.append(f"Increase grid size (currently using {cores})")
633
+ if fidelity_evaluation == "too_high":
634
+ advice.append(fidelity_advice)
635
+ elif op_data["Bound"].raw_value == "SLOW":
636
+ input_0_memory = op_data["Input 0 Memory"].raw_value
637
+ if input_0_memory and "L1" not in input_0_memory:
638
+ advice.append(f"If possible place input 0 in L1 (currently in {input_0_memory})")
639
+
640
+ inner_dim_block = op_data["Inner Dim Block Size"].raw_value
641
+ out_h = op_data["Output Subblock H"].raw_value
642
+ out_w = op_data["Output Subblock W"].raw_value
643
+
644
+ if inner_dim_block is None and out_h is None and out_w is None:
645
+ advice.append(
646
+ "No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
601
647
  )
648
+ else:
649
+ all_good = True
650
+ if inner_dim_block is not None:
651
+ if inner_dim_block < 2:
652
+ advice.append(f"in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
653
+ all_good = False
654
+ else:
655
+ advice.append("No inner dim block size found")
656
+ all_good = False
602
657
 
603
- if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
604
- if not op_data["DRAM Sharded"].raw_value:
605
- advice.append(
606
- "- Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
607
- )
608
- if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
609
- advice.append(f"- {fidelity_advice}")
610
- if fidelity_evaluation == "too_high":
611
- advice.append(f"- {fidelity_advice}")
612
- elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
613
- if cores < 64:
614
- advice.append(f"- Increase grid size (currently using {cores})")
615
- if fidelity_evaluation == "too_high":
616
- advice.append(f"- {fidelity_advice}")
617
- elif op_data["Bound"].raw_value == "SLOW":
618
- input_0_memory = op_data["Input 0 Memory"].raw_value
619
- if input_0_memory and "L1" not in input_0_memory:
620
- advice.append(f"- If possible place input 0 in L1 (currently in {input_0_memory})")
621
-
622
- inner_dim_block = op_data["Inner Dim Block Size"].raw_value
623
- out_h = op_data["Output Subblock H"].raw_value
624
- out_w = op_data["Output Subblock W"].raw_value
625
-
626
- if inner_dim_block is None and out_h is None and out_w is None:
658
+ if out_h is not None and out_w is not None:
659
+ out_area = out_h * out_w
660
+ if out_area < 2:
627
661
  advice.append(
628
- "- No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
662
+ f"Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
629
663
  )
630
- else:
631
- all_good = True
632
- if inner_dim_block is not None:
633
- if inner_dim_block < 2:
634
- advice.append(f"- in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
635
- all_good = False
636
- else:
637
- advice.append("- No inner dim block size found")
638
- all_good = False
639
-
640
- if out_h is not None and out_w is not None:
641
- out_area = out_h * out_w
642
- if out_area < 2:
643
- advice.append(
644
- f"- Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
645
- )
646
- all_good = False
647
- else:
648
- advice.append("- No output subblock size found")
649
- all_good = False
650
-
651
- if all_good:
652
- advice.append(
653
- f"- in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
654
- )
655
- if fidelity_advice:
656
- advice.append(f"- {fidelity_advice}")
657
-
658
- if advice:
659
- for item in advice:
660
- print(colored(item, color))
664
+ all_good = False
661
665
  else:
662
- print(colored(" Optimized", color))
663
- print() # Add a blank line between matmuls
666
+ advice.append("No output subblock size found")
667
+ all_good = False
668
+
669
+ if all_good:
670
+ advice.append(
671
+ f"in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
672
+ )
673
+ if fidelity_advice:
674
+ advice.append(fidelity_advice)
675
+
676
+ return advice
664
677
 
665
678
 
666
679
  def merge_device_rows(df):
@@ -682,11 +695,11 @@ def merge_device_rows(df):
682
695
 
683
696
  if "AllGather" in op_name or "ReduceScatter" in op_name:
684
697
  # For collective ops, take the row with minimum duration
685
- min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE FW DURATION [ns]"])
698
+ min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
686
699
  merged_blocks.append(min_duration_block[1])
687
700
  else:
688
701
  # For non-collective ops, take the row with maximum duration
689
- max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE FW DURATION [ns]"])
702
+ max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
690
703
  merged_blocks.append(max_duration_block[1])
691
704
 
692
705
  return pd.DataFrame(merged_blocks)
@@ -730,7 +743,7 @@ def filter_by_id_range(rows, id_range):
730
743
  def main():
731
744
  args, id_range = parse_args()
732
745
  generate_perf_report(
733
- args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice
746
+ args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode
734
747
  )
735
748
 
736
749
 
@@ -751,6 +764,7 @@ def parse_args():
751
764
  parser.add_argument("--no-color", action="store_true", help="Force output without color")
752
765
  parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
753
766
  parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
767
+ parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
754
768
  args = parser.parse_args()
755
769
 
756
770
  # Set the global color_output variable
@@ -766,14 +780,15 @@ def parse_args():
766
780
  return args, id_range
767
781
 
768
782
 
769
- def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice):
783
+ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode):
770
784
  df = pd.read_csv(csv_file, low_memory=False)
771
785
 
772
786
  # Add a column for original row numbers
773
787
  df["ORIGINAL_ROW"] = df.index + 2 # +2 to match Excel row numbers (1-based + header)
774
788
 
775
789
  # Sort the DataFrame by "HOST START TS" column
776
- if "HOST START TS" in df.columns:
790
+ # Sorting by HOST START TS is incorrect when using tracing mode since the tracing ops timestamps are the ones when captured and not executed
791
+ if "HOST START TS" in df.columns and not tracing_mode:
777
792
  print(colored("Sorting CSV by 'HOST START TS' column...", "cyan"))
778
793
  df = df.sort_values(by="HOST START TS")
779
794
  else:
@@ -839,11 +854,18 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
839
854
  "Output Subblock H",
840
855
  "Output Subblock W",
841
856
  ]
857
+ if not no_advice:
858
+ all_headers.append("Advice")
842
859
  print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
843
860
  with open(csv_output_file, "w") as f:
844
- f.write(",".join(all_headers) + "\n")
861
+ csv_writer = csv.DictWriter(f, fieldnames=all_headers)
862
+ csv_writer.writeheader()
845
863
  for op_data in rows:
846
- f.write(",".join(str(op_data[header].raw_value) for header in all_headers) + "\n")
864
+ row = {header: op_data[header].raw_value for header in all_headers if header in op_data}
865
+ if not no_advice:
866
+ advice = generate_matmul_advice(op_data) if is_matmul_op(op_data) else ""
867
+ row["Advice"] = " • ".join(advice)
868
+ csv_writer.writerow(row)
847
869
  else:
848
870
  col_widths = [
849
871
  max(max(visible_length(str(row[header])) for row in rows), visible_length(header))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: tt-perf-report
3
- Version: 1.0.0
3
+ Version: 1.0.3
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
File without changes
File without changes
File without changes