tt-perf-report 1.0.0__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tt-perf-report might be problematic. Click here for more details.
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/PKG-INFO +1 -1
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/pyproject.toml +1 -1
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report/perf_report.py +103 -81
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/PKG-INFO +1 -1
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/LICENSE +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/README.md +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/setup.cfg +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report/__init__.py +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/SOURCES.txt +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/dependency_links.txt +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/entry_points.txt +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/requires.txt +0 -0
- {tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tt-perf-report"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.3"
|
|
8
8
|
description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
|
|
9
9
|
license = {file = "LICENSE"}
|
|
10
10
|
readme = "README.md"
|
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
|
|
5
|
-
|
|
5
|
+
import csv
|
|
6
6
|
import sys
|
|
7
7
|
import argparse
|
|
8
8
|
import re
|
|
9
|
-
from typing import Any, Optional
|
|
9
|
+
from typing import Any, Optional, Union
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
import pandas as pd
|
|
12
12
|
|
|
@@ -280,7 +280,7 @@ def analyze_op(row, prev_row):
|
|
|
280
280
|
op_code = Cell(row["OP CODE"])
|
|
281
281
|
cores = Cell(int(row["CORE COUNT"]) if pd.notna(row["CORE COUNT"]) else None)
|
|
282
282
|
device_time = Cell(
|
|
283
|
-
row["DEVICE
|
|
283
|
+
row["DEVICE KERNEL DURATION [ns]"] / 1000 if pd.notna(row["DEVICE KERNEL DURATION [ns]"]) else None,
|
|
284
284
|
unit="us",
|
|
285
285
|
decimals=0,
|
|
286
286
|
)
|
|
@@ -294,9 +294,12 @@ def analyze_op(row, prev_row):
|
|
|
294
294
|
else:
|
|
295
295
|
op_to_op_gap = Cell(None, unit="us", decimals=0)
|
|
296
296
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
297
|
+
def get_entry(k: str) -> Union[str, None]:
|
|
298
|
+
return row[k] if k in row else None
|
|
299
|
+
|
|
300
|
+
output_datatype = get_entry("OUTPUT_0_DATATYPE")
|
|
301
|
+
input_0_datatype = get_entry("INPUT_0_DATATYPE")
|
|
302
|
+
input_1_datatype = get_entry("INPUT_1_DATATYPE")
|
|
300
303
|
output_datatype_cell = Cell(output_datatype)
|
|
301
304
|
input_0_datatype_cell = Cell(input_0_datatype)
|
|
302
305
|
input_1_datatype_cell = Cell(input_1_datatype)
|
|
@@ -579,88 +582,98 @@ def print_op_to_op_gap_advice(rows, headers, col_widths):
|
|
|
579
582
|
)
|
|
580
583
|
|
|
581
584
|
|
|
585
|
+
def is_matmul_op(op_data):
|
|
586
|
+
return "Matmul" in op_data["OP Code"].raw_value
|
|
587
|
+
|
|
588
|
+
|
|
582
589
|
def print_matmul_advice(rows, headers, col_widths):
|
|
583
|
-
matmul_ops = [op_data for op_data in rows if
|
|
590
|
+
matmul_ops = [op_data for op_data in rows if is_matmul_op(op_data)]
|
|
584
591
|
|
|
585
592
|
if matmul_ops:
|
|
586
593
|
print("Matmul Optimization\n-------------------")
|
|
587
594
|
for op_data in matmul_ops:
|
|
588
595
|
print_row(op_data, col_widths, headers)
|
|
589
|
-
advice =
|
|
596
|
+
advice = generate_matmul_advice(op_data)
|
|
590
597
|
color = "grey" if op_data["OP Code"].color == "grey" else "white"
|
|
591
598
|
|
|
592
|
-
|
|
593
|
-
|
|
599
|
+
if advice:
|
|
600
|
+
for item in advice:
|
|
601
|
+
print(colored(f"- {item}", color))
|
|
602
|
+
else:
|
|
603
|
+
print(colored("✅ Optimized", color))
|
|
604
|
+
print() # Add a blank line between matmuls
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def generate_matmul_advice(op_data):
|
|
608
|
+
advice = []
|
|
609
|
+
|
|
610
|
+
math_fidelity = (
|
|
611
|
+
op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
|
|
612
|
+
)
|
|
613
|
+
output_datatype = op_data["Output Datatype"].raw_value
|
|
614
|
+
input_0_datatype = op_data["Input 0 Datatype"].raw_value
|
|
615
|
+
input_1_datatype = op_data["Input 1 Datatype"].raw_value
|
|
616
|
+
cores = op_data["Cores"].raw_value
|
|
617
|
+
fidelity_evaluation, fidelity_advice = evaluate_fidelity(
|
|
618
|
+
input_0_datatype, input_1_datatype, output_datatype, math_fidelity
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
|
|
622
|
+
if not op_data["DRAM Sharded"].raw_value:
|
|
623
|
+
advice.append(
|
|
624
|
+
"Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
|
|
594
625
|
)
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
626
|
+
if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
|
|
627
|
+
advice.append(fidelity_advice)
|
|
628
|
+
if fidelity_evaluation == "too_high":
|
|
629
|
+
advice.append(fidelity_advice)
|
|
630
|
+
elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
|
|
631
|
+
if cores < 64:
|
|
632
|
+
advice.append(f"Increase grid size (currently using {cores})")
|
|
633
|
+
if fidelity_evaluation == "too_high":
|
|
634
|
+
advice.append(fidelity_advice)
|
|
635
|
+
elif op_data["Bound"].raw_value == "SLOW":
|
|
636
|
+
input_0_memory = op_data["Input 0 Memory"].raw_value
|
|
637
|
+
if input_0_memory and "L1" not in input_0_memory:
|
|
638
|
+
advice.append(f"If possible place input 0 in L1 (currently in {input_0_memory})")
|
|
639
|
+
|
|
640
|
+
inner_dim_block = op_data["Inner Dim Block Size"].raw_value
|
|
641
|
+
out_h = op_data["Output Subblock H"].raw_value
|
|
642
|
+
out_w = op_data["Output Subblock W"].raw_value
|
|
643
|
+
|
|
644
|
+
if inner_dim_block is None and out_h is None and out_w is None:
|
|
645
|
+
advice.append(
|
|
646
|
+
"No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
|
|
601
647
|
)
|
|
648
|
+
else:
|
|
649
|
+
all_good = True
|
|
650
|
+
if inner_dim_block is not None:
|
|
651
|
+
if inner_dim_block < 2:
|
|
652
|
+
advice.append(f"in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
|
|
653
|
+
all_good = False
|
|
654
|
+
else:
|
|
655
|
+
advice.append("No inner dim block size found")
|
|
656
|
+
all_good = False
|
|
602
657
|
|
|
603
|
-
if
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
"- Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
|
|
607
|
-
)
|
|
608
|
-
if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
|
|
609
|
-
advice.append(f"- {fidelity_advice}")
|
|
610
|
-
if fidelity_evaluation == "too_high":
|
|
611
|
-
advice.append(f"- {fidelity_advice}")
|
|
612
|
-
elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
|
|
613
|
-
if cores < 64:
|
|
614
|
-
advice.append(f"- Increase grid size (currently using {cores})")
|
|
615
|
-
if fidelity_evaluation == "too_high":
|
|
616
|
-
advice.append(f"- {fidelity_advice}")
|
|
617
|
-
elif op_data["Bound"].raw_value == "SLOW":
|
|
618
|
-
input_0_memory = op_data["Input 0 Memory"].raw_value
|
|
619
|
-
if input_0_memory and "L1" not in input_0_memory:
|
|
620
|
-
advice.append(f"- If possible place input 0 in L1 (currently in {input_0_memory})")
|
|
621
|
-
|
|
622
|
-
inner_dim_block = op_data["Inner Dim Block Size"].raw_value
|
|
623
|
-
out_h = op_data["Output Subblock H"].raw_value
|
|
624
|
-
out_w = op_data["Output Subblock W"].raw_value
|
|
625
|
-
|
|
626
|
-
if inner_dim_block is None and out_h is None and out_w is None:
|
|
658
|
+
if out_h is not None and out_w is not None:
|
|
659
|
+
out_area = out_h * out_w
|
|
660
|
+
if out_area < 2:
|
|
627
661
|
advice.append(
|
|
628
|
-
"
|
|
662
|
+
f"Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
|
|
629
663
|
)
|
|
630
|
-
|
|
631
|
-
all_good = True
|
|
632
|
-
if inner_dim_block is not None:
|
|
633
|
-
if inner_dim_block < 2:
|
|
634
|
-
advice.append(f"- in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
|
|
635
|
-
all_good = False
|
|
636
|
-
else:
|
|
637
|
-
advice.append("- No inner dim block size found")
|
|
638
|
-
all_good = False
|
|
639
|
-
|
|
640
|
-
if out_h is not None and out_w is not None:
|
|
641
|
-
out_area = out_h * out_w
|
|
642
|
-
if out_area < 2:
|
|
643
|
-
advice.append(
|
|
644
|
-
f"- Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
|
|
645
|
-
)
|
|
646
|
-
all_good = False
|
|
647
|
-
else:
|
|
648
|
-
advice.append("- No output subblock size found")
|
|
649
|
-
all_good = False
|
|
650
|
-
|
|
651
|
-
if all_good:
|
|
652
|
-
advice.append(
|
|
653
|
-
f"- in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
|
|
654
|
-
)
|
|
655
|
-
if fidelity_advice:
|
|
656
|
-
advice.append(f"- {fidelity_advice}")
|
|
657
|
-
|
|
658
|
-
if advice:
|
|
659
|
-
for item in advice:
|
|
660
|
-
print(colored(item, color))
|
|
664
|
+
all_good = False
|
|
661
665
|
else:
|
|
662
|
-
|
|
663
|
-
|
|
666
|
+
advice.append("No output subblock size found")
|
|
667
|
+
all_good = False
|
|
668
|
+
|
|
669
|
+
if all_good:
|
|
670
|
+
advice.append(
|
|
671
|
+
f"in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
|
|
672
|
+
)
|
|
673
|
+
if fidelity_advice:
|
|
674
|
+
advice.append(fidelity_advice)
|
|
675
|
+
|
|
676
|
+
return advice
|
|
664
677
|
|
|
665
678
|
|
|
666
679
|
def merge_device_rows(df):
|
|
@@ -682,11 +695,11 @@ def merge_device_rows(df):
|
|
|
682
695
|
|
|
683
696
|
if "AllGather" in op_name or "ReduceScatter" in op_name:
|
|
684
697
|
# For collective ops, take the row with minimum duration
|
|
685
|
-
min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE
|
|
698
|
+
min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
|
|
686
699
|
merged_blocks.append(min_duration_block[1])
|
|
687
700
|
else:
|
|
688
701
|
# For non-collective ops, take the row with maximum duration
|
|
689
|
-
max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE
|
|
702
|
+
max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
|
|
690
703
|
merged_blocks.append(max_duration_block[1])
|
|
691
704
|
|
|
692
705
|
return pd.DataFrame(merged_blocks)
|
|
@@ -730,7 +743,7 @@ def filter_by_id_range(rows, id_range):
|
|
|
730
743
|
def main():
|
|
731
744
|
args, id_range = parse_args()
|
|
732
745
|
generate_perf_report(
|
|
733
|
-
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice
|
|
746
|
+
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode
|
|
734
747
|
)
|
|
735
748
|
|
|
736
749
|
|
|
@@ -751,6 +764,7 @@ def parse_args():
|
|
|
751
764
|
parser.add_argument("--no-color", action="store_true", help="Force output without color")
|
|
752
765
|
parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
|
|
753
766
|
parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
|
|
767
|
+
parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
|
|
754
768
|
args = parser.parse_args()
|
|
755
769
|
|
|
756
770
|
# Set the global color_output variable
|
|
@@ -766,14 +780,15 @@ def parse_args():
|
|
|
766
780
|
return args, id_range
|
|
767
781
|
|
|
768
782
|
|
|
769
|
-
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice):
|
|
783
|
+
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode):
|
|
770
784
|
df = pd.read_csv(csv_file, low_memory=False)
|
|
771
785
|
|
|
772
786
|
# Add a column for original row numbers
|
|
773
787
|
df["ORIGINAL_ROW"] = df.index + 2 # +2 to match Excel row numbers (1-based + header)
|
|
774
788
|
|
|
775
789
|
# Sort the DataFrame by "HOST START TS" column
|
|
776
|
-
|
|
790
|
+
# Sorting by HOST START TS is incorrect when using tracing mode since the tracing ops timestamps are the ones when captured and not executed
|
|
791
|
+
if "HOST START TS" in df.columns and not tracing_mode:
|
|
777
792
|
print(colored("Sorting CSV by 'HOST START TS' column...", "cyan"))
|
|
778
793
|
df = df.sort_values(by="HOST START TS")
|
|
779
794
|
else:
|
|
@@ -839,11 +854,18 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
839
854
|
"Output Subblock H",
|
|
840
855
|
"Output Subblock W",
|
|
841
856
|
]
|
|
857
|
+
if not no_advice:
|
|
858
|
+
all_headers.append("Advice")
|
|
842
859
|
print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
|
|
843
860
|
with open(csv_output_file, "w") as f:
|
|
844
|
-
|
|
861
|
+
csv_writer = csv.DictWriter(f, fieldnames=all_headers)
|
|
862
|
+
csv_writer.writeheader()
|
|
845
863
|
for op_data in rows:
|
|
846
|
-
|
|
864
|
+
row = {header: op_data[header].raw_value for header in all_headers if header in op_data}
|
|
865
|
+
if not no_advice:
|
|
866
|
+
advice = generate_matmul_advice(op_data) if is_matmul_op(op_data) else ""
|
|
867
|
+
row["Advice"] = " • ".join(advice)
|
|
868
|
+
csv_writer.writerow(row)
|
|
847
869
|
else:
|
|
848
870
|
col_widths = [
|
|
849
871
|
max(max(visible_length(str(row[header])) for row in rows), visible_length(header))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tt_perf_report-1.0.0 → tt_perf_report-1.0.3}/src/tt_perf_report.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|