PyPI - tt-perf-report - Versions diffs - 1.0.1__tar.gz → 1.0.4__tar.gz - Mend

tt-perf-report 1.0.1tar.gz → 1.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tt-perf-report might be problematic. Click here for more details.

Files changed (13) hide show

{tt_perf_report-1.0.1 → tt_perf_report-1.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: tt-perf-report
-Version: 1.0.1
+Version: 1.0.4
 Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
 License:                                  Apache License
                                    Version 2.0, January 2004

{tt_perf_report-1.0.1 → tt_perf_report-1.0.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "tt-perf-report"
-version = "1.0.1"
+version = "1.0.4"
 description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
 license = {file = "LICENSE"}
 readme = "README.md"

{tt_perf_report-1.0.1 → tt_perf_report-1.0.4}/src/tt_perf_report/perf_report.py RENAMED Viewed

@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+import csv
 import sys
 import argparse
 import re
@@ -38,6 +38,7 @@ def colored(text, color):
             "yellow": "\033[38;5;11m",
             "blue": "\033[38;5;12m",
             "magenta": "\033[38;5;13m",
+            "orange": "\033[38;5;208m",
             "cyan": "\033[38;5;14m",
             "white": "\033[38;5;15m",
             "end": "\033[0m",
@@ -70,7 +71,7 @@ class Cell:
         if self.raw_value is None or pd.isna(self.raw_value):
             return ""
-        if isinstance(self.raw_value, str) and "Matmul" in self.raw_value:
+        if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
             parts = self.raw_value.split(maxsplit=1)
             op_name = parts[0]
             size = parts[1] if len(parts) > 1 else ""
@@ -275,6 +276,37 @@ def analyze_matmul(row):
         core_count,  # Return the potentially adjusted core count
     )
+def analyze_conv(row):
+    duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
+    core_count = 64 # we decided to normalize to the max core count
+    math_fidelity = row["MATH FIDELITY"]
+    # Check for DRAM-sharded program config
+    attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
+    peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
+    NHW = int(row["OUTPUT_0_Y"])
+    CH_IN = int(row["INPUT_0_X"])
+    W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
+    CH_OUT = int(row["INPUT_1_X"])
+    M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
+    flops = (M * K * N * 2) / duration_s
+    size = f"{M} x {K} x {N}"
+    memory_info = f"({row['INPUT_0_DATATYPE']} {row['INPUT_0_MEMORY'].replace('DEV_0_', '')} @ {row['INPUT_1_DATATYPE']} {row['INPUT_1_MEMORY'].replace('DEV_0_', '')} => {row['OUTPUT_0_DATATYPE']} {row['OUTPUT_0_MEMORY'].replace('DEV_0_', '')})"
+    flops_percentage = (flops / peak_flops_value) * 100
+    return (
+        flops,
+        flops_percentage,
+        size,
+        memory_info,
+        math_fidelity
+    )
 def analyze_op(row, prev_row):
     op_code = Cell(row["OP CODE"])
@@ -305,6 +337,19 @@ def analyze_op(row, prev_row):
     input_1_datatype_cell = Cell(input_1_datatype)
     short_name = lambda n: {"BFLOAT16": "BF16", "BFLOAT8_B": "BFP8", "BFLOAT4_B": "BFP4"}.get(n, n)
+    dram_speed = Cell(None, unit="GB/s", decimals=0)
+    dram_percentage = Cell(None, unit="%", decimals=1)
+    flops = Cell(None, unit="TFLOPs", decimals=1)
+    flops_percentage = Cell(None, unit="%", decimals=1)
+    math_fidelity = ""
+    math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
+    math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
+    math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
+    math_fidelity_cell = Cell(math_fidelity.strip())
+    is_dram_sharded = False
     if "Matmul" in op_code.raw_value:
         (
             dram_speed,
@@ -329,19 +374,24 @@ def analyze_op(row, prev_row):
             if math_fidelity
             else None
         )
-    else:
+    elif "OptimizedConvNew" in op_code.raw_value:
+        (
+            flops,
+            flops_percentage,
+            size,
+            memory_info,
+            math_fidelity,
+        ) = analyze_conv(row)
+        op_code = Cell(f"{op_code.raw_value} {size}")
         dram_speed = Cell(None, unit="GB/s", decimals=0)
         dram_percentage = Cell(None, unit="%", decimals=1)
-        flops = Cell(None, unit="TFLOPs", decimals=1)
-        flops_percentage = Cell(None, unit="%", decimals=1)
-        math_fidelity = ""
-        math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
-        math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
-        math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
-        math_fidelity_cell = Cell(math_fidelity.strip())
-        is_dram_sharded = False
+        flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
+        flops_percentage = Cell(flops_percentage, unit="%", decimals=1)
+        math_fidelity_cell = Cell(
+            f"{math_fidelity} {short_name(input_0_datatype)} x {short_name(input_1_datatype)} => {short_name(output_datatype)}".strip()
+            if math_fidelity
+            else None
+        )
     output = {
         "ID": None,
@@ -434,6 +484,7 @@ def color_row(op_data, percentage, min_percentage):
         op_colors = {
             "(torch)": "red",
             "Matmul": "magenta",
+            "OptimizedConvNew" : "orange",
             "LayerNorm": "cyan",
             "AllGather": "cyan",
             "AllReduce": "cyan",
@@ -484,7 +535,8 @@ def color_row(op_data, percentage, min_percentage):
         if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5:
             op_data["Op-to-Op Gap"].color = "red"
-        if "Matmul" in op_data["OP Code"].raw_value and op_data["Math Fidelity"].raw_value:
+        if ("Matmul" in op_data["OP Code"].raw_value
+            or "OptimizedConvNew" in op_data["OP Code"].raw_value) and op_data["Math Fidelity"].raw_value:
             math_fidelity = op_data["Math Fidelity"].raw_value.split()[0]
             input_0_datatype = op_data["Input 0 Datatype"].raw_value
             input_1_datatype = op_data["Input 1 Datatype"].raw_value
@@ -582,88 +634,98 @@ def print_op_to_op_gap_advice(rows, headers, col_widths):
         )
+def is_matmul_op(op_data):
+    return "Matmul" in op_data["OP Code"].raw_value
 def print_matmul_advice(rows, headers, col_widths):
-    matmul_ops = [op_data for op_data in rows if "Matmul" in op_data["OP Code"].raw_value]
+    matmul_ops = [op_data for op_data in rows if is_matmul_op(op_data)]
     if matmul_ops:
         print("Matmul Optimization\n-------------------")
         for op_data in matmul_ops:
             print_row(op_data, col_widths, headers)
-            advice = []
+            advice = generate_matmul_advice(op_data)
             color = "grey" if op_data["OP Code"].color == "grey" else "white"
-            math_fidelity = (
-                op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
+            if advice:
+                for item in advice:
+                    print(colored(f"- {item}", color))
+            else:
+                print(colored("✅ Optimized", color))
+            print()  # Add a blank line between matmuls
+def generate_matmul_advice(op_data):
+    advice = []
+    math_fidelity = (
+        op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
+    )
+    output_datatype = op_data["Output Datatype"].raw_value
+    input_0_datatype = op_data["Input 0 Datatype"].raw_value
+    input_1_datatype = op_data["Input 1 Datatype"].raw_value
+    cores = op_data["Cores"].raw_value
+    fidelity_evaluation, fidelity_advice = evaluate_fidelity(
+        input_0_datatype, input_1_datatype, output_datatype, math_fidelity
+    )
+    if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
+        if not op_data["DRAM Sharded"].raw_value:
+            advice.append(
+                "Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
             )
-            output_datatype = op_data["Output Datatype"].raw_value
-            input_0_datatype = op_data["Input 0 Datatype"].raw_value
-            input_1_datatype = op_data["Input 1 Datatype"].raw_value
-            cores = op_data["Cores"].raw_value
-            fidelity_evaluation, fidelity_advice = evaluate_fidelity(
-                input_0_datatype, input_1_datatype, output_datatype, math_fidelity
+        if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
+            advice.append(fidelity_advice)
+        if fidelity_evaluation == "too_high":
+            advice.append(fidelity_advice)
+    elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
+        if cores < 64:
+            advice.append(f"Increase grid size (currently using {cores})")
+        if fidelity_evaluation == "too_high":
+            advice.append(fidelity_advice)
+    elif op_data["Bound"].raw_value == "SLOW":
+        input_0_memory = op_data["Input 0 Memory"].raw_value
+        if input_0_memory and "L1" not in input_0_memory:
+            advice.append(f"If possible place input 0 in L1 (currently in {input_0_memory})")
+        inner_dim_block = op_data["Inner Dim Block Size"].raw_value
+        out_h = op_data["Output Subblock H"].raw_value
+        out_w = op_data["Output Subblock W"].raw_value
+        if inner_dim_block is None and out_h is None and out_w is None:
+            advice.append(
+                "No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
             )
+        else:
+            all_good = True
+            if inner_dim_block is not None:
+                if inner_dim_block < 2:
+                    advice.append(f"in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
+                    all_good = False
+            else:
+                advice.append("No inner dim block size found")
+                all_good = False
-            if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
-                if not op_data["DRAM Sharded"].raw_value:
-                    advice.append(
-                        "- Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
-                    )
-                if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
-                    advice.append(f"- {fidelity_advice}")
-                if fidelity_evaluation == "too_high":
-                    advice.append(f"- {fidelity_advice}")
-            elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
-                if cores < 64:
-                    advice.append(f"- Increase grid size (currently using {cores})")
-                if fidelity_evaluation == "too_high":
-                    advice.append(f"- {fidelity_advice}")
-            elif op_data["Bound"].raw_value == "SLOW":
-                input_0_memory = op_data["Input 0 Memory"].raw_value
-                if input_0_memory and "L1" not in input_0_memory:
-                    advice.append(f"- If possible place input 0 in L1 (currently in {input_0_memory})")
-                inner_dim_block = op_data["Inner Dim Block Size"].raw_value
-                out_h = op_data["Output Subblock H"].raw_value
-                out_w = op_data["Output Subblock W"].raw_value
-                if inner_dim_block is None and out_h is None and out_w is None:
+            if out_h is not None and out_w is not None:
+                out_area = out_h * out_w
+                if out_area < 2:
                     advice.append(
-                        "- No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
+                        f"Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
                     )
-                else:
-                    all_good = True
-                    if inner_dim_block is not None:
-                        if inner_dim_block < 2:
-                            advice.append(f"- in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
-                            all_good = False
-                    else:
-                        advice.append("- No inner dim block size found")
-                        all_good = False
-                    if out_h is not None and out_w is not None:
-                        out_area = out_h * out_w
-                        if out_area < 2:
-                            advice.append(
-                                f"- Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
-                            )
-                            all_good = False
-                    else:
-                        advice.append("- No output subblock size found")
-                        all_good = False
-                    if all_good:
-                        advice.append(
-                            f"- in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
-                        )
-                    if fidelity_advice:
-                        advice.append(f"- {fidelity_advice}")
-            if advice:
-                for item in advice:
-                    print(colored(item, color))
+                    all_good = False
             else:
-                print(colored("✅ Optimized", color))
-            print()  # Add a blank line between matmuls
+                advice.append("No output subblock size found")
+                all_good = False
+            if all_good:
+                advice.append(
+                    f"in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
+                )
+            if fidelity_advice:
+                advice.append(fidelity_advice)
+    return advice
 def merge_device_rows(df):
@@ -733,7 +795,7 @@ def filter_by_id_range(rows, id_range):
 def main():
     args, id_range = parse_args()
     generate_perf_report(
-        args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode
+        args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode, args.raw_op_codes,
     )
@@ -755,6 +817,7 @@ def parse_args():
     parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
     parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
     parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
+    parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
     args = parser.parse_args()
     # Set the global color_output variable
@@ -770,7 +833,7 @@ def parse_args():
     return args, id_range
-def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode):
+def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode, raw_op_codes):
     df = pd.read_csv(csv_file, low_memory=False)
     # Add a column for original row numbers
@@ -798,6 +861,8 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
     for _, row in df.iterrows():
         op_data, current_gap = analyze_op(row, prev_row)
         op_data["ID"] = Cell(row["ORIGINAL_ROW"])  # Use the original row number
+        if raw_op_codes:
+            op_data["Raw OP Code"] = Cell(row["OP CODE"])
         rows.append(op_data)
         prev_row = row
@@ -844,11 +909,20 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
             "Output Subblock H",
             "Output Subblock W",
         ]
+        if not no_advice:
+            all_headers.append("Advice")
+        if raw_op_codes:
+            all_headers.append("Raw OP Code")
         print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
         with open(csv_output_file, "w") as f:
-            f.write(",".join(all_headers) + "\n")
+            csv_writer = csv.DictWriter(f, fieldnames=all_headers)
+            csv_writer.writeheader()
             for op_data in rows:
-                f.write(",".join(str(op_data[header].raw_value) for header in all_headers) + "\n")
+                row = {header: op_data[header].raw_value for header in all_headers if header in op_data}
+                if not no_advice:
+                    advice = generate_matmul_advice(op_data) if is_matmul_op(op_data) else ""
+                    row["Advice"] = " • ".join(advice)
+                csv_writer.writerow(row)
     else:
         col_widths = [
             max(max(visible_length(str(row[header])) for row in rows), visible_length(header))

{tt_perf_report-1.0.1 → tt_perf_report-1.0.4}/src/tt_perf_report.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: tt-perf-report
-Version: 1.0.1
+Version: 1.0.4
 Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
 License:                                  Apache License
                                    Version 2.0, January 2004