PyPI - tt-perf-report - Versions diffs - 1.0.5__tar.gz → 1.0.7__tar.gz - Mend

tt-perf-report 1.0.5tar.gz → 1.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tt-perf-report might be problematic. Click here for more details.

Files changed (14) hide show

{tt_perf_report-1.0.5/src/tt_perf_report.egg-info → tt_perf_report-1.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: tt-perf-report
-Version: 1.0.5
+Version: 1.0.7
 Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE_understanding.txt
 Requires-Dist: pandas
+Dynamic: license-file
 # Performance Report Analysis Tool

{tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "tt-perf-report"
-version = "1.0.5"
+version = "1.0.7"
 description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
 license = {file = "LICENSE"}
 readme = "README.md"

{tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report/perf_report.py RENAMED Viewed

@@ -71,7 +71,7 @@ class Cell:
         if self.raw_value is None or pd.isna(self.raw_value):
             return ""
-        if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
+        if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value or "HaloDeviceOperation" in self.raw_value):
             parts = self.raw_value.split(maxsplit=1)
             op_name = parts[0]
             size = parts[1] if len(parts) > 1 else ""
@@ -276,6 +276,42 @@ def analyze_matmul(row):
         core_count,  # Return the potentially adjusted core count
     )
+def analyze_halo(row):
+    attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
+    try:
+        window_hw = attributes.split("window_hw=")[1].split(";")[0:2]
+        window_hw = ",".join(window_hw[0:2])
+    except (IndexError, AttributeError):
+        window_hw = "x"
+    try:
+        stride_hw = attributes.split("stride_hw=")[1].split(";")[0:2]
+        stride_hw = ",".join(stride_hw[0:2])
+    except (IndexError, AttributeError):
+        stride_hw = "x"
+    try:
+        pad_hw = attributes.split("padding=")[1].split(";")[0:4]
+        pad_hw = ",".join(pad_hw[0:4])
+    except (IndexError, AttributeError):
+        pad_hw = "x"
+    try:
+        dilation_hw = attributes.split("dilation_hw=")[1].split(";")[0:2]
+        dilation_hw = ",".join(dilation_hw[0:2])
+    except (IndexError, AttributeError):
+        dilation_hw = "x"
+    try:
+        memory_layout = attributes.split("memory_layout=")[1].split(";")[0].split("::")[1]
+    except (IndexError, AttributeError):
+        memory_layout = "x"
+    config = f"w={window_hw} s={stride_hw} p={pad_hw} d={dilation_hw} | {memory_layout}"
+    return config
 def analyze_conv(row):
     duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
@@ -300,12 +336,40 @@ def analyze_conv(row):
     flops_percentage = (flops / peak_flops_value) * 100
+    try:
+        act_block_h_ntiles = int(attributes.split("act_block_h_ntiles")[1][1:].split(";")[0])
+    except (IndexError, ValueError):
+        act_block_h_ntiles = "x"
+    try:
+        enable_act_double_buffer = "true" == attributes.split("enable_act_double_buffer': '")[1].split("'")[0]
+    except (IndexError, ValueError):
+        enable_act_double_buffer = "x"
+    try:
+        enable_split_reader = "true" == attributes.split("enable_split_reader': '")[1].split("'")[0]
+    except (IndexError, ValueError):
+        enable_split_reader = "x"
+    try:
+        per_core_out_matrix_height_ntile = int(attributes.split("per_core_out_matrix_height_ntile")[1][1:].split(";")[0])
+    except (IndexError, ValueError):
+        per_core_out_matrix_height_ntile = "x"
+    config = f"[ABH={per_core_out_matrix_height_ntile}|{act_block_h_ntiles}"
+    if (enable_act_double_buffer):
+        config += " ADB"
+    if (enable_split_reader):
+        config += " SR"
+    config += "]"
     return (
         flops,
         flops_percentage,
         size,
         memory_info,
-        math_fidelity
+        math_fidelity,
+        config,
     )
 def analyze_op(row, prev_row):
@@ -381,8 +445,9 @@ def analyze_op(row, prev_row):
             size,
             memory_info,
             math_fidelity,
+            config,
         ) = analyze_conv(row)
-        op_code = Cell(f"{op_code.raw_value} {size}")
+        op_code = Cell(f"{op_code.raw_value} {size} {config}")
         dram_speed = Cell(None, unit="GB/s", decimals=0)
         dram_percentage = Cell(None, unit="%", decimals=1)
         flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
@@ -392,6 +457,13 @@ def analyze_op(row, prev_row):
             if math_fidelity
             else None
         )
+    elif "HaloDeviceOperation" in op_code.raw_value:
+        config = analyze_halo(row)
+        op_code = Cell(f"{op_code.raw_value} {config}")
+        dram_speed = Cell(None, unit="GB/s", decimals=0)
+        dram_percentage = Cell(None, unit="%", decimals=1)
+        flops = Cell(None, unit="TFLOPs", decimals=1)
+        flops_percentage = Cell(None, unit="%", decimals=1)
     output = {
         "ID": None,
@@ -765,10 +837,17 @@ def merge_device_rows(df):
         if not blocks:
             break
-        if "AllGather" in op_name or "ReduceScatter" in op_name:
-            # For collective ops, take the row with minimum duration
-            min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
-            merged_blocks.append(min_duration_block[1])
+        if "AllGather" in op_name or "ReduceScatter" in op_name or "AllReduce" in op_name:
+            # For collective ops, take the average duration over all rows within a block
+            device_kernel_durations = [d["DEVICE KERNEL DURATION [ns]"]
+                             for _, d in blocks
+                             if pd.notna(d["DEVICE KERNEL DURATION [ns]"])]
+            # Use the first block's data but update its duration with the average
+            base_block = blocks[0][1].copy()
+            base_block["DEVICE KERNEL DURATION [ns]"] = (
+                sum(device_kernel_durations) / len(device_kernel_durations) if device_kernel_durations else float("nan")
+            )
+            merged_blocks.append(base_block)
         else:
             # For non-collective ops, take the row with maximum duration
             max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
@@ -814,11 +893,15 @@ def filter_by_id_range(rows, id_range):
     return rows
+def filter_host_ops(rows):
+    return [row for row in rows if not is_host_op(row)]
 def main():
     args, id_range = parse_args()
     generate_perf_report(
-        args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode, args.raw_op_codes,
-    )
+        args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
+        args.tracing_mode, args.raw_op_codes, args.no_host_ops)
 def parse_args():
@@ -840,6 +923,8 @@ def parse_args():
     parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
     parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
     parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
+    parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
     args = parser.parse_args()
     # Set the global color_output variable
@@ -855,7 +940,9 @@ def parse_args():
     return args, id_range
-def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode, raw_op_codes):
+def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
+                         id_range, csv_output_file, no_advice, tracing_mode,
+                         raw_op_codes, no_host_ops):
     df = pd.read_csv(csv_file, low_memory=False)
     # Add a column for original row numbers
@@ -883,13 +970,14 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
     for _, row in df.iterrows():
         op_data, current_gap = analyze_op(row, prev_row)
         op_data["ID"] = Cell(row["ORIGINAL_ROW"])  # Use the original row number
+        op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
         if raw_op_codes:
             op_data["Raw OP Code"] = Cell(row["OP CODE"])
         rows.append(op_data)
         prev_row = row
         # Count device and host ops
-        if "(torch)" in op_data["OP Code"].raw_value:
+        if is_host_op(op_data):
             host_ops += 1
         else:
             device_ops += 1
@@ -900,6 +988,9 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
     # Filter rows based on id_range
     rows = filter_by_id_range(rows, id_range)
+    if no_host_ops:
+        rows = filter_host_ops(rows)
     # Recalculate derived columns after filtering
     add_derived_columns(rows)
@@ -930,6 +1021,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
             "Inner Dim Block Size",
             "Output Subblock H",
             "Output Subblock W",
+            "Global Call Count",
         ]
         if not no_advice:
             all_headers.append("Advice")
@@ -955,5 +1047,9 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
             print_advice_section(rows, visible_headers, col_widths)
+def is_host_op(op_data):
+    return "(torch)" in op_data["OP Code"].raw_value
 if __name__ == "__main__":
     main()

{tt_perf_report-1.0.5 → tt_perf_report-1.0.7/src/tt_perf_report.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: tt-perf-report
-Version: 1.0.5
+Version: 1.0.7
 Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE_understanding.txt
 Requires-Dist: pandas
+Dynamic: license-file
 # Performance Report Analysis Tool