tt-perf-report 1.0.6__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tt-perf-report might be problematic. Click here for more details.
- {tt_perf_report-1.0.6/src/tt_perf_report.egg-info → tt_perf_report-1.1.0}/PKG-INFO +4 -2
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/pyproject.toml +2 -2
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report/perf_report.py +234 -29
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0/src/tt_perf_report.egg-info}/PKG-INFO +4 -2
- tt_perf_report-1.1.0/src/tt_perf_report.egg-info/requires.txt +2 -0
- tt_perf_report-1.0.6/src/tt_perf_report.egg-info/requires.txt +0 -1
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/LICENSE +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/LICENSE_understanding.txt +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/README.md +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/setup.cfg +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report/__init__.py +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report.egg-info/SOURCES.txt +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report.egg-info/dependency_links.txt +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report.egg-info/entry_points.txt +0 -0
- {tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -218,6 +218,8 @@ Description-Content-Type: text/markdown
|
|
|
218
218
|
License-File: LICENSE
|
|
219
219
|
License-File: LICENSE_understanding.txt
|
|
220
220
|
Requires-Dist: pandas
|
|
221
|
+
Requires-Dist: matplotlib
|
|
222
|
+
Dynamic: license-file
|
|
221
223
|
|
|
222
224
|
# Performance Report Analysis Tool
|
|
223
225
|
|
|
@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tt-perf-report"
|
|
7
|
-
version = "1.0
|
|
7
|
+
version = "1.1.0"
|
|
8
8
|
description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
|
|
9
9
|
license = {file = "LICENSE"}
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = ["tenstorrent", "tt-metal"]
|
|
12
|
-
dependencies = ["pandas"]
|
|
12
|
+
dependencies = ["pandas", "matplotlib"]
|
|
13
13
|
|
|
14
14
|
[project.scripts]
|
|
15
15
|
tt-perf-report = "tt_perf_report.perf_report:main"
|
|
@@ -2,17 +2,32 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
|
|
5
|
-
import csv
|
|
6
|
-
import sys
|
|
7
5
|
import argparse
|
|
6
|
+
import csv
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import os
|
|
8
9
|
import re
|
|
10
|
+
import sys
|
|
9
11
|
from typing import Any, Optional, Union
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
11
14
|
import pandas as pd
|
|
12
15
|
|
|
13
16
|
# Global variable to store color preference
|
|
14
17
|
color_output = None # None means auto-detect, True forces color, False forces no color
|
|
15
18
|
|
|
19
|
+
def get_value_physical_logical(input : str, is_physical : bool = True):
|
|
20
|
+
if "[" in input and "]" in input:
|
|
21
|
+
physical_part = input.split("[")[0]
|
|
22
|
+
logical_part = input.split("[")[1].split("]")[0]
|
|
23
|
+
|
|
24
|
+
if is_physical:
|
|
25
|
+
return int(physical_part)
|
|
26
|
+
else:
|
|
27
|
+
return int(logical_part)
|
|
28
|
+
else:
|
|
29
|
+
# back compatible
|
|
30
|
+
return int(input)
|
|
16
31
|
|
|
17
32
|
def set_color_output(force_color, force_no_color):
|
|
18
33
|
global color_output
|
|
@@ -71,7 +86,7 @@ class Cell:
|
|
|
71
86
|
if self.raw_value is None or pd.isna(self.raw_value):
|
|
72
87
|
return ""
|
|
73
88
|
|
|
74
|
-
if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
|
|
89
|
+
if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value or "HaloDeviceOperation" in self.raw_value):
|
|
75
90
|
parts = self.raw_value.split(maxsplit=1)
|
|
76
91
|
op_name = parts[0]
|
|
77
92
|
size = parts[1] if len(parts) > 1 else ""
|
|
@@ -212,28 +227,28 @@ def analyze_matmul(row):
|
|
|
212
227
|
total_data_size_bytes = 0
|
|
213
228
|
if input_0_from_dram:
|
|
214
229
|
total_data_size_bytes += (
|
|
215
|
-
row["
|
|
216
|
-
* row["
|
|
217
|
-
* row["
|
|
218
|
-
* row["
|
|
230
|
+
get_value_physical_logical(row["INPUT_0_W_PAD[LOGICAL]"])
|
|
231
|
+
* get_value_physical_logical(row["INPUT_0_Y_PAD[LOGICAL]"])
|
|
232
|
+
* get_value_physical_logical(row["INPUT_0_Z_PAD[LOGICAL]"])
|
|
233
|
+
* get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"])
|
|
219
234
|
* get_datatype_size(row["INPUT_0_DATATYPE"])
|
|
220
235
|
)
|
|
221
236
|
if input_1_from_dram:
|
|
222
237
|
total_data_size_bytes += (
|
|
223
|
-
row["
|
|
224
|
-
* row["
|
|
225
|
-
* row["
|
|
226
|
-
* row["
|
|
238
|
+
get_value_physical_logical(row["INPUT_1_W_PAD[LOGICAL]"])
|
|
239
|
+
* get_value_physical_logical(row["INPUT_1_Y_PAD[LOGICAL]"])
|
|
240
|
+
* get_value_physical_logical(row["INPUT_1_Z_PAD[LOGICAL]"])
|
|
241
|
+
* get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
|
|
227
242
|
* get_datatype_size(row["INPUT_1_DATATYPE"])
|
|
228
243
|
)
|
|
229
244
|
|
|
230
245
|
# Always include output if it's written to DRAM
|
|
231
246
|
if "DRAM" in row["OUTPUT_0_MEMORY"]:
|
|
232
247
|
total_data_size_bytes += (
|
|
233
|
-
row["
|
|
234
|
-
* row["
|
|
235
|
-
* row["
|
|
236
|
-
* row["
|
|
248
|
+
get_value_physical_logical(row["OUTPUT_0_W_PAD[LOGICAL]"])
|
|
249
|
+
* get_value_physical_logical(row["OUTPUT_0_Y_PAD[LOGICAL]"])
|
|
250
|
+
* get_value_physical_logical(row["OUTPUT_0_Z_PAD[LOGICAL]"])
|
|
251
|
+
* get_value_physical_logical(row["OUTPUT_0_X_PAD[LOGICAL]"])
|
|
237
252
|
* get_datatype_size(row["OUTPUT_0_DATATYPE"])
|
|
238
253
|
)
|
|
239
254
|
|
|
@@ -253,8 +268,8 @@ def analyze_matmul(row):
|
|
|
253
268
|
|
|
254
269
|
peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
|
|
255
270
|
|
|
256
|
-
M, K, N =
|
|
257
|
-
W, Z =
|
|
271
|
+
M, K, N = get_value_physical_logical(row["INPUT_0_Y_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
|
|
272
|
+
W, Z = get_value_physical_logical(row["INPUT_0_W_PAD[LOGICAL]"]), get_value_physical_logical(row["INPUT_0_Z_PAD[LOGICAL]"])
|
|
258
273
|
|
|
259
274
|
flops = (M * K * N * W * Z * 2) / duration_s
|
|
260
275
|
|
|
@@ -276,6 +291,42 @@ def analyze_matmul(row):
|
|
|
276
291
|
core_count, # Return the potentially adjusted core count
|
|
277
292
|
)
|
|
278
293
|
|
|
294
|
+
def analyze_halo(row):
|
|
295
|
+
attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
window_hw = attributes.split("window_hw=")[1].split(";")[0:2]
|
|
299
|
+
window_hw = ",".join(window_hw[0:2])
|
|
300
|
+
except (IndexError, AttributeError):
|
|
301
|
+
window_hw = "x"
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
stride_hw = attributes.split("stride_hw=")[1].split(";")[0:2]
|
|
305
|
+
stride_hw = ",".join(stride_hw[0:2])
|
|
306
|
+
except (IndexError, AttributeError):
|
|
307
|
+
stride_hw = "x"
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
pad_hw = attributes.split("padding=")[1].split(";")[0:4]
|
|
311
|
+
pad_hw = ",".join(pad_hw[0:4])
|
|
312
|
+
except (IndexError, AttributeError):
|
|
313
|
+
pad_hw = "x"
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
dilation_hw = attributes.split("dilation_hw=")[1].split(";")[0:2]
|
|
317
|
+
dilation_hw = ",".join(dilation_hw[0:2])
|
|
318
|
+
except (IndexError, AttributeError):
|
|
319
|
+
dilation_hw = "x"
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
memory_layout = attributes.split("memory_layout=")[1].split(";")[0].split("::")[1]
|
|
323
|
+
except (IndexError, AttributeError):
|
|
324
|
+
memory_layout = "x"
|
|
325
|
+
|
|
326
|
+
config = f"w={window_hw} s={stride_hw} p={pad_hw} d={dilation_hw} | {memory_layout}"
|
|
327
|
+
|
|
328
|
+
return config
|
|
329
|
+
|
|
279
330
|
def analyze_conv(row):
|
|
280
331
|
duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
|
|
281
332
|
|
|
@@ -287,10 +338,10 @@ def analyze_conv(row):
|
|
|
287
338
|
|
|
288
339
|
peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
|
|
289
340
|
|
|
290
|
-
NHW =
|
|
291
|
-
CH_IN =
|
|
341
|
+
NHW = get_value_physical_logical(row["OUTPUT_0_Y_PAD[LOGICAL]"])
|
|
342
|
+
CH_IN = get_value_physical_logical(row["INPUT_0_X_PAD[LOGICAL]"])
|
|
292
343
|
W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
|
|
293
|
-
CH_OUT =
|
|
344
|
+
CH_OUT = get_value_physical_logical(row["INPUT_1_X_PAD[LOGICAL]"])
|
|
294
345
|
|
|
295
346
|
M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
|
|
296
347
|
flops = (M * K * N * 2) / duration_s
|
|
@@ -300,12 +351,40 @@ def analyze_conv(row):
|
|
|
300
351
|
|
|
301
352
|
flops_percentage = (flops / peak_flops_value) * 100
|
|
302
353
|
|
|
354
|
+
try:
|
|
355
|
+
act_block_h_ntiles = int(attributes.split("act_block_h_ntiles")[1][1:].split(";")[0])
|
|
356
|
+
except (IndexError, ValueError):
|
|
357
|
+
act_block_h_ntiles = "x"
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
enable_act_double_buffer = "true" == attributes.split("enable_act_double_buffer': '")[1].split("'")[0]
|
|
361
|
+
except (IndexError, ValueError):
|
|
362
|
+
enable_act_double_buffer = "x"
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
enable_split_reader = "true" == attributes.split("enable_split_reader': '")[1].split("'")[0]
|
|
366
|
+
except (IndexError, ValueError):
|
|
367
|
+
enable_split_reader = "x"
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
per_core_out_matrix_height_ntile = int(attributes.split("per_core_out_matrix_height_ntile")[1][1:].split(";")[0])
|
|
371
|
+
except (IndexError, ValueError):
|
|
372
|
+
per_core_out_matrix_height_ntile = "x"
|
|
373
|
+
|
|
374
|
+
config = f"[ABH={per_core_out_matrix_height_ntile}|{act_block_h_ntiles}"
|
|
375
|
+
if (enable_act_double_buffer):
|
|
376
|
+
config += " ADB"
|
|
377
|
+
if (enable_split_reader):
|
|
378
|
+
config += " SR"
|
|
379
|
+
config += "]"
|
|
380
|
+
|
|
303
381
|
return (
|
|
304
382
|
flops,
|
|
305
383
|
flops_percentage,
|
|
306
384
|
size,
|
|
307
385
|
memory_info,
|
|
308
|
-
math_fidelity
|
|
386
|
+
math_fidelity,
|
|
387
|
+
config,
|
|
309
388
|
)
|
|
310
389
|
|
|
311
390
|
def analyze_op(row, prev_row):
|
|
@@ -381,8 +460,9 @@ def analyze_op(row, prev_row):
|
|
|
381
460
|
size,
|
|
382
461
|
memory_info,
|
|
383
462
|
math_fidelity,
|
|
463
|
+
config,
|
|
384
464
|
) = analyze_conv(row)
|
|
385
|
-
op_code = Cell(f"{op_code.raw_value} {size}")
|
|
465
|
+
op_code = Cell(f"{op_code.raw_value} {size} {config}")
|
|
386
466
|
dram_speed = Cell(None, unit="GB/s", decimals=0)
|
|
387
467
|
dram_percentage = Cell(None, unit="%", decimals=1)
|
|
388
468
|
flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
|
|
@@ -392,6 +472,13 @@ def analyze_op(row, prev_row):
|
|
|
392
472
|
if math_fidelity
|
|
393
473
|
else None
|
|
394
474
|
)
|
|
475
|
+
elif "HaloDeviceOperation" in op_code.raw_value:
|
|
476
|
+
config = analyze_halo(row)
|
|
477
|
+
op_code = Cell(f"{op_code.raw_value} {config}")
|
|
478
|
+
dram_speed = Cell(None, unit="GB/s", decimals=0)
|
|
479
|
+
dram_percentage = Cell(None, unit="%", decimals=1)
|
|
480
|
+
flops = Cell(None, unit="TFLOPs", decimals=1)
|
|
481
|
+
flops_percentage = Cell(None, unit="%", decimals=1)
|
|
395
482
|
|
|
396
483
|
output = {
|
|
397
484
|
"ID": None,
|
|
@@ -728,6 +815,93 @@ def generate_matmul_advice(op_data):
|
|
|
728
815
|
return advice
|
|
729
816
|
|
|
730
817
|
|
|
818
|
+
def generate_stacked_report(rows, visible_headers, stack_by_input0_layout:bool = False):
|
|
819
|
+
if stack_by_input0_layout:
|
|
820
|
+
visible_headers.append("Input 0 Memory")
|
|
821
|
+
|
|
822
|
+
# Create a pandas DataFrame from rows and headers
|
|
823
|
+
data = {header: [row[header].raw_value for row in rows] for header in visible_headers}
|
|
824
|
+
df = pd.DataFrame(data)
|
|
825
|
+
|
|
826
|
+
if (stack_by_input0_layout):
|
|
827
|
+
df["OP Code Joined"] = df["OP Code"].str.split().str[0] \
|
|
828
|
+
+ " (in0:" + df["Input 0 Memory"].str.split('_').str[-2].str.lower() + "_" + df["Input 0 Memory"].str.split('_').str[-1].str.lower() + ")"
|
|
829
|
+
else:
|
|
830
|
+
df["OP Code Joined"] = df["OP Code"].str.split().str[0]
|
|
831
|
+
|
|
832
|
+
# Group by the joined OP Code and aggregate the data
|
|
833
|
+
stacked_df = df.groupby("OP Code Joined").agg(
|
|
834
|
+
Device_Time_Sum_us=("Device Time", "sum"),
|
|
835
|
+
Ops_Count=("Device Time", "count"),
|
|
836
|
+
Flops_min=("FLOPs %", "min"),
|
|
837
|
+
Flops_max=("FLOPs %", "max"),
|
|
838
|
+
Flops_mean=("FLOPs %", "mean"),
|
|
839
|
+
Flops_std=("FLOPs %", "std"),
|
|
840
|
+
).reset_index()
|
|
841
|
+
|
|
842
|
+
# Calculate the percentage of device time
|
|
843
|
+
total_device_time = stacked_df["Device_Time_Sum_us"].sum()
|
|
844
|
+
stacked_df["%"] = (stacked_df["Device_Time_Sum_us"] / total_device_time) * 100
|
|
845
|
+
# Reorder columns to move Device_Time_Percentage to be the 3rd column
|
|
846
|
+
cols = stacked_df.columns.tolist()
|
|
847
|
+
cols.insert(0, cols.pop(cols.index("%")))
|
|
848
|
+
stacked_df = stacked_df[cols]
|
|
849
|
+
# Sort the stacked dataframe by "Device_Time_Sum_us" in descending order
|
|
850
|
+
stacked_df = stacked_df.sort_values(by="Device_Time_Sum_us", ascending=False)
|
|
851
|
+
|
|
852
|
+
return stacked_df
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def print_stacked_report(stacked_df: pd.DataFrame):
|
|
856
|
+
print("\n📊 Stacked report 📊\n============\n")
|
|
857
|
+
print(stacked_df.to_string(index=False, float_format="%.2f"))
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def dump_stacked_report(stacked_df: pd.DataFrame, output_file: str):
|
|
861
|
+
stacked_df.to_csv(output_file, index=False, float_format="%.1f")
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def plot_stacked_report(stacked_df: pd.DataFrame, output_file: str, threshold: float = 0.02):
|
|
865
|
+
# Prepare data for the stacked bar plot
|
|
866
|
+
device_time_sum = stacked_df["Device_Time_Sum_us"]
|
|
867
|
+
total_sum = device_time_sum.sum()
|
|
868
|
+
|
|
869
|
+
# Create a stacked bar plot
|
|
870
|
+
plt.figure(figsize=(6, 8), dpi=300)
|
|
871
|
+
width = 0.5
|
|
872
|
+
bottom = 0
|
|
873
|
+
colors = plt.cm.tab20.colors + plt.cm.tab20b.colors + plt.cm.tab20c.colors
|
|
874
|
+
|
|
875
|
+
for i, row in stacked_df.iterrows():
|
|
876
|
+
color = colors[i % len(colors)]
|
|
877
|
+
bar = plt.bar(1, row["Device_Time_Sum_us"], width, label=row["OP Code Joined"], bottom=bottom, color=color)
|
|
878
|
+
|
|
879
|
+
text = f"({row['%']:.1f}%) {row['OP Code Joined']} total={row['Device_Time_Sum_us']:.1f}us; {row['Ops_Count']} ops"
|
|
880
|
+
if not pd.isna(row["Flops_mean"]):
|
|
881
|
+
text += f"\n Util [{row['Flops_min']:.1f} - {row['Flops_max']:.1f}] {row['Flops_mean']:.1f} ± {row['Flops_std']:.1f} %"
|
|
882
|
+
|
|
883
|
+
# Add overlay text if the data is significant
|
|
884
|
+
if row["Device_Time_Sum_us"] >= total_sum * threshold:
|
|
885
|
+
plt.text(
|
|
886
|
+
bar[0].get_x() + bar[0].get_width() / 2,
|
|
887
|
+
bottom + row["Device_Time_Sum_us"] / 2,
|
|
888
|
+
text,
|
|
889
|
+
ha="center",
|
|
890
|
+
va="center",
|
|
891
|
+
fontsize=6,
|
|
892
|
+
color="white"
|
|
893
|
+
)
|
|
894
|
+
bottom += row["Device_Time_Sum_us"]
|
|
895
|
+
|
|
896
|
+
# Set plot labels and title
|
|
897
|
+
plt.xlim(1 - width / 2 - 0.05, 1 + width / 2 + 0.05)
|
|
898
|
+
plt.ylabel("Device Time [us]")
|
|
899
|
+
plt.title(f"Stacked Device Time (Total: {total_sum:.1f} us)")
|
|
900
|
+
plt.tight_layout()
|
|
901
|
+
|
|
902
|
+
# Save the plot to a file
|
|
903
|
+
plt.savefig(output_file)
|
|
904
|
+
|
|
731
905
|
def merge_device_rows(df):
|
|
732
906
|
block_by_device = defaultdict(list)
|
|
733
907
|
|
|
@@ -765,10 +939,17 @@ def merge_device_rows(df):
|
|
|
765
939
|
if not blocks:
|
|
766
940
|
break
|
|
767
941
|
|
|
768
|
-
if "AllGather" in op_name or "ReduceScatter" in op_name:
|
|
769
|
-
# For collective ops, take the
|
|
770
|
-
|
|
771
|
-
|
|
942
|
+
if "AllGather" in op_name or "ReduceScatter" in op_name or "AllReduce" in op_name:
|
|
943
|
+
# For collective ops, take the average duration over all rows within a block
|
|
944
|
+
device_kernel_durations = [d["DEVICE KERNEL DURATION [ns]"]
|
|
945
|
+
for _, d in blocks
|
|
946
|
+
if pd.notna(d["DEVICE KERNEL DURATION [ns]"])]
|
|
947
|
+
# Use the first block's data but update its duration with the average
|
|
948
|
+
base_block = blocks[0][1].copy()
|
|
949
|
+
base_block["DEVICE KERNEL DURATION [ns]"] = (
|
|
950
|
+
sum(device_kernel_durations) / len(device_kernel_durations) if device_kernel_durations else float("nan")
|
|
951
|
+
)
|
|
952
|
+
merged_blocks.append(base_block)
|
|
772
953
|
else:
|
|
773
954
|
# For non-collective ops, take the row with maximum duration
|
|
774
955
|
max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
|
|
@@ -822,7 +1003,7 @@ def main():
|
|
|
822
1003
|
args, id_range = parse_args()
|
|
823
1004
|
generate_perf_report(
|
|
824
1005
|
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
|
|
825
|
-
args.tracing_mode, args.raw_op_codes, args.no_host_ops)
|
|
1006
|
+
args.tracing_mode, args.raw_op_codes, args.no_host_ops, args.no_stacked_report, args.no_stack_by_in0, args.stacked_csv)
|
|
826
1007
|
|
|
827
1008
|
|
|
828
1009
|
def parse_args():
|
|
@@ -845,6 +1026,12 @@ def parse_args():
|
|
|
845
1026
|
parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
|
|
846
1027
|
parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
|
|
847
1028
|
parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
|
|
1029
|
+
parser.add_argument("--no-stacked-report", action="store_true", help="Do not generate a stacked report")
|
|
1030
|
+
parser.add_argument("--no-stack-by-in0", action="store_true",
|
|
1031
|
+
help="Do not group the stacked report by the layout of Input 0 (extracted from the Input 0 Memory column)"
|
|
1032
|
+
)
|
|
1033
|
+
parser.add_argument("--stacked-csv", type=str,
|
|
1034
|
+
help="Output filename for the stacked report CSV; Defaults to OUTPUT_FILE_stacked.csv", metavar="STACKED_FILE")
|
|
848
1035
|
|
|
849
1036
|
args = parser.parse_args()
|
|
850
1037
|
|
|
@@ -863,7 +1050,7 @@ def parse_args():
|
|
|
863
1050
|
|
|
864
1051
|
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
|
|
865
1052
|
id_range, csv_output_file, no_advice, tracing_mode,
|
|
866
|
-
raw_op_codes, no_host_ops):
|
|
1053
|
+
raw_op_codes, no_host_ops, no_stacked_report, no_stack_by_in0, stacked_report_file):
|
|
867
1054
|
df = pd.read_csv(csv_file, low_memory=False)
|
|
868
1055
|
|
|
869
1056
|
# Add a column for original row numbers
|
|
@@ -891,6 +1078,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
|
|
|
891
1078
|
for _, row in df.iterrows():
|
|
892
1079
|
op_data, current_gap = analyze_op(row, prev_row)
|
|
893
1080
|
op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
|
|
1081
|
+
op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
|
|
894
1082
|
if raw_op_codes:
|
|
895
1083
|
op_data["Raw OP Code"] = Cell(row["OP CODE"])
|
|
896
1084
|
rows.append(op_data)
|
|
@@ -941,6 +1129,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
|
|
|
941
1129
|
"Inner Dim Block Size",
|
|
942
1130
|
"Output Subblock H",
|
|
943
1131
|
"Output Subblock W",
|
|
1132
|
+
"Global Call Count",
|
|
944
1133
|
]
|
|
945
1134
|
if not no_advice:
|
|
946
1135
|
all_headers.append("Advice")
|
|
@@ -965,6 +1154,22 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
|
|
|
965
1154
|
if not no_advice:
|
|
966
1155
|
print_advice_section(rows, visible_headers, col_widths)
|
|
967
1156
|
|
|
1157
|
+
# handle stacked report generation
|
|
1158
|
+
if not(no_stacked_report):
|
|
1159
|
+
stacked_report = generate_stacked_report(rows, visible_headers, not(no_stack_by_in0))
|
|
1160
|
+
|
|
1161
|
+
if not(csv_output_file):
|
|
1162
|
+
print_stacked_report(stacked_report)
|
|
1163
|
+
if stacked_report_file or csv_output_file:
|
|
1164
|
+
if not stacked_report_file:
|
|
1165
|
+
base_stacked_report_file = f"{os.path.splitext(csv_output_file)[0]}_stacked"
|
|
1166
|
+
else:
|
|
1167
|
+
base_stacked_report_file = os.path.splitext(stacked_report_file)[0]
|
|
1168
|
+
print(colored(f"Writing CSV stacked report to {base_stacked_report_file}.csv", "cyan"))
|
|
1169
|
+
dump_stacked_report(stacked_report, f"{base_stacked_report_file}.csv")
|
|
1170
|
+
print(colored(f"Plotting PNG stacked report to {base_stacked_report_file}.png", "cyan"))
|
|
1171
|
+
plot_stacked_report(stacked_report, f"{base_stacked_report_file}.png")
|
|
1172
|
+
|
|
968
1173
|
|
|
969
1174
|
def is_host_op(op_data):
|
|
970
1175
|
return "(torch)" in op_data["OP Code"].raw_value
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -218,6 +218,8 @@ Description-Content-Type: text/markdown
|
|
|
218
218
|
License-File: LICENSE
|
|
219
219
|
License-File: LICENSE_understanding.txt
|
|
220
220
|
Requires-Dist: pandas
|
|
221
|
+
Requires-Dist: matplotlib
|
|
222
|
+
Dynamic: license-file
|
|
221
223
|
|
|
222
224
|
# Performance Report Analysis Tool
|
|
223
225
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
pandas
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tt_perf_report-1.0.6 → tt_perf_report-1.1.0}/src/tt_perf_report.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|