tt-perf-report 1.0.5__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tt-perf-report might be problematic. Click here for more details.
- {tt_perf_report-1.0.5/src/tt_perf_report.egg-info → tt_perf_report-1.0.7}/PKG-INFO +3 -2
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/pyproject.toml +1 -1
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report/perf_report.py +107 -11
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7/src/tt_perf_report.egg-info}/PKG-INFO +3 -2
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/LICENSE +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/LICENSE_understanding.txt +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/README.md +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/setup.cfg +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report/__init__.py +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report.egg-info/SOURCES.txt +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report.egg-info/dependency_links.txt +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report.egg-info/entry_points.txt +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report.egg-info/requires.txt +0 -0
- {tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
|
|
|
218
218
|
License-File: LICENSE
|
|
219
219
|
License-File: LICENSE_understanding.txt
|
|
220
220
|
Requires-Dist: pandas
|
|
221
|
+
Dynamic: license-file
|
|
221
222
|
|
|
222
223
|
# Performance Report Analysis Tool
|
|
223
224
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tt-perf-report"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.7"
|
|
8
8
|
description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
|
|
9
9
|
license = {file = "LICENSE"}
|
|
10
10
|
readme = "README.md"
|
|
@@ -71,7 +71,7 @@ class Cell:
|
|
|
71
71
|
if self.raw_value is None or pd.isna(self.raw_value):
|
|
72
72
|
return ""
|
|
73
73
|
|
|
74
|
-
if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
|
|
74
|
+
if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value or "HaloDeviceOperation" in self.raw_value):
|
|
75
75
|
parts = self.raw_value.split(maxsplit=1)
|
|
76
76
|
op_name = parts[0]
|
|
77
77
|
size = parts[1] if len(parts) > 1 else ""
|
|
@@ -276,6 +276,42 @@ def analyze_matmul(row):
|
|
|
276
276
|
core_count, # Return the potentially adjusted core count
|
|
277
277
|
)
|
|
278
278
|
|
|
279
|
+
def analyze_halo(row):
|
|
280
|
+
attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
window_hw = attributes.split("window_hw=")[1].split(";")[0:2]
|
|
284
|
+
window_hw = ",".join(window_hw[0:2])
|
|
285
|
+
except (IndexError, AttributeError):
|
|
286
|
+
window_hw = "x"
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
stride_hw = attributes.split("stride_hw=")[1].split(";")[0:2]
|
|
290
|
+
stride_hw = ",".join(stride_hw[0:2])
|
|
291
|
+
except (IndexError, AttributeError):
|
|
292
|
+
stride_hw = "x"
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
pad_hw = attributes.split("padding=")[1].split(";")[0:4]
|
|
296
|
+
pad_hw = ",".join(pad_hw[0:4])
|
|
297
|
+
except (IndexError, AttributeError):
|
|
298
|
+
pad_hw = "x"
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
dilation_hw = attributes.split("dilation_hw=")[1].split(";")[0:2]
|
|
302
|
+
dilation_hw = ",".join(dilation_hw[0:2])
|
|
303
|
+
except (IndexError, AttributeError):
|
|
304
|
+
dilation_hw = "x"
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
memory_layout = attributes.split("memory_layout=")[1].split(";")[0].split("::")[1]
|
|
308
|
+
except (IndexError, AttributeError):
|
|
309
|
+
memory_layout = "x"
|
|
310
|
+
|
|
311
|
+
config = f"w={window_hw} s={stride_hw} p={pad_hw} d={dilation_hw} | {memory_layout}"
|
|
312
|
+
|
|
313
|
+
return config
|
|
314
|
+
|
|
279
315
|
def analyze_conv(row):
|
|
280
316
|
duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
|
|
281
317
|
|
|
@@ -300,12 +336,40 @@ def analyze_conv(row):
|
|
|
300
336
|
|
|
301
337
|
flops_percentage = (flops / peak_flops_value) * 100
|
|
302
338
|
|
|
339
|
+
try:
|
|
340
|
+
act_block_h_ntiles = int(attributes.split("act_block_h_ntiles")[1][1:].split(";")[0])
|
|
341
|
+
except (IndexError, ValueError):
|
|
342
|
+
act_block_h_ntiles = "x"
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
enable_act_double_buffer = "true" == attributes.split("enable_act_double_buffer': '")[1].split("'")[0]
|
|
346
|
+
except (IndexError, ValueError):
|
|
347
|
+
enable_act_double_buffer = "x"
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
enable_split_reader = "true" == attributes.split("enable_split_reader': '")[1].split("'")[0]
|
|
351
|
+
except (IndexError, ValueError):
|
|
352
|
+
enable_split_reader = "x"
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
per_core_out_matrix_height_ntile = int(attributes.split("per_core_out_matrix_height_ntile")[1][1:].split(";")[0])
|
|
356
|
+
except (IndexError, ValueError):
|
|
357
|
+
per_core_out_matrix_height_ntile = "x"
|
|
358
|
+
|
|
359
|
+
config = f"[ABH={per_core_out_matrix_height_ntile}|{act_block_h_ntiles}"
|
|
360
|
+
if (enable_act_double_buffer):
|
|
361
|
+
config += " ADB"
|
|
362
|
+
if (enable_split_reader):
|
|
363
|
+
config += " SR"
|
|
364
|
+
config += "]"
|
|
365
|
+
|
|
303
366
|
return (
|
|
304
367
|
flops,
|
|
305
368
|
flops_percentage,
|
|
306
369
|
size,
|
|
307
370
|
memory_info,
|
|
308
|
-
math_fidelity
|
|
371
|
+
math_fidelity,
|
|
372
|
+
config,
|
|
309
373
|
)
|
|
310
374
|
|
|
311
375
|
def analyze_op(row, prev_row):
|
|
@@ -381,8 +445,9 @@ def analyze_op(row, prev_row):
|
|
|
381
445
|
size,
|
|
382
446
|
memory_info,
|
|
383
447
|
math_fidelity,
|
|
448
|
+
config,
|
|
384
449
|
) = analyze_conv(row)
|
|
385
|
-
op_code = Cell(f"{op_code.raw_value} {size}")
|
|
450
|
+
op_code = Cell(f"{op_code.raw_value} {size} {config}")
|
|
386
451
|
dram_speed = Cell(None, unit="GB/s", decimals=0)
|
|
387
452
|
dram_percentage = Cell(None, unit="%", decimals=1)
|
|
388
453
|
flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
|
|
@@ -392,6 +457,13 @@ def analyze_op(row, prev_row):
|
|
|
392
457
|
if math_fidelity
|
|
393
458
|
else None
|
|
394
459
|
)
|
|
460
|
+
elif "HaloDeviceOperation" in op_code.raw_value:
|
|
461
|
+
config = analyze_halo(row)
|
|
462
|
+
op_code = Cell(f"{op_code.raw_value} {config}")
|
|
463
|
+
dram_speed = Cell(None, unit="GB/s", decimals=0)
|
|
464
|
+
dram_percentage = Cell(None, unit="%", decimals=1)
|
|
465
|
+
flops = Cell(None, unit="TFLOPs", decimals=1)
|
|
466
|
+
flops_percentage = Cell(None, unit="%", decimals=1)
|
|
395
467
|
|
|
396
468
|
output = {
|
|
397
469
|
"ID": None,
|
|
@@ -765,10 +837,17 @@ def merge_device_rows(df):
|
|
|
765
837
|
if not blocks:
|
|
766
838
|
break
|
|
767
839
|
|
|
768
|
-
if "AllGather" in op_name or "ReduceScatter" in op_name:
|
|
769
|
-
# For collective ops, take the
|
|
770
|
-
|
|
771
|
-
|
|
840
|
+
if "AllGather" in op_name or "ReduceScatter" in op_name or "AllReduce" in op_name:
|
|
841
|
+
# For collective ops, take the average duration over all rows within a block
|
|
842
|
+
device_kernel_durations = [d["DEVICE KERNEL DURATION [ns]"]
|
|
843
|
+
for _, d in blocks
|
|
844
|
+
if pd.notna(d["DEVICE KERNEL DURATION [ns]"])]
|
|
845
|
+
# Use the first block's data but update its duration with the average
|
|
846
|
+
base_block = blocks[0][1].copy()
|
|
847
|
+
base_block["DEVICE KERNEL DURATION [ns]"] = (
|
|
848
|
+
sum(device_kernel_durations) / len(device_kernel_durations) if device_kernel_durations else float("nan")
|
|
849
|
+
)
|
|
850
|
+
merged_blocks.append(base_block)
|
|
772
851
|
else:
|
|
773
852
|
# For non-collective ops, take the row with maximum duration
|
|
774
853
|
max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
|
|
@@ -814,11 +893,15 @@ def filter_by_id_range(rows, id_range):
|
|
|
814
893
|
return rows
|
|
815
894
|
|
|
816
895
|
|
|
896
|
+
def filter_host_ops(rows):
|
|
897
|
+
return [row for row in rows if not is_host_op(row)]
|
|
898
|
+
|
|
899
|
+
|
|
817
900
|
def main():
|
|
818
901
|
args, id_range = parse_args()
|
|
819
902
|
generate_perf_report(
|
|
820
|
-
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
|
|
821
|
-
|
|
903
|
+
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
|
|
904
|
+
args.tracing_mode, args.raw_op_codes, args.no_host_ops)
|
|
822
905
|
|
|
823
906
|
|
|
824
907
|
def parse_args():
|
|
@@ -840,6 +923,8 @@ def parse_args():
|
|
|
840
923
|
parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
|
|
841
924
|
parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
|
|
842
925
|
parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
|
|
926
|
+
parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
|
|
927
|
+
|
|
843
928
|
args = parser.parse_args()
|
|
844
929
|
|
|
845
930
|
# Set the global color_output variable
|
|
@@ -855,7 +940,9 @@ def parse_args():
|
|
|
855
940
|
return args, id_range
|
|
856
941
|
|
|
857
942
|
|
|
858
|
-
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
|
|
943
|
+
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
|
|
944
|
+
id_range, csv_output_file, no_advice, tracing_mode,
|
|
945
|
+
raw_op_codes, no_host_ops):
|
|
859
946
|
df = pd.read_csv(csv_file, low_memory=False)
|
|
860
947
|
|
|
861
948
|
# Add a column for original row numbers
|
|
@@ -883,13 +970,14 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
883
970
|
for _, row in df.iterrows():
|
|
884
971
|
op_data, current_gap = analyze_op(row, prev_row)
|
|
885
972
|
op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
|
|
973
|
+
op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
|
|
886
974
|
if raw_op_codes:
|
|
887
975
|
op_data["Raw OP Code"] = Cell(row["OP CODE"])
|
|
888
976
|
rows.append(op_data)
|
|
889
977
|
prev_row = row
|
|
890
978
|
|
|
891
979
|
# Count device and host ops
|
|
892
|
-
if
|
|
980
|
+
if is_host_op(op_data):
|
|
893
981
|
host_ops += 1
|
|
894
982
|
else:
|
|
895
983
|
device_ops += 1
|
|
@@ -900,6 +988,9 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
900
988
|
# Filter rows based on id_range
|
|
901
989
|
rows = filter_by_id_range(rows, id_range)
|
|
902
990
|
|
|
991
|
+
if no_host_ops:
|
|
992
|
+
rows = filter_host_ops(rows)
|
|
993
|
+
|
|
903
994
|
# Recalculate derived columns after filtering
|
|
904
995
|
add_derived_columns(rows)
|
|
905
996
|
|
|
@@ -930,6 +1021,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
930
1021
|
"Inner Dim Block Size",
|
|
931
1022
|
"Output Subblock H",
|
|
932
1023
|
"Output Subblock W",
|
|
1024
|
+
"Global Call Count",
|
|
933
1025
|
]
|
|
934
1026
|
if not no_advice:
|
|
935
1027
|
all_headers.append("Advice")
|
|
@@ -955,5 +1047,9 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
955
1047
|
print_advice_section(rows, visible_headers, col_widths)
|
|
956
1048
|
|
|
957
1049
|
|
|
1050
|
+
def is_host_op(op_data):
|
|
1051
|
+
return "(torch)" in op_data["OP Code"].raw_value
|
|
1052
|
+
|
|
1053
|
+
|
|
958
1054
|
if __name__ == "__main__":
|
|
959
1055
|
main()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
|
|
|
218
218
|
License-File: LICENSE
|
|
219
219
|
License-File: LICENSE_understanding.txt
|
|
220
220
|
Requires-Dist: pandas
|
|
221
|
+
Dynamic: license-file
|
|
221
222
|
|
|
222
223
|
# Performance Report Analysis Tool
|
|
223
224
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tt_perf_report-1.0.5 → tt_perf_report-1.0.7}/src/tt_perf_report.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|