tt-perf-report 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tt-perf-report might be problematic. Click here for more details.
- tt_perf_report/perf_report.py +96 -17
- tt_perf_report-1.0.5.dist-info/LICENSE_understanding.txt +3 -0
- {tt_perf_report-1.0.3.dist-info → tt_perf_report-1.0.5.dist-info}/METADATA +2 -1
- tt_perf_report-1.0.5.dist-info/RECORD +9 -0
- {tt_perf_report-1.0.3.dist-info → tt_perf_report-1.0.5.dist-info}/WHEEL +1 -1
- tt_perf_report-1.0.3.dist-info/RECORD +0 -8
- {tt_perf_report-1.0.3.dist-info → tt_perf_report-1.0.5.dist-info}/LICENSE +0 -0
- {tt_perf_report-1.0.3.dist-info → tt_perf_report-1.0.5.dist-info}/entry_points.txt +0 -0
- {tt_perf_report-1.0.3.dist-info → tt_perf_report-1.0.5.dist-info}/top_level.txt +0 -0
tt_perf_report/perf_report.py
CHANGED
|
@@ -38,6 +38,7 @@ def colored(text, color):
|
|
|
38
38
|
"yellow": "\033[38;5;11m",
|
|
39
39
|
"blue": "\033[38;5;12m",
|
|
40
40
|
"magenta": "\033[38;5;13m",
|
|
41
|
+
"orange": "\033[38;5;208m",
|
|
41
42
|
"cyan": "\033[38;5;14m",
|
|
42
43
|
"white": "\033[38;5;15m",
|
|
43
44
|
"end": "\033[0m",
|
|
@@ -70,7 +71,7 @@ class Cell:
|
|
|
70
71
|
if self.raw_value is None or pd.isna(self.raw_value):
|
|
71
72
|
return ""
|
|
72
73
|
|
|
73
|
-
if isinstance(self.raw_value, str) and "Matmul" in self.raw_value:
|
|
74
|
+
if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
|
|
74
75
|
parts = self.raw_value.split(maxsplit=1)
|
|
75
76
|
op_name = parts[0]
|
|
76
77
|
size = parts[1] if len(parts) > 1 else ""
|
|
@@ -275,6 +276,37 @@ def analyze_matmul(row):
|
|
|
275
276
|
core_count, # Return the potentially adjusted core count
|
|
276
277
|
)
|
|
277
278
|
|
|
279
|
+
def analyze_conv(row):
|
|
280
|
+
duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
|
|
281
|
+
|
|
282
|
+
core_count = 64 # we decided to normalize to the max core count
|
|
283
|
+
math_fidelity = row["MATH FIDELITY"]
|
|
284
|
+
|
|
285
|
+
# Check for DRAM-sharded program config
|
|
286
|
+
attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
|
|
287
|
+
|
|
288
|
+
peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
|
|
289
|
+
|
|
290
|
+
NHW = int(row["OUTPUT_0_Y"])
|
|
291
|
+
CH_IN = int(row["INPUT_0_X"])
|
|
292
|
+
W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
|
|
293
|
+
CH_OUT = int(row["INPUT_1_X"])
|
|
294
|
+
|
|
295
|
+
M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
|
|
296
|
+
flops = (M * K * N * 2) / duration_s
|
|
297
|
+
|
|
298
|
+
size = f"{M} x {K} x {N}"
|
|
299
|
+
memory_info = f"({row['INPUT_0_DATATYPE']} {row['INPUT_0_MEMORY'].replace('DEV_0_', '')} @ {row['INPUT_1_DATATYPE']} {row['INPUT_1_MEMORY'].replace('DEV_0_', '')} => {row['OUTPUT_0_DATATYPE']} {row['OUTPUT_0_MEMORY'].replace('DEV_0_', '')})"
|
|
300
|
+
|
|
301
|
+
flops_percentage = (flops / peak_flops_value) * 100
|
|
302
|
+
|
|
303
|
+
return (
|
|
304
|
+
flops,
|
|
305
|
+
flops_percentage,
|
|
306
|
+
size,
|
|
307
|
+
memory_info,
|
|
308
|
+
math_fidelity
|
|
309
|
+
)
|
|
278
310
|
|
|
279
311
|
def analyze_op(row, prev_row):
|
|
280
312
|
op_code = Cell(row["OP CODE"])
|
|
@@ -305,6 +337,19 @@ def analyze_op(row, prev_row):
|
|
|
305
337
|
input_1_datatype_cell = Cell(input_1_datatype)
|
|
306
338
|
short_name = lambda n: {"BFLOAT16": "BF16", "BFLOAT8_B": "BFP8", "BFLOAT4_B": "BFP4"}.get(n, n)
|
|
307
339
|
|
|
340
|
+
dram_speed = Cell(None, unit="GB/s", decimals=0)
|
|
341
|
+
dram_percentage = Cell(None, unit="%", decimals=1)
|
|
342
|
+
flops = Cell(None, unit="TFLOPs", decimals=1)
|
|
343
|
+
flops_percentage = Cell(None, unit="%", decimals=1)
|
|
344
|
+
|
|
345
|
+
math_fidelity = ""
|
|
346
|
+
math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
|
|
347
|
+
math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
|
|
348
|
+
math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
|
|
349
|
+
math_fidelity_cell = Cell(math_fidelity.strip())
|
|
350
|
+
|
|
351
|
+
is_dram_sharded = False
|
|
352
|
+
|
|
308
353
|
if "Matmul" in op_code.raw_value:
|
|
309
354
|
(
|
|
310
355
|
dram_speed,
|
|
@@ -329,19 +374,24 @@ def analyze_op(row, prev_row):
|
|
|
329
374
|
if math_fidelity
|
|
330
375
|
else None
|
|
331
376
|
)
|
|
332
|
-
|
|
377
|
+
elif "OptimizedConvNew" in op_code.raw_value:
|
|
378
|
+
(
|
|
379
|
+
flops,
|
|
380
|
+
flops_percentage,
|
|
381
|
+
size,
|
|
382
|
+
memory_info,
|
|
383
|
+
math_fidelity,
|
|
384
|
+
) = analyze_conv(row)
|
|
385
|
+
op_code = Cell(f"{op_code.raw_value} {size}")
|
|
333
386
|
dram_speed = Cell(None, unit="GB/s", decimals=0)
|
|
334
387
|
dram_percentage = Cell(None, unit="%", decimals=1)
|
|
335
|
-
flops = Cell(None, unit="TFLOPs", decimals=1)
|
|
336
|
-
flops_percentage = Cell(
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
math_fidelity_cell = Cell(math_fidelity.strip())
|
|
343
|
-
|
|
344
|
-
is_dram_sharded = False
|
|
388
|
+
flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
|
|
389
|
+
flops_percentage = Cell(flops_percentage, unit="%", decimals=1)
|
|
390
|
+
math_fidelity_cell = Cell(
|
|
391
|
+
f"{math_fidelity} {short_name(input_0_datatype)} x {short_name(input_1_datatype)} => {short_name(output_datatype)}".strip()
|
|
392
|
+
if math_fidelity
|
|
393
|
+
else None
|
|
394
|
+
)
|
|
345
395
|
|
|
346
396
|
output = {
|
|
347
397
|
"ID": None,
|
|
@@ -434,6 +484,7 @@ def color_row(op_data, percentage, min_percentage):
|
|
|
434
484
|
op_colors = {
|
|
435
485
|
"(torch)": "red",
|
|
436
486
|
"Matmul": "magenta",
|
|
487
|
+
"OptimizedConvNew" : "orange",
|
|
437
488
|
"LayerNorm": "cyan",
|
|
438
489
|
"AllGather": "cyan",
|
|
439
490
|
"AllReduce": "cyan",
|
|
@@ -484,7 +535,8 @@ def color_row(op_data, percentage, min_percentage):
|
|
|
484
535
|
if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5:
|
|
485
536
|
op_data["Op-to-Op Gap"].color = "red"
|
|
486
537
|
|
|
487
|
-
if "Matmul" in op_data["OP Code"].raw_value
|
|
538
|
+
if ("Matmul" in op_data["OP Code"].raw_value
|
|
539
|
+
or "OptimizedConvNew" in op_data["OP Code"].raw_value) and op_data["Math Fidelity"].raw_value:
|
|
488
540
|
math_fidelity = op_data["Math Fidelity"].raw_value.split()[0]
|
|
489
541
|
input_0_datatype = op_data["Input 0 Datatype"].raw_value
|
|
490
542
|
input_1_datatype = op_data["Input 1 Datatype"].raw_value
|
|
@@ -690,8 +742,28 @@ def merge_device_rows(df):
|
|
|
690
742
|
device_ids = sorted(block_by_device.keys())
|
|
691
743
|
merged_blocks = []
|
|
692
744
|
|
|
693
|
-
|
|
694
|
-
|
|
745
|
+
global_index = 0
|
|
746
|
+
while max(len(block_by_device[device_id]) for device_id in device_ids) > 0:
|
|
747
|
+
blocks = []
|
|
748
|
+
op_name = None
|
|
749
|
+
missing_devices = []
|
|
750
|
+
for device_id in device_ids:
|
|
751
|
+
if not len(block_by_device[device_id]):
|
|
752
|
+
print(colored(f"Warning: Device {device_id} is missing operation {op_name} at index {global_index}", "yellow"))
|
|
753
|
+
continue
|
|
754
|
+
if op_name is None:
|
|
755
|
+
op_name = block_by_device[device_id][0][0]
|
|
756
|
+
elif op_name != block_by_device[device_id][0][0]:
|
|
757
|
+
missing_devices.append(device_id)
|
|
758
|
+
continue
|
|
759
|
+
|
|
760
|
+
blocks.append(block_by_device[device_id].pop(0))
|
|
761
|
+
|
|
762
|
+
if missing_devices:
|
|
763
|
+
print(colored(f"Warning: {op_name} at index {global_index} not present in CSV for {len(missing_devices)} devices {missing_devices} - do not trust data for this op or directly subsequent ops with the same name", "yellow"))
|
|
764
|
+
|
|
765
|
+
if not blocks:
|
|
766
|
+
break
|
|
695
767
|
|
|
696
768
|
if "AllGather" in op_name or "ReduceScatter" in op_name:
|
|
697
769
|
# For collective ops, take the row with minimum duration
|
|
@@ -702,6 +774,8 @@ def merge_device_rows(df):
|
|
|
702
774
|
max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
|
|
703
775
|
merged_blocks.append(max_duration_block[1])
|
|
704
776
|
|
|
777
|
+
global_index += 1
|
|
778
|
+
|
|
705
779
|
return pd.DataFrame(merged_blocks)
|
|
706
780
|
|
|
707
781
|
|
|
@@ -743,7 +817,7 @@ def filter_by_id_range(rows, id_range):
|
|
|
743
817
|
def main():
|
|
744
818
|
args, id_range = parse_args()
|
|
745
819
|
generate_perf_report(
|
|
746
|
-
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode
|
|
820
|
+
args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode, args.raw_op_codes,
|
|
747
821
|
)
|
|
748
822
|
|
|
749
823
|
|
|
@@ -765,6 +839,7 @@ def parse_args():
|
|
|
765
839
|
parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
|
|
766
840
|
parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
|
|
767
841
|
parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
|
|
842
|
+
parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
|
|
768
843
|
args = parser.parse_args()
|
|
769
844
|
|
|
770
845
|
# Set the global color_output variable
|
|
@@ -780,7 +855,7 @@ def parse_args():
|
|
|
780
855
|
return args, id_range
|
|
781
856
|
|
|
782
857
|
|
|
783
|
-
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode):
|
|
858
|
+
def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode, raw_op_codes):
|
|
784
859
|
df = pd.read_csv(csv_file, low_memory=False)
|
|
785
860
|
|
|
786
861
|
# Add a column for original row numbers
|
|
@@ -808,6 +883,8 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
808
883
|
for _, row in df.iterrows():
|
|
809
884
|
op_data, current_gap = analyze_op(row, prev_row)
|
|
810
885
|
op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
|
|
886
|
+
if raw_op_codes:
|
|
887
|
+
op_data["Raw OP Code"] = Cell(row["OP CODE"])
|
|
811
888
|
rows.append(op_data)
|
|
812
889
|
prev_row = row
|
|
813
890
|
|
|
@@ -856,6 +933,8 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
|
|
|
856
933
|
]
|
|
857
934
|
if not no_advice:
|
|
858
935
|
all_headers.append("Advice")
|
|
936
|
+
if raw_op_codes:
|
|
937
|
+
all_headers.append("Raw OP Code")
|
|
859
938
|
print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
|
|
860
939
|
with open(csv_output_file, "w") as f:
|
|
861
940
|
csv_writer = csv.DictWriter(f, fieldnames=all_headers)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: tt-perf-report
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.5
|
|
4
4
|
Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -216,6 +216,7 @@ Project-URL: Repository, https://github.com/tenstorrent/tt-perf-report
|
|
|
216
216
|
Keywords: tenstorrent,tt-metal
|
|
217
217
|
Description-Content-Type: text/markdown
|
|
218
218
|
License-File: LICENSE
|
|
219
|
+
License-File: LICENSE_understanding.txt
|
|
219
220
|
Requires-Dist: pandas
|
|
220
221
|
|
|
221
222
|
# Performance Report Analysis Tool
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
tt_perf_report/__init__.py,sha256=-j4iFYebIwgdS8uphk8-M6zasRqGBL3CQGnJH9keRuI,92
|
|
2
|
+
tt_perf_report/perf_report.py,sha256=14gQ3UMfd6pSqgbaVAhEO62u9lZrls_dx_XxlAqfrkQ,37876
|
|
3
|
+
tt_perf_report-1.0.5.dist-info/LICENSE,sha256=6dZGjPECz_ULS-sf40FLlt6OmQFcrRvmzG5mJRZCQ5I,11825
|
|
4
|
+
tt_perf_report-1.0.5.dist-info/LICENSE_understanding.txt,sha256=pymi-yb_RvYM9p2ZA4iSNsImcvhDBBxlGuJCY9dTq7M,233
|
|
5
|
+
tt_perf_report-1.0.5.dist-info/METADATA,sha256=dOYnleQMhtLCVE3W0aDQNNd92fouRnpfcb66nodBtag,18345
|
|
6
|
+
tt_perf_report-1.0.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
7
|
+
tt_perf_report-1.0.5.dist-info/entry_points.txt,sha256=ReAziglcjbAkPbklqheUISkfoEVI5ptlFrBAJTIk5dI,67
|
|
8
|
+
tt_perf_report-1.0.5.dist-info/top_level.txt,sha256=mEQ-BK3rRbmz9QyWitTCLy2xwmC5rmJno_TY_H9s9CE,15
|
|
9
|
+
tt_perf_report-1.0.5.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
tt_perf_report/__init__.py,sha256=-j4iFYebIwgdS8uphk8-M6zasRqGBL3CQGnJH9keRuI,92
|
|
2
|
-
tt_perf_report/perf_report.py,sha256=K_AXtz8ZFFkhLLIoHz2jbuw6aFg1qsJbsHF7kQwL2GI,34620
|
|
3
|
-
tt_perf_report-1.0.3.dist-info/LICENSE,sha256=6dZGjPECz_ULS-sf40FLlt6OmQFcrRvmzG5mJRZCQ5I,11825
|
|
4
|
-
tt_perf_report-1.0.3.dist-info/METADATA,sha256=mCbrFtPNT_MbvLO-Vt7ugY6dS-FhhhrtifOM98PbE6g,18305
|
|
5
|
-
tt_perf_report-1.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
6
|
-
tt_perf_report-1.0.3.dist-info/entry_points.txt,sha256=ReAziglcjbAkPbklqheUISkfoEVI5ptlFrBAJTIk5dI,67
|
|
7
|
-
tt_perf_report-1.0.3.dist-info/top_level.txt,sha256=mEQ-BK3rRbmz9QyWitTCLy2xwmC5rmJno_TY_H9s9CE,15
|
|
8
|
-
tt_perf_report-1.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|