tt-perf-report 1.0.1__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: tt-perf-report
3
- Version: 1.0.1
3
+ Version: 1.0.4
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.1"
7
+ version = "1.0.4"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
@@ -2,7 +2,7 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
5
-
5
+ import csv
6
6
  import sys
7
7
  import argparse
8
8
  import re
@@ -38,6 +38,7 @@ def colored(text, color):
38
38
  "yellow": "\033[38;5;11m",
39
39
  "blue": "\033[38;5;12m",
40
40
  "magenta": "\033[38;5;13m",
41
+ "orange": "\033[38;5;208m",
41
42
  "cyan": "\033[38;5;14m",
42
43
  "white": "\033[38;5;15m",
43
44
  "end": "\033[0m",
@@ -70,7 +71,7 @@ class Cell:
70
71
  if self.raw_value is None or pd.isna(self.raw_value):
71
72
  return ""
72
73
 
73
- if isinstance(self.raw_value, str) and "Matmul" in self.raw_value:
74
+ if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
74
75
  parts = self.raw_value.split(maxsplit=1)
75
76
  op_name = parts[0]
76
77
  size = parts[1] if len(parts) > 1 else ""
@@ -275,6 +276,37 @@ def analyze_matmul(row):
275
276
  core_count, # Return the potentially adjusted core count
276
277
  )
277
278
 
279
+ def analyze_conv(row):
280
+ duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
281
+
282
+ core_count = 64 # we decided to normalize to the max core count
283
+ math_fidelity = row["MATH FIDELITY"]
284
+
285
+ # Check for DRAM-sharded program config
286
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
287
+
288
+ peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
289
+
290
+ NHW = int(row["OUTPUT_0_Y"])
291
+ CH_IN = int(row["INPUT_0_X"])
292
+ W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
293
+ CH_OUT = int(row["INPUT_1_X"])
294
+
295
+ M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
296
+ flops = (M * K * N * 2) / duration_s
297
+
298
+ size = f"{M} x {K} x {N}"
299
+ memory_info = f"({row['INPUT_0_DATATYPE']} {row['INPUT_0_MEMORY'].replace('DEV_0_', '')} @ {row['INPUT_1_DATATYPE']} {row['INPUT_1_MEMORY'].replace('DEV_0_', '')} => {row['OUTPUT_0_DATATYPE']} {row['OUTPUT_0_MEMORY'].replace('DEV_0_', '')})"
300
+
301
+ flops_percentage = (flops / peak_flops_value) * 100
302
+
303
+ return (
304
+ flops,
305
+ flops_percentage,
306
+ size,
307
+ memory_info,
308
+ math_fidelity
309
+ )
278
310
 
279
311
  def analyze_op(row, prev_row):
280
312
  op_code = Cell(row["OP CODE"])
@@ -305,6 +337,19 @@ def analyze_op(row, prev_row):
305
337
  input_1_datatype_cell = Cell(input_1_datatype)
306
338
  short_name = lambda n: {"BFLOAT16": "BF16", "BFLOAT8_B": "BFP8", "BFLOAT4_B": "BFP4"}.get(n, n)
307
339
 
340
+ dram_speed = Cell(None, unit="GB/s", decimals=0)
341
+ dram_percentage = Cell(None, unit="%", decimals=1)
342
+ flops = Cell(None, unit="TFLOPs", decimals=1)
343
+ flops_percentage = Cell(None, unit="%", decimals=1)
344
+
345
+ math_fidelity = ""
346
+ math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
347
+ math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
348
+ math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
349
+ math_fidelity_cell = Cell(math_fidelity.strip())
350
+
351
+ is_dram_sharded = False
352
+
308
353
  if "Matmul" in op_code.raw_value:
309
354
  (
310
355
  dram_speed,
@@ -329,19 +374,24 @@ def analyze_op(row, prev_row):
329
374
  if math_fidelity
330
375
  else None
331
376
  )
332
- else:
377
+ elif "OptimizedConvNew" in op_code.raw_value:
378
+ (
379
+ flops,
380
+ flops_percentage,
381
+ size,
382
+ memory_info,
383
+ math_fidelity,
384
+ ) = analyze_conv(row)
385
+ op_code = Cell(f"{op_code.raw_value} {size}")
333
386
  dram_speed = Cell(None, unit="GB/s", decimals=0)
334
387
  dram_percentage = Cell(None, unit="%", decimals=1)
335
- flops = Cell(None, unit="TFLOPs", decimals=1)
336
- flops_percentage = Cell(None, unit="%", decimals=1)
337
-
338
- math_fidelity = ""
339
- math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
340
- math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
341
- math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
342
- math_fidelity_cell = Cell(math_fidelity.strip())
343
-
344
- is_dram_sharded = False
388
+ flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
389
+ flops_percentage = Cell(flops_percentage, unit="%", decimals=1)
390
+ math_fidelity_cell = Cell(
391
+ f"{math_fidelity} {short_name(input_0_datatype)} x {short_name(input_1_datatype)} => {short_name(output_datatype)}".strip()
392
+ if math_fidelity
393
+ else None
394
+ )
345
395
 
346
396
  output = {
347
397
  "ID": None,
@@ -434,6 +484,7 @@ def color_row(op_data, percentage, min_percentage):
434
484
  op_colors = {
435
485
  "(torch)": "red",
436
486
  "Matmul": "magenta",
487
+ "OptimizedConvNew" : "orange",
437
488
  "LayerNorm": "cyan",
438
489
  "AllGather": "cyan",
439
490
  "AllReduce": "cyan",
@@ -484,7 +535,8 @@ def color_row(op_data, percentage, min_percentage):
484
535
  if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5:
485
536
  op_data["Op-to-Op Gap"].color = "red"
486
537
 
487
- if "Matmul" in op_data["OP Code"].raw_value and op_data["Math Fidelity"].raw_value:
538
+ if ("Matmul" in op_data["OP Code"].raw_value
539
+ or "OptimizedConvNew" in op_data["OP Code"].raw_value) and op_data["Math Fidelity"].raw_value:
488
540
  math_fidelity = op_data["Math Fidelity"].raw_value.split()[0]
489
541
  input_0_datatype = op_data["Input 0 Datatype"].raw_value
490
542
  input_1_datatype = op_data["Input 1 Datatype"].raw_value
@@ -582,88 +634,98 @@ def print_op_to_op_gap_advice(rows, headers, col_widths):
582
634
  )
583
635
 
584
636
 
637
+ def is_matmul_op(op_data):
638
+ return "Matmul" in op_data["OP Code"].raw_value
639
+
640
+
585
641
  def print_matmul_advice(rows, headers, col_widths):
586
- matmul_ops = [op_data for op_data in rows if "Matmul" in op_data["OP Code"].raw_value]
642
+ matmul_ops = [op_data for op_data in rows if is_matmul_op(op_data)]
587
643
 
588
644
  if matmul_ops:
589
645
  print("Matmul Optimization\n-------------------")
590
646
  for op_data in matmul_ops:
591
647
  print_row(op_data, col_widths, headers)
592
- advice = []
648
+ advice = generate_matmul_advice(op_data)
593
649
  color = "grey" if op_data["OP Code"].color == "grey" else "white"
594
650
 
595
- math_fidelity = (
596
- op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
651
+ if advice:
652
+ for item in advice:
653
+ print(colored(f"- {item}", color))
654
+ else:
655
+ print(colored("✅ Optimized", color))
656
+ print() # Add a blank line between matmuls
657
+
658
+
659
+ def generate_matmul_advice(op_data):
660
+ advice = []
661
+
662
+ math_fidelity = (
663
+ op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None
664
+ )
665
+ output_datatype = op_data["Output Datatype"].raw_value
666
+ input_0_datatype = op_data["Input 0 Datatype"].raw_value
667
+ input_1_datatype = op_data["Input 1 Datatype"].raw_value
668
+ cores = op_data["Cores"].raw_value
669
+ fidelity_evaluation, fidelity_advice = evaluate_fidelity(
670
+ input_0_datatype, input_1_datatype, output_datatype, math_fidelity
671
+ )
672
+
673
+ if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
674
+ if not op_data["DRAM Sharded"].raw_value:
675
+ advice.append(
676
+ "Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
597
677
  )
598
- output_datatype = op_data["Output Datatype"].raw_value
599
- input_0_datatype = op_data["Input 0 Datatype"].raw_value
600
- input_1_datatype = op_data["Input 1 Datatype"].raw_value
601
- cores = op_data["Cores"].raw_value
602
- fidelity_evaluation, fidelity_advice = evaluate_fidelity(
603
- input_0_datatype, input_1_datatype, output_datatype, math_fidelity
678
+ if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
679
+ advice.append(fidelity_advice)
680
+ if fidelity_evaluation == "too_high":
681
+ advice.append(fidelity_advice)
682
+ elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
683
+ if cores < 64:
684
+ advice.append(f"Increase grid size (currently using {cores})")
685
+ if fidelity_evaluation == "too_high":
686
+ advice.append(fidelity_advice)
687
+ elif op_data["Bound"].raw_value == "SLOW":
688
+ input_0_memory = op_data["Input 0 Memory"].raw_value
689
+ if input_0_memory and "L1" not in input_0_memory:
690
+ advice.append(f"If possible place input 0 in L1 (currently in {input_0_memory})")
691
+
692
+ inner_dim_block = op_data["Inner Dim Block Size"].raw_value
693
+ out_h = op_data["Output Subblock H"].raw_value
694
+ out_w = op_data["Output Subblock W"].raw_value
695
+
696
+ if inner_dim_block is None and out_h is None and out_w is None:
697
+ advice.append(
698
+ "No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
604
699
  )
700
+ else:
701
+ all_good = True
702
+ if inner_dim_block is not None:
703
+ if inner_dim_block < 2:
704
+ advice.append(f"in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
705
+ all_good = False
706
+ else:
707
+ advice.append("No inner dim block size found")
708
+ all_good = False
605
709
 
606
- if op_data["Bound"].raw_value in ["DRAM", "BOTH"]:
607
- if not op_data["DRAM Sharded"].raw_value:
608
- advice.append(
609
- "- Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further"
610
- )
611
- if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40:
612
- advice.append(f"- {fidelity_advice}")
613
- if fidelity_evaluation == "too_high":
614
- advice.append(f"- {fidelity_advice}")
615
- elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]:
616
- if cores < 64:
617
- advice.append(f"- Increase grid size (currently using {cores})")
618
- if fidelity_evaluation == "too_high":
619
- advice.append(f"- {fidelity_advice}")
620
- elif op_data["Bound"].raw_value == "SLOW":
621
- input_0_memory = op_data["Input 0 Memory"].raw_value
622
- if input_0_memory and "L1" not in input_0_memory:
623
- advice.append(f"- If possible place input 0 in L1 (currently in {input_0_memory})")
624
-
625
- inner_dim_block = op_data["Inner Dim Block Size"].raw_value
626
- out_h = op_data["Output Subblock H"].raw_value
627
- out_w = op_data["Output Subblock W"].raw_value
628
-
629
- if inner_dim_block is None and out_h is None and out_w is None:
710
+ if out_h is not None and out_w is not None:
711
+ out_area = out_h * out_w
712
+ if out_area < 2:
630
713
  advice.append(
631
- "- No program_config specified, try using one to override in0_block_w and out_subblock_h/w"
714
+ f"Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
632
715
  )
633
- else:
634
- all_good = True
635
- if inner_dim_block is not None:
636
- if inner_dim_block < 2:
637
- advice.append(f"- in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above")
638
- all_good = False
639
- else:
640
- advice.append("- No inner dim block size found")
641
- all_good = False
642
-
643
- if out_h is not None and out_w is not None:
644
- out_area = out_h * out_w
645
- if out_area < 2:
646
- advice.append(
647
- f"- Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible"
648
- )
649
- all_good = False
650
- else:
651
- advice.append("- No output subblock size found")
652
- all_good = False
653
-
654
- if all_good:
655
- advice.append(
656
- f"- in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
657
- )
658
- if fidelity_advice:
659
- advice.append(f"- {fidelity_advice}")
660
-
661
- if advice:
662
- for item in advice:
663
- print(colored(item, color))
716
+ all_good = False
664
717
  else:
665
- print(colored(" Optimized", color))
666
- print() # Add a blank line between matmuls
718
+ advice.append("No output subblock size found")
719
+ all_good = False
720
+
721
+ if all_good:
722
+ advice.append(
723
+ f"in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷"
724
+ )
725
+ if fidelity_advice:
726
+ advice.append(fidelity_advice)
727
+
728
+ return advice
667
729
 
668
730
 
669
731
  def merge_device_rows(df):
@@ -733,7 +795,7 @@ def filter_by_id_range(rows, id_range):
733
795
  def main():
734
796
  args, id_range = parse_args()
735
797
  generate_perf_report(
736
- args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode
798
+ args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode, args.raw_op_codes,
737
799
  )
738
800
 
739
801
 
@@ -755,6 +817,7 @@ def parse_args():
755
817
  parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
756
818
  parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
757
819
  parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
820
+ parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
758
821
  args = parser.parse_args()
759
822
 
760
823
  # Set the global color_output variable
@@ -770,7 +833,7 @@ def parse_args():
770
833
  return args, id_range
771
834
 
772
835
 
773
- def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode):
836
+ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode, raw_op_codes):
774
837
  df = pd.read_csv(csv_file, low_memory=False)
775
838
 
776
839
  # Add a column for original row numbers
@@ -798,6 +861,8 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
798
861
  for _, row in df.iterrows():
799
862
  op_data, current_gap = analyze_op(row, prev_row)
800
863
  op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
864
+ if raw_op_codes:
865
+ op_data["Raw OP Code"] = Cell(row["OP CODE"])
801
866
  rows.append(op_data)
802
867
  prev_row = row
803
868
 
@@ -844,11 +909,20 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
844
909
  "Output Subblock H",
845
910
  "Output Subblock W",
846
911
  ]
912
+ if not no_advice:
913
+ all_headers.append("Advice")
914
+ if raw_op_codes:
915
+ all_headers.append("Raw OP Code")
847
916
  print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
848
917
  with open(csv_output_file, "w") as f:
849
- f.write(",".join(all_headers) + "\n")
918
+ csv_writer = csv.DictWriter(f, fieldnames=all_headers)
919
+ csv_writer.writeheader()
850
920
  for op_data in rows:
851
- f.write(",".join(str(op_data[header].raw_value) for header in all_headers) + "\n")
921
+ row = {header: op_data[header].raw_value for header in all_headers if header in op_data}
922
+ if not no_advice:
923
+ advice = generate_matmul_advice(op_data) if is_matmul_op(op_data) else ""
924
+ row["Advice"] = " • ".join(advice)
925
+ csv_writer.writerow(row)
852
926
  else:
853
927
  col_widths = [
854
928
  max(max(visible_length(str(row[header])) for row in rows), visible_length(header))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: tt-perf-report
3
- Version: 1.0.1
3
+ Version: 1.0.4
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
File without changes
File without changes
File without changes