tt-perf-report 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -0,0 +1,3 @@
1
+ For the avoidance of doubt, this software assists in programming Tenstorrent products.
2
+
3
+ However, making, using, or selling hardware, models, or IP may require the license of rights (such as patent rights) from Tenstorrent or others.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: tt-perf-report
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -216,6 +216,7 @@ Project-URL: Repository, https://github.com/tenstorrent/tt-perf-report
216
216
  Keywords: tenstorrent,tt-metal
217
217
  Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
+ License-File: LICENSE_understanding.txt
219
220
  Requires-Dist: pandas
220
221
 
221
222
  # Performance Report Analysis Tool
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.3"
7
+ version = "1.0.5"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
@@ -38,6 +38,7 @@ def colored(text, color):
38
38
  "yellow": "\033[38;5;11m",
39
39
  "blue": "\033[38;5;12m",
40
40
  "magenta": "\033[38;5;13m",
41
+ "orange": "\033[38;5;208m",
41
42
  "cyan": "\033[38;5;14m",
42
43
  "white": "\033[38;5;15m",
43
44
  "end": "\033[0m",
@@ -70,7 +71,7 @@ class Cell:
70
71
  if self.raw_value is None or pd.isna(self.raw_value):
71
72
  return ""
72
73
 
73
- if isinstance(self.raw_value, str) and "Matmul" in self.raw_value:
74
+ if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
74
75
  parts = self.raw_value.split(maxsplit=1)
75
76
  op_name = parts[0]
76
77
  size = parts[1] if len(parts) > 1 else ""
@@ -275,6 +276,37 @@ def analyze_matmul(row):
275
276
  core_count, # Return the potentially adjusted core count
276
277
  )
277
278
 
279
+ def analyze_conv(row):
280
+ duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
281
+
282
+ core_count = 64 # we decided to normalize to the max core count
283
+ math_fidelity = row["MATH FIDELITY"]
284
+
285
+ # Check for DRAM-sharded program config
286
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
287
+
288
+ peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count
289
+
290
+ NHW = int(row["OUTPUT_0_Y"])
291
+ CH_IN = int(row["INPUT_0_X"])
292
+ W = [int(x) for x in (attributes.split("window_hw")[1].split("; ")[0][2:-1].split(";"))]
293
+ CH_OUT = int(row["INPUT_1_X"])
294
+
295
+ M, K, N = NHW, CH_IN * W[0] * W[1], CH_OUT
296
+ flops = (M * K * N * 2) / duration_s
297
+
298
+ size = f"{M} x {K} x {N}"
299
+ memory_info = f"({row['INPUT_0_DATATYPE']} {row['INPUT_0_MEMORY'].replace('DEV_0_', '')} @ {row['INPUT_1_DATATYPE']} {row['INPUT_1_MEMORY'].replace('DEV_0_', '')} => {row['OUTPUT_0_DATATYPE']} {row['OUTPUT_0_MEMORY'].replace('DEV_0_', '')})"
300
+
301
+ flops_percentage = (flops / peak_flops_value) * 100
302
+
303
+ return (
304
+ flops,
305
+ flops_percentage,
306
+ size,
307
+ memory_info,
308
+ math_fidelity
309
+ )
278
310
 
279
311
  def analyze_op(row, prev_row):
280
312
  op_code = Cell(row["OP CODE"])
@@ -305,6 +337,19 @@ def analyze_op(row, prev_row):
305
337
  input_1_datatype_cell = Cell(input_1_datatype)
306
338
  short_name = lambda n: {"BFLOAT16": "BF16", "BFLOAT8_B": "BFP8", "BFLOAT4_B": "BFP4"}.get(n, n)
307
339
 
340
+ dram_speed = Cell(None, unit="GB/s", decimals=0)
341
+ dram_percentage = Cell(None, unit="%", decimals=1)
342
+ flops = Cell(None, unit="TFLOPs", decimals=1)
343
+ flops_percentage = Cell(None, unit="%", decimals=1)
344
+
345
+ math_fidelity = ""
346
+ math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
347
+ math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
348
+ math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
349
+ math_fidelity_cell = Cell(math_fidelity.strip())
350
+
351
+ is_dram_sharded = False
352
+
308
353
  if "Matmul" in op_code.raw_value:
309
354
  (
310
355
  dram_speed,
@@ -329,19 +374,24 @@ def analyze_op(row, prev_row):
329
374
  if math_fidelity
330
375
  else None
331
376
  )
332
- else:
377
+ elif "OptimizedConvNew" in op_code.raw_value:
378
+ (
379
+ flops,
380
+ flops_percentage,
381
+ size,
382
+ memory_info,
383
+ math_fidelity,
384
+ ) = analyze_conv(row)
385
+ op_code = Cell(f"{op_code.raw_value} {size}")
333
386
  dram_speed = Cell(None, unit="GB/s", decimals=0)
334
387
  dram_percentage = Cell(None, unit="%", decimals=1)
335
- flops = Cell(None, unit="TFLOPs", decimals=1)
336
- flops_percentage = Cell(None, unit="%", decimals=1)
337
-
338
- math_fidelity = ""
339
- math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else ""
340
- math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else ""
341
- math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else ""
342
- math_fidelity_cell = Cell(math_fidelity.strip())
343
-
344
- is_dram_sharded = False
388
+ flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
389
+ flops_percentage = Cell(flops_percentage, unit="%", decimals=1)
390
+ math_fidelity_cell = Cell(
391
+ f"{math_fidelity} {short_name(input_0_datatype)} x {short_name(input_1_datatype)} => {short_name(output_datatype)}".strip()
392
+ if math_fidelity
393
+ else None
394
+ )
345
395
 
346
396
  output = {
347
397
  "ID": None,
@@ -434,6 +484,7 @@ def color_row(op_data, percentage, min_percentage):
434
484
  op_colors = {
435
485
  "(torch)": "red",
436
486
  "Matmul": "magenta",
487
+ "OptimizedConvNew" : "orange",
437
488
  "LayerNorm": "cyan",
438
489
  "AllGather": "cyan",
439
490
  "AllReduce": "cyan",
@@ -484,7 +535,8 @@ def color_row(op_data, percentage, min_percentage):
484
535
  if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5:
485
536
  op_data["Op-to-Op Gap"].color = "red"
486
537
 
487
- if "Matmul" in op_data["OP Code"].raw_value and op_data["Math Fidelity"].raw_value:
538
+ if ("Matmul" in op_data["OP Code"].raw_value
539
+ or "OptimizedConvNew" in op_data["OP Code"].raw_value) and op_data["Math Fidelity"].raw_value:
488
540
  math_fidelity = op_data["Math Fidelity"].raw_value.split()[0]
489
541
  input_0_datatype = op_data["Input 0 Datatype"].raw_value
490
542
  input_1_datatype = op_data["Input 1 Datatype"].raw_value
@@ -690,8 +742,28 @@ def merge_device_rows(df):
690
742
  device_ids = sorted(block_by_device.keys())
691
743
  merged_blocks = []
692
744
 
693
- for blocks in zip(*[block_by_device[device_id] for device_id in device_ids]):
694
- op_name = blocks[0][0]
745
+ global_index = 0
746
+ while max(len(block_by_device[device_id]) for device_id in device_ids) > 0:
747
+ blocks = []
748
+ op_name = None
749
+ missing_devices = []
750
+ for device_id in device_ids:
751
+ if not len(block_by_device[device_id]):
752
+ print(colored(f"Warning: Device {device_id} is missing operation {op_name} at index {global_index}", "yellow"))
753
+ continue
754
+ if op_name is None:
755
+ op_name = block_by_device[device_id][0][0]
756
+ elif op_name != block_by_device[device_id][0][0]:
757
+ missing_devices.append(device_id)
758
+ continue
759
+
760
+ blocks.append(block_by_device[device_id].pop(0))
761
+
762
+ if missing_devices:
763
+ print(colored(f"Warning: {op_name} at index {global_index} not present in CSV for {len(missing_devices)} devices {missing_devices} - do not trust data for this op or directly subsequent ops with the same name", "yellow"))
764
+
765
+ if not blocks:
766
+ break
695
767
 
696
768
  if "AllGather" in op_name or "ReduceScatter" in op_name:
697
769
  # For collective ops, take the row with minimum duration
@@ -702,6 +774,8 @@ def merge_device_rows(df):
702
774
  max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
703
775
  merged_blocks.append(max_duration_block[1])
704
776
 
777
+ global_index += 1
778
+
705
779
  return pd.DataFrame(merged_blocks)
706
780
 
707
781
 
@@ -743,7 +817,7 @@ def filter_by_id_range(rows, id_range):
743
817
  def main():
744
818
  args, id_range = parse_args()
745
819
  generate_perf_report(
746
- args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode
820
+ args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode, args.raw_op_codes,
747
821
  )
748
822
 
749
823
 
@@ -765,6 +839,7 @@ def parse_args():
765
839
  parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE")
766
840
  parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
767
841
  parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
842
+ parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
768
843
  args = parser.parse_args()
769
844
 
770
845
  # Set the global color_output variable
@@ -780,7 +855,7 @@ def parse_args():
780
855
  return args, id_range
781
856
 
782
857
 
783
- def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode):
858
+ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode, raw_op_codes):
784
859
  df = pd.read_csv(csv_file, low_memory=False)
785
860
 
786
861
  # Add a column for original row numbers
@@ -808,6 +883,8 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
808
883
  for _, row in df.iterrows():
809
884
  op_data, current_gap = analyze_op(row, prev_row)
810
885
  op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
886
+ if raw_op_codes:
887
+ op_data["Raw OP Code"] = Cell(row["OP CODE"])
811
888
  rows.append(op_data)
812
889
  prev_row = row
813
890
 
@@ -856,6 +933,8 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
856
933
  ]
857
934
  if not no_advice:
858
935
  all_headers.append("Advice")
936
+ if raw_op_codes:
937
+ all_headers.append("Raw OP Code")
859
938
  print(colored(f"Writing CSV output to {csv_output_file}", "cyan"))
860
939
  with open(csv_output_file, "w") as f:
861
940
  csv_writer = csv.DictWriter(f, fieldnames=all_headers)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: tt-perf-report
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -216,6 +216,7 @@ Project-URL: Repository, https://github.com/tenstorrent/tt-perf-report
216
216
  Keywords: tenstorrent,tt-metal
217
217
  Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
+ License-File: LICENSE_understanding.txt
219
220
  Requires-Dist: pandas
220
221
 
221
222
  # Performance Report Analysis Tool
@@ -1,4 +1,5 @@
1
1
  LICENSE
2
+ LICENSE_understanding.txt
2
3
  README.md
3
4
  pyproject.toml
4
5
  src/tt_perf_report/__init__.py
File without changes
File without changes
File without changes