tt-perf-report 1.0.5__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Dynamic: license-file
221
222
 
222
223
  # Performance Report Analysis Tool
223
224
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.5"
7
+ version = "1.0.7"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
@@ -71,7 +71,7 @@ class Cell:
71
71
  if self.raw_value is None or pd.isna(self.raw_value):
72
72
  return ""
73
73
 
74
- if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
74
+ if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value or "HaloDeviceOperation" in self.raw_value):
75
75
  parts = self.raw_value.split(maxsplit=1)
76
76
  op_name = parts[0]
77
77
  size = parts[1] if len(parts) > 1 else ""
@@ -276,6 +276,42 @@ def analyze_matmul(row):
276
276
  core_count, # Return the potentially adjusted core count
277
277
  )
278
278
 
279
+ def analyze_halo(row):
280
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
281
+
282
+ try:
283
+ window_hw = attributes.split("window_hw=")[1].split(";")[0:2]
284
+ window_hw = ",".join(window_hw[0:2])
285
+ except (IndexError, AttributeError):
286
+ window_hw = "x"
287
+
288
+ try:
289
+ stride_hw = attributes.split("stride_hw=")[1].split(";")[0:2]
290
+ stride_hw = ",".join(stride_hw[0:2])
291
+ except (IndexError, AttributeError):
292
+ stride_hw = "x"
293
+
294
+ try:
295
+ pad_hw = attributes.split("padding=")[1].split(";")[0:4]
296
+ pad_hw = ",".join(pad_hw[0:4])
297
+ except (IndexError, AttributeError):
298
+ pad_hw = "x"
299
+
300
+ try:
301
+ dilation_hw = attributes.split("dilation_hw=")[1].split(";")[0:2]
302
+ dilation_hw = ",".join(dilation_hw[0:2])
303
+ except (IndexError, AttributeError):
304
+ dilation_hw = "x"
305
+
306
+ try:
307
+ memory_layout = attributes.split("memory_layout=")[1].split(";")[0].split("::")[1]
308
+ except (IndexError, AttributeError):
309
+ memory_layout = "x"
310
+
311
+ config = f"w={window_hw} s={stride_hw} p={pad_hw} d={dilation_hw} | {memory_layout}"
312
+
313
+ return config
314
+
279
315
  def analyze_conv(row):
280
316
  duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
281
317
 
@@ -300,12 +336,40 @@ def analyze_conv(row):
300
336
 
301
337
  flops_percentage = (flops / peak_flops_value) * 100
302
338
 
339
+ try:
340
+ act_block_h_ntiles = int(attributes.split("act_block_h_ntiles")[1][1:].split(";")[0])
341
+ except (IndexError, ValueError):
342
+ act_block_h_ntiles = "x"
343
+
344
+ try:
345
+ enable_act_double_buffer = "true" == attributes.split("enable_act_double_buffer': '")[1].split("'")[0]
346
+ except (IndexError, ValueError):
347
+ enable_act_double_buffer = "x"
348
+
349
+ try:
350
+ enable_split_reader = "true" == attributes.split("enable_split_reader': '")[1].split("'")[0]
351
+ except (IndexError, ValueError):
352
+ enable_split_reader = "x"
353
+
354
+ try:
355
+ per_core_out_matrix_height_ntile = int(attributes.split("per_core_out_matrix_height_ntile")[1][1:].split(";")[0])
356
+ except (IndexError, ValueError):
357
+ per_core_out_matrix_height_ntile = "x"
358
+
359
+ config = f"[ABH={per_core_out_matrix_height_ntile}|{act_block_h_ntiles}"
360
+ if (enable_act_double_buffer):
361
+ config += " ADB"
362
+ if (enable_split_reader):
363
+ config += " SR"
364
+ config += "]"
365
+
303
366
  return (
304
367
  flops,
305
368
  flops_percentage,
306
369
  size,
307
370
  memory_info,
308
- math_fidelity
371
+ math_fidelity,
372
+ config,
309
373
  )
310
374
 
311
375
  def analyze_op(row, prev_row):
@@ -381,8 +445,9 @@ def analyze_op(row, prev_row):
381
445
  size,
382
446
  memory_info,
383
447
  math_fidelity,
448
+ config,
384
449
  ) = analyze_conv(row)
385
- op_code = Cell(f"{op_code.raw_value} {size}")
450
+ op_code = Cell(f"{op_code.raw_value} {size} {config}")
386
451
  dram_speed = Cell(None, unit="GB/s", decimals=0)
387
452
  dram_percentage = Cell(None, unit="%", decimals=1)
388
453
  flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
@@ -392,6 +457,13 @@ def analyze_op(row, prev_row):
392
457
  if math_fidelity
393
458
  else None
394
459
  )
460
+ elif "HaloDeviceOperation" in op_code.raw_value:
461
+ config = analyze_halo(row)
462
+ op_code = Cell(f"{op_code.raw_value} {config}")
463
+ dram_speed = Cell(None, unit="GB/s", decimals=0)
464
+ dram_percentage = Cell(None, unit="%", decimals=1)
465
+ flops = Cell(None, unit="TFLOPs", decimals=1)
466
+ flops_percentage = Cell(None, unit="%", decimals=1)
395
467
 
396
468
  output = {
397
469
  "ID": None,
@@ -765,10 +837,17 @@ def merge_device_rows(df):
765
837
  if not blocks:
766
838
  break
767
839
 
768
- if "AllGather" in op_name or "ReduceScatter" in op_name:
769
- # For collective ops, take the row with minimum duration
770
- min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
771
- merged_blocks.append(min_duration_block[1])
840
+ if "AllGather" in op_name or "ReduceScatter" in op_name or "AllReduce" in op_name:
841
+ # For collective ops, take the average duration over all rows within a block
842
+ device_kernel_durations = [d["DEVICE KERNEL DURATION [ns]"]
843
+ for _, d in blocks
844
+ if pd.notna(d["DEVICE KERNEL DURATION [ns]"])]
845
+ # Use the first block's data but update its duration with the average
846
+ base_block = blocks[0][1].copy()
847
+ base_block["DEVICE KERNEL DURATION [ns]"] = (
848
+ sum(device_kernel_durations) / len(device_kernel_durations) if device_kernel_durations else float("nan")
849
+ )
850
+ merged_blocks.append(base_block)
772
851
  else:
773
852
  # For non-collective ops, take the row with maximum duration
774
853
  max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
@@ -814,11 +893,15 @@ def filter_by_id_range(rows, id_range):
814
893
  return rows
815
894
 
816
895
 
896
+ def filter_host_ops(rows):
897
+ return [row for row in rows if not is_host_op(row)]
898
+
899
+
817
900
  def main():
818
901
  args, id_range = parse_args()
819
902
  generate_perf_report(
820
- args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice, args.tracing_mode, args.raw_op_codes,
821
- )
903
+ args.csv_file, args.signpost, args.ignore_signposts, args.min_percentage, id_range, args.csv, args.no_advice,
904
+ args.tracing_mode, args.raw_op_codes, args.no_host_ops)
822
905
 
823
906
 
824
907
  def parse_args():
@@ -840,6 +923,8 @@ def parse_args():
840
923
  parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report")
841
924
  parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode")
842
925
  parser.add_argument("--raw-op-codes", action="store_true", help="Include raw op codes in output")
926
+ parser.add_argument("--no-host-ops", action="store_true", help="Do not include host ops in output")
927
+
843
928
  args = parser.parse_args()
844
929
 
845
930
  # Set the global color_output variable
@@ -855,7 +940,9 @@ def parse_args():
855
940
  return args, id_range
856
941
 
857
942
 
858
- def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode, raw_op_codes):
943
+ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
944
+ id_range, csv_output_file, no_advice, tracing_mode,
945
+ raw_op_codes, no_host_ops):
859
946
  df = pd.read_csv(csv_file, low_memory=False)
860
947
 
861
948
  # Add a column for original row numbers
@@ -883,13 +970,14 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
883
970
  for _, row in df.iterrows():
884
971
  op_data, current_gap = analyze_op(row, prev_row)
885
972
  op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
973
+ op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
886
974
  if raw_op_codes:
887
975
  op_data["Raw OP Code"] = Cell(row["OP CODE"])
888
976
  rows.append(op_data)
889
977
  prev_row = row
890
978
 
891
979
  # Count device and host ops
892
- if "(torch)" in op_data["OP Code"].raw_value:
980
+ if is_host_op(op_data):
893
981
  host_ops += 1
894
982
  else:
895
983
  device_ops += 1
@@ -900,6 +988,9 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
900
988
  # Filter rows based on id_range
901
989
  rows = filter_by_id_range(rows, id_range)
902
990
 
991
+ if no_host_ops:
992
+ rows = filter_host_ops(rows)
993
+
903
994
  # Recalculate derived columns after filtering
904
995
  add_derived_columns(rows)
905
996
 
@@ -930,6 +1021,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
930
1021
  "Inner Dim Block Size",
931
1022
  "Output Subblock H",
932
1023
  "Output Subblock W",
1024
+ "Global Call Count",
933
1025
  ]
934
1026
  if not no_advice:
935
1027
  all_headers.append("Advice")
@@ -955,5 +1047,9 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage, i
955
1047
  print_advice_section(rows, visible_headers, col_widths)
956
1048
 
957
1049
 
1050
+ def is_host_op(op_data):
1051
+ return "(torch)" in op_data["OP Code"].raw_value
1052
+
1053
+
958
1054
  if __name__ == "__main__":
959
1055
  main()
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Dynamic: license-file
221
222
 
222
223
  # Performance Report Analysis Tool
223
224
 
File without changes
File without changes
File without changes