tt-perf-report 1.0.6__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tt-perf-report might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.6
3
+ Version: 1.0.7
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Dynamic: license-file
221
222
 
222
223
  # Performance Report Analysis Tool
223
224
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "tt-perf-report"
7
- version = "1.0.6"
7
+ version = "1.0.7"
8
8
  description = "This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities."
9
9
  license = {file = "LICENSE"}
10
10
  readme = "README.md"
@@ -71,7 +71,7 @@ class Cell:
71
71
  if self.raw_value is None or pd.isna(self.raw_value):
72
72
  return ""
73
73
 
74
- if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value):
74
+ if isinstance(self.raw_value, str) and ("Matmul" in self.raw_value or "OptimizedConvNew" in self.raw_value or "HaloDeviceOperation" in self.raw_value):
75
75
  parts = self.raw_value.split(maxsplit=1)
76
76
  op_name = parts[0]
77
77
  size = parts[1] if len(parts) > 1 else ""
@@ -276,6 +276,42 @@ def analyze_matmul(row):
276
276
  core_count, # Return the potentially adjusted core count
277
277
  )
278
278
 
279
+ def analyze_halo(row):
280
+ attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else ""
281
+
282
+ try:
283
+ window_hw = attributes.split("window_hw=")[1].split(";")[0:2]
284
+ window_hw = ",".join(window_hw[0:2])
285
+ except (IndexError, AttributeError):
286
+ window_hw = "x"
287
+
288
+ try:
289
+ stride_hw = attributes.split("stride_hw=")[1].split(";")[0:2]
290
+ stride_hw = ",".join(stride_hw[0:2])
291
+ except (IndexError, AttributeError):
292
+ stride_hw = "x"
293
+
294
+ try:
295
+ pad_hw = attributes.split("padding=")[1].split(";")[0:4]
296
+ pad_hw = ",".join(pad_hw[0:4])
297
+ except (IndexError, AttributeError):
298
+ pad_hw = "x"
299
+
300
+ try:
301
+ dilation_hw = attributes.split("dilation_hw=")[1].split(";")[0:2]
302
+ dilation_hw = ",".join(dilation_hw[0:2])
303
+ except (IndexError, AttributeError):
304
+ dilation_hw = "x"
305
+
306
+ try:
307
+ memory_layout = attributes.split("memory_layout=")[1].split(";")[0].split("::")[1]
308
+ except (IndexError, AttributeError):
309
+ memory_layout = "x"
310
+
311
+ config = f"w={window_hw} s={stride_hw} p={pad_hw} d={dilation_hw} | {memory_layout}"
312
+
313
+ return config
314
+
279
315
  def analyze_conv(row):
280
316
  duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9
281
317
 
@@ -300,12 +336,40 @@ def analyze_conv(row):
300
336
 
301
337
  flops_percentage = (flops / peak_flops_value) * 100
302
338
 
339
+ try:
340
+ act_block_h_ntiles = int(attributes.split("act_block_h_ntiles")[1][1:].split(";")[0])
341
+ except (IndexError, ValueError):
342
+ act_block_h_ntiles = "x"
343
+
344
+ try:
345
+ enable_act_double_buffer = "true" == attributes.split("enable_act_double_buffer': '")[1].split("'")[0]
346
+ except (IndexError, ValueError):
347
+ enable_act_double_buffer = "x"
348
+
349
+ try:
350
+ enable_split_reader = "true" == attributes.split("enable_split_reader': '")[1].split("'")[0]
351
+ except (IndexError, ValueError):
352
+ enable_split_reader = "x"
353
+
354
+ try:
355
+ per_core_out_matrix_height_ntile = int(attributes.split("per_core_out_matrix_height_ntile")[1][1:].split(";")[0])
356
+ except (IndexError, ValueError):
357
+ per_core_out_matrix_height_ntile = "x"
358
+
359
+ config = f"[ABH={per_core_out_matrix_height_ntile}|{act_block_h_ntiles}"
360
+ if (enable_act_double_buffer):
361
+ config += " ADB"
362
+ if (enable_split_reader):
363
+ config += " SR"
364
+ config += "]"
365
+
303
366
  return (
304
367
  flops,
305
368
  flops_percentage,
306
369
  size,
307
370
  memory_info,
308
- math_fidelity
371
+ math_fidelity,
372
+ config,
309
373
  )
310
374
 
311
375
  def analyze_op(row, prev_row):
@@ -381,8 +445,9 @@ def analyze_op(row, prev_row):
381
445
  size,
382
446
  memory_info,
383
447
  math_fidelity,
448
+ config,
384
449
  ) = analyze_conv(row)
385
- op_code = Cell(f"{op_code.raw_value} {size}")
450
+ op_code = Cell(f"{op_code.raw_value} {size} {config}")
386
451
  dram_speed = Cell(None, unit="GB/s", decimals=0)
387
452
  dram_percentage = Cell(None, unit="%", decimals=1)
388
453
  flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1)
@@ -392,6 +457,13 @@ def analyze_op(row, prev_row):
392
457
  if math_fidelity
393
458
  else None
394
459
  )
460
+ elif "HaloDeviceOperation" in op_code.raw_value:
461
+ config = analyze_halo(row)
462
+ op_code = Cell(f"{op_code.raw_value} {config}")
463
+ dram_speed = Cell(None, unit="GB/s", decimals=0)
464
+ dram_percentage = Cell(None, unit="%", decimals=1)
465
+ flops = Cell(None, unit="TFLOPs", decimals=1)
466
+ flops_percentage = Cell(None, unit="%", decimals=1)
395
467
 
396
468
  output = {
397
469
  "ID": None,
@@ -765,10 +837,17 @@ def merge_device_rows(df):
765
837
  if not blocks:
766
838
  break
767
839
 
768
- if "AllGather" in op_name or "ReduceScatter" in op_name:
769
- # For collective ops, take the row with minimum duration
770
- min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
771
- merged_blocks.append(min_duration_block[1])
840
+ if "AllGather" in op_name or "ReduceScatter" in op_name or "AllReduce" in op_name:
841
+ # For collective ops, take the average duration over all rows within a block
842
+ device_kernel_durations = [d["DEVICE KERNEL DURATION [ns]"]
843
+ for _, d in blocks
844
+ if pd.notna(d["DEVICE KERNEL DURATION [ns]"])]
845
+ # Use the first block's data but update its duration with the average
846
+ base_block = blocks[0][1].copy()
847
+ base_block["DEVICE KERNEL DURATION [ns]"] = (
848
+ sum(device_kernel_durations) / len(device_kernel_durations) if device_kernel_durations else float("nan")
849
+ )
850
+ merged_blocks.append(base_block)
772
851
  else:
773
852
  # For non-collective ops, take the row with maximum duration
774
853
  max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"])
@@ -891,6 +970,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
891
970
  for _, row in df.iterrows():
892
971
  op_data, current_gap = analyze_op(row, prev_row)
893
972
  op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number
973
+ op_data["Global Call Count"] = Cell(row["GLOBAL CALL COUNT"])
894
974
  if raw_op_codes:
895
975
  op_data["Raw OP Code"] = Cell(row["OP CODE"])
896
976
  rows.append(op_data)
@@ -941,6 +1021,7 @@ def generate_perf_report(csv_file, signpost, ignore_signposts, min_percentage,
941
1021
  "Inner Dim Block Size",
942
1022
  "Output Subblock H",
943
1023
  "Output Subblock W",
1024
+ "Global Call Count",
944
1025
  ]
945
1026
  if not no_advice:
946
1027
  all_headers.append("Advice")
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tt-perf-report
3
- Version: 1.0.6
3
+ Version: 1.0.7
4
4
  Summary: This tool analyzes performance traces from TT-Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,6 +218,7 @@ Description-Content-Type: text/markdown
218
218
  License-File: LICENSE
219
219
  License-File: LICENSE_understanding.txt
220
220
  Requires-Dist: pandas
221
+ Dynamic: license-file
221
222
 
222
223
  # Performance Report Analysis Tool
223
224
 
File without changes
File without changes
File without changes