carm-roofline 1.0.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. carm_roofline-1.0.0.dev0/GUI_utils.py +392 -0
  2. carm_roofline-1.0.0.dev0/PKG-INFO +227 -0
  3. carm_roofline-1.0.0.dev0/README.md +184 -0
  4. carm_roofline-1.0.0.dev0/ResultsGUI.py +3715 -0
  5. carm_roofline-1.0.0.dev0/architecture/__init__.py +22 -0
  6. carm_roofline-1.0.0.dev0/architecture/architecture.py +299 -0
  7. carm_roofline-1.0.0.dev0/architecture/arm.py +20 -0
  8. carm_roofline-1.0.0.dev0/architecture/config.py +255 -0
  9. carm_roofline-1.0.0.dev0/architecture/detect.py +468 -0
  10. carm_roofline-1.0.0.dev0/architecture/frequency.py +52 -0
  11. carm_roofline-1.0.0.dev0/architecture/memory.py +585 -0
  12. carm_roofline-1.0.0.dev0/architecture/riscv.py +71 -0
  13. carm_roofline-1.0.0.dev0/architecture/tests/arm/features.c +41 -0
  14. carm_roofline-1.0.0.dev0/architecture/tests/arm/frequency.h +86 -0
  15. carm_roofline-1.0.0.dev0/architecture/tests/arm/vlen.c +18 -0
  16. carm_roofline-1.0.0.dev0/architecture/tests/frequency.c +137 -0
  17. carm_roofline-1.0.0.dev0/architecture/tests/riscv/frequency.h +89 -0
  18. carm_roofline-1.0.0.dev0/architecture/tests/riscv/rvv_version.c +27 -0
  19. carm_roofline-1.0.0.dev0/architecture/tests/riscv/vlen.c +20 -0
  20. carm_roofline-1.0.0.dev0/architecture/tests/x86/cache.c +83 -0
  21. carm_roofline-1.0.0.dev0/architecture/tests/x86/features.c +47 -0
  22. carm_roofline-1.0.0.dev0/architecture/tests/x86/frequency.h +127 -0
  23. carm_roofline-1.0.0.dev0/architecture/tests/x86/x86_avx512/frequency.h +129 -0
  24. carm_roofline-1.0.0.dev0/architecture/x86.py +29 -0
  25. carm_roofline-1.0.0.dev0/arguments.py +142 -0
  26. carm_roofline-1.0.0.dev0/assets/CARM_icon.svg +4 -0
  27. carm_roofline-1.0.0.dev0/assets/CHAMP_logo.svg +44 -0
  28. carm_roofline-1.0.0.dev0/assets/menu_icon.png +0 -0
  29. carm_roofline-1.0.0.dev0/assets/style.css +3 -0
  30. carm_roofline-1.0.0.dev0/benchmark/__init__.py +44 -0
  31. carm_roofline-1.0.0.dev0/benchmark/benchmark.py +182 -0
  32. carm_roofline-1.0.0.dev0/benchmark/benchmarking.py +223 -0
  33. carm_roofline-1.0.0.dev0/benchmark/generation/__init__.py +51 -0
  34. carm_roofline-1.0.0.dev0/benchmark/generation/arm.py +154 -0
  35. carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/__init__.py +17 -0
  36. carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/data_type.py +40 -0
  37. carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/instruction.py +411 -0
  38. carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/operation.py +44 -0
  39. carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/register.py +150 -0
  40. carm_roofline-1.0.0.dev0/benchmark/generation/isa.py +742 -0
  41. carm_roofline-1.0.0.dev0/benchmark/generation/parameters.py +90 -0
  42. carm_roofline-1.0.0.dev0/benchmark/generation/riscv.py +211 -0
  43. carm_roofline-1.0.0.dev0/benchmark/generation/x86.py +206 -0
  44. carm_roofline-1.0.0.dev0/benchmark/interface.py +162 -0
  45. carm_roofline-1.0.0.dev0/benchmark/output/__init__.py +50 -0
  46. carm_roofline-1.0.0.dev0/benchmark/output/arithmetic.py +271 -0
  47. carm_roofline-1.0.0.dev0/benchmark/output/base.py +66 -0
  48. carm_roofline-1.0.0.dev0/benchmark/output/common.py +116 -0
  49. carm_roofline-1.0.0.dev0/benchmark/output/memory.py +227 -0
  50. carm_roofline-1.0.0.dev0/benchmark/output/memory_sweep.py +280 -0
  51. carm_roofline-1.0.0.dev0/benchmark/output/mixed.py +137 -0
  52. carm_roofline-1.0.0.dev0/benchmark/output/roofline.py +382 -0
  53. carm_roofline-1.0.0.dev0/benchmark/result.py +90 -0
  54. carm_roofline-1.0.0.dev0/benchmark/suites/__init__.py +17 -0
  55. carm_roofline-1.0.0.dev0/benchmark/suites/arithmetic.py +89 -0
  56. carm_roofline-1.0.0.dev0/benchmark/suites/base.py +168 -0
  57. carm_roofline-1.0.0.dev0/benchmark/suites/memory.py +146 -0
  58. carm_roofline-1.0.0.dev0/benchmark/suites/memory_sweep.py +230 -0
  59. carm_roofline-1.0.0.dev0/benchmark/suites/mixed.py +51 -0
  60. carm_roofline-1.0.0.dev0/benchmark/suites/roofline.py +114 -0
  61. carm_roofline-1.0.0.dev0/carm.py +186 -0
  62. carm_roofline-1.0.0.dev0/carm_roofline.egg-info/PKG-INFO +227 -0
  63. carm_roofline-1.0.0.dev0/carm_roofline.egg-info/SOURCES.txt +79 -0
  64. carm_roofline-1.0.0.dev0/carm_roofline.egg-info/dependency_links.txt +1 -0
  65. carm_roofline-1.0.0.dev0/carm_roofline.egg-info/entry_points.txt +2 -0
  66. carm_roofline-1.0.0.dev0/carm_roofline.egg-info/requires.txt +24 -0
  67. carm_roofline-1.0.0.dev0/carm_roofline.egg-info/top_level.txt +14 -0
  68. carm_roofline-1.0.0.dev0/context.py +22 -0
  69. carm_roofline-1.0.0.dev0/exec_interface.py +138 -0
  70. carm_roofline-1.0.0.dev0/gui_config.py +27 -0
  71. carm_roofline-1.0.0.dev0/output_utils.py +201 -0
  72. carm_roofline-1.0.0.dev0/pyproject.toml +219 -0
  73. carm_roofline-1.0.0.dev0/run_config.py +51 -0
  74. carm_roofline-1.0.0.dev0/setup.cfg +4 -0
  75. carm_roofline-1.0.0.dev0/test_bench/__init__.py +10 -0
  76. carm_roofline-1.0.0.dev0/test_bench/builder.py +374 -0
  77. carm_roofline-1.0.0.dev0/test_bench/test_bench.c +368 -0
  78. carm_roofline-1.0.0.dev0/test_bench/test_bench.h +274 -0
  79. carm_roofline-1.0.0.dev0/test_bench/wrapper.inl +100 -0
  80. carm_roofline-1.0.0.dev0/units.py +434 -0
  81. carm_roofline-1.0.0.dev0/utils.py +313 -0
@@ -0,0 +1,392 @@
1
+ import csv
2
+ import os
3
+ import math
4
+ import plotly.graph_objects as go
5
+ import numpy as np
6
+
7
+ import utils as ut
8
+
9
+
10
+ def read_csv_file(file_path):
11
+ data_list = []
12
+ with open(file_path, newline="") as csvfile:
13
+ reader = csv.reader(csvfile)
14
+ header = next(reader)
15
+ machine_name = header[1]
16
+ l1_size = int(header[3])
17
+ l2_size = int(header[5])
18
+ l3_size = int(header[7])
19
+
20
+ header2 = next(reader)
21
+ for row in reader:
22
+ if not row or not "".join(row).strip():
23
+ continue
24
+ data = {}
25
+ data["Date"] = row[0]
26
+ data["ISA"] = row[1]
27
+ data["Precision"] = row[2]
28
+ data["Threads"] = int(row[3])
29
+ data["Loads"] = int(row[4])
30
+ data["Stores"] = int(row[5])
31
+ data["Interleaved"] = row[6]
32
+ data["DRAMBytes"] = int(row[7])
33
+ data["FPInst"] = row[8]
34
+ data["L1"] = float(row[9])
35
+ data["L2"] = float(row[11])
36
+ data["L3"] = float(row[13])
37
+ data["DRAM"] = float(row[15])
38
+ data["FP"] = float(row[17])
39
+ data["FP_FMA"] = float(row[19])
40
+ data_list.append(data)
41
+
42
+ return machine_name, l1_size, l2_size, l3_size, data_list
43
+
44
+
45
+ def read_application_csv_file(file_path):
46
+ if not os.path.exists(file_path):
47
+ print("Application file does not exist:", file_path)
48
+ return False
49
+
50
+ data_list = []
51
+ try:
52
+ with open(file_path, newline="") as csvfile:
53
+ reader = csv.reader(csvfile)
54
+ header = next(reader, None)
55
+
56
+ if header is None:
57
+ print("File is empty:", file_path)
58
+ return False
59
+
60
+ for row in reader:
61
+ if row:
62
+ data = {
63
+ "Date": row[0],
64
+ "Method": row[1],
65
+ "Name": row[2],
66
+ "ISA": row[3],
67
+ "Precision": row[4],
68
+ "Threads": row[5],
69
+ "AI": float(row[6]),
70
+ "GFLOPS": float(row[7]),
71
+ "Bandwidth": float(row[8]),
72
+ "Time": float(row[9]),
73
+ }
74
+ data_list.append(data)
75
+
76
+ except Exception as e:
77
+ print("Failed to read the file:", file_path, "Error:", e)
78
+ return False
79
+ return data_list if data_list else False
80
+
81
+
82
+ def extract_last_segment(s):
83
+ return s.split("_")[-1] if "_" in s else s
84
+
85
+
86
+ def extract_prefix(s):
87
+ if "_" in s:
88
+ return s.rsplit("_", 1)[0]
89
+ return s
90
+
91
+
92
+ def interpolate_color(start_color, end_color, factor):
93
+ r = int(start_color[0] + factor * (end_color[0] - start_color[0]))
94
+ g = int(start_color[1] + factor * (end_color[1] - start_color[1]))
95
+ b = int(start_color[2] + factor * (end_color[2] - start_color[2]))
96
+ return f"rgb({r},{g},{b})"
97
+
98
+
99
+ def construct_query(ISA, Precision, Threads, Loads, Stores, Interleaved, DRAMBytes, FPInst, Date):
100
+ query_parts = []
101
+ if ISA:
102
+ query_parts.append(f"ISA == '{ISA}'")
103
+ if Precision:
104
+ query_parts.append(f"Precision == '{Precision}'")
105
+ if Threads:
106
+ query_parts.append(f"Threads == {Threads}")
107
+ if Loads:
108
+ query_parts.append(f"Loads == {Loads}")
109
+ if Stores:
110
+ query_parts.append(f"Stores == {Stores}")
111
+ if Interleaved:
112
+ query_parts.append(f"Interleaved == '{Interleaved}'")
113
+ if DRAMBytes:
114
+ query_parts.append(f"DRAMBytes == {DRAMBytes}")
115
+ if FPInst:
116
+ query_parts.append(f"FPInst == '{FPInst}'")
117
+ if Date:
118
+ query_parts.append(f"Date == '{Date}'")
119
+
120
+ return " and ".join(query_parts) if query_parts else None
121
+
122
+
123
+ def calculate_roofline(values, min_ai):
124
+ aidots = [0] * 3
125
+ FPaidots = [0] * 2
126
+ gflopdots = [0] * 3
127
+ FPgflopdots = [0] * 2
128
+
129
+ ai = np.linspace(min(0.00390625, min_ai), 256, num=200000)
130
+ traces = []
131
+ cache_levels = ["L1", "L2", "L3", "DRAM"]
132
+
133
+ dots = {}
134
+ if values[5] > 0:
135
+ peak_flops = values[5]
136
+ else:
137
+ peak_flops = values[4]
138
+
139
+ for cache_level in cache_levels:
140
+ if values[cache_levels.index(cache_level)] > 0:
141
+ aidots = [0, 0, 0]
142
+ gflopdots = [0, 0, 0]
143
+
144
+ y_values = ut.carm_eq(ai, values[cache_levels.index(cache_level)], peak_flops)
145
+ y_special = ut.carm_eq(0.00390625, values[cache_levels.index(cache_level)], peak_flops)
146
+
147
+ # Find the point where y_values stops increasing or reaches a plateau
148
+ for i in range(1, len(y_values)):
149
+ if y_values[i - 1] == y_values[i]:
150
+ aidots[1] = float(ai[i - 1])
151
+ break
152
+ else:
153
+ # If no break occurred in the loop
154
+ aidots[1] = float(ai[-1])
155
+ i = len(y_values) - 1
156
+
157
+ mid_ai = np.sqrt(aidots[1] * min(0.00390625, min_ai))
158
+ mid_gflops = np.sqrt(y_values[0] * y_values[i - 1])
159
+
160
+ dots[cache_level] = {
161
+ "start": [min(0.00390625, min_ai), y_values[0]],
162
+ "mid": [mid_ai, mid_gflops],
163
+ "ridge": [aidots[1], y_values[i - 1]],
164
+ "end": [ai[-1], y_values[-1]],
165
+ }
166
+
167
+ for i in range(4):
168
+ if values[i]:
169
+ top_roof = values[i]
170
+ break
171
+
172
+ y_values = ut.carm_eq(ai, top_roof, values[4])
173
+
174
+ for i in range(1, len(y_values)):
175
+ if y_values[i - 1] == y_values[i]:
176
+ FPaidots[0] = float(ai[i - 1])
177
+ break
178
+ FPgflopdots[0] = y_values[i - 1]
179
+
180
+ FPaidots[1] = ai[199999]
181
+ FPgflopdots[1] = y_values[199999]
182
+
183
+ dots[values[6]] = {"ridge": [FPaidots[0], FPgflopdots[0]], "end": [FPaidots[1], FPgflopdots[1]]}
184
+
185
+ return dots
186
+
187
+
188
+ def plot_roofline(values, dots, name_suffix, ISA, line_legend, line_size, line_legend_detailed):
189
+ import numpy as np
190
+
191
+ aidots = [0] * 3
192
+ FPaidots = [0] * 2
193
+ gflopdots = [0] * 3
194
+ FPgflopdots = [0] * 2
195
+
196
+ ai = np.linspace(0.00390625, 256, num=200000)
197
+ traces = []
198
+ cache_levels = ["L1", "L2", "L3", "DRAM"]
199
+ if name_suffix == "":
200
+ colors = ["black", "black", "black", "black"]
201
+ color_inst = "black"
202
+ else:
203
+ colors = ["red", "red", "red", "red"]
204
+ color_inst = "red"
205
+ linestyles = ["solid", "solid", "dash", "dot"]
206
+
207
+ for cache_level, color, linestyle in zip(cache_levels, colors, linestyles):
208
+ cache_dots = dots.get(cache_level)
209
+ if cache_dots:
210
+ if line_legend_detailed:
211
+ legend_text = f"{cache_level} {ISA.upper()} Bandwidth: {values[cache_levels.index(cache_level)]} GB/s"
212
+ else:
213
+ legend_text = f"{cache_level} {ISA.upper()}"
214
+ aidots = [cache_dots["start"][0], cache_dots["ridge"][0], cache_dots["end"][0]]
215
+ gflopdots = [cache_dots["start"][1], cache_dots["ridge"][1], cache_dots["end"][1]]
216
+ trace = go.Scatter(
217
+ x=aidots,
218
+ y=gflopdots,
219
+ mode="lines",
220
+ text=[
221
+ "",
222
+ f"{cache_level} {ISA.upper()} Peak Bandwidth: {values[cache_levels.index(cache_level)]} GB/s",
223
+ f"FP FMA {ISA.upper()} Peak: {values[5]} GFLOP/s",
224
+ ],
225
+ hovertemplate="<b>%{text}</b><br>(%{x}, %{y})<br><extra></extra>",
226
+ line=dict(color=color, dash=linestyle, width=line_size),
227
+ name=legend_text,
228
+ showlegend=line_legend,
229
+ )
230
+ traces.append(trace)
231
+ if values[4] > 0:
232
+ aidots = [dots[values[6]]["ridge"][0], dots[values[6]]["end"][0]]
233
+ gflopdots = [dots[values[6]]["ridge"][1], dots[values[6]]["end"][1]]
234
+ if line_legend_detailed:
235
+ legend_text = f"FP {values[6].upper()} {ISA.upper()} Peak: {values[4]} GFLOP/s"
236
+ else:
237
+ legend_text = f"{values[6].upper()} {ISA.upper()}"
238
+ if values[5] == 0:
239
+ linedash = "solid"
240
+ else:
241
+ linedash = "dashdot"
242
+ trace_inst = go.Scatter(
243
+ x=aidots,
244
+ y=gflopdots,
245
+ mode="lines",
246
+ text=[
247
+ f"FP {ISA.upper()} {values[6].upper()} Peak Performance: {values[4]} GFLOP/s",
248
+ f"FP {ISA.upper()} {values[6].upper()} Peak: {values[4]} GFLOP/s",
249
+ ],
250
+ hovertemplate="<b>%{text}</b><br>(%{x}, %{y})<br><extra></extra>",
251
+ line=dict(color=color_inst, dash=linedash, width=line_size),
252
+ name=legend_text,
253
+ showlegend=line_legend,
254
+ )
255
+ traces.append(trace_inst)
256
+ if values[5] > 0:
257
+ aidots = [dots["L1"]["ridge"][0], dots["L1"]["end"][0]]
258
+ gflopdots = [dots["L1"]["ridge"][1], dots["L1"]["end"][1]]
259
+
260
+ if line_legend_detailed:
261
+ legend_text = f"FP FMA {ISA.upper()} Peak: {values[5]} GFLOP/s"
262
+ else:
263
+ legend_text = f"FMA {ISA.upper()}"
264
+
265
+ trace_inst = go.Scatter(
266
+ x=aidots,
267
+ y=gflopdots,
268
+ mode="lines",
269
+ text=[
270
+ f"FP {ISA.upper()} FMA Peak Performance: {values[5]} GFLOP/s",
271
+ f"FP {ISA.upper()} FMA Peak: {values[5]} GFLOP/s",
272
+ ],
273
+ hovertemplate="<b>%{text}</b><br>(%{x}, %{y})<br><extra></extra>",
274
+ line=dict(color=color_inst, dash="solid", width=line_size),
275
+ name=legend_text,
276
+ showlegend=line_legend,
277
+ )
278
+ traces.append(trace_inst)
279
+
280
+ return traces
281
+
282
+
283
+ def draw_annotation(
284
+ values, lines, name_suffix, ISA, cache_level, graph_width, graph_height, anon_size, x_range=None, y_range=None
285
+ ):
286
+ aidots = [0] * 3
287
+ gflopdots = [0] * 3
288
+ annotation = {}
289
+ cache_levels = ["L1", "L2", "L3", "DRAM"]
290
+ angle_degrees = {}
291
+
292
+ if cache_level in cache_levels:
293
+ if cache_level in lines and lines[cache_level]["ridge"][0] > 0:
294
+ log_x1, log_x2 = math.log10(lines[cache_level]["start"][0]), math.log10(lines[cache_level]["ridge"][0])
295
+ log_y1, log_y2 = math.log10(lines[cache_level]["start"][1]), math.log10(lines[cache_level]["ridge"][1])
296
+
297
+ log_xmin, log_xmax = x_range[0], x_range[1]
298
+ log_ymin, log_ymax = y_range[0], y_range[1]
299
+
300
+ # Compute pixel coordinates based on log scale
301
+ x1_pixel = ((log_x1 - log_xmin) / (log_xmax - log_xmin)) * graph_width
302
+ x2_pixel = ((log_x2 - log_xmin) / (log_xmax - log_xmin)) * graph_width
303
+
304
+ y1_pixel = graph_height - ((log_y1 - log_ymin) / (log_ymax - log_ymin)) * graph_height
305
+ y2_pixel = graph_height - ((log_y2 - log_ymin) / (log_ymax - log_ymin)) * graph_height
306
+
307
+ # Pixel slope
308
+ pixel_slope = (y2_pixel - y1_pixel) / (x2_pixel - x1_pixel)
309
+
310
+ # Convert pixel slope to angle in degrees
311
+ angle_degrees[cache_level] = math.degrees(math.atan(pixel_slope))
312
+
313
+ ai = np.linspace(0.00390625, 256, num=200000)
314
+ traces = []
315
+
316
+ if name_suffix == "1":
317
+ colors = ["black", "black", "black", "black"]
318
+ color_inst = "black"
319
+ factor = 1.3
320
+ else:
321
+ colors = ["red", "red", "red", "red"]
322
+ color_inst = "red"
323
+ factor = 0.7
324
+ linestyles = ["solid", "solid", "dash", "dot"]
325
+
326
+ if cache_level in cache_levels and values[cache_levels.index(cache_level)] > 0:
327
+ if cache_level in lines:
328
+ aidots[0] = 0.00390625
329
+ y_values = ut.carm_eq(ai, values[cache_levels.index(cache_level)], values[5])
330
+ gflopdots[0] = y_values[0]
331
+ for i in range(1, len(y_values)):
332
+ if y_values[i - 1] == y_values[i]:
333
+ aidots[1] = float(ai[i - 1])
334
+ break
335
+ gflopdots[1] = y_values[i - 1]
336
+
337
+ annotation = go.layout.Annotation(
338
+ x=math.log10(lines[cache_level]["mid"][0] * factor),
339
+ y=math.log10(lines[cache_level]["mid"][1] * factor),
340
+ text=f"{cache_level} {ISA} Bandwidth: {values[cache_levels.index(cache_level)]} GB/s",
341
+ showarrow=False,
342
+ font=dict(
343
+ color=colors[0],
344
+ size=anon_size,
345
+ ),
346
+ align="center",
347
+ bgcolor="white",
348
+ bordercolor=colors[0],
349
+ borderwidth=1,
350
+ textangle=angle_degrees[cache_level],
351
+ name=f"{cache_level}_{name_suffix}",
352
+ )
353
+ if cache_level == "FMA" and values[5] > 0:
354
+ mid_ai = np.sqrt(lines["L1"]["ridge"][0] * lines["L1"]["end"][0])
355
+ mid_gflops = lines["L1"]["ridge"][1]
356
+ annotation = go.layout.Annotation(
357
+ x=math.log10(mid_ai),
358
+ y=math.log10(mid_gflops),
359
+ text=f"FP FMA {ISA} Peak: {values[5]} GFLOP/s",
360
+ showarrow=False,
361
+ font=dict(
362
+ color=colors[0],
363
+ size=anon_size,
364
+ ),
365
+ align="center",
366
+ bgcolor="white",
367
+ bordercolor=colors[0],
368
+ borderwidth=1,
369
+ textangle=0,
370
+ name=f"FP_FMA_{name_suffix}",
371
+ )
372
+
373
+ if cache_level == "FP" and values[4] > 0:
374
+ mid_ai = np.sqrt(lines["L1"]["ridge"][0] * lines["L1"]["end"][0])
375
+ mid_gflops = values[4]
376
+ annotation = go.layout.Annotation(
377
+ x=math.log10(mid_ai),
378
+ y=math.log10(mid_gflops),
379
+ text=f"FP {values[6].upper()} {ISA} Peak: {values[4]} GFLOP/s",
380
+ showarrow=False,
381
+ font=dict(
382
+ color=colors[0],
383
+ size=anon_size,
384
+ ),
385
+ align="center",
386
+ bgcolor="white",
387
+ bordercolor=colors[0],
388
+ borderwidth=1,
389
+ textangle=0,
390
+ name=f"FP_{name_suffix}",
391
+ )
392
+ return annotation
@@ -0,0 +1,227 @@
1
+ Metadata-Version: 2.4
2
+ Name: carm-roofline
3
+ Version: 1.0.0.dev0
4
+ Summary: CARM: Cache-Aware Roofline Model benchmarking and visualization toolkit
5
+ Author: CARM Contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/yourusername/carm-roofline
8
+ Project-URL: Repository, https://github.com/yourusername/carm-roofline
9
+ Project-URL: Issues, https://github.com/yourusername/carm-roofline/issues
10
+ Keywords: benchmark,roofline,performance,hpc,cache,simd
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Topic :: Software Development :: Testing
15
+ Classifier: Topic :: System :: Benchmark
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Operating System :: POSIX :: Linux
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: rich>=14.3.2
24
+ Requires-Dist: rich-argparse>=1.7.0
25
+ Requires-Dist: numpy>=2.2.6
26
+ Requires-Dist: matplotlib>=3.7.2
27
+ Requires-Dist: tomli>=2.0.0
28
+ Provides-Extra: gui
29
+ Requires-Dist: dash; extra == "gui"
30
+ Requires-Dist: dash-bootstrap-components; extra == "gui"
31
+ Requires-Dist: dash-daq; extra == "gui"
32
+ Requires-Dist: diskcache; extra == "gui"
33
+ Requires-Dist: plotly; extra == "gui"
34
+ Requires-Dist: pandas; extra == "gui"
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=9.0.2; extra == "dev"
37
+ Requires-Dist: ruff>=0.14.14; extra == "dev"
38
+ Requires-Dist: pre-commit>=4.5.1; extra == "dev"
39
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
40
+ Provides-Extra: analysis
41
+ Provides-Extra: all
42
+ Requires-Dist: carm-roofline[dev,gui]; extra == "all"
43
+
44
+ # The CARM Tool
45
+
46
+ This tool performs the micro-benchmarking necessary to constuct the Cache-Aware Roofline Model (CARM) for floating-point operations on Intel, AMD, AARCH64, and RISCV64 CPUs. It supports different instruction set extensions (AVX512, AVX2, SSE, Scalar, SVE, Neon, RVV1.0, RVV0.7), different data precisions (double- and single-precision), different floating point instructions (fused multiply and add, addition, multiplication and division). The micro-benchmarks can be performed for any number of threads. The tool provides as output a vizualization of CARM, as well as the measurements obtained for the different memory levels and selected FP instruction. The tool is also capable of the micro-benchmarking necessary to construct a memory bandwidth graph for various problem sizes, and perform mixed tests that stress the FP units and memory system at the same time.
47
+
48
+ The tool can also perform application analysis using either performance counters (via PAPI) or dynamic binary instrumentation (via DynamoRIO or Intel SDE), to view the output of these results in a CARM graph the GUI is required.
49
+
50
+ For better results visualization, ResultsGUI.py can be ran to generate a web browser based user interface for result visualization, results from other machines can be imported for visualization on any machine via the GUI, by moving the necessary result csv files to the results folder of the machine running the GUI.
51
+
52
+ The tool is currently under active development, new features will be added in the future which might not always be immediatly documented and bugs are to be excpected. All feedback regarding bugs and feature requests is welcome.
53
+
54
+ ## Requirements
55
+ - gcc (>= 4.9 for AVX512 tests and only tested with gcc 9.3)
56
+ - python (only tested with python 3.8.8)
57
+ - matplolib (only tested with 3.3.4)
58
+ - numpy
59
+ - dash (GUI only)
60
+ - dash-bootstrap-components (GUI only)
61
+ - pandas (GUI only)
62
+ - plotly (GUI only)
63
+ - diskcache (GUI only)
64
+ - DynamoRIO (only tested with 10.93.19916) - for DBI application analysis (x86 and AARCH64)
65
+ (Might require an edit to line 132 in the CustomClient Makefile to match the installed version of DynamoRIO)
66
+ - PAPI (only tested with 7.0.1) - for PMU application analysis
67
+ - Optional:
68
+ - Intel SDE (only tested with 9.33.0) - for DBI application analysis (x86)
69
+
70
+ ## How to use (CLI)
71
+
72
+ > **Note**: `run.py` is the legacy entry point. The refactored entry point is **`carm.py`** (or installed command `carm`) — see `carm.py -h` for updated arguments.
73
+
74
+ The first step is optional and consists in creating a configuration file for the system to test under the **config** folder. This configuration file is optional in x86 systems since the tool is able to automatically scan the cache sizes present, however this detection can sometimes be wrong (you can check what cache sizes have been detected by using -v 3), so a configuration file is still advised. You can also skip the configuration file by using the arguments:
75
+ -l1 <l1_size (per core)> -l2 <l2_size (per core)> -l3 <l3_size (total)> and --name <name>.
76
+
77
+ This configuration file can include four fields:
78
+ - identifier of the system
79
+ - L1 size per core (in KiB)
80
+ - L2 size per core (in KiB)
81
+ - Total L3 size (in KiB)
82
+
83
+ An example configuration file looks like:
84
+ ```
85
+ name=venus
86
+ l1_cache=32
87
+ l2_cache=1024
88
+ l3_cache=25344
89
+ ```
90
+
91
+ After the optional creation of the configuration file, the tool can executed as:
92
+
93
+ ```
94
+ python run.py <path_config_file> --name <name> --test <test> --inst <fp_inst> --num_ops <num_ops> --isa <[isa]> --precision <[data_precision]> --ld_st_ratio <ld_st_ratio> --fp_ld_st_ratio <fp_ld_st_ratio> --l3_kbytes <l3_kbytes> --dram_kbytes <dram_kbytes> --threads <[num_threads]> --freq <frequency> --l1_size <l1_size> --l2_size <l2_size> --l3_size <l3_size> --threads_per_l1 <threads_per_l1> --threads_per_l2 <threads_per_l2> --vector_length <vector_length> --verbose [0, 1, 2, 3, 4] [--only_ld] [--only_st] [--no_freq_measure] [--set_freq] [--interleaved] [--dram_auto] [--plot]
95
+ ```
96
+
97
+ where
98
+ - <path_config_file> is the path for configuration file of the system. This should be your first argument.
99
+ - --name <name> is the name for machine running the benchmarks (Default: unnamed)
100
+ - --test <test> is the test to be performed (roofline, MEM, FP, L1, L2, L3, DRAM, mixedL1, mixedL2, mixedL3, mixedDRAM);
101
+ - --inst <fp_inst> is the floating point instruction to be used (add, mul, div), fma performance is also measured by default;
102
+ - --num_ops <num_ops> is the number of FP operations used for the FP benchmark;
103
+ - --isa <isa> is the instruction set extension, multiple options can be seletcted by spacing them (avx512, avx2, sse, scalar, neon, armscalar, rvv0.7, rvv1.0, riscvscalar, auto);
104
+ - --precision <data_precision> is the precision of the data, multiple options can be seletcted by spacing them (dp, sp);
105
+ - --ld_st_ratio <ld_st_ratio> is the number of loads per store involed in the memory benchmarks;
106
+ - --fp_ld_st_ratio <fp_ld_st_ratio> is the FP to Load/Store ratio involved in the mixed benchmarks;
107
+ - --l3_kbytes <l3_kbytes> is the total size of the array for the L3 test in KiB;
108
+ - --dram_kbytes <dram_kbytes> is the total size of the array for the DRAM test in KiB (Default: 524288 (512 MiB));
109
+ - --threads <num_threads> is the number of threads used for the test, multiple options can be selected by spacing them
110
+ - --freq <frequency> expected CPU frequency if not auto-measuring (in GHz)
111
+ - --l1_size <l1_size> is the L1 size per core of the machine being benchmarked
112
+ - --l2_size <l2_size> is the L2 size per core of the machine being benchmarked
113
+ - --l3_size <l3_size> is the total L3 size of the machine being benchmarked
114
+ - --threads_per_l1 <threads_per_l1> are the expected number of threads that will share the same L1 cache (Default: 1)
115
+ - --threads_per_l2 <threads_per_l2> are the expected number of threads that will share the same L2 cache (Default: 2)
116
+ - --vector_length <vector_length> is the desired vector length in elements to be used (for riscvvector only, tool will use the max by default)
117
+ - --verbose [0, 1, 2, 3, 4] is the level of terminal output details (0 -> No Output 1 -> Only ISA/Configuration Errors and Test Specifications, 2 -> Test Results, 3 -> Configuration Values Selected/Detected, 4 -> Debug Output)
118
+ - [--only_ld] indicates that the memory benchmarks will just contain loads (<ld_st_ratio> is ignored);
119
+ - [--only_st] indicates that the memory benchmarks will just contain stores (<ld_st_ratio> is ignored);
120
+ - [--no_freq_measure] disables the automatic frequency measuring (CPU frequency should be provided in config file or via --freq argument)
121
+ - [--set_freq] will set the cpu frequency to the specified one (sudo is required, x86 only, might not work)
122
+ - [--interleaved] indicates if the cores belong to interleaved numa domains (e.g. core 0 -> node 0, core 1 -> node 1, core 2 -> node 0, etc). Used for thread binding;
123
+ - [--dram_auto] automatically adjust the DRAM test size according to number of threads to ensure individual thread data only fits in DRAM (Default: 0)
124
+ - [--plot] enables the plotting of CARM/MEM results as an SVG image, allowing for result visualization without using the GUI (Default: 0)
125
+
126
+
127
+ A simple run can be executed with the command
128
+
129
+ ```
130
+ python run.py
131
+ ```
132
+
133
+ which by default runs the micro-benchmarks necessary to obtain CARM data, for all available ISAs using double-precision. The FP instructions used are the ADD and FMA instructions (32768 operations) and the memory benchmarks contain 2 loads per each store, with the DRAM test using an array with size 512MiB and 1 thread.
134
+
135
+
136
+ For additional information regarding the input arguments, run the command:
137
+
138
+ ```
139
+ python run.py -h
140
+ ```
141
+
142
+ To profile an application using **Performance Counters**, PMU_AI_Calculator.py should be executed with the following arguments:
143
+
144
+ - <executable_path> Path to the executable to analyze.
145
+ - <additional_args> Arguments for the executable that will be analyzed.
146
+ - --name <name> Name for the machine running the executable (Default: unnamed);
147
+ - --app_name <app_name> Name for the executable (if empty, executable name will be used);
148
+ - --isa <isa> Main ISA used by the executable, if not sure leave blank (optional only for naming facilitation);
149
+
150
+ Note that this requires the PAPI_LST_INS, PAPI_SP_OPS, and PAPI_DP_OPS events to be available on your system.
151
+
152
+ To profile an application using **Dynamic Binary Instrumentation**, DBI_AI_Calculator.py should be executed with the following arguments:
153
+
154
+ - <DBI_path> Path to the DynamoRIO directory, or Intel SDE directory if --sde is used.
155
+ - <executable_path> Path to the executable to analyze.
156
+ - [--roi] Measure only Region of Interest, or not. (Must be previously marked in the source code);
157
+ - [--sde] Measure using Intel SDE, instead of DynamoRIO (x86 only);
158
+ - --name <name> Name for the machine running the executable (Default: unnamed);
159
+ - --app_name <app_name> Name for the executable (if empty, executable name will be used);
160
+ - --isa <isa> Main ISA used by the executable, if not sure leave blank (optional only for naming facilitation);
161
+ - --threads <threads> Number of threads used by the application (optional only for naming facilitation);
162
+ - --precision <data_precision> Data Precision used by the application (optional only for naming facilitation);
163
+ - <additional_args> Arguments for the executable that will be analyzed. (This should be your last argument)
164
+
165
+ Note that both the PMU analysis and the DBI with ROI analysis require the previous injection of the source code with Region of Interest specific code, to facilitate this proccess you can include the dbi_carm_roi.h header file in your application directory and use the API functions to enable the DBI based ROI analysis.
166
+
167
+ ```
168
+ CARM_roi_begin();
169
+ CARM_roi_end();
170
+ ```
171
+
172
+ For PMU analysis via PAPI, the PAPI high level API must be used to define the region of interest via the functions.
173
+
174
+ ```
175
+ PAPI_hl_region_begin("");
176
+ PAPI_hl_region_end("");
177
+ ```
178
+
179
+ In case of PMU analysis the PAPI library must be linked during compilation, this can usually be done following one of these methods:
180
+
181
+ ```
182
+ Method 1:
183
+ gcc -<Compiler flags> -I/Path/To/Papi/src <source_file.c> -o <executable_file> /Path/To/Papi/src/libpapi.a
184
+
185
+ Method 2:
186
+ gcc -<Compiler flags> -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib <source_file.c> -o <executable_file> -lpapi
187
+ ```
188
+
189
+ The profiling results are automatically stored in a csv assocaited with the provided machine name, these results can then be viewed using the GUI, make sure to match the machine name used in the profiling with the machine name used in the CARM benchmarks execution.
190
+
191
+ ## How to use (GUI)
192
+
193
+ Launch the GUI from the refactored CLI:
194
+
195
+ ```
196
+ ./carm.py gui
197
+ ```
198
+
199
+ By default, the GUI reads/writes results under `carm_results` relative to the current working directory. You can override this root with:
200
+
201
+ ```
202
+ ./carm.py gui --results-dir /path/to/results
203
+ ```
204
+
205
+ The GUI benchmark button now launches the refactored benchmark flow (`carm.py benchmark`) with roofline-focused settings. The benchmark output remains visible in the terminal where the GUI was launched.
206
+
207
+ The "Run Application Analysis" button is currently marked **TBI** and disabled in the GUI.
208
+
209
+ ## In papers and reports, please refer to this tool as follows
210
+
211
+ <p>
212
+ <a href="https://doi.org/10.1109/L-CA.2013.6" alt="Publication">
213
+ <img src="https://img.shields.io/badge/DOI-10.1109/L--CA.2013.6-blue.svg"/></a>
214
+
215
+ </p>
216
+
217
+ <p>
218
+ <a href="https://doi.org/10.1016/j.future.2020.01.044" alt="Publication">
219
+ <img src="https://img.shields.io/badge/DOI-10.1016/j.future.2020.01.044-blue.svg"/></a>
220
+
221
+ </p>
222
+
223
+ J. Morgado, L. Sousa, A. Ilic. "CARM Tool: Cache-Aware Roofline Model Automatic Benchmarking and Application Analysis", IEEE International Symposium on Workload Characterization (IISWC), Vancouver, British Columbia, Canada, 2024
224
+
225
+ A. Ilic, F. Pratas and L. Sousa, "Cache-aware Roofline model: Upgrading the loft," in IEEE Computer Architecture Letters, vol. 13, no. 1, pp. 21-24, 21 Jan.-June 2014, doi: 10.1109/L-CA.2013.6.
226
+
227
+ Diogo Marques, Aleksandar Ilic, Zakhar A. Matveev, and Leonel Sousa. "Application-driven cache-aware roofline model." Future Generation Computer Systems 107 (2020): 257-273.