PyPI - carm-roofline - Versions diffs - 1.0.0.dev0__tar.gz - Mend

carm-roofline 1.0.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

carm_roofline-1.0.0.dev0/GUI_utils.py +392 -0
carm_roofline-1.0.0.dev0/PKG-INFO +227 -0
carm_roofline-1.0.0.dev0/README.md +184 -0
carm_roofline-1.0.0.dev0/ResultsGUI.py +3715 -0
carm_roofline-1.0.0.dev0/architecture/__init__.py +22 -0
carm_roofline-1.0.0.dev0/architecture/architecture.py +299 -0
carm_roofline-1.0.0.dev0/architecture/arm.py +20 -0
carm_roofline-1.0.0.dev0/architecture/config.py +255 -0
carm_roofline-1.0.0.dev0/architecture/detect.py +468 -0
carm_roofline-1.0.0.dev0/architecture/frequency.py +52 -0
carm_roofline-1.0.0.dev0/architecture/memory.py +585 -0
carm_roofline-1.0.0.dev0/architecture/riscv.py +71 -0
carm_roofline-1.0.0.dev0/architecture/tests/arm/features.c +41 -0
carm_roofline-1.0.0.dev0/architecture/tests/arm/frequency.h +86 -0
carm_roofline-1.0.0.dev0/architecture/tests/arm/vlen.c +18 -0
carm_roofline-1.0.0.dev0/architecture/tests/frequency.c +137 -0
carm_roofline-1.0.0.dev0/architecture/tests/riscv/frequency.h +89 -0
carm_roofline-1.0.0.dev0/architecture/tests/riscv/rvv_version.c +27 -0
carm_roofline-1.0.0.dev0/architecture/tests/riscv/vlen.c +20 -0
carm_roofline-1.0.0.dev0/architecture/tests/x86/cache.c +83 -0
carm_roofline-1.0.0.dev0/architecture/tests/x86/features.c +47 -0
carm_roofline-1.0.0.dev0/architecture/tests/x86/frequency.h +127 -0
carm_roofline-1.0.0.dev0/architecture/tests/x86/x86_avx512/frequency.h +129 -0
carm_roofline-1.0.0.dev0/architecture/x86.py +29 -0
carm_roofline-1.0.0.dev0/arguments.py +142 -0
carm_roofline-1.0.0.dev0/assets/CARM_icon.svg +4 -0
carm_roofline-1.0.0.dev0/assets/CHAMP_logo.svg +44 -0
carm_roofline-1.0.0.dev0/assets/menu_icon.png +0 -0
carm_roofline-1.0.0.dev0/assets/style.css +3 -0
carm_roofline-1.0.0.dev0/benchmark/__init__.py +44 -0
carm_roofline-1.0.0.dev0/benchmark/benchmark.py +182 -0
carm_roofline-1.0.0.dev0/benchmark/benchmarking.py +223 -0
carm_roofline-1.0.0.dev0/benchmark/generation/__init__.py +51 -0
carm_roofline-1.0.0.dev0/benchmark/generation/arm.py +154 -0
carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/__init__.py +17 -0
carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/data_type.py +40 -0
carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/instruction.py +411 -0
carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/operation.py +44 -0
carm_roofline-1.0.0.dev0/benchmark/generation/code_gen/register.py +150 -0
carm_roofline-1.0.0.dev0/benchmark/generation/isa.py +742 -0
carm_roofline-1.0.0.dev0/benchmark/generation/parameters.py +90 -0
carm_roofline-1.0.0.dev0/benchmark/generation/riscv.py +211 -0
carm_roofline-1.0.0.dev0/benchmark/generation/x86.py +206 -0
carm_roofline-1.0.0.dev0/benchmark/interface.py +162 -0
carm_roofline-1.0.0.dev0/benchmark/output/__init__.py +50 -0
carm_roofline-1.0.0.dev0/benchmark/output/arithmetic.py +271 -0
carm_roofline-1.0.0.dev0/benchmark/output/base.py +66 -0
carm_roofline-1.0.0.dev0/benchmark/output/common.py +116 -0
carm_roofline-1.0.0.dev0/benchmark/output/memory.py +227 -0
carm_roofline-1.0.0.dev0/benchmark/output/memory_sweep.py +280 -0
carm_roofline-1.0.0.dev0/benchmark/output/mixed.py +137 -0
carm_roofline-1.0.0.dev0/benchmark/output/roofline.py +382 -0
carm_roofline-1.0.0.dev0/benchmark/result.py +90 -0
carm_roofline-1.0.0.dev0/benchmark/suites/__init__.py +17 -0
carm_roofline-1.0.0.dev0/benchmark/suites/arithmetic.py +89 -0
carm_roofline-1.0.0.dev0/benchmark/suites/base.py +168 -0
carm_roofline-1.0.0.dev0/benchmark/suites/memory.py +146 -0
carm_roofline-1.0.0.dev0/benchmark/suites/memory_sweep.py +230 -0
carm_roofline-1.0.0.dev0/benchmark/suites/mixed.py +51 -0
carm_roofline-1.0.0.dev0/benchmark/suites/roofline.py +114 -0
carm_roofline-1.0.0.dev0/carm.py +186 -0
carm_roofline-1.0.0.dev0/carm_roofline.egg-info/PKG-INFO +227 -0
carm_roofline-1.0.0.dev0/carm_roofline.egg-info/SOURCES.txt +79 -0
carm_roofline-1.0.0.dev0/carm_roofline.egg-info/dependency_links.txt +1 -0
carm_roofline-1.0.0.dev0/carm_roofline.egg-info/entry_points.txt +2 -0
carm_roofline-1.0.0.dev0/carm_roofline.egg-info/requires.txt +24 -0
carm_roofline-1.0.0.dev0/carm_roofline.egg-info/top_level.txt +14 -0
carm_roofline-1.0.0.dev0/context.py +22 -0
carm_roofline-1.0.0.dev0/exec_interface.py +138 -0
carm_roofline-1.0.0.dev0/gui_config.py +27 -0
carm_roofline-1.0.0.dev0/output_utils.py +201 -0
carm_roofline-1.0.0.dev0/pyproject.toml +219 -0
carm_roofline-1.0.0.dev0/run_config.py +51 -0
carm_roofline-1.0.0.dev0/setup.cfg +4 -0
carm_roofline-1.0.0.dev0/test_bench/__init__.py +10 -0
carm_roofline-1.0.0.dev0/test_bench/builder.py +374 -0
carm_roofline-1.0.0.dev0/test_bench/test_bench.c +368 -0
carm_roofline-1.0.0.dev0/test_bench/test_bench.h +274 -0
carm_roofline-1.0.0.dev0/test_bench/wrapper.inl +100 -0
carm_roofline-1.0.0.dev0/units.py +434 -0
carm_roofline-1.0.0.dev0/utils.py +313 -0

carm_roofline-1.0.0.dev0/GUI_utils.py ADDED Viewed

@@ -0,0 +1,392 @@
+import csv
+import os
+import math
+import plotly.graph_objects as go
+import numpy as np
+import utils as ut
+def read_csv_file(file_path):
+    data_list = []
+    with open(file_path, newline="") as csvfile:
+        reader = csv.reader(csvfile)
+        header = next(reader)
+        machine_name = header[1]
+        l1_size = int(header[3])
+        l2_size = int(header[5])
+        l3_size = int(header[7])
+        header2 = next(reader)
+        for row in reader:
+            if not row or not "".join(row).strip():
+                continue
+            data = {}
+            data["Date"] = row[0]
+            data["ISA"] = row[1]
+            data["Precision"] = row[2]
+            data["Threads"] = int(row[3])
+            data["Loads"] = int(row[4])
+            data["Stores"] = int(row[5])
+            data["Interleaved"] = row[6]
+            data["DRAMBytes"] = int(row[7])
+            data["FPInst"] = row[8]
+            data["L1"] = float(row[9])
+            data["L2"] = float(row[11])
+            data["L3"] = float(row[13])
+            data["DRAM"] = float(row[15])
+            data["FP"] = float(row[17])
+            data["FP_FMA"] = float(row[19])
+            data_list.append(data)
+    return machine_name, l1_size, l2_size, l3_size, data_list
+def read_application_csv_file(file_path):
+    if not os.path.exists(file_path):
+        print("Application file does not exist:", file_path)
+        return False
+    data_list = []
+    try:
+        with open(file_path, newline="") as csvfile:
+            reader = csv.reader(csvfile)
+            header = next(reader, None)
+            if header is None:
+                print("File is empty:", file_path)
+                return False
+            for row in reader:
+                if row:
+                    data = {
+                        "Date": row[0],
+                        "Method": row[1],
+                        "Name": row[2],
+                        "ISA": row[3],
+                        "Precision": row[4],
+                        "Threads": row[5],
+                        "AI": float(row[6]),
+                        "GFLOPS": float(row[7]),
+                        "Bandwidth": float(row[8]),
+                        "Time": float(row[9]),
+                    }
+                    data_list.append(data)
+    except Exception as e:
+        print("Failed to read the file:", file_path, "Error:", e)
+        return False
+    return data_list if data_list else False
+def extract_last_segment(s):
+    return s.split("_")[-1] if "_" in s else s
+def extract_prefix(s):
+    if "_" in s:
+        return s.rsplit("_", 1)[0]
+    return s
+def interpolate_color(start_color, end_color, factor):
+    r = int(start_color[0] + factor * (end_color[0] - start_color[0]))
+    g = int(start_color[1] + factor * (end_color[1] - start_color[1]))
+    b = int(start_color[2] + factor * (end_color[2] - start_color[2]))
+    return f"rgb({r},{g},{b})"
+def construct_query(ISA, Precision, Threads, Loads, Stores, Interleaved, DRAMBytes, FPInst, Date):
+    query_parts = []
+    if ISA:
+        query_parts.append(f"ISA == '{ISA}'")
+    if Precision:
+        query_parts.append(f"Precision == '{Precision}'")
+    if Threads:
+        query_parts.append(f"Threads == {Threads}")
+    if Loads:
+        query_parts.append(f"Loads == {Loads}")
+    if Stores:
+        query_parts.append(f"Stores == {Stores}")
+    if Interleaved:
+        query_parts.append(f"Interleaved == '{Interleaved}'")
+    if DRAMBytes:
+        query_parts.append(f"DRAMBytes == {DRAMBytes}")
+    if FPInst:
+        query_parts.append(f"FPInst == '{FPInst}'")
+    if Date:
+        query_parts.append(f"Date == '{Date}'")
+    return " and ".join(query_parts) if query_parts else None
+def calculate_roofline(values, min_ai):
+    aidots = [0] * 3
+    FPaidots = [0] * 2
+    gflopdots = [0] * 3
+    FPgflopdots = [0] * 2
+    ai = np.linspace(min(0.00390625, min_ai), 256, num=200000)
+    traces = []
+    cache_levels = ["L1", "L2", "L3", "DRAM"]
+    dots = {}
+    if values[5] > 0:
+        peak_flops = values[5]
+    else:
+        peak_flops = values[4]
+    for cache_level in cache_levels:
+        if values[cache_levels.index(cache_level)] > 0:
+            aidots = [0, 0, 0]
+            gflopdots = [0, 0, 0]
+            y_values = ut.carm_eq(ai, values[cache_levels.index(cache_level)], peak_flops)
+            y_special = ut.carm_eq(0.00390625, values[cache_levels.index(cache_level)], peak_flops)
+            # Find the point where y_values stops increasing or reaches a plateau
+            for i in range(1, len(y_values)):
+                if y_values[i - 1] == y_values[i]:
+                    aidots[1] = float(ai[i - 1])
+                    break
+            else:
+                # If no break occurred in the loop
+                aidots[1] = float(ai[-1])
+                i = len(y_values) - 1
+            mid_ai = np.sqrt(aidots[1] * min(0.00390625, min_ai))
+            mid_gflops = np.sqrt(y_values[0] * y_values[i - 1])
+            dots[cache_level] = {
+                "start": [min(0.00390625, min_ai), y_values[0]],
+                "mid": [mid_ai, mid_gflops],
+                "ridge": [aidots[1], y_values[i - 1]],
+                "end": [ai[-1], y_values[-1]],
+            }
+    for i in range(4):
+        if values[i]:
+            top_roof = values[i]
+            break
+    y_values = ut.carm_eq(ai, top_roof, values[4])
+    for i in range(1, len(y_values)):
+        if y_values[i - 1] == y_values[i]:
+            FPaidots[0] = float(ai[i - 1])
+            break
+    FPgflopdots[0] = y_values[i - 1]
+    FPaidots[1] = ai[199999]
+    FPgflopdots[1] = y_values[199999]
+    dots[values[6]] = {"ridge": [FPaidots[0], FPgflopdots[0]], "end": [FPaidots[1], FPgflopdots[1]]}
+    return dots
+def plot_roofline(values, dots, name_suffix, ISA, line_legend, line_size, line_legend_detailed):
+    import numpy as np
+    aidots = [0] * 3
+    FPaidots = [0] * 2
+    gflopdots = [0] * 3
+    FPgflopdots = [0] * 2
+    ai = np.linspace(0.00390625, 256, num=200000)
+    traces = []
+    cache_levels = ["L1", "L2", "L3", "DRAM"]
+    if name_suffix == "":
+        colors = ["black", "black", "black", "black"]
+        color_inst = "black"
+    else:
+        colors = ["red", "red", "red", "red"]
+        color_inst = "red"
+    linestyles = ["solid", "solid", "dash", "dot"]
+    for cache_level, color, linestyle in zip(cache_levels, colors, linestyles):
+        cache_dots = dots.get(cache_level)
+        if cache_dots:
+            if line_legend_detailed:
+                legend_text = f"{cache_level} {ISA.upper()} Bandwidth: {values[cache_levels.index(cache_level)]} GB/s"
+            else:
+                legend_text = f"{cache_level} {ISA.upper()}"
+            aidots = [cache_dots["start"][0], cache_dots["ridge"][0], cache_dots["end"][0]]
+            gflopdots = [cache_dots["start"][1], cache_dots["ridge"][1], cache_dots["end"][1]]
+            trace = go.Scatter(
+                x=aidots,
+                y=gflopdots,
+                mode="lines",
+                text=[
+                    "",
+                    f"{cache_level} {ISA.upper()} Peak Bandwidth: {values[cache_levels.index(cache_level)]} GB/s",
+                    f"FP FMA {ISA.upper()} Peak: {values[5]} GFLOP/s",
+                ],
+                hovertemplate="<b>%{text}</b><br>(%{x}, %{y})<br><extra></extra>",
+                line=dict(color=color, dash=linestyle, width=line_size),
+                name=legend_text,
+                showlegend=line_legend,
+            )
+            traces.append(trace)
+    if values[4] > 0:
+        aidots = [dots[values[6]]["ridge"][0], dots[values[6]]["end"][0]]
+        gflopdots = [dots[values[6]]["ridge"][1], dots[values[6]]["end"][1]]
+        if line_legend_detailed:
+            legend_text = f"FP {values[6].upper()} {ISA.upper()}  Peak: {values[4]} GFLOP/s"
+        else:
+            legend_text = f"{values[6].upper()} {ISA.upper()}"
+        if values[5] == 0:
+            linedash = "solid"
+        else:
+            linedash = "dashdot"
+        trace_inst = go.Scatter(
+            x=aidots,
+            y=gflopdots,
+            mode="lines",
+            text=[
+                f"FP {ISA.upper()} {values[6].upper()} Peak Performance: {values[4]} GFLOP/s",
+                f"FP {ISA.upper()} {values[6].upper()} Peak: {values[4]} GFLOP/s",
+            ],
+            hovertemplate="<b>%{text}</b><br>(%{x}, %{y})<br><extra></extra>",
+            line=dict(color=color_inst, dash=linedash, width=line_size),
+            name=legend_text,
+            showlegend=line_legend,
+        )
+        traces.append(trace_inst)
+    if values[5] > 0:
+        aidots = [dots["L1"]["ridge"][0], dots["L1"]["end"][0]]
+        gflopdots = [dots["L1"]["ridge"][1], dots["L1"]["end"][1]]
+        if line_legend_detailed:
+            legend_text = f"FP FMA {ISA.upper()} Peak: {values[5]} GFLOP/s"
+        else:
+            legend_text = f"FMA {ISA.upper()}"
+        trace_inst = go.Scatter(
+            x=aidots,
+            y=gflopdots,
+            mode="lines",
+            text=[
+                f"FP {ISA.upper()} FMA Peak Performance: {values[5]} GFLOP/s",
+                f"FP {ISA.upper()} FMA Peak: {values[5]} GFLOP/s",
+            ],
+            hovertemplate="<b>%{text}</b><br>(%{x}, %{y})<br><extra></extra>",
+            line=dict(color=color_inst, dash="solid", width=line_size),
+            name=legend_text,
+            showlegend=line_legend,
+        )
+        traces.append(trace_inst)
+    return traces
+def draw_annotation(
+    values, lines, name_suffix, ISA, cache_level, graph_width, graph_height, anon_size, x_range=None, y_range=None
+):
+    aidots = [0] * 3
+    gflopdots = [0] * 3
+    annotation = {}
+    cache_levels = ["L1", "L2", "L3", "DRAM"]
+    angle_degrees = {}
+    if cache_level in cache_levels:
+        if cache_level in lines and lines[cache_level]["ridge"][0] > 0:
+            log_x1, log_x2 = math.log10(lines[cache_level]["start"][0]), math.log10(lines[cache_level]["ridge"][0])
+            log_y1, log_y2 = math.log10(lines[cache_level]["start"][1]), math.log10(lines[cache_level]["ridge"][1])
+            log_xmin, log_xmax = x_range[0], x_range[1]
+            log_ymin, log_ymax = y_range[0], y_range[1]
+            # Compute pixel coordinates based on log scale
+            x1_pixel = ((log_x1 - log_xmin) / (log_xmax - log_xmin)) * graph_width
+            x2_pixel = ((log_x2 - log_xmin) / (log_xmax - log_xmin)) * graph_width
+            y1_pixel = graph_height - ((log_y1 - log_ymin) / (log_ymax - log_ymin)) * graph_height
+            y2_pixel = graph_height - ((log_y2 - log_ymin) / (log_ymax - log_ymin)) * graph_height
+            # Pixel slope
+            pixel_slope = (y2_pixel - y1_pixel) / (x2_pixel - x1_pixel)
+            # Convert pixel slope to angle in degrees
+            angle_degrees[cache_level] = math.degrees(math.atan(pixel_slope))
+    ai = np.linspace(0.00390625, 256, num=200000)
+    traces = []
+    if name_suffix == "1":
+        colors = ["black", "black", "black", "black"]
+        color_inst = "black"
+        factor = 1.3
+    else:
+        colors = ["red", "red", "red", "red"]
+        color_inst = "red"
+        factor = 0.7
+    linestyles = ["solid", "solid", "dash", "dot"]
+    if cache_level in cache_levels and values[cache_levels.index(cache_level)] > 0:
+        if cache_level in lines:
+            aidots[0] = 0.00390625
+            y_values = ut.carm_eq(ai, values[cache_levels.index(cache_level)], values[5])
+            gflopdots[0] = y_values[0]
+            for i in range(1, len(y_values)):
+                if y_values[i - 1] == y_values[i]:
+                    aidots[1] = float(ai[i - 1])
+                    break
+            gflopdots[1] = y_values[i - 1]
+            annotation = go.layout.Annotation(
+                x=math.log10(lines[cache_level]["mid"][0] * factor),
+                y=math.log10(lines[cache_level]["mid"][1] * factor),
+                text=f"{cache_level} {ISA} Bandwidth: {values[cache_levels.index(cache_level)]} GB/s",
+                showarrow=False,
+                font=dict(
+                    color=colors[0],
+                    size=anon_size,
+                ),
+                align="center",
+                bgcolor="white",
+                bordercolor=colors[0],
+                borderwidth=1,
+                textangle=angle_degrees[cache_level],
+                name=f"{cache_level}_{name_suffix}",
+            )
+    if cache_level == "FMA" and values[5] > 0:
+        mid_ai = np.sqrt(lines["L1"]["ridge"][0] * lines["L1"]["end"][0])
+        mid_gflops = lines["L1"]["ridge"][1]
+        annotation = go.layout.Annotation(
+            x=math.log10(mid_ai),
+            y=math.log10(mid_gflops),
+            text=f"FP FMA {ISA} Peak: {values[5]} GFLOP/s",
+            showarrow=False,
+            font=dict(
+                color=colors[0],
+                size=anon_size,
+            ),
+            align="center",
+            bgcolor="white",
+            bordercolor=colors[0],
+            borderwidth=1,
+            textangle=0,
+            name=f"FP_FMA_{name_suffix}",
+        )
+    if cache_level == "FP" and values[4] > 0:
+        mid_ai = np.sqrt(lines["L1"]["ridge"][0] * lines["L1"]["end"][0])
+        mid_gflops = values[4]
+        annotation = go.layout.Annotation(
+            x=math.log10(mid_ai),
+            y=math.log10(mid_gflops),
+            text=f"FP {values[6].upper()} {ISA} Peak: {values[4]} GFLOP/s",
+            showarrow=False,
+            font=dict(
+                color=colors[0],
+                size=anon_size,
+            ),
+            align="center",
+            bgcolor="white",
+            bordercolor=colors[0],
+            borderwidth=1,
+            textangle=0,
+            name=f"FP_{name_suffix}",
+        )
+    return annotation

carm_roofline-1.0.0.dev0/PKG-INFO ADDED Viewed

@@ -0,0 +1,227 @@
+Metadata-Version: 2.4
+Name: carm-roofline
+Version: 1.0.0.dev0
+Summary: CARM: Cache-Aware Roofline Model benchmarking and visualization toolkit
+Author: CARM Contributors
+License: MIT
+Project-URL: Homepage, https://github.com/yourusername/carm-roofline
+Project-URL: Repository, https://github.com/yourusername/carm-roofline
+Project-URL: Issues, https://github.com/yourusername/carm-roofline/issues
+Keywords: benchmark,roofline,performance,hpc,cache,simd
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Software Development :: Testing
+Classifier: Topic :: System :: Benchmark
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Operating System :: POSIX :: Linux
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: rich>=14.3.2
+Requires-Dist: rich-argparse>=1.7.0
+Requires-Dist: numpy>=2.2.6
+Requires-Dist: matplotlib>=3.7.2
+Requires-Dist: tomli>=2.0.0
+Provides-Extra: gui
+Requires-Dist: dash; extra == "gui"
+Requires-Dist: dash-bootstrap-components; extra == "gui"
+Requires-Dist: dash-daq; extra == "gui"
+Requires-Dist: diskcache; extra == "gui"
+Requires-Dist: plotly; extra == "gui"
+Requires-Dist: pandas; extra == "gui"
+Provides-Extra: dev
+Requires-Dist: pytest>=9.0.2; extra == "dev"
+Requires-Dist: ruff>=0.14.14; extra == "dev"
+Requires-Dist: pre-commit>=4.5.1; extra == "dev"
+Requires-Dist: mypy>=1.8.0; extra == "dev"
+Provides-Extra: analysis
+Provides-Extra: all
+Requires-Dist: carm-roofline[dev,gui]; extra == "all"
+# The CARM Tool
+This tool performs the micro-benchmarking necessary to constuct the Cache-Aware Roofline Model (CARM) for floating-point operations on Intel, AMD, AARCH64, and RISCV64 CPUs. It supports different instruction set extensions (AVX512, AVX2, SSE, Scalar, SVE, Neon, RVV1.0, RVV0.7), different data precisions (double- and single-precision), different floating point instructions (fused multiply and add, addition, multiplication and division). The micro-benchmarks can be performed for any number of threads. The tool provides as output a vizualization of CARM, as well as the measurements obtained for the different memory levels and selected FP instruction. The tool is also capable of the micro-benchmarking necessary to construct a memory bandwidth graph for various problem sizes, and perform mixed tests that stress the FP units and memory system at the same time.
+The tool can also perform application analysis using either performance counters (via PAPI) or dynamic binary instrumentation (via DynamoRIO or Intel SDE), to view the output of these results in a CARM graph the GUI is required.
+For better results visualization, ResultsGUI.py can be ran to generate a web browser based user interface for result visualization, results from other machines can be imported for visualization on any machine via the GUI, by moving the necessary result csv files to the results folder of the machine running the GUI.
+The tool is currently under active development, new features will be added in the future which might not always be immediatly documented and bugs are to be excpected. All feedback regarding bugs and feature requests is welcome.
+## Requirements
+- gcc (>= 4.9 for AVX512 tests and only tested with gcc 9.3)
+- python (only tested with python 3.8.8)
+    - matplolib (only tested with 3.3.4)
+    - numpy
+    - dash (GUI only)
+    - dash-bootstrap-components (GUI only)
+    - pandas (GUI only)
+    - plotly (GUI only)
+    - diskcache (GUI only)
+- DynamoRIO (only tested with 10.93.19916) - for DBI application analysis (x86 and AARCH64)
+    (Might require an edit to line 132 in the CustomClient Makefile to match the installed version of DynamoRIO)
+- PAPI (only tested with 7.0.1) - for PMU application analysis
+- Optional:
+    - Intel SDE (only tested with 9.33.0)  - for DBI application analysis (x86)
+## How to use (CLI)
+> **Note**: `run.py` is the legacy entry point. The refactored entry point is **`carm.py`** (or installed command `carm`) — see `carm.py -h` for updated arguments.
+The first step is optional and consists in creating a configuration file for the system to test under the **config** folder. This configuration file is optional in x86 systems since the tool is able to automatically scan the cache sizes present, however this detection can sometimes be wrong (you can check what cache sizes have been detected by using -v 3), so a configuration file is still advised. You can also skip the configuration file by using the arguments:
+-l1 <l1_size (per core)> -l2 <l2_size (per core)> -l3 <l3_size (total)> and --name <name>.
+This configuration file can include four fields:
+- identifier of the system
+- L1 size per core (in KiB)
+- L2 size per core (in KiB)
+- Total L3 size (in KiB)
+An example configuration file looks like:
+```
+name=venus
+l1_cache=32
+l2_cache=1024
+l3_cache=25344
+```
+After the optional creation of the configuration file, the tool can executed as:
+```
+python run.py <path_config_file> --name <name> --test <test> --inst <fp_inst> --num_ops <num_ops> --isa <[isa]> --precision <[data_precision]> --ld_st_ratio <ld_st_ratio> --fp_ld_st_ratio <fp_ld_st_ratio> --l3_kbytes <l3_kbytes> --dram_kbytes <dram_kbytes> --threads <[num_threads]> --freq <frequency> --l1_size <l1_size> --l2_size <l2_size> --l3_size <l3_size> --threads_per_l1 <threads_per_l1> --threads_per_l2 <threads_per_l2> --vector_length <vector_length> --verbose [0, 1, 2, 3, 4] [--only_ld] [--only_st] [--no_freq_measure] [--set_freq] [--interleaved] [--dram_auto] [--plot]
+```
+where
+ - <path_config_file> is the path for configuration file of the system. This should be your first argument.
+ - --name <name> is the name for machine running the benchmarks (Default: unnamed)
+ - --test <test> is the test to be performed (roofline, MEM, FP, L1, L2, L3, DRAM, mixedL1, mixedL2, mixedL3, mixedDRAM);
+ - --inst <fp_inst> is the floating point instruction to be used (add, mul, div), fma performance is also measured by default;
+ - --num_ops <num_ops> is the number of FP operations used for the FP benchmark;
+ - --isa <isa> is the instruction set extension, multiple options can be seletcted by spacing them (avx512, avx2, sse, scalar, neon, armscalar, rvv0.7, rvv1.0, riscvscalar, auto);
+ - --precision <data_precision> is the precision of the data, multiple options can be seletcted by spacing them (dp, sp);
+ - --ld_st_ratio <ld_st_ratio> is the number of loads per store involed in the memory benchmarks;
+ - --fp_ld_st_ratio <fp_ld_st_ratio> is the FP to Load/Store ratio involved in the mixed benchmarks;
+ - --l3_kbytes <l3_kbytes> is the total size of the array for the L3 test in KiB;
+ - --dram_kbytes <dram_kbytes> is the total size of the array for the DRAM test in KiB (Default: 524288 (512 MiB));
+ - --threads <num_threads> is the number of threads used for the test, multiple options can be selected by spacing them
+ - --freq <frequency> expected CPU frequency if not auto-measuring (in GHz)
+ - --l1_size <l1_size> is the L1 size per core of the machine being benchmarked
+ - --l2_size <l2_size> is the L2 size per core of the machine being benchmarked
+ - --l3_size <l3_size> is the total L3 size of the machine being benchmarked
+ - --threads_per_l1 <threads_per_l1> are the expected number of threads that will share the same L1 cache (Default: 1)
+ - --threads_per_l2 <threads_per_l2> are the expected number of threads that will share the same L2 cache (Default: 2)
+ - --vector_length <vector_length> is the desired vector length in elements to be used (for riscvvector only, tool will use the max by default)
+ - --verbose [0, 1, 2, 3, 4] is the level of terminal output details (0 -> No Output 1 -> Only ISA/Configuration Errors and Test Specifications, 2 -> Test Results, 3 -> Configuration Values Selected/Detected, 4 -> Debug Output)
+ - [--only_ld] indicates that the memory benchmarks will just contain loads (<ld_st_ratio> is ignored);
+ - [--only_st] indicates that the memory benchmarks will just contain stores (<ld_st_ratio> is ignored);
+ - [--no_freq_measure] disables the automatic frequency measuring (CPU frequency should be provided in config file or via --freq argument)
+ - [--set_freq] will set the cpu frequency to the specified one (sudo is required, x86 only, might not work)
+ - [--interleaved] indicates if the cores belong to interleaved numa domains (e.g. core 0 -> node 0, core 1 -> node 1, core 2 -> node 0, etc). Used for thread binding;
+ - [--dram_auto] automatically adjust the DRAM test size according to number of threads to ensure individual thread data only fits in DRAM (Default: 0)
+ - [--plot] enables the plotting of CARM/MEM results as an SVG image, allowing for result visualization without using the GUI (Default: 0)
+A simple run can be executed with the command
+```
+python run.py
+```
+which by default runs the micro-benchmarks necessary to obtain CARM data, for all available ISAs using double-precision. The FP instructions used are the ADD and FMA instructions (32768 operations) and the memory benchmarks contain 2 loads per each store, with the DRAM test using an array with size 512MiB and 1 thread.
+For additional information regarding the input arguments, run the command:
+```
+python run.py -h
+```
+To profile an application using **Performance Counters**, PMU_AI_Calculator.py should be executed with the following arguments:
+ - <executable_path> Path to the executable to analyze.
+ - <additional_args> Arguments for the executable that will be analyzed.
+ - --name <name> Name for the machine running the executable (Default: unnamed);
+ - --app_name <app_name> Name for the executable (if empty, executable name will be used);
+ - --isa <isa> Main ISA used by the executable, if not sure leave blank (optional only for naming facilitation);
+Note that this requires the PAPI_LST_INS, PAPI_SP_OPS, and PAPI_DP_OPS events to be available on your system.
+To profile an application using **Dynamic Binary Instrumentation**, DBI_AI_Calculator.py should be executed with the following arguments:
+ - <DBI_path> Path to the DynamoRIO directory, or Intel SDE directory if --sde is used.
+ - <executable_path> Path to the executable to analyze.
+ - [--roi] Measure only Region of Interest, or not. (Must be previously marked in the source code);
+ - [--sde] Measure using Intel SDE, instead of DynamoRIO (x86 only);
+ - --name <name> Name for the machine running the executable (Default: unnamed);
+ - --app_name <app_name> Name for the executable (if empty, executable name will be used);
+ - --isa <isa> Main ISA used by the executable, if not sure leave blank (optional only for naming facilitation);
+ - --threads <threads> Number of threads used by the application (optional only for naming facilitation);
+ - --precision <data_precision> Data Precision used by the application (optional only for naming facilitation);
+ - <additional_args> Arguments for the executable that will be analyzed. (This should be your last argument)
+Note that both the PMU analysis and the DBI with ROI analysis require the previous injection of the source code with Region of Interest specific code, to facilitate this proccess you can include the dbi_carm_roi.h header file in your application directory and use the API functions to enable the DBI based ROI analysis.
+```
+CARM_roi_begin();
+CARM_roi_end();
+```
+For PMU analysis via PAPI, the PAPI high level API must be used to define the region of interest via the  functions.
+```
+PAPI_hl_region_begin("");
+PAPI_hl_region_end("");
+```
+In case of PMU analysis the PAPI library must be linked during compilation, this can usually be done following one of these methods:
+```
+Method 1:
+gcc -<Compiler flags> -I/Path/To/Papi/src <source_file.c> -o <executable_file> /Path/To/Papi/src/libpapi.a
+Method 2:
+gcc -<Compiler flags> -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib  <source_file.c> -o <executable_file> -lpapi
+```
+The profiling results are automatically stored in a csv assocaited with the provided machine name, these results can then be viewed using the GUI, make sure to match the machine name used in the profiling with the machine name used in the CARM benchmarks execution.
+## How to use (GUI)
+Launch the GUI from the refactored CLI:
+```
+./carm.py gui
+```
+By default, the GUI reads/writes results under `carm_results` relative to the current working directory. You can override this root with:
+```
+./carm.py gui --results-dir /path/to/results
+```
+The GUI benchmark button now launches the refactored benchmark flow (`carm.py benchmark`) with roofline-focused settings. The benchmark output remains visible in the terminal where the GUI was launched.
+The "Run Application Analysis" button is currently marked **TBI** and disabled in the GUI.
+## In papers and reports, please refer to this tool as follows
+<p>
+  <a href="https://doi.org/10.1109/L-CA.2013.6" alt="Publication">
+    <img src="https://img.shields.io/badge/DOI-10.1109/L--CA.2013.6-blue.svg"/></a>
+</p>
+<p>
+  <a href="https://doi.org/10.1016/j.future.2020.01.044" alt="Publication">
+    <img src="https://img.shields.io/badge/DOI-10.1016/j.future.2020.01.044-blue.svg"/></a>
+</p>
+J. Morgado, L. Sousa, A. Ilic. "CARM Tool: Cache-Aware Roofline Model Automatic Benchmarking and Application Analysis", IEEE International Symposium on Workload Characterization (IISWC), Vancouver, British Columbia, Canada, 2024
+A. Ilic, F. Pratas and L. Sousa, "Cache-aware Roofline model: Upgrading the loft," in IEEE Computer Architecture Letters, vol. 13, no. 1, pp. 21-24, 21 Jan.-June 2014, doi: 10.1109/L-CA.2013.6.
+Diogo Marques, Aleksandar Ilic, Zakhar A. Matveev, and Leonel Sousa. "Application-driven cache-aware roofline model." Future Generation Computer Systems 107 (2020): 257-273.