PyPI - onnx-diagnostic - Versions diffs - 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl - Mend

onnx-diagnostic 0.7.3py3-none-any.whl → 0.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

onnx_diagnostic/helpers/log_helper.py CHANGED Viewed

@@ -1,208 +1,23 @@
-import datetime
 import enum
-import glob
 import io
-import os
 import pprint
 import re
 import warnings
-import zipfile
-from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 import numpy as np
 import pandas
 from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
 from .helper import string_sig
-BUCKET_SCALES_VALUES = np.array(
-    [-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
+from ._log_helper import (
+    BUCKET_SCALES,
+    breaking_last_point,
+    apply_excel_style,
+    align_dataframe_with,
+    open_dataframe,
+    enumerate_csv_files,
 )
-BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
-def filter_data(
-    df: pandas.DataFrame,
-    filter_in: Optional[str] = None,
-    filter_out: Optional[str] = None,
-    verbose: int = 0,
-) -> pandas.DataFrame:
-    """
-    Argument `filter` follows the syntax
-    ``<column1>:<fmt1>//<column2>:<fmt2>``.
-    The format is the following:
-    * a value or a set of values separated by ``;``
-    """
-    if not filter_in and not filter_out:
-        return df
-    def _f(fmt):
-        cond = {}
-        if isinstance(fmt, str):
-            cols = fmt.split("//")
-            for c in cols:
-                assert ":" in c, f"Unexpected value {c!r} in fmt={fmt!r}"
-                spl = c.split(":")
-                assert len(spl) == 2, f"Unexpected value {c!r} in fmt={fmt!r}"
-                name, fil = spl
-                cond[name] = set(fil.split(";"))
-        return cond
-    if filter_in:
-        cond = _f(filter_in)
-        assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_in!r}"
-        for k, v in cond.items():
-            if k not in df.columns:
-                continue
-            if verbose:
-                print(
-                    f"[_filter_data] filter in column {k!r}, "
-                    f"values {v!r} among {set(df[k].astype(str))}"
-                )
-            df = df[df[k].astype(str).isin(v)]
-    if filter_out:
-        cond = _f(filter_out)
-        assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_out!r}"
-        for k, v in cond.items():
-            if k not in df.columns:
-                continue
-            if verbose:
-                print(
-                    f"[_filter_data] filter out column {k!r}, "
-                    f"values {v!r} among {set(df[k].astype(str))}"
-                )
-            df = df[~df[k].astype(str).isin(v)]
-    return df
-def enumerate_csv_files(
-    data: Union[
-        pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
-    ],
-    verbose: int = 0,
-    filtering: Optional[Callable[[str], bool]] = None,
-) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
-    """
-    Enumerates files considered for the aggregation.
-    Only csv files are considered.
-    If a zip file is given, the function digs into the zip files and
-    loops over csv candidates.
-    :param data: dataframe with the raw data or a file or list of files
-    :param vrbose: verbosity
-    :param filtering: function to filter in or out files in zip files,
-        must return true to keep the file, false to skip it.
-    :return: a generator yielding tuples with the filename, date, full path and zip file
-    data can contains:
-    * a dataframe
-    * a string for a filename, zip or csv
-    * a list of string
-    * a tuple
-    """
-    if not isinstance(data, list):
-        data = [data]
-    for itn, filename in enumerate(data):
-        if isinstance(filename, pandas.DataFrame):
-            if verbose:
-                print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
-            yield filename
-            continue
-        if isinstance(filename, tuple):
-            # A file in a zipfile
-            if verbose:
-                print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
-            yield filename
-            continue
-        if os.path.exists(filename):
-            ext = os.path.splitext(filename)[-1]
-            if ext == ".csv":
-                # We check the first line is ok.
-                if verbose:
-                    print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
-                dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
-                du = dt.strftime("%Y-%m-%d %H:%M:%S")
-                yield (os.path.split(filename)[-1], du, filename, "")
-                continue
-            if ext == ".zip":
-                if verbose:
-                    print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
-                zf = zipfile.ZipFile(filename, "r")
-                for ii, info in enumerate(zf.infolist()):
-                    name = info.filename
-                    if filtering is None:
-                        ext = os.path.splitext(name)[-1]
-                        if ext != ".csv":
-                            continue
-                    elif not filtering(name):
-                        continue
-                    if verbose:
-                        print(
-                            f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
-                        )
-                    with zf.open(name) as zzf:
-                        first_line = zzf.readline()
-                    if b"," not in first_line:
-                        continue
-                    yield (
-                        os.path.split(name)[-1],
-                        "%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
-                        name,
-                        filename,
-                    )
-                zf.close()
-                continue
-            raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
-        # filename is a pattern.
-        found = glob.glob(filename)
-        if verbose and not found:
-            print(f"[enumerate_csv_files] unable to find file in {filename!r}")
-        for ii, f in enumerate(found):
-            if verbose:
-                print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
-            yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
-def open_dataframe(
-    data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
-) -> pandas.DataFrame:
-    """
-    Opens a filename defined by function
-    :func:`onnx_diagnostic.helpers.log_helper.enumerate_csv_files`.
-    :param data: a dataframe, a filename, a tuple indicating the file is coming
-        from a zip file
-    :return: a dataframe
-    """
-    if isinstance(data, pandas.DataFrame):
-        return data
-    if isinstance(data, str):
-        df = pandas.read_csv(data)
-        df["RAWFILENAME"] = data
-        return df
-    if isinstance(data, tuple):
-        if not data[-1]:
-            df = pandas.read_csv(data[2])
-            df["RAWFILENAME"] = data[2]
-            return df
-        zf = zipfile.ZipFile(data[-1])
-        with zf.open(data[2]) as f:
-            df = pandas.read_csv(f)
-            df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
-        zf.close()
-        return df
-    raise ValueError(f"Unexpected value for data: {data!r}")
 class CubeViewDef:
     """
     Defines how to compute a view.
@@ -226,9 +41,46 @@ class CubeViewDef:
     :param name: name of the view, used mostly to debug
     :param plots: adds plot to the Excel sheet
     :param no_index: remove the index (but keeps the columns)
+    Some examples of views. First example is an aggregated view
+    for many metrics.
+    .. code-block:: python
+        cube = CubeLogs(...)
+        CubeViewDef(
+            key_index=cube._filter_column(fs, cube.keys_time),
+            values=cube._filter_column(
+                ["TIME_ITER", "speedup", "time_latency.*", "onnx_n_nodes"],
+                cube.values,
+            ),
+            ignore_unique=True,
+            key_agg=["model_name", "task", "model_task", "suite"],
+            agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
+            agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
+            name="agg-all",
+            plots=True,
+        )
+    Next one focuses on a couple of metrics.
+    .. code-block:: python
+        cube = CubeLogs(...)
+        CubeViewDef(
+            key_index=cube._filter_column(fs, cube.keys_time),
+            values=cube._filter_column(["speedup"], cube.values),
+            ignore_unique=True,
+            keep_columns_in_index=["suite"],
+            name="speedup",
+        )
     """
     class HighLightKind(enum.IntEnum):
+        "Codes to highlight values."
         NONE = 0
         RED = 1
         GREEN = 2
@@ -276,118 +128,6 @@ class CubeViewDef:
         return string_sig(self)  # type: ignore[arg-type]
-def apply_excel_style(
-    filename_or_writer: Any,
-    f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
-):
-    """
-    Applies styles on all sheets in a file unless the sheet is too big.
-    :param filename_or_writer: filename, modified inplace
-    :param f_highlight: color function to apply, one per sheet
-    """
-    from openpyxl import load_workbook
-    from openpyxl.styles import Alignment
-    from openpyxl.utils import get_column_letter
-    from openpyxl.styles import Font  # , PatternFill, numbers
-    if isinstance(filename_or_writer, str):
-        workbook = load_workbook(filename_or_writer)
-        save = True
-    else:
-        workbook = filename_or_writer.book
-        save = False
-    left = Alignment(horizontal="left")
-    left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
-    right = Alignment(horizontal="right")
-    font_colors = {
-        CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
-        CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
-    }
-    for name in workbook.sheetnames:
-        f_highlight = f_highlights.get(name, None) if f_highlights else None
-        sheet = workbook[name]
-        n_rows = sheet.max_row
-        n_cols = sheet.max_column
-        if n_rows * n_cols > 2**18:
-            # Too big.
-            continue
-        co: Dict[int, int] = {}
-        sizes: Dict[int, int] = {}
-        cols = set()
-        for i in range(1, n_rows + 1):
-            for j, cell in enumerate(sheet[i]):
-                if j > n_cols:
-                    break
-                cols.add(cell.column)
-                if isinstance(cell.value, float):
-                    co[j] = co.get(j, 0) + 1
-                elif isinstance(cell.value, str):
-                    sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
-        for k, v in sizes.items():
-            c = get_column_letter(k)
-            sheet.column_dimensions[c].width = min(max(8, v), 30)
-        for k in cols:
-            if k not in sizes:
-                c = get_column_letter(k)
-                sheet.column_dimensions[c].width = 15
-        for i in range(1, n_rows + 1):
-            for j, cell in enumerate(sheet[i]):
-                if j > n_cols:
-                    break
-                if isinstance(cell.value, pandas.Timestamp):
-                    cell.alignment = right
-                    dt = cell.value.to_pydatetime()
-                    cell.value = dt
-                    cell.number_format = (
-                        "YYYY-MM-DD"
-                        if (
-                            dt.hour == 0
-                            and dt.minute == 0
-                            and dt.second == 0
-                            and dt.microsecond == 0
-                        )
-                        else "YYYY-MM-DD 00:00:00"
-                    )
-                elif isinstance(cell.value, (float, int)):
-                    cell.alignment = right
-                    x = abs(cell.value)
-                    if int(x) == x:
-                        cell.number_format = "0"
-                    elif x > 5000:
-                        cell.number_format = "# ##0"
-                    elif x >= 500:
-                        cell.number_format = "0.0"
-                    elif x >= 50:
-                        cell.number_format = "0.00"
-                    elif x >= 5:
-                        cell.number_format = "0.000"
-                    elif x > 0.5:
-                        cell.number_format = "0.0000"
-                    elif x > 0.005:
-                        cell.number_format = "0.00000"
-                    else:
-                        cell.number_format = "0.000E+00"
-                    if f_highlight:
-                        h = f_highlight(cell.value)
-                        if h in font_colors:
-                            cell.font = font_colors[h]
-                elif isinstance(cell.value, str) and len(cell.value) > 70:
-                    cell.alignment = left_shrink
-                else:
-                    cell.alignment = left
-                    if f_highlight:
-                        h = f_highlight(cell.value)
-                        if h in font_colors:
-                            cell.font = font_colors[h]
-    if save:
-        workbook.save(filename_or_writer)
 class CubePlot:
     """
     Creates a plot.
@@ -397,6 +137,26 @@ class CubePlot:
     :param split: draw a graph per line in the dataframe
     :param timeseries: this assumes the time is one level of the columns,
         this argument indices the level name
+    It defines a graph. Usually *bar* or *barh* is used to
+    compare experiments for every metric, a subplot by metric.
+    .. code-block:: python
+        CubePlot(df, kind="barh", orientation="row", split=True)
+    *line* is usually used to plot timeseries showing the
+    evolution of metrics over time.
+    .. code-block:: python
+        CubePlot(
+            df,
+            kind="line",
+            orientation="row",
+            split=True,
+            timeseries="time",
+        )
     """
     KINDS = {"bar", "barh", "line"}
@@ -607,6 +367,35 @@ class CubePlot:
 class CubeLogs:
     """
     Processes logs coming from experiments.
+    A cube is basically a database with certain columns
+    playing specific roles.
+    * time: only one column, it is not mandatory but it is recommended
+      to have one
+    * keys: they are somehow coordinates, they cannot be aggregated,
+      they are not numbers, more like categories, `(time, *keys)`
+      identifies an element of the database in an unique way,
+      there cannot be more than one row sharing the same key and time
+      values
+    * values: they are not necessary numerical, but if they are,
+      they can be aggregated
+    Every other columns is ignored. More columns can be added
+    by using formulas.
+    :param data: the raw data
+    :param time: the time column
+    :param keys: the keys, can include regular expressions
+    :param values: the values, can include regular expressions
+    :param ignored: ignores some column, acts as negative regular
+        expressions for the other two
+    :param recent: if more than one rows share the same keys,
+        the cube only keeps the most recent one
+    :param formulas: columns to add, defined with formulas
+    :param fill_missing: a dictionary, defines values replacing missing one
+        for some columns
+    :param keep_last_date: overwrites all the times with the most recent
+        one, it makes things easier for timeseries
     """
     def __init__(
@@ -636,6 +425,22 @@ class CubeLogs:
         self.fill_missing = fill_missing
         self.keep_last_date = keep_last_date
+    def clone(
+        self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
+    ) -> "CubeLogs":
+        """
+        Makes a copy of the dataframe.
+        It copies the processed data not the original one.
+        """
+        cube = self.__class__(
+            data if data is not None else self.data.copy(),
+            time=self.time,
+            keys=keys or self.keys_no_time,
+            values=self.values,
+        )
+        cube.load()
+        return cube
     def post_load_process_piece(
         self, df: pandas.DataFrame, unique: bool = False
     ) -> pandas.DataFrame:
@@ -741,17 +546,13 @@ class CubeLogs:
             print(f"[CubeLogs.load] dropped={self.dropped}")
             print(f"[CubeLogs.load] data.shape={self.data.shape}")
-        shape = self.data.shape
         if verbose:
             print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
         self._preprocess()
         if verbose:
             print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
-        assert (
-            self.data.shape[0] > 0
-        ), f"The preprocessing reduced shape {shape} to {self.data.shape}."
-        if self.recent and verbose:
-            print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
+            if self.recent:
+                print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
         # Let's apply the formulas
         if self._formulas:
@@ -883,6 +684,18 @@ class CubeLogs:
         "usual"
         return str(self.data) if hasattr(self, "data") else str(self._data)
+    def make_view_def(self, name: str) -> Optional[CubeViewDef]:
+        """
+        Returns a view definition.
+        :param name: name of a value
+        :return: a CubeViewDef or None if name does not make sense
+        """
+        assert name in self.values, f"{name!r} is not one of the values {self.values}"
+        keys = sorted(self.keys_no_time)
+        index = len(keys) // 2 + (len(keys) % 2)
+        return CubeViewDef(key_index=keys[:index], values=[name], name=name)
     def view(
         self,
         view_def: Union[str, CubeViewDef],
@@ -900,6 +713,12 @@ class CubeLogs:
         :param verbose: verbosity level
         :return: dataframe
         """
+        if isinstance(view_def, str):
+            # We automatically create a view for a metric
+            view_def_ = self.make_view_def(view_def)
+            assert view_def_ is not None, f"Unable to create a view from {view_def!r}"
+            view_def = view_def_
         assert isinstance(
             view_def, CubeViewDef
         ), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
@@ -1113,6 +932,17 @@ class CubeLogs:
         else:
             piv.sort_index(inplace=True, axis=1)
+        # final step, force columns with numerical values to be float
+        for c in list(piv.columns):
+            s = piv[c]
+            if not pandas.api.types.is_object_dtype(s):
+                continue
+            try:
+                sf = s.astype(float)
+            except (ValueError, TypeError):
+                continue
+            piv[c] = sf
         if verbose:
             print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
             print(f"[CubeLogs.view] -- done view {view_def.name!r}")
@@ -1155,7 +985,9 @@ class CubeLogs:
         for c in set(key_index) | set(key_columns):
             s = new_data[c]
             if s.isna().max():
-                if pandas.api.types.is_numeric_dtype(s):
+                if pandas.api.types.is_numeric_dtype(
+                    s
+                ) and not pandas.api.types.is_object_dtype(s):
                     min_v = s.dropna().min()
                     assert (
                         min_v >= 0
@@ -1192,7 +1024,7 @@ class CubeLogs:
             )
             if len(nonan) > 0:
                 obs.update(dict(count=len(nonan)))
-                if is_numeric_dtype(nonan):
+                if is_numeric_dtype(nonan) and not pandas.api.types.is_object_dtype(nonan):
                     obs.update(
                         dict(
                             min=nonan.min(),
@@ -1228,9 +1060,11 @@ class CubeLogs:
         raw: Optional[str] = "raw",
         verbose: int = 0,
         csv: Optional[Sequence[str]] = None,
+        time_mask: bool = False,
+        sbs: Optional[Dict[str, Dict[str, Any]]] = None,
     ):
         """
-        Creates an excel file with a list of view.
+        Creates an excel file with a list of views.
         :param output: output file to create
         :param views: sequence or dictionary of views to append
@@ -1238,9 +1072,17 @@ class CubeLogs:
         :param raw: add a page with the raw data
         :param csv: views to dump as csv files (same name as outputs + view naw)
         :param verbose: verbosity
+        :param time_mask: color the background of the cells if one
+            of the value for the last date is unexpected,
+            assuming they should remain stale
+        :param sbs: configurations to compare side-by-side, this adds two tabs,
+            one gathering raw data about the two configurations, the other one
+            is aggregated by metrics
         """
         if verbose:
             print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
+        time_mask &= len(self.data[self.time].unique()) > 2
+        cube_time = self.cube_time(fill_other_dates=True) if time_mask else None
         views = {k: k for k in views} if not isinstance(views, dict) else views
         f_highlights = {}
         plots = []
@@ -1252,10 +1094,25 @@ class CubeLogs:
                     print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
                 df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
+            time_mask_view: Dict[str, pandas.DataFrame] = {}
             for name, view in views.items():
                 if view is None:
                     continue
                 df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
+                if cube_time is not None:
+                    cube_mask = cube_time.view(view)
+                    aligned = align_dataframe_with(cube_mask, df)
+                    if aligned is not None:
+                        assert aligned.shape == df.shape, (
+                            f"Shape mismatch between the view {df.shape} and the mask "
+                            f"{time_mask_view[name].shape}"
+                        )
+                        time_mask_view[name] = aligned
+                        if verbose:
+                            print(
+                                f"[CubeLogs.to_excel] compute mask for view {name!r} "
+                                f"with shape {aligned.shape}"
+                            )
                 if tview is None:
                     continue
                 memory = df.memory_usage(deep=True).sum()
@@ -1335,6 +1192,36 @@ class CubeLogs:
                         writer, sheet_name="raw", freeze_panes=(1, 1), index=True
                     )
+            if sbs:
+                if verbose:
+                    for k, v in sbs.items():
+                        print(f"[CubeLogs.to_excel] sbs {k}: {v}")
+                name = "∧".join(sbs)
+                sbs_raw, sbs_agg = self.sbs(sbs)
+                if verbose:
+                    print(f"[CubeLogs.to_excel] add sheet {name!r} with shape {sbs_raw.shape}")
+                    print(
+                        f"[CubeLogs.to_excel] add sheet '{name}-AGG' "
+                        f"with shape {sbs_agg.shape}"
+                    )
+                sbs_raw = sbs_raw.reset_index(drop=False)
+                sbs_raw.to_excel(
+                    writer,
+                    sheet_name=name,
+                    freeze_panes=(
+                        sbs_raw.columns.nlevels + sbs_raw.index.nlevels,
+                        sbs_raw.index.nlevels,
+                    ),
+                )
+                sbs_agg.to_excel(
+                    writer,
+                    sheet_name=f"{name}-AGG",
+                    freeze_panes=(
+                        sbs_agg.columns.nlevels + sbs_agg.index.nlevels,
+                        sbs_agg.index.nlevels,
+                    ),
+                )
             if plots:
                 from openpyxl.drawing.image import Image
@@ -1366,10 +1253,194 @@ class CubeLogs:
             if verbose:
                 print(f"[CubeLogs.to_excel] applies style to {output!r}")
-            apply_excel_style(writer, f_highlights)  # type: ignore[arg-type]
+            apply_excel_style(
+                writer, f_highlights, time_mask_view=time_mask_view, verbose=verbose  # type: ignore[arg-type]
+            )
             if verbose:
                 print(f"[CubeLogs.to_excel] done with {len(views)} views")
+    def cube_time(self, fill_other_dates: bool = False, threshold: float = 1.2) -> "CubeLogs":
+        """
+        Aggregates the data over time to detect changes on the last value.
+        If *fill_other_dates* is True, all dates are kept, but values
+        are filled with 0.
+        *threshold* determines the bandwidth within the values are expected,
+        should be a factor of the standard deviation.
+        """
+        unique_time = self.data[self.time].unique()
+        assert len(unique_time) > 2, f"Not enough dates to proceed: unique_time={unique_time}"
+        gr = self.data[[*self.keys_no_time, *self.values]].groupby(
+            self.keys_no_time, dropna=False
+        )
+        dgr = gr.agg(
+            lambda series, th=threshold: int(breaking_last_point(series, threshold=th)[0])
+        )
+        tm = unique_time.max()
+        assert dgr.shape[0] > 0, (
+            f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
+            f"data.shape={self.data.shape}"
+        )
+        dgr[self.time] = tm
+        if fill_other_dates:
+            other_df = []
+            other_dates = [t for t in unique_time if t != tm]
+            for t in other_dates:
+                df = dgr.copy()
+                df[self.time] = t
+                for c in df.columns:
+                    if c != self.time:
+                        df[c] = 0
+                other_df.append(df)
+            dgr = pandas.concat([dgr, *other_df], axis=0)
+            assert dgr.shape[0] > 0, (
+                f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
+                f"data.shape={self.data.shape}, "
+                f"other_df shapes={[df.shape for df in other_df]}"
+            )
+        return self.clone(data=dgr.reset_index(drop=False))
+    def sbs(
+        self, configs: Dict[str, Dict[str, Any]], column_name: str = "CONF"
+    ) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
+        """
+        Creates a side-by-side for two configurations.
+        Every configuration a dictionary column:value which filters in
+        the rows to keep in order to compute the side by side.
+        Every configuration is given a name (the key in configs),
+        it is added in column column_name.
+        :param configs: example
+            ``dict(CFA=dict(exporter="E1", opt="O"), CFB=dict(exporter="E2", opt="O"))``
+        :param column_name: column to add with the name of the configuration
+        :return: data and aggregated date
+        """
+        assert (
+            len(configs) >= 2
+        ), f"A side by side needs at least two configs but configs={configs}"
+        set_keys_time = set(self.keys_time)
+        columns_index = None
+        data_list = []
+        for name_conf, conf in configs.items():
+            if columns_index is None:
+                columns_index = list(conf.keys())
+                assert set(columns_index) <= set_keys_time, (
+                    f"Configuration {conf} includes columns outside the keys "
+                    f"{', '.join(sorted(set_keys_time))}"
+                )
+            else:
+                assert set(columns_index) == set(conf), (
+                    f"Every conf should share the same keys but conf={conf} "
+                    f"is different from {set(columns_index)}"
+                )
+            data = self.data
+            for k, v in conf.items():
+                data = data[data[k] == v]
+            assert data.shape[0] > 0, f"No rows found for conf={conf}"
+            assert (
+                column_name not in data.columns
+            ), f"column_name={column_name!r} is already in {data.columns}"
+            data = data.copy()
+            data[column_name] = name_conf
+            data_list.append(data)
+        new_data = pandas.concat(data_list, axis=0)
+        cube = self.clone(new_data, keys=[*self.keys_no_time, column_name])
+        key_index = set(self.keys_time) - {*columns_index, column_name}  # type: ignore[misc]
+        view = CubeViewDef(
+            key_index=set(key_index),  # type: ignore[arg-type]
+            name="sbs",
+            values=cube.values,
+            keep_columns_in_index=[self.time],
+        )
+        view_res = cube.view(view)
+        assert isinstance(view_res, pandas.DataFrame), "not needed but mypy complains"
+        # add metrics
+        index_column_name = list(view_res.columns.names).index(column_name)
+        index_metrics = list(view_res.columns.names).index("METRICS")
+        def _mkc(m, s):
+            c = ["" for c in view_res.columns.names]
+            c[index_column_name] = s
+            c[index_metrics] = m
+            return tuple(c)
+        list_configs = list(configs.items())
+        mean_columns = [
+            c
+            for c in view_res.columns
+            if pandas.api.types.is_numeric_dtype(view_res[c])
+            and not pandas.api.types.is_object_dtype(view_res[c])
+        ]
+        assert mean_columns, f"No numerical columns in {view_res.dtypes}"
+        view_res = view_res[mean_columns].copy()
+        metrics = sorted(set(c[index_metrics] for c in view_res.columns))
+        assert metrics, (
+            f"No numerical metrics detected in "
+            f"view_res.columns.names={view_res.columns.names}, "
+            f"columns={view_res.dtypes}"
+        )
+        sum_columns = []
+        columns_to_add = []
+        for i in range(len(list_configs)):
+            for j in range(i + 1, len(list_configs)):
+                for m in metrics:
+                    iname, ci = list_configs[i]
+                    jname, cj = list_configs[j]
+                    ci = ci.copy()
+                    cj = cj.copy()
+                    ci["METRICS"] = m
+                    cj["METRICS"] = m
+                    ci["CONF"] = iname
+                    cj["CONF"] = jname
+                    ci_name = tuple(ci[n] for n in view_res.columns.names)
+                    cj_name = tuple(cj[n] for n in view_res.columns.names)
+                    assert ci_name in view_res.columns or cj_name in view_res.columns, (
+                        f"Unable to find column {ci_name} or {cj_name} "
+                        f"in columns {view_res.columns}, metrics={metrics}"
+                    )
+                    if ci_name not in view_res.columns or cj_name not in view_res.columns:
+                        # One config does not have such metric.
+                        continue
+                    si = view_res[ci_name]
+                    sj = view_res[cj_name]
+                    sinan = si.isna()
+                    sjnan = sj.isna()
+                    n1 = iname
+                    n2 = jname
+                    nas = pandas.DataFrame(
+                        {
+                            _mkc(m, f"∅{n1}∧∅{n2}"): (sinan & sjnan).astype(int),
+                            _mkc(m, f"∅{n1}∧{n2}"): (sinan & ~sjnan).astype(int),
+                            _mkc(m, f"{n1}∧∅{n2}"): (~sinan & sjnan).astype(int),
+                            _mkc(m, f"{n1}∧{n2}"): (~sinan & ~sjnan).astype(int),
+                            _mkc(m, f"{n1}<{n2}"): (si < sj).astype(int),
+                            _mkc(m, f"{n1}=={n2}"): (si == sj).astype(int),
+                            _mkc(m, f"{n1}>{n2}"): (si > sj).astype(int),
+                        }
+                    )
+                    nas.columns.names = view_res.columns.names
+                    columns_to_add.append(nas)
+                    sum_columns.extend(nas.columns)
+        view_res = pandas.concat([view_res, *columns_to_add], axis=1)
+        res = view_res.stack("METRICS", future_stack=True)  # type: ignore[union-attr]
+        res = res.reorder_levels(
+            [res.index.nlevels - 1, *list(range(res.index.nlevels - 1))]
+        ).sort_index()
+        # aggregated metrics
+        aggs = {
+            **{k: "mean" for k in mean_columns},  # noqa: C420
+            **{k: "sum" for k in sum_columns},  # noqa: C420
+        }
+        flat = view_res.groupby(self.time).agg(aggs)
+        flat = flat.stack("METRICS", future_stack=True)
+        return res, flat
 class CubeLogsPerformance(CubeLogs):
     """
@@ -1456,6 +1527,24 @@ class CubeLogsPerformance(CubeLogs):
             keep_last_date=keep_last_date,
         )
+    def clone(
+        self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
+    ) -> "CubeLogs":
+        """
+        Makes a copy of the dataframe.
+        It copies the processed data not the original one.
+        keys can be changed as well.
+        """
+        cube = self.__class__(
+            data if data is not None else self.data.copy(),
+            time=self.time,
+            keys=keys or self.keys_no_time,
+            values=self.values,
+            recent=False,
+        )
+        cube.load()
+        return cube
     def _process_formula(
         self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
     ) -> Callable[[pandas.DataFrame], pandas.Series]:

onnx-diagnostic 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl

onnx-diagnostic 0.7.3py3-none-any.whl → 0.7.5py3-none-any.whl