onnx-diagnostic 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. onnx_diagnostic/__init__.py +7 -0
  2. onnx_diagnostic/__main__.py +4 -0
  3. onnx_diagnostic/_command_lines_parser.py +1141 -0
  4. onnx_diagnostic/api.py +15 -0
  5. onnx_diagnostic/doc.py +100 -0
  6. onnx_diagnostic/export/__init__.py +2 -0
  7. onnx_diagnostic/export/api.py +124 -0
  8. onnx_diagnostic/export/dynamic_shapes.py +1083 -0
  9. onnx_diagnostic/export/shape_helper.py +296 -0
  10. onnx_diagnostic/export/validate.py +173 -0
  11. onnx_diagnostic/ext_test_case.py +1290 -0
  12. onnx_diagnostic/helpers/__init__.py +1 -0
  13. onnx_diagnostic/helpers/_log_helper.py +463 -0
  14. onnx_diagnostic/helpers/args_helper.py +132 -0
  15. onnx_diagnostic/helpers/bench_run.py +450 -0
  16. onnx_diagnostic/helpers/cache_helper.py +687 -0
  17. onnx_diagnostic/helpers/config_helper.py +170 -0
  18. onnx_diagnostic/helpers/doc_helper.py +163 -0
  19. onnx_diagnostic/helpers/fake_tensor_helper.py +273 -0
  20. onnx_diagnostic/helpers/graph_helper.py +386 -0
  21. onnx_diagnostic/helpers/helper.py +1707 -0
  22. onnx_diagnostic/helpers/log_helper.py +2245 -0
  23. onnx_diagnostic/helpers/memory_peak.py +249 -0
  24. onnx_diagnostic/helpers/mini_onnx_builder.py +600 -0
  25. onnx_diagnostic/helpers/model_builder_helper.py +469 -0
  26. onnx_diagnostic/helpers/onnx_helper.py +1200 -0
  27. onnx_diagnostic/helpers/ort_session.py +736 -0
  28. onnx_diagnostic/helpers/rt_helper.py +476 -0
  29. onnx_diagnostic/helpers/torch_helper.py +987 -0
  30. onnx_diagnostic/reference/__init__.py +4 -0
  31. onnx_diagnostic/reference/evaluator.py +254 -0
  32. onnx_diagnostic/reference/ops/__init__.py +1 -0
  33. onnx_diagnostic/reference/ops/op_add_add_mul_mul.py +68 -0
  34. onnx_diagnostic/reference/ops/op_attention.py +60 -0
  35. onnx_diagnostic/reference/ops/op_average_pool_grad.py +63 -0
  36. onnx_diagnostic/reference/ops/op_bias_softmax.py +16 -0
  37. onnx_diagnostic/reference/ops/op_cast_like.py +46 -0
  38. onnx_diagnostic/reference/ops/op_complex.py +26 -0
  39. onnx_diagnostic/reference/ops/op_concat.py +15 -0
  40. onnx_diagnostic/reference/ops/op_constant_of_shape.py +67 -0
  41. onnx_diagnostic/reference/ops/op_fused_matmul.py +31 -0
  42. onnx_diagnostic/reference/ops/op_gather.py +29 -0
  43. onnx_diagnostic/reference/ops/op_gather_elements.py +45 -0
  44. onnx_diagnostic/reference/ops/op_gather_grad.py +12 -0
  45. onnx_diagnostic/reference/ops/op_memcpy_host.py +11 -0
  46. onnx_diagnostic/reference/ops/op_mul_sigmoid.py +23 -0
  47. onnx_diagnostic/reference/ops/op_negxplus1.py +8 -0
  48. onnx_diagnostic/reference/ops/op_qlinear_average_pool.py +40 -0
  49. onnx_diagnostic/reference/ops/op_qlinear_conv.py +102 -0
  50. onnx_diagnostic/reference/ops/op_quick_gelu.py +23 -0
  51. onnx_diagnostic/reference/ops/op_replace_zero.py +13 -0
  52. onnx_diagnostic/reference/ops/op_rotary.py +19 -0
  53. onnx_diagnostic/reference/ops/op_scan.py +65 -0
  54. onnx_diagnostic/reference/ops/op_scatter_elements.py +107 -0
  55. onnx_diagnostic/reference/ops/op_scatternd_of_shape.py +22 -0
  56. onnx_diagnostic/reference/ops/op_simplified_layer_normalization.py +8 -0
  57. onnx_diagnostic/reference/ops/op_skip_layer_normalization.py +13 -0
  58. onnx_diagnostic/reference/ops/op_slice.py +20 -0
  59. onnx_diagnostic/reference/ops/op_transpose_cast.py +16 -0
  60. onnx_diagnostic/reference/ops/op_tri_matrix.py +17 -0
  61. onnx_diagnostic/reference/ort_evaluator.py +652 -0
  62. onnx_diagnostic/reference/quantized_tensor.py +46 -0
  63. onnx_diagnostic/reference/report_results_comparison.py +95 -0
  64. onnx_diagnostic/reference/torch_evaluator.py +669 -0
  65. onnx_diagnostic/reference/torch_ops/__init__.py +56 -0
  66. onnx_diagnostic/reference/torch_ops/_op_run.py +335 -0
  67. onnx_diagnostic/reference/torch_ops/access_ops.py +94 -0
  68. onnx_diagnostic/reference/torch_ops/binary_ops.py +108 -0
  69. onnx_diagnostic/reference/torch_ops/controlflow_ops.py +121 -0
  70. onnx_diagnostic/reference/torch_ops/generator_ops.py +36 -0
  71. onnx_diagnostic/reference/torch_ops/nn_ops.py +196 -0
  72. onnx_diagnostic/reference/torch_ops/other_ops.py +106 -0
  73. onnx_diagnostic/reference/torch_ops/reduce_ops.py +130 -0
  74. onnx_diagnostic/reference/torch_ops/sequence_ops.py +65 -0
  75. onnx_diagnostic/reference/torch_ops/shape_ops.py +121 -0
  76. onnx_diagnostic/reference/torch_ops/unary_ops.py +93 -0
  77. onnx_diagnostic/tasks/__init__.py +90 -0
  78. onnx_diagnostic/tasks/automatic_speech_recognition.py +188 -0
  79. onnx_diagnostic/tasks/data/__init__.py +13 -0
  80. onnx_diagnostic/tasks/data/dummies_imagetext2text_generation_gemma3.onnx +0 -0
  81. onnx_diagnostic/tasks/feature_extraction.py +162 -0
  82. onnx_diagnostic/tasks/fill_mask.py +89 -0
  83. onnx_diagnostic/tasks/image_classification.py +144 -0
  84. onnx_diagnostic/tasks/image_text_to_text.py +581 -0
  85. onnx_diagnostic/tasks/image_to_video.py +127 -0
  86. onnx_diagnostic/tasks/mask_generation.py +143 -0
  87. onnx_diagnostic/tasks/mixture_of_expert.py +79 -0
  88. onnx_diagnostic/tasks/object_detection.py +134 -0
  89. onnx_diagnostic/tasks/sentence_similarity.py +89 -0
  90. onnx_diagnostic/tasks/summarization.py +227 -0
  91. onnx_diagnostic/tasks/text2text_generation.py +230 -0
  92. onnx_diagnostic/tasks/text_classification.py +89 -0
  93. onnx_diagnostic/tasks/text_generation.py +352 -0
  94. onnx_diagnostic/tasks/text_to_image.py +95 -0
  95. onnx_diagnostic/tasks/zero_shot_image_classification.py +128 -0
  96. onnx_diagnostic/torch_export_patches/__init__.py +21 -0
  97. onnx_diagnostic/torch_export_patches/eval/__init__.py +725 -0
  98. onnx_diagnostic/torch_export_patches/eval/model_cases.py +898 -0
  99. onnx_diagnostic/torch_export_patches/onnx_export_errors.py +1098 -0
  100. onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +311 -0
  101. onnx_diagnostic/torch_export_patches/patch_details.py +340 -0
  102. onnx_diagnostic/torch_export_patches/patch_expressions.py +108 -0
  103. onnx_diagnostic/torch_export_patches/patch_inputs.py +211 -0
  104. onnx_diagnostic/torch_export_patches/patch_module.py +1047 -0
  105. onnx_diagnostic/torch_export_patches/patch_module_helper.py +184 -0
  106. onnx_diagnostic/torch_export_patches/patches/__init__.py +0 -0
  107. onnx_diagnostic/torch_export_patches/patches/patch_torch.py +1090 -0
  108. onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +2139 -0
  109. onnx_diagnostic/torch_export_patches/serialization/__init__.py +46 -0
  110. onnx_diagnostic/torch_export_patches/serialization/diffusers_impl.py +34 -0
  111. onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +313 -0
  112. onnx_diagnostic/torch_models/__init__.py +0 -0
  113. onnx_diagnostic/torch_models/code_sample.py +343 -0
  114. onnx_diagnostic/torch_models/hghub/__init__.py +1 -0
  115. onnx_diagnostic/torch_models/hghub/hub_api.py +422 -0
  116. onnx_diagnostic/torch_models/hghub/hub_data.py +234 -0
  117. onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +4905 -0
  118. onnx_diagnostic/torch_models/hghub/model_inputs.py +388 -0
  119. onnx_diagnostic/torch_models/hghub/model_specific.py +76 -0
  120. onnx_diagnostic/torch_models/llms.py +2 -0
  121. onnx_diagnostic/torch_models/untrained/__init__.py +0 -0
  122. onnx_diagnostic/torch_models/untrained/llm_phi2.py +113 -0
  123. onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py +76 -0
  124. onnx_diagnostic/torch_models/validate.py +2124 -0
  125. onnx_diagnostic/torch_onnx/__init__.py +0 -0
  126. onnx_diagnostic/torch_onnx/runtime_info.py +289 -0
  127. onnx_diagnostic/torch_onnx/sbs.py +440 -0
  128. onnx_diagnostic-0.8.0.dist-info/METADATA +213 -0
  129. onnx_diagnostic-0.8.0.dist-info/RECORD +132 -0
  130. onnx_diagnostic-0.8.0.dist-info/WHEEL +5 -0
  131. onnx_diagnostic-0.8.0.dist-info/licenses/LICENSE.txt +19 -0
  132. onnx_diagnostic-0.8.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2245 @@
1
+ import enum
2
+ import io
3
+ import os
4
+ import pprint
5
+ import re
6
+ import warnings
7
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
8
+ import numpy as np
9
+ import pandas
10
+ from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
11
+ from .helper import string_sig
12
+ from ._log_helper import (
13
+ BUCKET_SCALES,
14
+ breaking_last_point,
15
+ apply_excel_style,
16
+ align_dataframe_with,
17
+ open_dataframe,
18
+ enumerate_csv_files,
19
+ )
20
+
21
+
22
+ class CubeViewDef:
23
+ """
24
+ Defines how to compute a view.
25
+
26
+ :param key_index: keys to put in the row index
27
+ :param values: values to show
28
+ :param ignore_unique: ignore keys with a unique value
29
+ :param order: to reorder key in columns index
30
+ :param key_agg: aggregate according to these columns before
31
+ creating the view
32
+ :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
33
+ it can be also a callable to return a different aggregation
34
+ method depending on the column name
35
+ :param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
36
+ :param agg_multi: aggregation over multiple columns
37
+ :param ignore_columns: ignore the following columns if known to overload the view
38
+ :param keep_columns_in_index: keeps the columns even if there is only one unique value
39
+ :param dropna: drops rows with nan if not relevant
40
+ :param transpose: transpose
41
+ :param f_highlight: to highlights some values
42
+ :param name: name of the view, used mostly to debug
43
+ :param plots: adds plot to the Excel sheet
44
+ :param no_index: remove the index (but keeps the columns)
45
+ :param fix_aggregation_change: a column among the keys which changes aggregation value
46
+ for different dates
47
+
48
+ Some examples of views. First example is an aggregated view
49
+ for many metrics.
50
+
51
+ .. code-block:: python
52
+
53
+ cube = CubeLogs(...)
54
+
55
+ CubeViewDef(
56
+ key_index=cube._filter_column(fs, cube.keys_time),
57
+ values=cube._filter_column(
58
+ ["TIME_ITER", "speedup", "time_latency.*", "onnx_n_nodes"],
59
+ cube.values,
60
+ ),
61
+ ignore_unique=True,
62
+ key_agg=["model_name", "task", "model_task", "suite"],
63
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
64
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
65
+ name="agg-all",
66
+ plots=True,
67
+ )
68
+
69
+ Next one focuses on a couple of metrics.
70
+
71
+ .. code-block:: python
72
+
73
+ cube = CubeLogs(...)
74
+
75
+ CubeViewDef(
76
+ key_index=cube._filter_column(fs, cube.keys_time),
77
+ values=cube._filter_column(["speedup"], cube.values),
78
+ ignore_unique=True,
79
+ keep_columns_in_index=["suite"],
80
+ name="speedup",
81
+ )
82
+ """
83
+
84
+ class HighLightKind(enum.IntEnum):
85
+ "Codes to highlight values."
86
+
87
+ NONE = 0
88
+ RED = 1
89
+ GREEN = 2
90
+
91
+ def __init__(
92
+ self,
93
+ key_index: Sequence[str],
94
+ values: Sequence[str],
95
+ ignore_unique: bool = True,
96
+ order: Optional[Sequence[str]] = None,
97
+ key_agg: Optional[Sequence[str]] = None,
98
+ agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
99
+ agg_kwargs: Optional[Dict[str, Any]] = None,
100
+ agg_multi: Optional[
101
+ Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
102
+ ] = None,
103
+ ignore_columns: Optional[Sequence[str]] = None,
104
+ keep_columns_in_index: Optional[Sequence[str]] = None,
105
+ dropna: bool = True,
106
+ transpose: bool = False,
107
+ f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
108
+ name: Optional[str] = None,
109
+ no_index: bool = False,
110
+ plots: bool = False,
111
+ fix_aggregation_change: Optional[List["str"]] = None,
112
+ ):
113
+ self.key_index = key_index
114
+ self.values = values
115
+ self.ignore_unique = ignore_unique
116
+ self.order = order
117
+ self.key_agg = key_agg
118
+ self.agg_args = agg_args
119
+ self.agg_kwargs = agg_kwargs
120
+ self.agg_multi = agg_multi
121
+ self.dropna = dropna
122
+ self.ignore_columns = ignore_columns
123
+ self.keep_columns_in_index = keep_columns_in_index
124
+ self.f_highlight = f_highlight
125
+ self.transpose = transpose
126
+ self.name = name
127
+ self.no_index = no_index
128
+ self.plots = plots
129
+ self.fix_aggregation_change = fix_aggregation_change
130
+
131
+ def __repr__(self) -> str:
132
+ "usual"
133
+ return string_sig(self) # type: ignore[arg-type]
134
+
135
+
136
+ class CubePlot:
137
+ """
138
+ Creates a plot.
139
+
140
+ :param df: dataframe
141
+ :param kind: kind of graph to plot, bar, barh, line
142
+ :param split: draw a graph per line in the dataframe
143
+ :param timeseries: this assumes the time is one level of the columns,
144
+ this argument indices the level name
145
+
146
+ It defines a graph. Usually *bar* or *barh* is used to
147
+ compare experiments for every metric, a subplot by metric.
148
+
149
+ .. code-block:: python
150
+
151
+ CubePlot(df, kind="barh", orientation="row", split=True)
152
+
153
+ *line* is usually used to plot timeseries showing the
154
+ evolution of metrics over time.
155
+
156
+ .. code-block:: python
157
+
158
+ CubePlot(
159
+ df,
160
+ kind="line",
161
+ orientation="row",
162
+ split=True,
163
+ timeseries="time",
164
+ )
165
+ """
166
+
167
+ KINDS = {"bar", "barh", "line"}
168
+
169
+ @classmethod
170
+ def group_columns(
171
+ cls, columns: List[str], sep: str = "/", depth: int = 2
172
+ ) -> List[List[str]]:
173
+ """Groups columns to have nice display."""
174
+ res: Dict[str, List[str]] = {}
175
+ for c in columns:
176
+ p = c.split("/")
177
+ k = "/".join(p[:depth])
178
+ if k not in res:
179
+ res[k] = []
180
+ res[k].append(c)
181
+ new_res: Dict[str, List[str]] = {}
182
+ for k, v in res.items():
183
+ if len(v) >= 3:
184
+ new_res[k] = v
185
+ else:
186
+ if "0" not in new_res:
187
+ new_res["0"] = []
188
+ new_res["0"].extend(v)
189
+ groups: List[List[str]] = [sorted(v) for k, v in sorted(new_res.items())]
190
+ if depth <= 1:
191
+ return groups
192
+ new_groups: List[List[str]] = []
193
+ for v in groups:
194
+ if len(v) >= 6:
195
+ new_groups.extend(cls.group_columns(v, depth=1, sep=sep))
196
+ else:
197
+ new_groups.append(v)
198
+ return new_groups
199
+
200
+ def __init__(
201
+ self,
202
+ df: pandas.DataFrame,
203
+ kind: str = "bar",
204
+ orientation="col",
205
+ split: bool = True,
206
+ timeseries: Optional[str] = None,
207
+ ):
208
+ assert (
209
+ not timeseries or timeseries in df.columns.names
210
+ ), f"Level {timeseries!r} is not part of the columns levels {df.columns.names}"
211
+ assert (
212
+ kind in self.__class__.KINDS
213
+ ), f"Unexpected kind={kind!r} not in {self.__class__.KINDS}"
214
+ assert split, f"split={split} not implemented"
215
+ assert (
216
+ not timeseries or orientation == "row"
217
+ ), f"orientation={orientation!r} must be 'row' for timeseries"
218
+ self.df = df.copy()
219
+ self.kind = kind
220
+ self.orientation = orientation
221
+ self.split = split
222
+ self.timeseries = timeseries
223
+
224
+ if timeseries:
225
+ if isinstance(self.df.columns, pandas.MultiIndex):
226
+ index_time = list(self.df.columns.names).index(self.timeseries)
227
+
228
+ def _drop(t, i=index_time):
229
+ return (*t[:i], *t[i + 1 :])
230
+
231
+ self.df.columns = pandas.MultiIndex.from_tuples(
232
+ [("/".join(map(str, _drop(i))), i[index_time]) for i in self.df.columns],
233
+ names=["metric", timeseries],
234
+ )
235
+ else:
236
+ if isinstance(self.df.columns, pandas.MultiIndex):
237
+ self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
238
+ if isinstance(self.df.index, pandas.MultiIndex):
239
+ self.df.index = ["/".join(map(str, i)) for i in self.df.index]
240
+
241
+ def __repr__(self) -> str:
242
+ "usual"
243
+ return string_sig(self) # type: ignore[arg-type]
244
+
245
+ def to_images(
246
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
247
+ ) -> List[bytes]:
248
+ """
249
+ Converts data into plots and images.
250
+
251
+ :param verbose: verbosity
252
+ :param merge: returns all graphs in a single image (True)
253
+ or an image for every graph (False)
254
+ :param title_suffix: prefix for the title of every graph
255
+ :return: list of binary images (format PNG)
256
+ """
257
+ if self.kind in ("barh", "bar"):
258
+ return self._to_images_bar(verbose=verbose, merge=merge, title_suffix=title_suffix)
259
+ if self.kind == "line":
260
+ return self._to_images_line(
261
+ verbose=verbose, merge=merge, title_suffix=title_suffix
262
+ )
263
+ raise AssertionError(f"self.kind={self.kind!r} not implemented")
264
+
265
+ @classmethod
266
+ def _make_loop(cls, ensemble, verbose):
267
+ if verbose:
268
+ from tqdm import tqdm
269
+
270
+ loop = tqdm(ensemble)
271
+ else:
272
+ loop = ensemble
273
+ return loop
274
+
275
+ def _to_images_bar(
276
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
277
+ ) -> List[bytes]:
278
+ """
279
+ Environment variable ``FIGSIZEH`` can be set to increase the
280
+ graph height. Default is 1.0.
281
+ """
282
+ assert merge, f"merge={merge} not implemented yet"
283
+ import matplotlib.pyplot as plt
284
+
285
+ df = self.df.T if self.orientation == "row" else self.df
286
+ title_suffix = f"\n{title_suffix}" if title_suffix else ""
287
+
288
+ n_cols = 3
289
+ nn = df.shape[1] // n_cols
290
+ nn += int(df.shape[1] % n_cols != 0)
291
+ ratio = float(os.environ.get("FIGSIZEH", "1"))
292
+ figsize = (6 * n_cols, nn * (2.5 + df.shape[0] / 15) * ratio)
293
+ fig, axs = plt.subplots(nn, n_cols, figsize=figsize)
294
+ pos = 0
295
+ imgs = []
296
+ for c in self._make_loop(df.columns, verbose):
297
+ ax = axs[pos // n_cols, pos % n_cols]
298
+ (
299
+ df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
300
+ if self.kind == "barh"
301
+ else df[c].plot.bar(title=f"{c}{title_suffix}", ax=ax)
302
+ )
303
+ ax.tick_params(axis="both", which="major", labelsize=8)
304
+ ax.grid(True)
305
+ pos += 1 # noqa: SIM113
306
+ fig.tight_layout()
307
+ imgdata = io.BytesIO()
308
+ fig.savefig(imgdata, format="png")
309
+ imgs.append(imgdata.getvalue())
310
+ plt.close()
311
+ return imgs
312
+
313
+ def _to_images_line(
314
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
315
+ ) -> List[bytes]:
316
+ assert merge, f"merge={merge} not implemented yet"
317
+ assert (
318
+ self.orientation == "row"
319
+ ), f"self.orientation={self.orientation!r} not implemented for this kind of graph."
320
+
321
+ def rotate_align(ax, angle=15, align="right"):
322
+ for label in ax.get_xticklabels():
323
+ label.set_rotation(angle)
324
+ label.set_horizontalalignment(align)
325
+ ax.tick_params(axis="both", which="major", labelsize=8)
326
+ ax.grid(True)
327
+ ax.legend()
328
+ ax.tick_params(labelleft=True)
329
+ return ax
330
+
331
+ import matplotlib.pyplot as plt
332
+
333
+ df = self.df.T
334
+
335
+ confs = list(df.unstack(self.timeseries).index)
336
+ groups = self.group_columns(confs)
337
+ n_cols = len(groups)
338
+
339
+ title_suffix = f"\n{title_suffix}" if title_suffix else ""
340
+ ratio = float(os.environ.get("FIGSIZEH", "1"))
341
+ figsize = (5 * n_cols, max(len(g) for g in groups) * (2 + df.shape[1] / 2) * ratio)
342
+ fig, axs = plt.subplots(
343
+ df.shape[1],
344
+ n_cols,
345
+ figsize=figsize,
346
+ sharex=True,
347
+ sharey="row" if n_cols > 1 else False,
348
+ )
349
+ imgs = []
350
+ row = 0
351
+ for c in self._make_loop(df.columns, verbose):
352
+ dfc = df[[c]]
353
+ dfc = dfc.unstack(self.timeseries).T.droplevel(0)
354
+ if n_cols == 1:
355
+ dfc.plot(title=f"{c}{title_suffix}", ax=axs[row], linewidth=3)
356
+ axs[row].grid(True)
357
+ rotate_align(axs[row])
358
+ else:
359
+ x = list(range(dfc.shape[0]))
360
+ ticks = list(dfc.index)
361
+ for ii, group in enumerate(groups):
362
+ ddd = dfc.loc[:, group].copy()
363
+ axs[row, ii].set_xticks(x)
364
+ axs[row, ii].set_xticklabels(ticks)
365
+ # This is very slow
366
+ # ddd.plot(ax=axs[row, ii],linewidth=3)
367
+ for jj in range(ddd.shape[1]):
368
+ axs[row, ii].plot(x, ddd.iloc[:, jj], lw=3, label=ddd.columns[jj])
369
+ axs[row, ii].set_title(f"{c}{title_suffix}")
370
+ rotate_align(axs[row, ii])
371
+ row += 1 # noqa: SIM113
372
+ fig.tight_layout()
373
+ imgdata = io.BytesIO()
374
+ fig.savefig(imgdata, format="png")
375
+ imgs.append(imgdata.getvalue())
376
+ plt.close()
377
+ return imgs
378
+
379
+
380
+ class CubeLogs:
381
+ """
382
+ Processes logs coming from experiments.
383
+ A cube is basically a database with certain columns
384
+ playing specific roles.
385
+
386
+ * time: only one column, it is not mandatory but it is recommended
387
+ to have one
388
+ * keys: they are somehow coordinates, they cannot be aggregated,
389
+ they are not numbers, more like categories, `(time, *keys)`
390
+ identifies an element of the database in an unique way,
391
+ there cannot be more than one row sharing the same key and time
392
+ values
393
+ * values: they are not necessary numerical, but if they are,
394
+ they can be aggregated
395
+
396
+ Every other columns is ignored. More columns can be added
397
+ by using formulas.
398
+
399
+ :param data: the raw data
400
+ :param time: the time column
401
+ :param keys: the keys, can include regular expressions
402
+ :param values: the values, can include regular expressions
403
+ :param ignored: ignores some column, acts as negative regular
404
+ expressions for the other two
405
+ :param recent: if more than one rows share the same keys,
406
+ the cube only keeps the most recent one
407
+ :param formulas: columns to add, defined with formulas
408
+ :param fill_missing: a dictionary, defines values replacing missing one
409
+ for some columns
410
+ :param keep_last_date: overwrites all the times with the most recent
411
+ one, it makes things easier for timeseries
412
+ """
413
+
414
+ def __init__(
415
+ self,
416
+ data: Any,
417
+ time: str = "date",
418
+ keys: Sequence[str] = ("version_.*", "model_.*"),
419
+ values: Sequence[str] = ("time_.*", "disc_.*"),
420
+ ignored: Sequence[str] = (),
421
+ recent: bool = False,
422
+ formulas: Optional[
423
+ Union[
424
+ Sequence[str],
425
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
426
+ ]
427
+ ] = None,
428
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
429
+ keep_last_date: bool = False,
430
+ ):
431
+ self._data = data
432
+ self._time = time
433
+ self._keys = keys
434
+ self._values = values
435
+ self._ignored = ignored
436
+ self.recent = recent
437
+ self._formulas = formulas
438
+ self.fill_missing = fill_missing
439
+ self.keep_last_date = keep_last_date
440
+
441
+ def clone(
442
+ self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
443
+ ) -> "CubeLogs":
444
+ """
445
+ Makes a copy of the dataframe.
446
+ It copies the processed data not the original one.
447
+ """
448
+ cube = self.__class__(
449
+ data if data is not None else self.data.copy(),
450
+ time=self.time,
451
+ keys=keys or self.keys_no_time,
452
+ values=self.values,
453
+ )
454
+ cube.load()
455
+ return cube
456
+
457
+ def post_load_process_piece(
458
+ self, df: pandas.DataFrame, unique: bool = False
459
+ ) -> pandas.DataFrame:
460
+ """
461
+ Postprocesses a piece when a cube is made of multiple pieces
462
+ before it gets merged.
463
+ """
464
+ if not self.fill_missing:
465
+ return df
466
+ missing = dict(self.fill_missing)
467
+ for k, v in missing.items():
468
+ if k not in df.columns:
469
+ df[k] = v
470
+ return df
471
+
472
+ def load(self, verbose: int = 0):
473
+ """Loads and preprocesses the data. Returns self."""
474
+ if isinstance(self._data, pandas.DataFrame):
475
+ if verbose:
476
+ print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
477
+ self.data = self.post_load_process_piece(self._data, unique=True)
478
+ if verbose:
479
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
480
+ elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
481
+ if verbose:
482
+ print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
483
+ self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
484
+ if verbose:
485
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
486
+ elif isinstance(self._data, list) and all(
487
+ isinstance(r, pandas.DataFrame) for r in self._data
488
+ ):
489
+ if verbose:
490
+ print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
491
+ self.data = pandas.concat(
492
+ [self.post_load_process_piece(c) for c in self._data], axis=0
493
+ )
494
+ if verbose:
495
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
496
+ elif isinstance(self._data, list):
497
+ if verbose:
498
+ print("[CubeLogs.load] load from list of Cubes")
499
+ cubes = []
500
+ for item in enumerate_csv_files(self._data, verbose=verbose):
501
+ df = open_dataframe(item)
502
+ cube = CubeLogs(
503
+ df,
504
+ time=self._time,
505
+ keys=self._keys,
506
+ values=self._values,
507
+ ignored=self._ignored,
508
+ recent=self.recent,
509
+ )
510
+ cube.load()
511
+ cubes.append(self.post_load_process_piece(cube.data))
512
+ self.data = pandas.concat(cubes, axis=0)
513
+ if verbose:
514
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
515
+ else:
516
+ raise NotImplementedError(
517
+ f"Not implemented with the provided data (type={type(self._data)})"
518
+ )
519
+
520
+ assert all(isinstance(c, str) for c in self.data.columns), (
521
+ f"The class only supports string as column names "
522
+ f"but found {[c for c in self.data.columns if not isinstance(c, str)]}"
523
+ )
524
+ if verbose:
525
+ print(f"[CubeLogs.load] loaded with shape={self.data.shape}")
526
+
527
+ self._initialize_columns()
528
+ if verbose:
529
+ print(f"[CubeLogs.load] time={self.time}")
530
+ print(f"[CubeLogs.load] keys={self.keys_no_time}")
531
+ print(f"[CubeLogs.load] values={self.values}")
532
+ print(f"[CubeLogs.load] ignored={self.ignored}")
533
+ print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
534
+ print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
535
+ assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
536
+ assert self.values, f"No values found with {self._values} from {self.data.columns}"
537
+ assert not (
538
+ set(self.keys_no_time) & set(self.values)
539
+ ), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
540
+ assert not (
541
+ set(self.keys_no_time) & set(self.ignored)
542
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
543
+ assert not (
544
+ set(self.values) & set(self.ignored)
545
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
546
+ assert (
547
+ self.time not in self.keys_no_time
548
+ and self.time not in self.values
549
+ and self.time not in self.ignored
550
+ ), (
551
+ f"Column {self.time!r} is also a key, a value or ignored, "
552
+ f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
553
+ f"ignored={sorted(self.ignored)}"
554
+ )
555
+ self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
556
+ self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
557
+ self.data = self.data[self.columns]
558
+ if verbose:
559
+ print(f"[CubeLogs.load] dropped={self.dropped}")
560
+ print(f"[CubeLogs.load] data.shape={self.data.shape}")
561
+
562
+ if verbose:
563
+ print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
564
+ self._preprocess()
565
+ if verbose:
566
+ print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
567
+ if self.recent:
568
+ print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
569
+
570
+ # Let's apply the formulas
571
+ if self._formulas:
572
+ forms = (
573
+ {k: k for k in self._formulas}
574
+ if not isinstance(self._formulas, dict)
575
+ else self._formulas
576
+ )
577
+ cols = set(self.values)
578
+ for k, ff in forms.items():
579
+ f = self._process_formula(ff)
580
+ if k in cols or f is None:
581
+ if verbose:
582
+ print(f"[CubeLogs.load] skip formula {k!r}")
583
+ else:
584
+ if verbose:
585
+ print(f"[CubeLogs.load] apply formula {k!r}")
586
+ self.data[k] = f(self.data)
587
+ self.values.append(k)
588
+ cols.add(k)
589
+ self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
590
+ for k in self.keys_no_time:
591
+ if self.data[k].isna().max():
592
+ self.values_for_key[k].add(np.nan)
593
+ self.keys_with_nans = [
594
+ c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
595
+ ]
596
+ if verbose:
597
+ print(f"[CubeLogs.load] convert column {self.time!r} into date")
598
+ if self.keys_with_nans:
599
+ print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
600
+ self.data[self.time] = pandas.to_datetime(self.data[self.time])
601
+
602
+ if self.keep_last_date:
603
+ times = self.data[self.time].dropna()
604
+ mi, mx = times.min(), times.max()
605
+ if mi != mx:
606
+ print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
607
+ self.data.loc[~self.data[self.time].isna(), self.time] = mx
608
+ self.values_for_key[self.time] = {mx}
609
+ if self.data[self.time].isna().max():
610
+ self.values_for_key[self.time].add(np.nan)
611
+ if verbose:
612
+ print(f"[CubeLogs.load] done, shape={self.shape}")
613
+ return self
614
+
615
+ def _process_formula(
616
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
617
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
618
+ assert callable(formula), f"formula={formula!r} is not supported."
619
+ return formula
620
+
621
+ @property
622
+ def shape(self) -> Tuple[int, int]:
623
+ "Returns the shape."
624
+ assert hasattr(self, "data"), "Method load was not called"
625
+ return self.data.shape
626
+
627
+ @property
628
+ def columns(self) -> Sequence[str]:
629
+ "Returns the columns."
630
+ assert hasattr(self, "data"), "Method load was not called"
631
+ return self.data.columns
632
+
633
+ def _preprocess(self):
634
+ last = self.values[0]
635
+ gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
636
+ gr = gr[gr[last] > 1]
637
+ if self.recent:
638
+ cp = self.data.copy()
639
+ assert (
640
+ "__index__" not in cp.columns
641
+ ), f"'__index__' should not be a column in {cp.columns}"
642
+ cp["__index__"] = np.arange(cp.shape[0])
643
+ gr = (
644
+ cp[[*self.keys_time, "__index__"]]
645
+ .groupby(self.keys_no_time, as_index=False, dropna=False)
646
+ .max()
647
+ )
648
+ assert gr.shape[0] > 0, (
649
+ f"Something went wrong after the groupby.\n"
650
+ f"{cp[[*self.keys, self.time, '__index__']].head().T}"
651
+ )
652
+ filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
653
+ assert filtered.shape[0] <= self.data.shape[0], (
654
+ f"Keeping the latest row brings more row {filtered.shape} "
655
+ f"(initial is {self.data.shape})."
656
+ )
657
+ self.data = filtered.drop("__index__", axis=1)
658
+ else:
659
+ assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
660
+
661
+ @classmethod
662
+ def _filter_column(cls, filters, columns, can_be_empty=False):
663
+ assert list(columns), "columns is empty"
664
+ set_cols = set()
665
+ for f in filters:
666
+ if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
667
+ reg = re.compile(f)
668
+ cols = [c for c in columns if reg.search(c)]
669
+ elif f in columns:
670
+ # No regular expression.
671
+ cols = [f]
672
+ else:
673
+ continue
674
+ set_cols |= set(cols)
675
+ assert (
676
+ can_be_empty or set_cols
677
+ ), f"Filters {filters} returns an empty set from {columns}"
678
+ return sorted(set_cols)
679
+
680
+ def _initialize_columns(self):
681
+ keys = self._filter_column(self._keys, self.data.columns)
682
+ self.values = self._filter_column(self._values, self.data.columns)
683
+ self.ignored = self._filter_column(self._ignored, self.data.columns, True)
684
+ assert (
685
+ self._time in self.data.columns
686
+ ), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
687
+ ignored_keys = set(self.ignored) & set(keys)
688
+ ignored_values = set(self.ignored) & set(self.values)
689
+ self.keys_no_time = [c for c in keys if c not in ignored_keys]
690
+ self.values = [c for c in self.values if c not in ignored_values]
691
+ self.ignored_keys = sorted(ignored_keys)
692
+ self.ignored_values = sorted(ignored_values)
693
+ self.time = self._time
694
+ self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
695
+
696
+ def __str__(self) -> str:
697
+ "usual"
698
+ return str(self.data) if hasattr(self, "data") else str(self._data)
699
+
700
+ def make_view_def(self, name: str) -> Optional[CubeViewDef]:
701
+ """
702
+ Returns a view definition.
703
+
704
+ :param name: name of a value
705
+ :return: a CubeViewDef or None if name does not make sense
706
+ """
707
+ assert name in self.values, f"{name!r} is not one of the values {self.values}"
708
+ keys = sorted(self.keys_no_time)
709
+ index = len(keys) // 2 + (len(keys) % 2)
710
+ return CubeViewDef(key_index=keys[:index], values=[name], name=name)
711
+
712
+ def view(
713
+ self,
714
+ view_def: Union[str, CubeViewDef],
715
+ return_view_def: bool = False,
716
+ verbose: int = 0,
717
+ ) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
718
+ """
719
+ Returns a dataframe, a pivot view.
720
+ `key_index` determines the index, the other key columns determines
721
+ the columns. If `ignore_unique` is True, every columns with a unique value
722
+ is removed.
723
+
724
+ :param view_def: view definition
725
+ :param return_view_def: returns the view as well
726
+ :param verbose: verbosity level
727
+ :return: dataframe
728
+ """
729
+ if isinstance(view_def, str):
730
+ # We automatically create a view for a metric
731
+ view_def_ = self.make_view_def(view_def)
732
+ assert view_def_ is not None, f"Unable to create a view from {view_def!r}"
733
+ view_def = view_def_
734
+
735
+ assert isinstance(
736
+ view_def, CubeViewDef
737
+ ), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
738
+ if verbose:
739
+ print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
740
+ key_agg = (
741
+ self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
742
+ )
743
+ set_key_agg = set(key_agg)
744
+ assert set_key_agg <= set(self.keys_time), (
745
+ f"view_def.name={view_def.name!r}, "
746
+ f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
747
+ f"keys={sorted(self.keys_time)}",
748
+ )
749
+
750
+ values = self._filter_column(view_def.values, self.values)
751
+ assert set(values) <= set(self.values), (
752
+ f"view_def.name={view_def.name!r}, "
753
+ f"non existing columns in values {set(values) - set(self.values)}, "
754
+ f"values={sorted(self.values)}"
755
+ )
756
+
757
+ if view_def.fix_aggregation_change and (
758
+ set(view_def.fix_aggregation_change) & set(self.keys_no_time)
759
+ ):
760
+ # before aggregation, let's fix some keys whose values changed over time
761
+ data_to_process = self._fix_aggregation_change(
762
+ self.data,
763
+ list(set(view_def.fix_aggregation_change) & set(self.keys_no_time)),
764
+ )
765
+ else:
766
+ data_to_process = self.data
767
+
768
+ # aggregation
769
+ if key_agg:
770
+ final_stack = True
771
+ key_index = [
772
+ c
773
+ for c in self._filter_column(view_def.key_index, self.keys_time)
774
+ if c not in set_key_agg
775
+ ]
776
+ keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
777
+ if verbose:
778
+ print(f"[CubeLogs.view] aggregation of {set_key_agg}")
779
+ print(f"[CubeLogs.view] groupby {keys_no_agg}")
780
+
781
+ data_red = data_to_process[[*keys_no_agg, *values]]
782
+ assert set(key_index) <= set(data_red.columns), (
783
+ f"view_def.name={view_def.name!r}, "
784
+ f"nnable to find {set(key_index) - set(data_red.columns)}, "
785
+ f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
786
+ f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
787
+ f"keys={pprint.pformat(sorted(self.keys_time))}"
788
+ )
789
+ grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
790
+ if callable(view_def.agg_args):
791
+ agg_kwargs = view_def.agg_kwargs or {}
792
+ agg_args = ({c: view_def.agg_args(c) for c in values},)
793
+ else:
794
+ agg_args = view_def.agg_args # type: ignore[assignment]
795
+ agg_kwargs = view_def.agg_kwargs or {}
796
+ data = grouped_data.agg(*agg_args, **agg_kwargs)
797
+ if view_def.agg_multi:
798
+ append = []
799
+ for k, f in view_def.agg_multi.items():
800
+ cv = grouped_data.apply(f, include_groups=False)
801
+ append.append(cv.to_frame(k))
802
+ data = pandas.concat([data, *append], axis=1)
803
+ set_all_keys = set(keys_no_agg)
804
+ values = list(data.columns)
805
+ data = data.reset_index(drop=False)
806
+ else:
807
+ key_index = self._filter_column(view_def.key_index, self.keys_time)
808
+ if verbose:
809
+ print(f"[CubeLogs.view] no aggregation, index={key_index}")
810
+ data = data_to_process[[*self.keys_time, *values]]
811
+ set_all_keys = set(self.keys_time)
812
+ final_stack = False
813
+
814
+ assert set(key_index) <= set_all_keys, (
815
+ f"view_def.name={view_def.name!r}, "
816
+ f"Non existing keys in key_index {set(key_index) - set_all_keys}"
817
+ )
818
+
819
+ # remove unnecessary column
820
+ set_key_columns = {
821
+ c for c in self.keys_time if c not in key_index and c not in set(key_agg)
822
+ }
823
+ key_index0 = key_index
824
+ if view_def.ignore_unique:
825
+ unique = {
826
+ k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
827
+ }
828
+ keep_anyway = (
829
+ set(view_def.keep_columns_in_index)
830
+ if view_def.keep_columns_in_index
831
+ else set()
832
+ )
833
+ key_index = [k for k in key_index if k not in unique or k in keep_anyway]
834
+ key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
835
+ if verbose:
836
+ print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
837
+ print(
838
+ f"[CubeLogs.view] columns with unique values "
839
+ f"{set(key_index0) - set(key_index)}"
840
+ )
841
+ else:
842
+ if verbose:
843
+ print("[CubeLogs.view] keep all columns")
844
+ key_columns = sorted(set_key_columns)
845
+ unique = set()
846
+
847
+ # md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
848
+ all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
849
+ assert all_cols == set(self.keys_time), (
850
+ f"view_def.name={view_def.name!r}, "
851
+ f"key_columns + key_index + key_agg + unique != keys, left="
852
+ f"{set(self.keys_time) - all_cols}, "
853
+ f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
854
+ f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
855
+ )
856
+
857
+ # reorder
858
+ if view_def.order:
859
+ subset = self._filter_column(view_def.order, all_cols | {self.time})
860
+ corder = [o for o in view_def.order if o in subset]
861
+ assert set(corder) <= set_key_columns, (
862
+ f"view_def.name={view_def.name!r}, "
863
+ f"non existing columns from order in key_columns "
864
+ f"{set(corder) - set_key_columns}"
865
+ )
866
+ key_columns = [
867
+ *[o for o in corder if o in key_columns],
868
+ *[c for c in key_columns if c not in view_def.order],
869
+ ]
870
+ else:
871
+ corder = None
872
+
873
+ if view_def.dropna:
874
+ data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
875
+ data,
876
+ key_index,
877
+ key_columns,
878
+ values,
879
+ keep_columns_in_index=view_def.keep_columns_in_index,
880
+ )
881
+ if view_def.ignore_columns:
882
+ if verbose:
883
+ print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
884
+ data = data.drop(view_def.ignore_columns, axis=1)
885
+ seti = set(view_def.ignore_columns)
886
+ if view_def.keep_columns_in_index:
887
+ seti -= set(view_def.keep_columns_in_index)
888
+ key_index = [c for c in key_index if c not in seti]
889
+ key_columns = [c for c in key_columns if c not in seti]
890
+ values = [c for c in values if c not in seti]
891
+
892
+ # final verification
893
+ if verbose:
894
+ print(f"[CubeLogs.view] key_index={key_index}")
895
+ print(f"[CubeLogs.view] key_columns={key_columns}")
896
+ g = data[[*key_index, *key_columns]].copy()
897
+ g["count"] = 1
898
+ r = (
899
+ g.copy()
900
+ if not key_index and not key_columns
901
+ else g.groupby([*key_index, *key_columns], dropna=False).sum()
902
+ )
903
+ not_unique = r[r["count"] > 1]
904
+ assert not_unique.shape[0] == 0, (
905
+ f"view_def.name={view_def.name!r}, "
906
+ f"unable to run the pivot with index={sorted(key_index)}, "
907
+ f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
908
+ f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
909
+ f"not unique={set(data.columns) - unique}"
910
+ f"\n--\n{not_unique.head(10)}"
911
+ )
912
+
913
+ # pivot
914
+ if verbose:
915
+ print(f"[CubeLogs.view] values={values}")
916
+ if key_index:
917
+ piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
918
+ else:
919
+ # pivot does return the same rank with it is empty.
920
+ # Let's add arficially one
921
+ data = data.copy()
922
+ data["ALL"] = "ALL"
923
+ piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
924
+ if isinstance(piv, pandas.Series):
925
+ piv = piv.to_frame(name="series")
926
+ names = list(piv.columns.names)
927
+ assert (
928
+ "METRICS" not in names
929
+ ), f"Not implemented when a level METRICS already exists {names!r}"
930
+ names[0] = "METRICS"
931
+ piv.columns = piv.columns.set_names(names)
932
+ if final_stack:
933
+ piv = piv.stack("METRICS", future_stack=True)
934
+ if view_def.transpose:
935
+ piv = piv.T
936
+ if isinstance(piv, pandas.Series):
937
+ piv = piv.to_frame("VALUE")
938
+ piv.sort_index(inplace=True)
939
+
940
+ if isinstance(piv.columns, pandas.MultiIndex):
941
+ if corder:
942
+ # reorder the levels for the columns with the view definition
943
+ new_corder = [c for c in corder if c in piv.columns.names]
944
+ new_names = [
945
+ *[c for c in piv.columns.names if c not in new_corder],
946
+ *new_corder,
947
+ ]
948
+ piv.columns = piv.columns.reorder_levels(new_names)
949
+ elif self.time in piv.columns.names:
950
+ # put time at the end
951
+ new_names = list(piv.columns.names)
952
+ ind = new_names.index(self.time)
953
+ if ind < len(new_names) - 1:
954
+ del new_names[ind]
955
+ new_names.append(self.time)
956
+ piv.columns = piv.columns.reorder_levels(new_names)
957
+
958
+ if view_def.no_index:
959
+ piv = piv.reset_index(drop=False)
960
+ else:
961
+ piv.sort_index(inplace=True, axis=1)
962
+
963
+ # final step, force columns with numerical values to be float
964
+ for c in list(piv.columns):
965
+ s = piv[c]
966
+ if not pandas.api.types.is_object_dtype(s):
967
+ continue
968
+ try:
969
+ sf = s.astype(float)
970
+ except (ValueError, TypeError):
971
+ continue
972
+ piv[c] = sf
973
+
974
+ if verbose:
975
+ print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
976
+ print(f"[CubeLogs.view] -- done view {view_def.name!r}")
977
+ return (piv, view_def) if return_view_def else piv
978
+
979
+ def _fix_aggregation_change(
980
+ self,
981
+ data: pandas.DataFrame,
982
+ columns_to_fix: Union[str, List[str]],
983
+ overwrite_or_merge: bool = True,
984
+ ) -> pandas.DataFrame:
985
+ """
986
+ Fixes columns used to aggregate values because their meaning changed over time.
987
+
988
+ :param data: data to fix
989
+ :param columns_to_fix: list of columns to fix
990
+ :param overwrite_or_merge: if True, overwrite all values by the concatenation
991
+ of all existing values, if merge, merges existing values found
992
+ and grouped by the other keys
993
+ :return: fixed data
994
+ """
995
+ if not isinstance(columns_to_fix, str):
996
+ for c in columns_to_fix:
997
+ data = self._fix_aggregation_change(data, c)
998
+ return data
999
+ # Let's process one column.
1000
+ keys = set(self.keys_time) - {columns_to_fix}
1001
+ select = data[self.keys_time]
1002
+ select_agg = select.groupby(list(keys)).count()
1003
+ assert select_agg[columns_to_fix].max() <= 1, (
1004
+ f"Column {columns_to_fix!r} has two distinct values at least for one date\n"
1005
+ f"{select_agg[select_agg[columns_to_fix] > 1]}"
1006
+ )
1007
+
1008
+ # unique value (to fill NaN)
1009
+ unique = "-".join(sorted(set(data[columns_to_fix].dropna())))
1010
+
1011
+ keys = set(self.keys_no_time) - {columns_to_fix}
1012
+ select = data[self.keys_no_time]
1013
+ select_agg = select.groupby(list(keys), as_index=True).apply(
1014
+ lambda x: "-".join(sorted(set(x[columns_to_fix].dropna()))), include_groups=False
1015
+ )
1016
+ select_agg = select_agg.to_frame(name=columns_to_fix)
1017
+ res = pandas.merge(
1018
+ data.drop([columns_to_fix], axis=1),
1019
+ select_agg,
1020
+ how="left",
1021
+ left_on=list(keys),
1022
+ right_index=True,
1023
+ )
1024
+ val = f"?{unique}?"
1025
+ res[columns_to_fix] = res[columns_to_fix].fillna(val).replace("", val)
1026
+ assert (
1027
+ data.shape == res.shape
1028
+ and sorted(data.columns) == sorted(res.columns)
1029
+ and sorted(data.index) == sorted(res.index)
1030
+ ), (
1031
+ f"Shape should match, data.shape={data.shape}, res.shape={res.shape}, "
1032
+ f"lost={set(data.columns) - set(res.columns)}, "
1033
+ f"added={set(res.columns) - set(data.columns)}"
1034
+ )
1035
+ res = res[data.columns]
1036
+ assert data.columns.equals(res.columns) and data.index.equals(res.index), (
1037
+ f"Columns or index mismatch "
1038
+ f"data.columns.equals(res.columns)={data.columns.equals(res.columns)}, "
1039
+ f"data.index.equals(res.columns)={data.index.equals(res.columns)}, "
1040
+ )
1041
+ return res
1042
+
1043
+ def _dropna(
1044
+ self,
1045
+ data: pandas.DataFrame,
1046
+ key_index: Sequence[str],
1047
+ key_columns: Sequence[str],
1048
+ values: Sequence[str],
1049
+ keep_columns_in_index: Optional[Sequence[str]] = None,
1050
+ ) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
1051
+ set_keep_columns_in_index = (
1052
+ set(keep_columns_in_index) if keep_columns_in_index else set()
1053
+ )
1054
+ v = data[values]
1055
+ new_data = data[~v.isnull().all(1)]
1056
+ if data.shape == new_data.shape:
1057
+ return data, key_index, key_columns, values
1058
+ new_data = new_data.copy()
1059
+ new_key_index = []
1060
+ for c in key_index:
1061
+ if c in set_keep_columns_in_index:
1062
+ new_key_index.append(c)
1063
+ continue
1064
+ v = new_data[c]
1065
+ sv = set(v.dropna())
1066
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
1067
+ new_key_index.append(c)
1068
+ new_key_columns = []
1069
+ for c in key_columns:
1070
+ if c in set_keep_columns_in_index:
1071
+ new_key_columns.append(c)
1072
+ continue
1073
+ v = new_data[c]
1074
+ sv = set(v.dropna())
1075
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
1076
+ new_key_columns.append(c)
1077
+ for c in set(key_index) | set(key_columns):
1078
+ s = new_data[c]
1079
+ if s.isna().max():
1080
+ if pandas.api.types.is_numeric_dtype(
1081
+ s
1082
+ ) and not pandas.api.types.is_object_dtype(s):
1083
+ min_v = s.dropna().min()
1084
+ assert (
1085
+ min_v >= 0
1086
+ ), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
1087
+ new_data[c] = s.fillna(-1)
1088
+ else:
1089
+ new_data[c] = s.fillna("NAN")
1090
+ return new_data, new_key_index, new_key_columns, values
1091
+
1092
+ def describe(self) -> pandas.DataFrame:
1093
+ """Basic description of all variables."""
1094
+ rows = []
1095
+ for name in self.data.columns:
1096
+ values = self.data[name]
1097
+ dtype = values.dtype
1098
+ nonan = values.dropna()
1099
+ obs = dict(
1100
+ name=name,
1101
+ dtype=str(dtype),
1102
+ missing=len(values) - len(nonan),
1103
+ kind=(
1104
+ "time"
1105
+ if name == self.time
1106
+ else (
1107
+ "keys"
1108
+ if name in self.keys_no_time
1109
+ else (
1110
+ "values"
1111
+ if name in self.values
1112
+ else ("ignored" if name in self.ignored else "unused")
1113
+ )
1114
+ )
1115
+ ),
1116
+ )
1117
+ if len(nonan) > 0:
1118
+ obs.update(dict(count=len(nonan)))
1119
+ if is_numeric_dtype(nonan) and not pandas.api.types.is_object_dtype(nonan):
1120
+ obs.update(
1121
+ dict(
1122
+ min=nonan.min(),
1123
+ max=nonan.max(),
1124
+ mean=nonan.mean(),
1125
+ sum=nonan.sum(),
1126
+ n_values=len(set(nonan)),
1127
+ )
1128
+ )
1129
+ elif obs["kind"] == "time":
1130
+ unique = set(nonan)
1131
+ obs["n_values"] = len(unique)
1132
+ o = dict(
1133
+ min=str(nonan.min()),
1134
+ max=str(nonan.max()),
1135
+ n_values=len(set(nonan)),
1136
+ )
1137
+ o["values"] = f"{o['min']} - {o['max']}"
1138
+ obs.update(o)
1139
+ else:
1140
+ unique = set(nonan)
1141
+ obs["n_values"] = len(unique)
1142
+ if len(unique) < 20:
1143
+ obs["values"] = ",".join(map(str, sorted(unique)))
1144
+ rows.append(obs)
1145
+ return pandas.DataFrame(rows).set_index("name")
1146
+
1147
+ def to_excel(
1148
+ self,
1149
+ output: str,
1150
+ views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
1151
+ main: Optional[str] = "main",
1152
+ raw: Optional[str] = "raw",
1153
+ verbose: int = 0,
1154
+ csv: Optional[Sequence[str]] = None,
1155
+ time_mask: bool = False,
1156
+ sbs: Optional[Dict[str, Dict[str, Any]]] = None,
1157
+ ):
1158
+ """
1159
+ Creates an excel file with a list of views.
1160
+
1161
+ :param output: output file to create
1162
+ :param views: sequence or dictionary of views to append
1163
+ :param main: add a page with statitcs on all variables
1164
+ :param raw: add a page with the raw data
1165
+ :param csv: views to dump as csv files (same name as outputs + view naw)
1166
+ :param verbose: verbosity
1167
+ :param time_mask: color the background of the cells if one
1168
+ of the value for the last date is unexpected,
1169
+ assuming they should remain stale
1170
+ :param sbs: configurations to compare side-by-side, this adds two tabs,
1171
+ one gathering raw data about the two configurations, the other one
1172
+ is aggregated by metrics
1173
+ """
1174
+ if verbose:
1175
+ print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
1176
+ time_mask &= len(self.data[self.time].unique()) > 2
1177
+ cube_time = self.cube_time(fill_other_dates=True) if time_mask else None
1178
+ views = {k: k for k in views} if not isinstance(views, dict) else views
1179
+ f_highlights = {}
1180
+ plots = []
1181
+ with pandas.ExcelWriter(output, engine="openpyxl") as writer:
1182
+ if main:
1183
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1184
+ df = self.describe().sort_values("name")
1185
+ if verbose:
1186
+ print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
1187
+ df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
1188
+
1189
+ time_mask_view: Dict[str, pandas.DataFrame] = {}
1190
+ for name, view in views.items():
1191
+ if view is None:
1192
+ continue
1193
+ df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
1194
+ if cube_time is not None:
1195
+ cube_mask = cube_time.view(view)
1196
+ aligned = align_dataframe_with(cube_mask, df)
1197
+ if aligned is not None:
1198
+ assert aligned.shape == df.shape, (
1199
+ f"Shape mismatch between the view {df.shape} and the mask "
1200
+ f"{time_mask_view[name].shape}"
1201
+ )
1202
+ time_mask_view[name] = aligned
1203
+ if verbose:
1204
+ print(
1205
+ f"[CubeLogs.to_excel] compute mask for view {name!r} "
1206
+ f"with shape {aligned.shape}"
1207
+ )
1208
+ if tview is None:
1209
+ continue
1210
+ memory = df.memory_usage(deep=True).sum()
1211
+ if verbose:
1212
+ print(
1213
+ f"[CubeLogs.to_excel] add sheet {name!r} with shape "
1214
+ f"{df.shape} ({memory} bytes), index={df.index.names}, "
1215
+ f"columns={df.columns.names}"
1216
+ )
1217
+ if self.time in df.columns.names:
1218
+ # Let's convert the time into str
1219
+ fr = df.columns.to_frame()
1220
+ if is_datetime64_any_dtype(fr[self.time]):
1221
+ dt = fr[self.time]
1222
+ has_time = (dt != dt.dt.normalize()).any()
1223
+ sdt = dt.apply(
1224
+ lambda t, has_time=has_time: t.strftime(
1225
+ "%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
1226
+ )
1227
+ )
1228
+ fr[self.time] = sdt
1229
+ df.columns = pandas.MultiIndex.from_frame(fr)
1230
+ if csv and name in csv:
1231
+ name_csv = f"{output}.{name}.csv"
1232
+ if verbose:
1233
+ print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
1234
+ df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
1235
+
1236
+ if memory > 2**22:
1237
+ msg = (
1238
+ f"[CubeLogs.to_excel] skipping {name!r}, "
1239
+ f"too big for excel with {memory} bytes"
1240
+ )
1241
+ if verbose:
1242
+ print(msg)
1243
+ else:
1244
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1245
+ else:
1246
+ df.to_excel(
1247
+ writer,
1248
+ sheet_name=name,
1249
+ freeze_panes=(df.columns.nlevels + 1, df.index.nlevels),
1250
+ )
1251
+ f_highlights[name] = tview.f_highlight
1252
+ if tview.plots:
1253
+ plots.append(
1254
+ CubePlot(
1255
+ df,
1256
+ kind="line",
1257
+ orientation="row",
1258
+ split=True,
1259
+ timeseries=self.time,
1260
+ )
1261
+ if self.time in df.columns.names
1262
+ else CubePlot(df, kind="barh", orientation="row", split=True)
1263
+ )
1264
+ if raw:
1265
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1266
+ # Too long.
1267
+ # self._apply_excel_style(raw, writer, self.data)
1268
+ if csv and "raw" in csv:
1269
+ df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
1270
+ memory = df.memory_usage(deep=True).sum()
1271
+ if memory > 2**22:
1272
+ msg = (
1273
+ f"[CubeLogs.to_excel] skipping 'raw', "
1274
+ f"too big for excel with {memory} bytes"
1275
+ )
1276
+ if verbose:
1277
+ print(msg)
1278
+ else:
1279
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1280
+ else:
1281
+ if verbose:
1282
+ print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
1283
+ self.data.to_excel(
1284
+ writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1285
+ )
1286
+
1287
+ if sbs:
1288
+ if verbose:
1289
+ for k, v in sbs.items():
1290
+ print(f"[CubeLogs.to_excel] sbs {k}: {v}")
1291
+ name = "∧".join(sbs)
1292
+ sbs_raw, sbs_agg, sbs_col = self.sbs(sbs)
1293
+ if verbose:
1294
+ print(f"[CubeLogs.to_excel] add sheet {name!r} with shape {sbs_raw.shape}")
1295
+ print(
1296
+ f"[CubeLogs.to_excel] add sheet '{name}-AGG' "
1297
+ f"with shape {sbs_agg.shape}"
1298
+ )
1299
+ sbs_raw = sbs_raw.reset_index(drop=False)
1300
+ sbs_raw.to_excel(
1301
+ writer,
1302
+ sheet_name=name,
1303
+ freeze_panes=(
1304
+ sbs_raw.columns.nlevels + 1,
1305
+ sbs_raw.index.nlevels,
1306
+ ),
1307
+ )
1308
+ sbs_agg.to_excel(
1309
+ writer,
1310
+ sheet_name=f"{name}-AGG",
1311
+ freeze_panes=(
1312
+ sbs_agg.columns.nlevels + 1,
1313
+ sbs_agg.index.nlevels,
1314
+ ),
1315
+ )
1316
+ sbs_col.to_excel(
1317
+ writer,
1318
+ sheet_name=f"{name}-COL",
1319
+ freeze_panes=(
1320
+ sbs_col.columns.nlevels + 1,
1321
+ sbs_col.index.nlevels,
1322
+ ),
1323
+ )
1324
+
1325
+ if plots:
1326
+ from openpyxl.drawing.image import Image
1327
+
1328
+ if verbose:
1329
+ print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
1330
+ sheet = writer.book.create_sheet("plots")
1331
+ pos = 0
1332
+ empty_row = 1
1333
+ times = self.data[self.time].dropna()
1334
+ mini, maxi = times.min(), times.max()
1335
+ title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
1336
+ " 00:00:00", ""
1337
+ )
1338
+ for plot in plots:
1339
+ imgs = plot.to_images(
1340
+ verbose=verbose, merge=True, title_suffix=title_suffix
1341
+ )
1342
+ for img in imgs:
1343
+ y = (pos // 2) * 16
1344
+ loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
1345
+ sheet.add_image(Image(io.BytesIO(img)), loc)
1346
+ if verbose:
1347
+ no = f"{output}.png"
1348
+ print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
1349
+ with open(no, "wb") as f:
1350
+ f.write(img)
1351
+ pos += 1
1352
+ empty_row += len(plots) + 2
1353
+
1354
+ if verbose:
1355
+ print(f"[CubeLogs.to_excel] applies style to {output!r}")
1356
+ apply_excel_style(
1357
+ writer, f_highlights, time_mask_view=time_mask_view, verbose=verbose # type: ignore[arg-type]
1358
+ )
1359
+ if verbose:
1360
+ print(f"[CubeLogs.to_excel] done with {len(views)} views")
1361
+
1362
+ def cube_time(self, fill_other_dates: bool = False, threshold: float = 1.2) -> "CubeLogs":
1363
+ """
1364
+ Aggregates the data over time to detect changes on the last value.
1365
+ If *fill_other_dates* is True, all dates are kept, but values
1366
+ are filled with 0.
1367
+ *threshold* determines the bandwidth within the values are expected,
1368
+ should be a factor of the standard deviation.
1369
+ """
1370
+ unique_time = self.data[self.time].unique()
1371
+ assert len(unique_time) > 2, f"Not enough dates to proceed: unique_time={unique_time}"
1372
+ gr = self.data[[*self.keys_no_time, *self.values]].groupby(
1373
+ self.keys_no_time, dropna=False
1374
+ )
1375
+ dgr = gr.agg(
1376
+ lambda series, th=threshold: int(breaking_last_point(series, threshold=th)[0])
1377
+ )
1378
+ tm = unique_time.max()
1379
+ assert dgr.shape[0] > 0, (
1380
+ f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
1381
+ f"data.shape={self.data.shape}"
1382
+ )
1383
+ dgr[self.time] = tm
1384
+ if fill_other_dates:
1385
+ other_df = []
1386
+ other_dates = [t for t in unique_time if t != tm]
1387
+ for t in other_dates:
1388
+ df = dgr.copy()
1389
+ df[self.time] = t
1390
+ for c in df.columns:
1391
+ if c != self.time:
1392
+ df[c] = 0
1393
+ other_df.append(df)
1394
+ dgr = pandas.concat([dgr, *other_df], axis=0)
1395
+ assert dgr.shape[0] > 0, (
1396
+ f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
1397
+ f"data.shape={self.data.shape}, "
1398
+ f"other_df shapes={[df.shape for df in other_df]}"
1399
+ )
1400
+ return self.clone(data=dgr.reset_index(drop=False))
1401
+
1402
+ def sbs(
1403
+ self, configs: Dict[str, Dict[str, Any]], column_name: str = "CONF"
1404
+ ) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.DataFrame]:
1405
+ """
1406
+ Creates a side-by-side for two configurations.
1407
+ Every configuration a dictionary column:value which filters in
1408
+ the rows to keep in order to compute the side by side.
1409
+ Every configuration is given a name (the key in configs),
1410
+ it is added in column column_name.
1411
+
1412
+ :param configs: example
1413
+ ``dict(CFA=dict(exporter="E1", opt="O"), CFB=dict(exporter="E2", opt="O"))``
1414
+ :param column_name: column to add with the name of the configuration
1415
+ :return: data, aggregated date, data with a row per model
1416
+ """
1417
+ assert (
1418
+ len(configs) >= 2
1419
+ ), f"A side by side needs at least two configs but configs={configs}"
1420
+ set_keys_time = set(self.keys_time)
1421
+ columns_index = None
1422
+ data_list = []
1423
+ for name_conf, conf in configs.items():
1424
+ if columns_index is None:
1425
+ columns_index = list(conf.keys())
1426
+ assert set(columns_index) <= set_keys_time, (
1427
+ f"Configuration {conf} includes columns outside the keys "
1428
+ f"{', '.join(sorted(set_keys_time))}"
1429
+ )
1430
+ else:
1431
+ assert set(columns_index) == set(conf), (
1432
+ f"Every conf should share the same keys but conf={conf} "
1433
+ f"is different from {set(columns_index)}"
1434
+ )
1435
+ data = self.data
1436
+ for k, v in conf.items():
1437
+ data = data[data[k] == v]
1438
+ assert data.shape[0] > 0, f"No rows found for conf={conf}"
1439
+ assert (
1440
+ column_name not in data.columns
1441
+ ), f"column_name={column_name!r} is already in {data.columns}"
1442
+ data = data.copy()
1443
+ data[column_name] = name_conf
1444
+ data_list.append(data)
1445
+
1446
+ new_data = pandas.concat(data_list, axis=0)
1447
+ cube = self.clone(new_data, keys=[*self.keys_no_time, column_name])
1448
+ key_index = set(self.keys_time) - {*columns_index, column_name} # type: ignore[misc]
1449
+ view = CubeViewDef(
1450
+ key_index=set(key_index), # type: ignore[arg-type]
1451
+ name="sbs",
1452
+ values=cube.values,
1453
+ keep_columns_in_index=[self.time],
1454
+ )
1455
+ view_res = cube.view(view)
1456
+ assert isinstance(view_res, pandas.DataFrame), "not needed but mypy complains"
1457
+
1458
+ # add metrics
1459
+ index_column_name = list(view_res.columns.names).index(column_name)
1460
+ index_metrics = list(view_res.columns.names).index("METRICS")
1461
+
1462
+ def _mkc(m, s):
1463
+ c = ["" for c in view_res.columns.names]
1464
+ c[index_column_name] = s
1465
+ c[index_metrics] = m
1466
+ return tuple(c)
1467
+
1468
+ list_configs = list(configs.items())
1469
+ mean_columns = [
1470
+ c
1471
+ for c in view_res.columns
1472
+ if pandas.api.types.is_numeric_dtype(view_res[c])
1473
+ and not pandas.api.types.is_object_dtype(view_res[c])
1474
+ ]
1475
+ assert mean_columns, f"No numerical columns in {view_res.dtypes}"
1476
+ view_res = view_res[mean_columns].copy()
1477
+ metrics = sorted(set(c[index_metrics] for c in view_res.columns))
1478
+ assert metrics, (
1479
+ f"No numerical metrics detected in "
1480
+ f"view_res.columns.names={view_res.columns.names}, "
1481
+ f"columns={view_res.dtypes}"
1482
+ )
1483
+ sum_columns = []
1484
+ columns_to_add = []
1485
+ for i in range(len(list_configs)):
1486
+ for j in range(i + 1, len(list_configs)):
1487
+ for m in metrics:
1488
+ iname, ci = list_configs[i]
1489
+ jname, cj = list_configs[j]
1490
+ ci = ci.copy()
1491
+ cj = cj.copy()
1492
+ ci["METRICS"] = m
1493
+ cj["METRICS"] = m
1494
+ ci["CONF"] = iname
1495
+ cj["CONF"] = jname
1496
+
1497
+ ci_name = tuple(ci[n] for n in view_res.columns.names)
1498
+ cj_name = tuple(cj[n] for n in view_res.columns.names)
1499
+ assert ci_name in view_res.columns or cj_name in view_res.columns, (
1500
+ f"Unable to find column {ci_name} or {cj_name} "
1501
+ f"in columns {view_res.columns}, metrics={metrics}"
1502
+ )
1503
+ if ci_name not in view_res.columns or cj_name not in view_res.columns:
1504
+ # One config does not have such metric.
1505
+ continue
1506
+
1507
+ si = view_res[ci_name]
1508
+ sj = view_res[cj_name]
1509
+
1510
+ sinan = si.isna()
1511
+ sjnan = sj.isna()
1512
+ n1 = iname
1513
+ n2 = jname
1514
+ nas = pandas.DataFrame(
1515
+ {
1516
+ _mkc(m, f"∅{n1}∧∅{n2}"): (sinan & sjnan).astype(int),
1517
+ _mkc(m, f"∅{n1}∧{n2}"): (sinan & ~sjnan).astype(int),
1518
+ _mkc(m, f"{n1}∧∅{n2}"): (~sinan & sjnan).astype(int),
1519
+ _mkc(m, f"{n1}∧{n2}"): (~sinan & ~sjnan).astype(int),
1520
+ _mkc(m, f"{n1}<{n2}"): (si < sj).astype(int),
1521
+ _mkc(m, f"{n1}=={n2}"): (si == sj).astype(int),
1522
+ _mkc(m, f"{n1}>{n2}"): (si > sj).astype(int),
1523
+ _mkc(m, f"{n1}*({n1}∧{n2})"): si * (~sinan & ~sjnan).astype(float),
1524
+ _mkc(m, f"{n2}*({n1}∧{n2})"): sj * (~sinan & ~sjnan).astype(float),
1525
+ }
1526
+ )
1527
+ nas.columns.names = view_res.columns.names
1528
+ columns_to_add.append(nas)
1529
+ sum_columns.extend(nas.columns)
1530
+
1531
+ view_res = pandas.concat([view_res, *columns_to_add], axis=1)
1532
+ res = view_res.stack("METRICS", future_stack=True) # type: ignore[union-attr]
1533
+ res = res.reorder_levels(
1534
+ [res.index.nlevels - 1, *list(range(res.index.nlevels - 1))]
1535
+ ).sort_index()
1536
+
1537
+ # aggregated metrics
1538
+ aggs = {
1539
+ **{k: "mean" for k in mean_columns}, # noqa: C420
1540
+ **{k: "sum" for k in sum_columns}, # noqa: C420
1541
+ }
1542
+ flat = view_res.groupby(self.time).agg(aggs)
1543
+ flat = flat.stack("METRICS", future_stack=True)
1544
+ return res, flat, view_res.T.sort_index().T
1545
+
1546
+
1547
+ class CubeLogsPerformance(CubeLogs):
1548
+ """Processes logs coming from experiments."""
1549
+
1550
+ def __init__(
1551
+ self,
1552
+ data: Any,
1553
+ time: str = "DATE",
1554
+ keys: Sequence[str] = (
1555
+ "^version_.*",
1556
+ "^model_.*",
1557
+ "device",
1558
+ "opt_patterns",
1559
+ "suite",
1560
+ "memory_peak",
1561
+ "machine",
1562
+ "exporter",
1563
+ "dynamic",
1564
+ "rtopt",
1565
+ "dtype",
1566
+ "device",
1567
+ "architecture",
1568
+ ),
1569
+ values: Sequence[str] = (
1570
+ "^time_.*",
1571
+ "^disc.*",
1572
+ "^ERR_.*",
1573
+ "CMD",
1574
+ "^ITER",
1575
+ "^onnx_.*",
1576
+ "^op_onnx_.*",
1577
+ "^peak_gpu_.*",
1578
+ ),
1579
+ ignored: Sequence[str] = ("version_python",),
1580
+ recent: bool = True,
1581
+ formulas: Optional[
1582
+ Union[
1583
+ Sequence[str],
1584
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
1585
+ ]
1586
+ ] = (
1587
+ "speedup",
1588
+ "bucket[speedup]",
1589
+ "ERR1",
1590
+ "n_models",
1591
+ "n_model_eager",
1592
+ "n_model_running",
1593
+ "n_model_acc01",
1594
+ "n_model_acc001",
1595
+ "n_model_dynamic",
1596
+ "n_model_pass",
1597
+ "n_model_faster",
1598
+ "n_model_faster2x",
1599
+ "n_model_faster3x",
1600
+ "n_model_faster4x",
1601
+ "n_model_faster5x",
1602
+ "n_node_attention",
1603
+ "n_node_attention23",
1604
+ "n_node_causal_mask",
1605
+ "n_node_constant",
1606
+ "n_node_control_flow",
1607
+ "n_node_expand",
1608
+ "n_node_function",
1609
+ "n_node_gqa",
1610
+ "n_node_initializer",
1611
+ "n_node_initializer_small",
1612
+ "n_node_layer_normalization",
1613
+ "n_node_layer_normalization23",
1614
+ "n_node_reshape",
1615
+ "n_node_rotary_embedding",
1616
+ "n_node_rotary_embedding23",
1617
+ "n_node_scatter",
1618
+ "n_node_sequence",
1619
+ "n_node_shape",
1620
+ "onnx_n_nodes_no_cst",
1621
+ "peak_gpu_torch",
1622
+ "peak_gpu_nvidia",
1623
+ "time_export_unbiased",
1624
+ ),
1625
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
1626
+ keep_last_date: bool = False,
1627
+ ):
1628
+ super().__init__(
1629
+ data=data,
1630
+ time=time,
1631
+ keys=keys,
1632
+ values=values,
1633
+ ignored=ignored,
1634
+ recent=recent,
1635
+ formulas=formulas,
1636
+ fill_missing=fill_missing,
1637
+ keep_last_date=keep_last_date,
1638
+ )
1639
+
1640
+ def clone(
1641
+ self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
1642
+ ) -> "CubeLogs":
1643
+ """
1644
+ Makes a copy of the dataframe.
1645
+ It copies the processed data not the original one.
1646
+ keys can be changed as well.
1647
+ """
1648
+ cube = self.__class__(
1649
+ data if data is not None else self.data.copy(),
1650
+ time=self.time,
1651
+ keys=keys or self.keys_no_time,
1652
+ values=self.values,
1653
+ recent=False,
1654
+ )
1655
+ cube.load()
1656
+ return cube
1657
+
1658
+ def _process_formula(
1659
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
1660
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
1661
+ """
1662
+ Processes a formula, converting it into a function.
1663
+
1664
+ :param formula: a formula string
1665
+ :return: a function
1666
+ """
1667
+ if callable(formula):
1668
+ return formula
1669
+ assert isinstance(
1670
+ formula, str
1671
+ ), f"Unexpected type for formula {type(formula)}: {formula!r}"
1672
+
1673
+ def gdf(df, cname, default_value=np.nan):
1674
+ if cname in df.columns:
1675
+ if np.isnan(default_value):
1676
+ return df[cname]
1677
+ return df[cname].fillna(default_value)
1678
+ return pandas.Series(default_value, index=df.index)
1679
+
1680
+ def ghas_value(df, cname):
1681
+ if cname not in df.columns:
1682
+ return pandas.Series(np.nan, index=df.index)
1683
+ isna = df[cname].isna()
1684
+ return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
1685
+
1686
+ def gpreserve(df, cname, series):
1687
+ if cname not in df.columns:
1688
+ return pandas.Series(np.nan, index=df.index)
1689
+ isna = df[cname].isna()
1690
+ return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
1691
+
1692
+ if formula == "speedup":
1693
+ columns = set(self._filter_column(["^time_.*"], self.data.columns))
1694
+ assert "time_latency" in columns and "time_latency_eager" in columns, (
1695
+ f"Unable to apply formula {formula!r}, with columns\n"
1696
+ f"{pprint.pformat(sorted(columns))}"
1697
+ )
1698
+ return lambda df: df["time_latency_eager"] / df["time_latency"]
1699
+
1700
+ if formula == "bucket[speedup]":
1701
+ columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
1702
+ assert "speedup" in columns, (
1703
+ f"Unable to apply formula {formula!r}, with columns\n"
1704
+ f"{pprint.pformat(sorted(columns))}"
1705
+ )
1706
+ # return lambda df: df["time_latency_eager"] / df["time_latency"]
1707
+ return lambda df: pandas.cut(
1708
+ df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
1709
+ )
1710
+
1711
+ if formula == "ERR1":
1712
+ columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
1713
+ if not columns:
1714
+ return lambda df: np.nan
1715
+
1716
+ def first_err(df: pandas.DataFrame) -> pandas.Series:
1717
+ ordered = [
1718
+ c
1719
+ for c in [
1720
+ "ERR_timeout",
1721
+ "ERR_load",
1722
+ "ERR_feeds",
1723
+ "ERR_warmup_eager",
1724
+ "ERR_export",
1725
+ "ERR_ort",
1726
+ "ERR_warmup",
1727
+ # "ERR_std",
1728
+ # "ERR_crash",
1729
+ # "ERR_stdout",
1730
+ ]
1731
+ if c in df.columns
1732
+ ]
1733
+ res = None
1734
+ for c in ordered:
1735
+ if res is None:
1736
+ res = df[c].fillna("")
1737
+ else:
1738
+ res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
1739
+ return res
1740
+
1741
+ return first_err
1742
+
1743
+ if formula.startswith("n_"):
1744
+ lambdas = dict(
1745
+ n_models=lambda df: ghas_value(df, "model_name"),
1746
+ n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
1747
+ n_model_running=lambda df: ghas_value(df, "time_latency"),
1748
+ n_model_acc01=lambda df: gpreserve(
1749
+ df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
1750
+ ),
1751
+ n_model_acc001=lambda df: gpreserve(
1752
+ df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
1753
+ ),
1754
+ n_model_dynamic=lambda df: gpreserve(
1755
+ df,
1756
+ "discrepancies_dynamic_abs",
1757
+ (gdf(df, "discrepancies_dynamic_abs") <= 0.1),
1758
+ ),
1759
+ n_model_pass=lambda df: gpreserve(
1760
+ df,
1761
+ "time_latency",
1762
+ (gdf(df, "discrepancies_abs", np.inf) < 0.1)
1763
+ & (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
1764
+ ),
1765
+ n_model_faster=lambda df: gpreserve(
1766
+ df,
1767
+ "time_latency",
1768
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
1769
+ ),
1770
+ n_model_faster2x=lambda df: gpreserve(
1771
+ df,
1772
+ "time_latency",
1773
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
1774
+ ),
1775
+ n_model_faster3x=lambda df: gpreserve(
1776
+ df,
1777
+ "time_latency",
1778
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
1779
+ ),
1780
+ n_model_faster4x=lambda df: gpreserve(
1781
+ df,
1782
+ "time_latency",
1783
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
1784
+ ),
1785
+ n_model_faster5x=lambda df: gpreserve(
1786
+ df,
1787
+ "time_latency",
1788
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 4.98,
1789
+ ),
1790
+ n_node_attention23=lambda df: gpreserve(
1791
+ df, "time_latency_eager", gdf(df, "op_onnx__Attention")
1792
+ ),
1793
+ n_node_rotary_embedding23=lambda df: gpreserve(
1794
+ df, "time_latency_eager", gdf(df, "op_onnx__RotaryEmbedding")
1795
+ ),
1796
+ n_node_layer_normalization23=lambda df: gpreserve(
1797
+ df,
1798
+ "time_latency_eager",
1799
+ gdf(df, "op_onnx__LayerNormalization", 0)
1800
+ + gdf(df, "op_onnx__RMSNormalization", 0)
1801
+ + gdf(df, "op_onnx__BatchNormlization", 0)
1802
+ + gdf(df, "op_onnx__InstanceNormlization", 0)
1803
+ + gdf(df, "op_onnx__GroupNormalization", 0),
1804
+ ),
1805
+ n_node_attention=lambda df: gpreserve(
1806
+ df,
1807
+ "time_latency_eager",
1808
+ gdf(df, "op_onnx_com.microsoft_Attention", 0)
1809
+ + gdf(df, "op_onnx_com.microsoft_MultiHeadAttention", 0)
1810
+ + gdf(df, "op_onnx_com.microsoft_PackedAttention", 0)
1811
+ + gdf(df, "op_onnx_com.microsoft_PackedMultiHeadAttention", 0)
1812
+ + gdf(df, "op_onnx_com.microsoft_GroupQueryAttention", 0)
1813
+ + gdf(df, "op_onnx_com.microsoft_PagedAttention", 0)
1814
+ + gdf(df, "op_onnx_com.microsoft_DecoderAttention", 0)
1815
+ + gdf(df, "op_onnx_com.microsoft_LongformerAttention", 0)
1816
+ + gdf(df, "op_onnx_com.microsoft_DecoderMaskedSelfAttention", 0)
1817
+ + gdf(df, "op_onnx_com.microsoft_DecoderMaskedMultiHeadAttention", 0)
1818
+ + gdf(df, "op_onnx_com.microsoft_SparseAttention", 0),
1819
+ ),
1820
+ n_node_gqa=lambda df: gpreserve(
1821
+ df,
1822
+ "time_latency_eager",
1823
+ gdf(df, "op_onnx_com.microsoft_GroupQueryAttention", 0),
1824
+ ),
1825
+ n_node_layer_normalization=lambda df: gpreserve(
1826
+ df,
1827
+ "time_latency_eager",
1828
+ gdf(df, "op_onnx_com.microsoft_EmbedLayerNormalization", 0)
1829
+ + gdf(df, "op_onnx_com.microsoft_SkipLayerNormalization", 0)
1830
+ + gdf(df, "op_onnx_com.microsoft_LayerNormalization", 0)
1831
+ + gdf(df, "op_onnx_com.microsoft_SkipSimplifiedLayerNormalization", 0)
1832
+ + gdf(df, "op_onnx_com.microsoft_SimplifiedLayerNormalization", 0),
1833
+ ),
1834
+ n_node_rotary_embedding=lambda df: gpreserve(
1835
+ df,
1836
+ "time_latency_eager",
1837
+ gdf(df, "op_onnx_com.microsoft_GemmaRotaryEmbedding", 0)
1838
+ + gdf(df, "op_onnx_com.microsoft_RotaryEmbedding", 0),
1839
+ ),
1840
+ n_node_control_flow=lambda df: gpreserve(
1841
+ df,
1842
+ "time_latency_eager",
1843
+ (
1844
+ gdf(df, "op_onnx__If", 0)
1845
+ + gdf(df, "op_onnx__Scan", 0)
1846
+ + gdf(df, "op_onnx__Loop", 0)
1847
+ ),
1848
+ ),
1849
+ n_node_scatter=lambda df: gpreserve(
1850
+ df,
1851
+ "time_latency_eager",
1852
+ gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
1853
+ ),
1854
+ n_node_function=lambda df: gpreserve(
1855
+ df, "onnx_n_functions", gdf(df, "onnx_n_functions")
1856
+ ),
1857
+ n_node_initializer_small=lambda df: gpreserve(
1858
+ df, "op_onnx_initializer_small", gdf(df, "op_onnx_initializer_small")
1859
+ ),
1860
+ n_node_initializer=lambda df: gpreserve(
1861
+ df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
1862
+ ),
1863
+ n_node_constant=lambda df: gpreserve(
1864
+ df, "time_latency_eager", gdf(df, "op_onnx__Constant")
1865
+ ),
1866
+ n_node_shape=lambda df: gpreserve(
1867
+ df, "time_latency_eager", gdf(df, "op_onnx__Shape")
1868
+ ),
1869
+ n_node_reshape=lambda df: gpreserve(
1870
+ df, "time_latency_eager", gdf(df, "op_onnx__Reshape")
1871
+ ),
1872
+ n_node_expand=lambda df: gpreserve(
1873
+ df, "time_latency_eager", gdf(df, "op_onnx__Expand")
1874
+ ),
1875
+ n_node_causal_mask=lambda df: gpreserve(
1876
+ df,
1877
+ "time_latency_eager",
1878
+ gdf(df, "op_onnx__CausalMask", 0),
1879
+ ),
1880
+ n_node_sequence=lambda df: gpreserve(
1881
+ df,
1882
+ "time_latency_eager",
1883
+ gdf(df, "op_onnx__SequenceAt", 0) + gdf(df, "op_onnx__SplitToSequence", 0),
1884
+ ),
1885
+ )
1886
+ assert (
1887
+ formula in lambdas
1888
+ ), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
1889
+ return lambdas[formula]
1890
+
1891
+ if formula == "onnx_n_nodes_no_cst":
1892
+ return lambda df: gdf(df, "onnx_n_nodes", 0) - gdf(
1893
+ df, "op_onnx__Constant", 0
1894
+ ).fillna(0)
1895
+ if formula == "peak_gpu_torch":
1896
+ return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
1897
+ if formula == "peak_gpu_nvidia":
1898
+ return (
1899
+ lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
1900
+ )
1901
+ if formula == "time_export_unbiased":
1902
+
1903
+ def unbiased_export(df):
1904
+ if "time_warmup_first_iteration" not in df.columns:
1905
+ return pandas.Series(np.nan, index=df.index)
1906
+ return pandas.Series(
1907
+ np.where(
1908
+ df["exporter"] == "inductor",
1909
+ df["time_warmup_first_iteration"] + df["time_export_success"],
1910
+ df["time_export_success"],
1911
+ ),
1912
+ index=df.index,
1913
+ )
1914
+
1915
+ return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
1916
+
1917
+ raise ValueError(
1918
+ f"Unexpected formula {formula!r}, available columns are\n"
1919
+ f"{pprint.pformat(sorted(self.data.columns))}"
1920
+ )
1921
+
1922
+ def view(
1923
+ self,
1924
+ view_def: Optional[Union[str, CubeViewDef]],
1925
+ return_view_def: bool = False,
1926
+ verbose: int = 0,
1927
+ ) -> Union[
1928
+ Optional[pandas.DataFrame], Tuple[Optional[pandas.DataFrame], Optional[CubeViewDef]]
1929
+ ]:
1930
+ """
1931
+ Returns a dataframe, a pivot view.
1932
+
1933
+ If view_def is a string, it is replaced by a prefined view.
1934
+
1935
+ :param view_def: view definition or a string
1936
+ :param return_view_def: returns the view definition as well
1937
+ :param verbose: verbosity level
1938
+ :return: dataframe or a couple (dataframe, view definition),
1939
+ both of them can be one if view_def cannot be interpreted
1940
+ """
1941
+ assert view_def is not None, "view_def is None, this is not allowed."
1942
+ if isinstance(view_def, str):
1943
+ view_def = self.make_view_def(view_def)
1944
+ if view_def is None:
1945
+ return (None, None) if return_view_def else None
1946
+ return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
1947
+
1948
+ def make_view_def(self, name: str) -> Optional[CubeViewDef]:
1949
+ """
1950
+ Returns a view definition.
1951
+
1952
+ :param name: name of the view
1953
+ :return: a CubeViewDef or None if name does not make sense
1954
+
1955
+ Available views:
1956
+
1957
+ * **agg-suite:** aggregation per suite
1958
+ * **disc:** discrepancies
1959
+ * **speedup:** speedup
1960
+ * **bucket_speedup:** speedup in buckets
1961
+ * **time:** latency
1962
+ * **time_export:** time to export
1963
+ * **counts:** status, running, faster, has control flow, ...
1964
+ * **err:** important errors
1965
+ * **cmd:** command lines
1966
+ * **raw-short:** raw data without all the unused columns
1967
+ """
1968
+ fix_aggregation_change = ["model_speedup_input_set", "model_test_with"]
1969
+ fs = ["suite", "model_suite", "task", "model_name", "model_task"]
1970
+ index_cols = self._filter_column(fs, self.keys_time)
1971
+ assert index_cols, (
1972
+ f"No index columns found for {fs!r} in "
1973
+ f"{pprint.pformat(sorted(self.keys_time))}"
1974
+ )
1975
+ index_cols = [c for c in fs if c in set(index_cols)]
1976
+
1977
+ f_speedup = lambda x: ( # noqa: E731
1978
+ CubeViewDef.HighLightKind.NONE
1979
+ if not isinstance(x, (float, int))
1980
+ else (
1981
+ CubeViewDef.HighLightKind.RED
1982
+ if x < 0.9
1983
+ else (
1984
+ CubeViewDef.HighLightKind.GREEN
1985
+ if x > 1.1
1986
+ else CubeViewDef.HighLightKind.NONE
1987
+ )
1988
+ )
1989
+ )
1990
+ f_disc = lambda x: ( # noqa: E731
1991
+ CubeViewDef.HighLightKind.NONE
1992
+ if not isinstance(x, (float, int))
1993
+ else (
1994
+ CubeViewDef.HighLightKind.RED
1995
+ if x > 0.1
1996
+ else (
1997
+ CubeViewDef.HighLightKind.GREEN
1998
+ if x < 0.01
1999
+ else CubeViewDef.HighLightKind.NONE
2000
+ )
2001
+ )
2002
+ )
2003
+ f_bucket = lambda x: ( # noqa: E731
2004
+ CubeViewDef.HighLightKind.NONE
2005
+ if not isinstance(x, str)
2006
+ else (
2007
+ CubeViewDef.HighLightKind.RED
2008
+ if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
2009
+ else (
2010
+ CubeViewDef.HighLightKind.NONE
2011
+ if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
2012
+ else (
2013
+ CubeViewDef.HighLightKind.GREEN
2014
+ if "[" in x
2015
+ else CubeViewDef.HighLightKind.NONE
2016
+ )
2017
+ )
2018
+ )
2019
+ )
2020
+
2021
+ def mean_weight(gr):
2022
+ weight = gr["time_latency_eager"]
2023
+ x = gr["speedup"]
2024
+ if x.shape[0] == 0:
2025
+ return np.nan
2026
+ div = weight.sum()
2027
+ if div > 0:
2028
+ return (x * weight).sum() / div
2029
+ return np.nan
2030
+
2031
+ def mean_geo(gr):
2032
+ x = gr["speedup"]
2033
+ return np.exp(np.log(x.dropna()).mean())
2034
+
2035
+ order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
2036
+ implemented_views = {
2037
+ "agg-suite": lambda: CubeViewDef(
2038
+ key_index=index_cols,
2039
+ values=self._filter_column(
2040
+ [
2041
+ "TIME_ITER",
2042
+ "speedup",
2043
+ "time_latency",
2044
+ "time_latency_eager",
2045
+ "time_export_success",
2046
+ "time_export_unbiased",
2047
+ "^n_.*",
2048
+ "target_opset",
2049
+ "onnx_filesize",
2050
+ "onnx_weight_size_torch",
2051
+ "onnx_weight_size_proto",
2052
+ "onnx_n_nodes",
2053
+ "onnx_n_nodes_no_cst",
2054
+ "op_onnx__Constant",
2055
+ "peak_gpu_torch",
2056
+ "peak_gpu_nvidia",
2057
+ ],
2058
+ self.values,
2059
+ ),
2060
+ ignore_unique=True,
2061
+ key_agg=["model_name", "task", "model_task"],
2062
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
2063
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
2064
+ keep_columns_in_index=["suite"],
2065
+ name="agg-suite",
2066
+ order=order,
2067
+ fix_aggregation_change=fix_aggregation_change,
2068
+ ),
2069
+ "agg-all": lambda: CubeViewDef(
2070
+ key_index=index_cols,
2071
+ values=self._filter_column(
2072
+ [
2073
+ "TIME_ITER",
2074
+ "speedup",
2075
+ "time_latency",
2076
+ "time_latency_eager",
2077
+ "time_export_success",
2078
+ "time_export_unbiased",
2079
+ "^n_.*",
2080
+ "target_opset",
2081
+ "onnx_filesize",
2082
+ "onnx_weight_size_torch",
2083
+ "onnx_weight_size_proto",
2084
+ "onnx_n_nodes",
2085
+ "onnx_n_nodes_no_cst",
2086
+ "peak_gpu_torch",
2087
+ "peak_gpu_nvidia",
2088
+ ],
2089
+ self.values,
2090
+ ),
2091
+ ignore_unique=True,
2092
+ key_agg=["model_name", "task", "model_task", "suite"],
2093
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
2094
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
2095
+ name="agg-all",
2096
+ order=order,
2097
+ plots=True,
2098
+ fix_aggregation_change=fix_aggregation_change,
2099
+ ),
2100
+ "disc": lambda: CubeViewDef(
2101
+ key_index=index_cols,
2102
+ values=self._filter_column(["discrepancies_abs"], self.values),
2103
+ ignore_unique=True,
2104
+ keep_columns_in_index=["suite"],
2105
+ f_highlight=f_disc,
2106
+ name="disc",
2107
+ order=order,
2108
+ fix_aggregation_change=fix_aggregation_change,
2109
+ ),
2110
+ "speedup": lambda: CubeViewDef(
2111
+ key_index=index_cols,
2112
+ values=self._filter_column(["speedup"], self.values),
2113
+ ignore_unique=True,
2114
+ keep_columns_in_index=["suite"],
2115
+ f_highlight=f_speedup,
2116
+ name="speedup",
2117
+ order=order,
2118
+ fix_aggregation_change=fix_aggregation_change,
2119
+ ),
2120
+ "counts": lambda: CubeViewDef(
2121
+ key_index=index_cols,
2122
+ values=self._filter_column(["^n_.*"], self.values),
2123
+ ignore_unique=True,
2124
+ keep_columns_in_index=["suite"],
2125
+ name="counts",
2126
+ order=order,
2127
+ ),
2128
+ "peak-gpu": lambda: CubeViewDef(
2129
+ key_index=index_cols,
2130
+ values=self._filter_column(["^peak_gpu_.*"], self.values),
2131
+ ignore_unique=True,
2132
+ keep_columns_in_index=["suite"],
2133
+ name="peak-gpu",
2134
+ order=order,
2135
+ fix_aggregation_change=fix_aggregation_change,
2136
+ ),
2137
+ "time": lambda: CubeViewDef(
2138
+ key_index=index_cols,
2139
+ values=self._filter_column(
2140
+ ["time_latency", "time_latency_eager"], self.values
2141
+ ),
2142
+ ignore_unique=True,
2143
+ keep_columns_in_index=["suite"],
2144
+ name="time",
2145
+ order=order,
2146
+ fix_aggregation_change=fix_aggregation_change,
2147
+ ),
2148
+ "time_export": lambda: CubeViewDef(
2149
+ key_index=index_cols,
2150
+ values=self._filter_column(["time_export_unbiased"], self.values),
2151
+ ignore_unique=True,
2152
+ keep_columns_in_index=["suite"],
2153
+ name="time_export",
2154
+ order=order,
2155
+ fix_aggregation_change=fix_aggregation_change,
2156
+ ),
2157
+ "err": lambda: CubeViewDef(
2158
+ key_index=index_cols,
2159
+ values=self._filter_column(
2160
+ ["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
2161
+ ),
2162
+ ignore_unique=True,
2163
+ keep_columns_in_index=["suite"],
2164
+ name="err",
2165
+ order=order,
2166
+ fix_aggregation_change=fix_aggregation_change,
2167
+ ),
2168
+ "bucket-speedup": lambda: CubeViewDef(
2169
+ key_index=index_cols,
2170
+ values=self._filter_column(["bucket[speedup]"], self.values),
2171
+ ignore_unique=True,
2172
+ keep_columns_in_index=["suite"],
2173
+ name="bucket-speedup",
2174
+ f_highlight=f_bucket,
2175
+ order=order,
2176
+ fix_aggregation_change=fix_aggregation_change,
2177
+ ),
2178
+ "onnx": lambda: CubeViewDef(
2179
+ key_index=index_cols,
2180
+ values=self._filter_column(
2181
+ [
2182
+ "onnx_filesize",
2183
+ "onnx_n_nodes",
2184
+ "onnx_n_nodes_no_cst",
2185
+ "onnx_weight_size_proto",
2186
+ "onnx_weight_size_torch",
2187
+ "op_onnx_initializer_small",
2188
+ ],
2189
+ self.values,
2190
+ ),
2191
+ ignore_unique=True,
2192
+ keep_columns_in_index=["suite"],
2193
+ name="onnx",
2194
+ order=order,
2195
+ fix_aggregation_change=fix_aggregation_change,
2196
+ ),
2197
+ "raw-short": lambda: CubeViewDef(
2198
+ key_index=self.keys_time,
2199
+ values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
2200
+ ignore_unique=False,
2201
+ keep_columns_in_index=["suite"],
2202
+ name="raw-short",
2203
+ no_index=True,
2204
+ fix_aggregation_change=fix_aggregation_change,
2205
+ ),
2206
+ }
2207
+
2208
+ cmd_col = self._filter_column(["CMD"], self.values, can_be_empty=True)
2209
+ if cmd_col:
2210
+ implemented_views["cmd"] = lambda: CubeViewDef(
2211
+ key_index=index_cols,
2212
+ values=cmd_col,
2213
+ ignore_unique=True,
2214
+ keep_columns_in_index=["suite"],
2215
+ name="cmd",
2216
+ order=order,
2217
+ fix_aggregation_change=fix_aggregation_change,
2218
+ )
2219
+
2220
+ assert name in implemented_views or name in {"cmd"}, (
2221
+ f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
2222
+ f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
2223
+ f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
2224
+ )
2225
+ if name not in implemented_views:
2226
+ return None
2227
+ return implemented_views[name]()
2228
+
2229
+ def post_load_process_piece(
2230
+ self, df: pandas.DataFrame, unique: bool = False
2231
+ ) -> pandas.DataFrame:
2232
+ df = super().post_load_process_piece(df, unique=unique)
2233
+ if unique:
2234
+ return df
2235
+ cols = self._filter_column(self._keys, df)
2236
+ res = None
2237
+ for c in cols:
2238
+ if df[c].isna().any():
2239
+ # Missing values for keys are not supposed to happen.
2240
+ uniq = set(df[c].dropna())
2241
+ if len(uniq) == 1:
2242
+ if res is None:
2243
+ res = df.copy()
2244
+ res[c] = res[c].fillna(uniq.pop())
2245
+ return df if res is None else res