onnx-diagnostic 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnx_diagnostic/__init__.py +7 -0
- onnx_diagnostic/__main__.py +4 -0
- onnx_diagnostic/_command_lines_parser.py +1141 -0
- onnx_diagnostic/api.py +15 -0
- onnx_diagnostic/doc.py +100 -0
- onnx_diagnostic/export/__init__.py +2 -0
- onnx_diagnostic/export/api.py +124 -0
- onnx_diagnostic/export/dynamic_shapes.py +1083 -0
- onnx_diagnostic/export/shape_helper.py +296 -0
- onnx_diagnostic/export/validate.py +173 -0
- onnx_diagnostic/ext_test_case.py +1290 -0
- onnx_diagnostic/helpers/__init__.py +1 -0
- onnx_diagnostic/helpers/_log_helper.py +463 -0
- onnx_diagnostic/helpers/args_helper.py +132 -0
- onnx_diagnostic/helpers/bench_run.py +450 -0
- onnx_diagnostic/helpers/cache_helper.py +687 -0
- onnx_diagnostic/helpers/config_helper.py +170 -0
- onnx_diagnostic/helpers/doc_helper.py +163 -0
- onnx_diagnostic/helpers/fake_tensor_helper.py +273 -0
- onnx_diagnostic/helpers/graph_helper.py +386 -0
- onnx_diagnostic/helpers/helper.py +1707 -0
- onnx_diagnostic/helpers/log_helper.py +2245 -0
- onnx_diagnostic/helpers/memory_peak.py +249 -0
- onnx_diagnostic/helpers/mini_onnx_builder.py +600 -0
- onnx_diagnostic/helpers/model_builder_helper.py +469 -0
- onnx_diagnostic/helpers/onnx_helper.py +1200 -0
- onnx_diagnostic/helpers/ort_session.py +736 -0
- onnx_diagnostic/helpers/rt_helper.py +476 -0
- onnx_diagnostic/helpers/torch_helper.py +987 -0
- onnx_diagnostic/reference/__init__.py +4 -0
- onnx_diagnostic/reference/evaluator.py +254 -0
- onnx_diagnostic/reference/ops/__init__.py +1 -0
- onnx_diagnostic/reference/ops/op_add_add_mul_mul.py +68 -0
- onnx_diagnostic/reference/ops/op_attention.py +60 -0
- onnx_diagnostic/reference/ops/op_average_pool_grad.py +63 -0
- onnx_diagnostic/reference/ops/op_bias_softmax.py +16 -0
- onnx_diagnostic/reference/ops/op_cast_like.py +46 -0
- onnx_diagnostic/reference/ops/op_complex.py +26 -0
- onnx_diagnostic/reference/ops/op_concat.py +15 -0
- onnx_diagnostic/reference/ops/op_constant_of_shape.py +67 -0
- onnx_diagnostic/reference/ops/op_fused_matmul.py +31 -0
- onnx_diagnostic/reference/ops/op_gather.py +29 -0
- onnx_diagnostic/reference/ops/op_gather_elements.py +45 -0
- onnx_diagnostic/reference/ops/op_gather_grad.py +12 -0
- onnx_diagnostic/reference/ops/op_memcpy_host.py +11 -0
- onnx_diagnostic/reference/ops/op_mul_sigmoid.py +23 -0
- onnx_diagnostic/reference/ops/op_negxplus1.py +8 -0
- onnx_diagnostic/reference/ops/op_qlinear_average_pool.py +40 -0
- onnx_diagnostic/reference/ops/op_qlinear_conv.py +102 -0
- onnx_diagnostic/reference/ops/op_quick_gelu.py +23 -0
- onnx_diagnostic/reference/ops/op_replace_zero.py +13 -0
- onnx_diagnostic/reference/ops/op_rotary.py +19 -0
- onnx_diagnostic/reference/ops/op_scan.py +65 -0
- onnx_diagnostic/reference/ops/op_scatter_elements.py +107 -0
- onnx_diagnostic/reference/ops/op_scatternd_of_shape.py +22 -0
- onnx_diagnostic/reference/ops/op_simplified_layer_normalization.py +8 -0
- onnx_diagnostic/reference/ops/op_skip_layer_normalization.py +13 -0
- onnx_diagnostic/reference/ops/op_slice.py +20 -0
- onnx_diagnostic/reference/ops/op_transpose_cast.py +16 -0
- onnx_diagnostic/reference/ops/op_tri_matrix.py +17 -0
- onnx_diagnostic/reference/ort_evaluator.py +652 -0
- onnx_diagnostic/reference/quantized_tensor.py +46 -0
- onnx_diagnostic/reference/report_results_comparison.py +95 -0
- onnx_diagnostic/reference/torch_evaluator.py +669 -0
- onnx_diagnostic/reference/torch_ops/__init__.py +56 -0
- onnx_diagnostic/reference/torch_ops/_op_run.py +335 -0
- onnx_diagnostic/reference/torch_ops/access_ops.py +94 -0
- onnx_diagnostic/reference/torch_ops/binary_ops.py +108 -0
- onnx_diagnostic/reference/torch_ops/controlflow_ops.py +121 -0
- onnx_diagnostic/reference/torch_ops/generator_ops.py +36 -0
- onnx_diagnostic/reference/torch_ops/nn_ops.py +196 -0
- onnx_diagnostic/reference/torch_ops/other_ops.py +106 -0
- onnx_diagnostic/reference/torch_ops/reduce_ops.py +130 -0
- onnx_diagnostic/reference/torch_ops/sequence_ops.py +65 -0
- onnx_diagnostic/reference/torch_ops/shape_ops.py +121 -0
- onnx_diagnostic/reference/torch_ops/unary_ops.py +93 -0
- onnx_diagnostic/tasks/__init__.py +90 -0
- onnx_diagnostic/tasks/automatic_speech_recognition.py +188 -0
- onnx_diagnostic/tasks/data/__init__.py +13 -0
- onnx_diagnostic/tasks/data/dummies_imagetext2text_generation_gemma3.onnx +0 -0
- onnx_diagnostic/tasks/feature_extraction.py +162 -0
- onnx_diagnostic/tasks/fill_mask.py +89 -0
- onnx_diagnostic/tasks/image_classification.py +144 -0
- onnx_diagnostic/tasks/image_text_to_text.py +581 -0
- onnx_diagnostic/tasks/image_to_video.py +127 -0
- onnx_diagnostic/tasks/mask_generation.py +143 -0
- onnx_diagnostic/tasks/mixture_of_expert.py +79 -0
- onnx_diagnostic/tasks/object_detection.py +134 -0
- onnx_diagnostic/tasks/sentence_similarity.py +89 -0
- onnx_diagnostic/tasks/summarization.py +227 -0
- onnx_diagnostic/tasks/text2text_generation.py +230 -0
- onnx_diagnostic/tasks/text_classification.py +89 -0
- onnx_diagnostic/tasks/text_generation.py +352 -0
- onnx_diagnostic/tasks/text_to_image.py +95 -0
- onnx_diagnostic/tasks/zero_shot_image_classification.py +128 -0
- onnx_diagnostic/torch_export_patches/__init__.py +21 -0
- onnx_diagnostic/torch_export_patches/eval/__init__.py +725 -0
- onnx_diagnostic/torch_export_patches/eval/model_cases.py +898 -0
- onnx_diagnostic/torch_export_patches/onnx_export_errors.py +1098 -0
- onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +311 -0
- onnx_diagnostic/torch_export_patches/patch_details.py +340 -0
- onnx_diagnostic/torch_export_patches/patch_expressions.py +108 -0
- onnx_diagnostic/torch_export_patches/patch_inputs.py +211 -0
- onnx_diagnostic/torch_export_patches/patch_module.py +1047 -0
- onnx_diagnostic/torch_export_patches/patch_module_helper.py +184 -0
- onnx_diagnostic/torch_export_patches/patches/__init__.py +0 -0
- onnx_diagnostic/torch_export_patches/patches/patch_torch.py +1090 -0
- onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +2139 -0
- onnx_diagnostic/torch_export_patches/serialization/__init__.py +46 -0
- onnx_diagnostic/torch_export_patches/serialization/diffusers_impl.py +34 -0
- onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +313 -0
- onnx_diagnostic/torch_models/__init__.py +0 -0
- onnx_diagnostic/torch_models/code_sample.py +343 -0
- onnx_diagnostic/torch_models/hghub/__init__.py +1 -0
- onnx_diagnostic/torch_models/hghub/hub_api.py +422 -0
- onnx_diagnostic/torch_models/hghub/hub_data.py +234 -0
- onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +4905 -0
- onnx_diagnostic/torch_models/hghub/model_inputs.py +388 -0
- onnx_diagnostic/torch_models/hghub/model_specific.py +76 -0
- onnx_diagnostic/torch_models/llms.py +2 -0
- onnx_diagnostic/torch_models/untrained/__init__.py +0 -0
- onnx_diagnostic/torch_models/untrained/llm_phi2.py +113 -0
- onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py +76 -0
- onnx_diagnostic/torch_models/validate.py +2124 -0
- onnx_diagnostic/torch_onnx/__init__.py +0 -0
- onnx_diagnostic/torch_onnx/runtime_info.py +289 -0
- onnx_diagnostic/torch_onnx/sbs.py +440 -0
- onnx_diagnostic-0.8.0.dist-info/METADATA +213 -0
- onnx_diagnostic-0.8.0.dist-info/RECORD +132 -0
- onnx_diagnostic-0.8.0.dist-info/WHEEL +5 -0
- onnx_diagnostic-0.8.0.dist-info/licenses/LICENSE.txt +19 -0
- onnx_diagnostic-0.8.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2245 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
import pprint
|
|
5
|
+
import re
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas
|
|
10
|
+
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
|
|
11
|
+
from .helper import string_sig
|
|
12
|
+
from ._log_helper import (
|
|
13
|
+
BUCKET_SCALES,
|
|
14
|
+
breaking_last_point,
|
|
15
|
+
apply_excel_style,
|
|
16
|
+
align_dataframe_with,
|
|
17
|
+
open_dataframe,
|
|
18
|
+
enumerate_csv_files,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CubeViewDef:
|
|
23
|
+
"""
|
|
24
|
+
Defines how to compute a view.
|
|
25
|
+
|
|
26
|
+
:param key_index: keys to put in the row index
|
|
27
|
+
:param values: values to show
|
|
28
|
+
:param ignore_unique: ignore keys with a unique value
|
|
29
|
+
:param order: to reorder key in columns index
|
|
30
|
+
:param key_agg: aggregate according to these columns before
|
|
31
|
+
creating the view
|
|
32
|
+
:param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
|
|
33
|
+
it can be also a callable to return a different aggregation
|
|
34
|
+
method depending on the column name
|
|
35
|
+
:param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
|
|
36
|
+
:param agg_multi: aggregation over multiple columns
|
|
37
|
+
:param ignore_columns: ignore the following columns if known to overload the view
|
|
38
|
+
:param keep_columns_in_index: keeps the columns even if there is only one unique value
|
|
39
|
+
:param dropna: drops rows with nan if not relevant
|
|
40
|
+
:param transpose: transpose
|
|
41
|
+
:param f_highlight: to highlights some values
|
|
42
|
+
:param name: name of the view, used mostly to debug
|
|
43
|
+
:param plots: adds plot to the Excel sheet
|
|
44
|
+
:param no_index: remove the index (but keeps the columns)
|
|
45
|
+
:param fix_aggregation_change: a column among the keys which changes aggregation value
|
|
46
|
+
for different dates
|
|
47
|
+
|
|
48
|
+
Some examples of views. First example is an aggregated view
|
|
49
|
+
for many metrics.
|
|
50
|
+
|
|
51
|
+
.. code-block:: python
|
|
52
|
+
|
|
53
|
+
cube = CubeLogs(...)
|
|
54
|
+
|
|
55
|
+
CubeViewDef(
|
|
56
|
+
key_index=cube._filter_column(fs, cube.keys_time),
|
|
57
|
+
values=cube._filter_column(
|
|
58
|
+
["TIME_ITER", "speedup", "time_latency.*", "onnx_n_nodes"],
|
|
59
|
+
cube.values,
|
|
60
|
+
),
|
|
61
|
+
ignore_unique=True,
|
|
62
|
+
key_agg=["model_name", "task", "model_task", "suite"],
|
|
63
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
64
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
65
|
+
name="agg-all",
|
|
66
|
+
plots=True,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
Next one focuses on a couple of metrics.
|
|
70
|
+
|
|
71
|
+
.. code-block:: python
|
|
72
|
+
|
|
73
|
+
cube = CubeLogs(...)
|
|
74
|
+
|
|
75
|
+
CubeViewDef(
|
|
76
|
+
key_index=cube._filter_column(fs, cube.keys_time),
|
|
77
|
+
values=cube._filter_column(["speedup"], cube.values),
|
|
78
|
+
ignore_unique=True,
|
|
79
|
+
keep_columns_in_index=["suite"],
|
|
80
|
+
name="speedup",
|
|
81
|
+
)
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
class HighLightKind(enum.IntEnum):
|
|
85
|
+
"Codes to highlight values."
|
|
86
|
+
|
|
87
|
+
NONE = 0
|
|
88
|
+
RED = 1
|
|
89
|
+
GREEN = 2
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
key_index: Sequence[str],
|
|
94
|
+
values: Sequence[str],
|
|
95
|
+
ignore_unique: bool = True,
|
|
96
|
+
order: Optional[Sequence[str]] = None,
|
|
97
|
+
key_agg: Optional[Sequence[str]] = None,
|
|
98
|
+
agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
|
|
99
|
+
agg_kwargs: Optional[Dict[str, Any]] = None,
|
|
100
|
+
agg_multi: Optional[
|
|
101
|
+
Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
|
|
102
|
+
] = None,
|
|
103
|
+
ignore_columns: Optional[Sequence[str]] = None,
|
|
104
|
+
keep_columns_in_index: Optional[Sequence[str]] = None,
|
|
105
|
+
dropna: bool = True,
|
|
106
|
+
transpose: bool = False,
|
|
107
|
+
f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
|
|
108
|
+
name: Optional[str] = None,
|
|
109
|
+
no_index: bool = False,
|
|
110
|
+
plots: bool = False,
|
|
111
|
+
fix_aggregation_change: Optional[List["str"]] = None,
|
|
112
|
+
):
|
|
113
|
+
self.key_index = key_index
|
|
114
|
+
self.values = values
|
|
115
|
+
self.ignore_unique = ignore_unique
|
|
116
|
+
self.order = order
|
|
117
|
+
self.key_agg = key_agg
|
|
118
|
+
self.agg_args = agg_args
|
|
119
|
+
self.agg_kwargs = agg_kwargs
|
|
120
|
+
self.agg_multi = agg_multi
|
|
121
|
+
self.dropna = dropna
|
|
122
|
+
self.ignore_columns = ignore_columns
|
|
123
|
+
self.keep_columns_in_index = keep_columns_in_index
|
|
124
|
+
self.f_highlight = f_highlight
|
|
125
|
+
self.transpose = transpose
|
|
126
|
+
self.name = name
|
|
127
|
+
self.no_index = no_index
|
|
128
|
+
self.plots = plots
|
|
129
|
+
self.fix_aggregation_change = fix_aggregation_change
|
|
130
|
+
|
|
131
|
+
def __repr__(self) -> str:
|
|
132
|
+
"usual"
|
|
133
|
+
return string_sig(self) # type: ignore[arg-type]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class CubePlot:
|
|
137
|
+
"""
|
|
138
|
+
Creates a plot.
|
|
139
|
+
|
|
140
|
+
:param df: dataframe
|
|
141
|
+
:param kind: kind of graph to plot, bar, barh, line
|
|
142
|
+
:param split: draw a graph per line in the dataframe
|
|
143
|
+
:param timeseries: this assumes the time is one level of the columns,
|
|
144
|
+
this argument indices the level name
|
|
145
|
+
|
|
146
|
+
It defines a graph. Usually *bar* or *barh* is used to
|
|
147
|
+
compare experiments for every metric, a subplot by metric.
|
|
148
|
+
|
|
149
|
+
.. code-block:: python
|
|
150
|
+
|
|
151
|
+
CubePlot(df, kind="barh", orientation="row", split=True)
|
|
152
|
+
|
|
153
|
+
*line* is usually used to plot timeseries showing the
|
|
154
|
+
evolution of metrics over time.
|
|
155
|
+
|
|
156
|
+
.. code-block:: python
|
|
157
|
+
|
|
158
|
+
CubePlot(
|
|
159
|
+
df,
|
|
160
|
+
kind="line",
|
|
161
|
+
orientation="row",
|
|
162
|
+
split=True,
|
|
163
|
+
timeseries="time",
|
|
164
|
+
)
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
KINDS = {"bar", "barh", "line"}
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def group_columns(
|
|
171
|
+
cls, columns: List[str], sep: str = "/", depth: int = 2
|
|
172
|
+
) -> List[List[str]]:
|
|
173
|
+
"""Groups columns to have nice display."""
|
|
174
|
+
res: Dict[str, List[str]] = {}
|
|
175
|
+
for c in columns:
|
|
176
|
+
p = c.split("/")
|
|
177
|
+
k = "/".join(p[:depth])
|
|
178
|
+
if k not in res:
|
|
179
|
+
res[k] = []
|
|
180
|
+
res[k].append(c)
|
|
181
|
+
new_res: Dict[str, List[str]] = {}
|
|
182
|
+
for k, v in res.items():
|
|
183
|
+
if len(v) >= 3:
|
|
184
|
+
new_res[k] = v
|
|
185
|
+
else:
|
|
186
|
+
if "0" not in new_res:
|
|
187
|
+
new_res["0"] = []
|
|
188
|
+
new_res["0"].extend(v)
|
|
189
|
+
groups: List[List[str]] = [sorted(v) for k, v in sorted(new_res.items())]
|
|
190
|
+
if depth <= 1:
|
|
191
|
+
return groups
|
|
192
|
+
new_groups: List[List[str]] = []
|
|
193
|
+
for v in groups:
|
|
194
|
+
if len(v) >= 6:
|
|
195
|
+
new_groups.extend(cls.group_columns(v, depth=1, sep=sep))
|
|
196
|
+
else:
|
|
197
|
+
new_groups.append(v)
|
|
198
|
+
return new_groups
|
|
199
|
+
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
df: pandas.DataFrame,
|
|
203
|
+
kind: str = "bar",
|
|
204
|
+
orientation="col",
|
|
205
|
+
split: bool = True,
|
|
206
|
+
timeseries: Optional[str] = None,
|
|
207
|
+
):
|
|
208
|
+
assert (
|
|
209
|
+
not timeseries or timeseries in df.columns.names
|
|
210
|
+
), f"Level {timeseries!r} is not part of the columns levels {df.columns.names}"
|
|
211
|
+
assert (
|
|
212
|
+
kind in self.__class__.KINDS
|
|
213
|
+
), f"Unexpected kind={kind!r} not in {self.__class__.KINDS}"
|
|
214
|
+
assert split, f"split={split} not implemented"
|
|
215
|
+
assert (
|
|
216
|
+
not timeseries or orientation == "row"
|
|
217
|
+
), f"orientation={orientation!r} must be 'row' for timeseries"
|
|
218
|
+
self.df = df.copy()
|
|
219
|
+
self.kind = kind
|
|
220
|
+
self.orientation = orientation
|
|
221
|
+
self.split = split
|
|
222
|
+
self.timeseries = timeseries
|
|
223
|
+
|
|
224
|
+
if timeseries:
|
|
225
|
+
if isinstance(self.df.columns, pandas.MultiIndex):
|
|
226
|
+
index_time = list(self.df.columns.names).index(self.timeseries)
|
|
227
|
+
|
|
228
|
+
def _drop(t, i=index_time):
|
|
229
|
+
return (*t[:i], *t[i + 1 :])
|
|
230
|
+
|
|
231
|
+
self.df.columns = pandas.MultiIndex.from_tuples(
|
|
232
|
+
[("/".join(map(str, _drop(i))), i[index_time]) for i in self.df.columns],
|
|
233
|
+
names=["metric", timeseries],
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
if isinstance(self.df.columns, pandas.MultiIndex):
|
|
237
|
+
self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
|
|
238
|
+
if isinstance(self.df.index, pandas.MultiIndex):
|
|
239
|
+
self.df.index = ["/".join(map(str, i)) for i in self.df.index]
|
|
240
|
+
|
|
241
|
+
def __repr__(self) -> str:
|
|
242
|
+
"usual"
|
|
243
|
+
return string_sig(self) # type: ignore[arg-type]
|
|
244
|
+
|
|
245
|
+
def to_images(
|
|
246
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
247
|
+
) -> List[bytes]:
|
|
248
|
+
"""
|
|
249
|
+
Converts data into plots and images.
|
|
250
|
+
|
|
251
|
+
:param verbose: verbosity
|
|
252
|
+
:param merge: returns all graphs in a single image (True)
|
|
253
|
+
or an image for every graph (False)
|
|
254
|
+
:param title_suffix: prefix for the title of every graph
|
|
255
|
+
:return: list of binary images (format PNG)
|
|
256
|
+
"""
|
|
257
|
+
if self.kind in ("barh", "bar"):
|
|
258
|
+
return self._to_images_bar(verbose=verbose, merge=merge, title_suffix=title_suffix)
|
|
259
|
+
if self.kind == "line":
|
|
260
|
+
return self._to_images_line(
|
|
261
|
+
verbose=verbose, merge=merge, title_suffix=title_suffix
|
|
262
|
+
)
|
|
263
|
+
raise AssertionError(f"self.kind={self.kind!r} not implemented")
|
|
264
|
+
|
|
265
|
+
@classmethod
|
|
266
|
+
def _make_loop(cls, ensemble, verbose):
|
|
267
|
+
if verbose:
|
|
268
|
+
from tqdm import tqdm
|
|
269
|
+
|
|
270
|
+
loop = tqdm(ensemble)
|
|
271
|
+
else:
|
|
272
|
+
loop = ensemble
|
|
273
|
+
return loop
|
|
274
|
+
|
|
275
|
+
def _to_images_bar(
|
|
276
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
277
|
+
) -> List[bytes]:
|
|
278
|
+
"""
|
|
279
|
+
Environment variable ``FIGSIZEH`` can be set to increase the
|
|
280
|
+
graph height. Default is 1.0.
|
|
281
|
+
"""
|
|
282
|
+
assert merge, f"merge={merge} not implemented yet"
|
|
283
|
+
import matplotlib.pyplot as plt
|
|
284
|
+
|
|
285
|
+
df = self.df.T if self.orientation == "row" else self.df
|
|
286
|
+
title_suffix = f"\n{title_suffix}" if title_suffix else ""
|
|
287
|
+
|
|
288
|
+
n_cols = 3
|
|
289
|
+
nn = df.shape[1] // n_cols
|
|
290
|
+
nn += int(df.shape[1] % n_cols != 0)
|
|
291
|
+
ratio = float(os.environ.get("FIGSIZEH", "1"))
|
|
292
|
+
figsize = (6 * n_cols, nn * (2.5 + df.shape[0] / 15) * ratio)
|
|
293
|
+
fig, axs = plt.subplots(nn, n_cols, figsize=figsize)
|
|
294
|
+
pos = 0
|
|
295
|
+
imgs = []
|
|
296
|
+
for c in self._make_loop(df.columns, verbose):
|
|
297
|
+
ax = axs[pos // n_cols, pos % n_cols]
|
|
298
|
+
(
|
|
299
|
+
df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
|
|
300
|
+
if self.kind == "barh"
|
|
301
|
+
else df[c].plot.bar(title=f"{c}{title_suffix}", ax=ax)
|
|
302
|
+
)
|
|
303
|
+
ax.tick_params(axis="both", which="major", labelsize=8)
|
|
304
|
+
ax.grid(True)
|
|
305
|
+
pos += 1 # noqa: SIM113
|
|
306
|
+
fig.tight_layout()
|
|
307
|
+
imgdata = io.BytesIO()
|
|
308
|
+
fig.savefig(imgdata, format="png")
|
|
309
|
+
imgs.append(imgdata.getvalue())
|
|
310
|
+
plt.close()
|
|
311
|
+
return imgs
|
|
312
|
+
|
|
313
|
+
def _to_images_line(
|
|
314
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
315
|
+
) -> List[bytes]:
|
|
316
|
+
assert merge, f"merge={merge} not implemented yet"
|
|
317
|
+
assert (
|
|
318
|
+
self.orientation == "row"
|
|
319
|
+
), f"self.orientation={self.orientation!r} not implemented for this kind of graph."
|
|
320
|
+
|
|
321
|
+
def rotate_align(ax, angle=15, align="right"):
|
|
322
|
+
for label in ax.get_xticklabels():
|
|
323
|
+
label.set_rotation(angle)
|
|
324
|
+
label.set_horizontalalignment(align)
|
|
325
|
+
ax.tick_params(axis="both", which="major", labelsize=8)
|
|
326
|
+
ax.grid(True)
|
|
327
|
+
ax.legend()
|
|
328
|
+
ax.tick_params(labelleft=True)
|
|
329
|
+
return ax
|
|
330
|
+
|
|
331
|
+
import matplotlib.pyplot as plt
|
|
332
|
+
|
|
333
|
+
df = self.df.T
|
|
334
|
+
|
|
335
|
+
confs = list(df.unstack(self.timeseries).index)
|
|
336
|
+
groups = self.group_columns(confs)
|
|
337
|
+
n_cols = len(groups)
|
|
338
|
+
|
|
339
|
+
title_suffix = f"\n{title_suffix}" if title_suffix else ""
|
|
340
|
+
ratio = float(os.environ.get("FIGSIZEH", "1"))
|
|
341
|
+
figsize = (5 * n_cols, max(len(g) for g in groups) * (2 + df.shape[1] / 2) * ratio)
|
|
342
|
+
fig, axs = plt.subplots(
|
|
343
|
+
df.shape[1],
|
|
344
|
+
n_cols,
|
|
345
|
+
figsize=figsize,
|
|
346
|
+
sharex=True,
|
|
347
|
+
sharey="row" if n_cols > 1 else False,
|
|
348
|
+
)
|
|
349
|
+
imgs = []
|
|
350
|
+
row = 0
|
|
351
|
+
for c in self._make_loop(df.columns, verbose):
|
|
352
|
+
dfc = df[[c]]
|
|
353
|
+
dfc = dfc.unstack(self.timeseries).T.droplevel(0)
|
|
354
|
+
if n_cols == 1:
|
|
355
|
+
dfc.plot(title=f"{c}{title_suffix}", ax=axs[row], linewidth=3)
|
|
356
|
+
axs[row].grid(True)
|
|
357
|
+
rotate_align(axs[row])
|
|
358
|
+
else:
|
|
359
|
+
x = list(range(dfc.shape[0]))
|
|
360
|
+
ticks = list(dfc.index)
|
|
361
|
+
for ii, group in enumerate(groups):
|
|
362
|
+
ddd = dfc.loc[:, group].copy()
|
|
363
|
+
axs[row, ii].set_xticks(x)
|
|
364
|
+
axs[row, ii].set_xticklabels(ticks)
|
|
365
|
+
# This is very slow
|
|
366
|
+
# ddd.plot(ax=axs[row, ii],linewidth=3)
|
|
367
|
+
for jj in range(ddd.shape[1]):
|
|
368
|
+
axs[row, ii].plot(x, ddd.iloc[:, jj], lw=3, label=ddd.columns[jj])
|
|
369
|
+
axs[row, ii].set_title(f"{c}{title_suffix}")
|
|
370
|
+
rotate_align(axs[row, ii])
|
|
371
|
+
row += 1 # noqa: SIM113
|
|
372
|
+
fig.tight_layout()
|
|
373
|
+
imgdata = io.BytesIO()
|
|
374
|
+
fig.savefig(imgdata, format="png")
|
|
375
|
+
imgs.append(imgdata.getvalue())
|
|
376
|
+
plt.close()
|
|
377
|
+
return imgs
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
class CubeLogs:
|
|
381
|
+
"""
|
|
382
|
+
Processes logs coming from experiments.
|
|
383
|
+
A cube is basically a database with certain columns
|
|
384
|
+
playing specific roles.
|
|
385
|
+
|
|
386
|
+
* time: only one column, it is not mandatory but it is recommended
|
|
387
|
+
to have one
|
|
388
|
+
* keys: they are somehow coordinates, they cannot be aggregated,
|
|
389
|
+
they are not numbers, more like categories, `(time, *keys)`
|
|
390
|
+
identifies an element of the database in an unique way,
|
|
391
|
+
there cannot be more than one row sharing the same key and time
|
|
392
|
+
values
|
|
393
|
+
* values: they are not necessary numerical, but if they are,
|
|
394
|
+
they can be aggregated
|
|
395
|
+
|
|
396
|
+
Every other columns is ignored. More columns can be added
|
|
397
|
+
by using formulas.
|
|
398
|
+
|
|
399
|
+
:param data: the raw data
|
|
400
|
+
:param time: the time column
|
|
401
|
+
:param keys: the keys, can include regular expressions
|
|
402
|
+
:param values: the values, can include regular expressions
|
|
403
|
+
:param ignored: ignores some column, acts as negative regular
|
|
404
|
+
expressions for the other two
|
|
405
|
+
:param recent: if more than one rows share the same keys,
|
|
406
|
+
the cube only keeps the most recent one
|
|
407
|
+
:param formulas: columns to add, defined with formulas
|
|
408
|
+
:param fill_missing: a dictionary, defines values replacing missing one
|
|
409
|
+
for some columns
|
|
410
|
+
:param keep_last_date: overwrites all the times with the most recent
|
|
411
|
+
one, it makes things easier for timeseries
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
def __init__(
|
|
415
|
+
self,
|
|
416
|
+
data: Any,
|
|
417
|
+
time: str = "date",
|
|
418
|
+
keys: Sequence[str] = ("version_.*", "model_.*"),
|
|
419
|
+
values: Sequence[str] = ("time_.*", "disc_.*"),
|
|
420
|
+
ignored: Sequence[str] = (),
|
|
421
|
+
recent: bool = False,
|
|
422
|
+
formulas: Optional[
|
|
423
|
+
Union[
|
|
424
|
+
Sequence[str],
|
|
425
|
+
Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
|
|
426
|
+
]
|
|
427
|
+
] = None,
|
|
428
|
+
fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
|
|
429
|
+
keep_last_date: bool = False,
|
|
430
|
+
):
|
|
431
|
+
self._data = data
|
|
432
|
+
self._time = time
|
|
433
|
+
self._keys = keys
|
|
434
|
+
self._values = values
|
|
435
|
+
self._ignored = ignored
|
|
436
|
+
self.recent = recent
|
|
437
|
+
self._formulas = formulas
|
|
438
|
+
self.fill_missing = fill_missing
|
|
439
|
+
self.keep_last_date = keep_last_date
|
|
440
|
+
|
|
441
|
+
def clone(
|
|
442
|
+
self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
|
|
443
|
+
) -> "CubeLogs":
|
|
444
|
+
"""
|
|
445
|
+
Makes a copy of the dataframe.
|
|
446
|
+
It copies the processed data not the original one.
|
|
447
|
+
"""
|
|
448
|
+
cube = self.__class__(
|
|
449
|
+
data if data is not None else self.data.copy(),
|
|
450
|
+
time=self.time,
|
|
451
|
+
keys=keys or self.keys_no_time,
|
|
452
|
+
values=self.values,
|
|
453
|
+
)
|
|
454
|
+
cube.load()
|
|
455
|
+
return cube
|
|
456
|
+
|
|
457
|
+
def post_load_process_piece(
|
|
458
|
+
self, df: pandas.DataFrame, unique: bool = False
|
|
459
|
+
) -> pandas.DataFrame:
|
|
460
|
+
"""
|
|
461
|
+
Postprocesses a piece when a cube is made of multiple pieces
|
|
462
|
+
before it gets merged.
|
|
463
|
+
"""
|
|
464
|
+
if not self.fill_missing:
|
|
465
|
+
return df
|
|
466
|
+
missing = dict(self.fill_missing)
|
|
467
|
+
for k, v in missing.items():
|
|
468
|
+
if k not in df.columns:
|
|
469
|
+
df[k] = v
|
|
470
|
+
return df
|
|
471
|
+
|
|
472
|
+
def load(self, verbose: int = 0):
|
|
473
|
+
"""Loads and preprocesses the data. Returns self."""
|
|
474
|
+
if isinstance(self._data, pandas.DataFrame):
|
|
475
|
+
if verbose:
|
|
476
|
+
print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
|
|
477
|
+
self.data = self.post_load_process_piece(self._data, unique=True)
|
|
478
|
+
if verbose:
|
|
479
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
480
|
+
elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
|
|
481
|
+
if verbose:
|
|
482
|
+
print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
|
|
483
|
+
self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
|
|
484
|
+
if verbose:
|
|
485
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
486
|
+
elif isinstance(self._data, list) and all(
|
|
487
|
+
isinstance(r, pandas.DataFrame) for r in self._data
|
|
488
|
+
):
|
|
489
|
+
if verbose:
|
|
490
|
+
print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
|
|
491
|
+
self.data = pandas.concat(
|
|
492
|
+
[self.post_load_process_piece(c) for c in self._data], axis=0
|
|
493
|
+
)
|
|
494
|
+
if verbose:
|
|
495
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
496
|
+
elif isinstance(self._data, list):
|
|
497
|
+
if verbose:
|
|
498
|
+
print("[CubeLogs.load] load from list of Cubes")
|
|
499
|
+
cubes = []
|
|
500
|
+
for item in enumerate_csv_files(self._data, verbose=verbose):
|
|
501
|
+
df = open_dataframe(item)
|
|
502
|
+
cube = CubeLogs(
|
|
503
|
+
df,
|
|
504
|
+
time=self._time,
|
|
505
|
+
keys=self._keys,
|
|
506
|
+
values=self._values,
|
|
507
|
+
ignored=self._ignored,
|
|
508
|
+
recent=self.recent,
|
|
509
|
+
)
|
|
510
|
+
cube.load()
|
|
511
|
+
cubes.append(self.post_load_process_piece(cube.data))
|
|
512
|
+
self.data = pandas.concat(cubes, axis=0)
|
|
513
|
+
if verbose:
|
|
514
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
515
|
+
else:
|
|
516
|
+
raise NotImplementedError(
|
|
517
|
+
f"Not implemented with the provided data (type={type(self._data)})"
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
assert all(isinstance(c, str) for c in self.data.columns), (
|
|
521
|
+
f"The class only supports string as column names "
|
|
522
|
+
f"but found {[c for c in self.data.columns if not isinstance(c, str)]}"
|
|
523
|
+
)
|
|
524
|
+
if verbose:
|
|
525
|
+
print(f"[CubeLogs.load] loaded with shape={self.data.shape}")
|
|
526
|
+
|
|
527
|
+
self._initialize_columns()
|
|
528
|
+
if verbose:
|
|
529
|
+
print(f"[CubeLogs.load] time={self.time}")
|
|
530
|
+
print(f"[CubeLogs.load] keys={self.keys_no_time}")
|
|
531
|
+
print(f"[CubeLogs.load] values={self.values}")
|
|
532
|
+
print(f"[CubeLogs.load] ignored={self.ignored}")
|
|
533
|
+
print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
|
|
534
|
+
print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
|
|
535
|
+
assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
|
|
536
|
+
assert self.values, f"No values found with {self._values} from {self.data.columns}"
|
|
537
|
+
assert not (
|
|
538
|
+
set(self.keys_no_time) & set(self.values)
|
|
539
|
+
), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
|
|
540
|
+
assert not (
|
|
541
|
+
set(self.keys_no_time) & set(self.ignored)
|
|
542
|
+
), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
|
|
543
|
+
assert not (
|
|
544
|
+
set(self.values) & set(self.ignored)
|
|
545
|
+
), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
|
|
546
|
+
assert (
|
|
547
|
+
self.time not in self.keys_no_time
|
|
548
|
+
and self.time not in self.values
|
|
549
|
+
and self.time not in self.ignored
|
|
550
|
+
), (
|
|
551
|
+
f"Column {self.time!r} is also a key, a value or ignored, "
|
|
552
|
+
f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
|
|
553
|
+
f"ignored={sorted(self.ignored)}"
|
|
554
|
+
)
|
|
555
|
+
self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
|
|
556
|
+
self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
|
|
557
|
+
self.data = self.data[self.columns]
|
|
558
|
+
if verbose:
|
|
559
|
+
print(f"[CubeLogs.load] dropped={self.dropped}")
|
|
560
|
+
print(f"[CubeLogs.load] data.shape={self.data.shape}")
|
|
561
|
+
|
|
562
|
+
if verbose:
|
|
563
|
+
print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
|
|
564
|
+
self._preprocess()
|
|
565
|
+
if verbose:
|
|
566
|
+
print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
|
|
567
|
+
if self.recent:
|
|
568
|
+
print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
|
|
569
|
+
|
|
570
|
+
# Let's apply the formulas
|
|
571
|
+
if self._formulas:
|
|
572
|
+
forms = (
|
|
573
|
+
{k: k for k in self._formulas}
|
|
574
|
+
if not isinstance(self._formulas, dict)
|
|
575
|
+
else self._formulas
|
|
576
|
+
)
|
|
577
|
+
cols = set(self.values)
|
|
578
|
+
for k, ff in forms.items():
|
|
579
|
+
f = self._process_formula(ff)
|
|
580
|
+
if k in cols or f is None:
|
|
581
|
+
if verbose:
|
|
582
|
+
print(f"[CubeLogs.load] skip formula {k!r}")
|
|
583
|
+
else:
|
|
584
|
+
if verbose:
|
|
585
|
+
print(f"[CubeLogs.load] apply formula {k!r}")
|
|
586
|
+
self.data[k] = f(self.data)
|
|
587
|
+
self.values.append(k)
|
|
588
|
+
cols.add(k)
|
|
589
|
+
self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
|
|
590
|
+
for k in self.keys_no_time:
|
|
591
|
+
if self.data[k].isna().max():
|
|
592
|
+
self.values_for_key[k].add(np.nan)
|
|
593
|
+
self.keys_with_nans = [
|
|
594
|
+
c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
|
|
595
|
+
]
|
|
596
|
+
if verbose:
|
|
597
|
+
print(f"[CubeLogs.load] convert column {self.time!r} into date")
|
|
598
|
+
if self.keys_with_nans:
|
|
599
|
+
print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
|
|
600
|
+
self.data[self.time] = pandas.to_datetime(self.data[self.time])
|
|
601
|
+
|
|
602
|
+
if self.keep_last_date:
|
|
603
|
+
times = self.data[self.time].dropna()
|
|
604
|
+
mi, mx = times.min(), times.max()
|
|
605
|
+
if mi != mx:
|
|
606
|
+
print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
|
|
607
|
+
self.data.loc[~self.data[self.time].isna(), self.time] = mx
|
|
608
|
+
self.values_for_key[self.time] = {mx}
|
|
609
|
+
if self.data[self.time].isna().max():
|
|
610
|
+
self.values_for_key[self.time].add(np.nan)
|
|
611
|
+
if verbose:
|
|
612
|
+
print(f"[CubeLogs.load] done, shape={self.shape}")
|
|
613
|
+
return self
|
|
614
|
+
|
|
615
|
+
def _process_formula(
|
|
616
|
+
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
617
|
+
) -> Callable[[pandas.DataFrame], pandas.Series]:
|
|
618
|
+
assert callable(formula), f"formula={formula!r} is not supported."
|
|
619
|
+
return formula
|
|
620
|
+
|
|
621
|
+
@property
|
|
622
|
+
def shape(self) -> Tuple[int, int]:
|
|
623
|
+
"Returns the shape."
|
|
624
|
+
assert hasattr(self, "data"), "Method load was not called"
|
|
625
|
+
return self.data.shape
|
|
626
|
+
|
|
627
|
+
@property
|
|
628
|
+
def columns(self) -> Sequence[str]:
|
|
629
|
+
"Returns the columns."
|
|
630
|
+
assert hasattr(self, "data"), "Method load was not called"
|
|
631
|
+
return self.data.columns
|
|
632
|
+
|
|
633
|
+
def _preprocess(self):
|
|
634
|
+
last = self.values[0]
|
|
635
|
+
gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
|
|
636
|
+
gr = gr[gr[last] > 1]
|
|
637
|
+
if self.recent:
|
|
638
|
+
cp = self.data.copy()
|
|
639
|
+
assert (
|
|
640
|
+
"__index__" not in cp.columns
|
|
641
|
+
), f"'__index__' should not be a column in {cp.columns}"
|
|
642
|
+
cp["__index__"] = np.arange(cp.shape[0])
|
|
643
|
+
gr = (
|
|
644
|
+
cp[[*self.keys_time, "__index__"]]
|
|
645
|
+
.groupby(self.keys_no_time, as_index=False, dropna=False)
|
|
646
|
+
.max()
|
|
647
|
+
)
|
|
648
|
+
assert gr.shape[0] > 0, (
|
|
649
|
+
f"Something went wrong after the groupby.\n"
|
|
650
|
+
f"{cp[[*self.keys, self.time, '__index__']].head().T}"
|
|
651
|
+
)
|
|
652
|
+
filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
|
|
653
|
+
assert filtered.shape[0] <= self.data.shape[0], (
|
|
654
|
+
f"Keeping the latest row brings more row {filtered.shape} "
|
|
655
|
+
f"(initial is {self.data.shape})."
|
|
656
|
+
)
|
|
657
|
+
self.data = filtered.drop("__index__", axis=1)
|
|
658
|
+
else:
|
|
659
|
+
assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
|
|
660
|
+
|
|
661
|
+
@classmethod
|
|
662
|
+
def _filter_column(cls, filters, columns, can_be_empty=False):
|
|
663
|
+
assert list(columns), "columns is empty"
|
|
664
|
+
set_cols = set()
|
|
665
|
+
for f in filters:
|
|
666
|
+
if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
|
|
667
|
+
reg = re.compile(f)
|
|
668
|
+
cols = [c for c in columns if reg.search(c)]
|
|
669
|
+
elif f in columns:
|
|
670
|
+
# No regular expression.
|
|
671
|
+
cols = [f]
|
|
672
|
+
else:
|
|
673
|
+
continue
|
|
674
|
+
set_cols |= set(cols)
|
|
675
|
+
assert (
|
|
676
|
+
can_be_empty or set_cols
|
|
677
|
+
), f"Filters {filters} returns an empty set from {columns}"
|
|
678
|
+
return sorted(set_cols)
|
|
679
|
+
|
|
680
|
+
def _initialize_columns(self):
|
|
681
|
+
keys = self._filter_column(self._keys, self.data.columns)
|
|
682
|
+
self.values = self._filter_column(self._values, self.data.columns)
|
|
683
|
+
self.ignored = self._filter_column(self._ignored, self.data.columns, True)
|
|
684
|
+
assert (
|
|
685
|
+
self._time in self.data.columns
|
|
686
|
+
), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
|
|
687
|
+
ignored_keys = set(self.ignored) & set(keys)
|
|
688
|
+
ignored_values = set(self.ignored) & set(self.values)
|
|
689
|
+
self.keys_no_time = [c for c in keys if c not in ignored_keys]
|
|
690
|
+
self.values = [c for c in self.values if c not in ignored_values]
|
|
691
|
+
self.ignored_keys = sorted(ignored_keys)
|
|
692
|
+
self.ignored_values = sorted(ignored_values)
|
|
693
|
+
self.time = self._time
|
|
694
|
+
self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
|
|
695
|
+
|
|
696
|
+
def __str__(self) -> str:
|
|
697
|
+
"usual"
|
|
698
|
+
return str(self.data) if hasattr(self, "data") else str(self._data)
|
|
699
|
+
|
|
700
|
+
def make_view_def(self, name: str) -> Optional[CubeViewDef]:
|
|
701
|
+
"""
|
|
702
|
+
Returns a view definition.
|
|
703
|
+
|
|
704
|
+
:param name: name of a value
|
|
705
|
+
:return: a CubeViewDef or None if name does not make sense
|
|
706
|
+
"""
|
|
707
|
+
assert name in self.values, f"{name!r} is not one of the values {self.values}"
|
|
708
|
+
keys = sorted(self.keys_no_time)
|
|
709
|
+
index = len(keys) // 2 + (len(keys) % 2)
|
|
710
|
+
return CubeViewDef(key_index=keys[:index], values=[name], name=name)
|
|
711
|
+
|
|
712
|
+
def view(
|
|
713
|
+
self,
|
|
714
|
+
view_def: Union[str, CubeViewDef],
|
|
715
|
+
return_view_def: bool = False,
|
|
716
|
+
verbose: int = 0,
|
|
717
|
+
) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
|
|
718
|
+
"""
|
|
719
|
+
Returns a dataframe, a pivot view.
|
|
720
|
+
`key_index` determines the index, the other key columns determines
|
|
721
|
+
the columns. If `ignore_unique` is True, every columns with a unique value
|
|
722
|
+
is removed.
|
|
723
|
+
|
|
724
|
+
:param view_def: view definition
|
|
725
|
+
:param return_view_def: returns the view as well
|
|
726
|
+
:param verbose: verbosity level
|
|
727
|
+
:return: dataframe
|
|
728
|
+
"""
|
|
729
|
+
if isinstance(view_def, str):
|
|
730
|
+
# We automatically create a view for a metric
|
|
731
|
+
view_def_ = self.make_view_def(view_def)
|
|
732
|
+
assert view_def_ is not None, f"Unable to create a view from {view_def!r}"
|
|
733
|
+
view_def = view_def_
|
|
734
|
+
|
|
735
|
+
assert isinstance(
|
|
736
|
+
view_def, CubeViewDef
|
|
737
|
+
), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
|
|
738
|
+
if verbose:
|
|
739
|
+
print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
|
|
740
|
+
key_agg = (
|
|
741
|
+
self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
|
|
742
|
+
)
|
|
743
|
+
set_key_agg = set(key_agg)
|
|
744
|
+
assert set_key_agg <= set(self.keys_time), (
|
|
745
|
+
f"view_def.name={view_def.name!r}, "
|
|
746
|
+
f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
|
|
747
|
+
f"keys={sorted(self.keys_time)}",
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
values = self._filter_column(view_def.values, self.values)
|
|
751
|
+
assert set(values) <= set(self.values), (
|
|
752
|
+
f"view_def.name={view_def.name!r}, "
|
|
753
|
+
f"non existing columns in values {set(values) - set(self.values)}, "
|
|
754
|
+
f"values={sorted(self.values)}"
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
if view_def.fix_aggregation_change and (
|
|
758
|
+
set(view_def.fix_aggregation_change) & set(self.keys_no_time)
|
|
759
|
+
):
|
|
760
|
+
# before aggregation, let's fix some keys whose values changed over time
|
|
761
|
+
data_to_process = self._fix_aggregation_change(
|
|
762
|
+
self.data,
|
|
763
|
+
list(set(view_def.fix_aggregation_change) & set(self.keys_no_time)),
|
|
764
|
+
)
|
|
765
|
+
else:
|
|
766
|
+
data_to_process = self.data
|
|
767
|
+
|
|
768
|
+
# aggregation
|
|
769
|
+
if key_agg:
|
|
770
|
+
final_stack = True
|
|
771
|
+
key_index = [
|
|
772
|
+
c
|
|
773
|
+
for c in self._filter_column(view_def.key_index, self.keys_time)
|
|
774
|
+
if c not in set_key_agg
|
|
775
|
+
]
|
|
776
|
+
keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
|
|
777
|
+
if verbose:
|
|
778
|
+
print(f"[CubeLogs.view] aggregation of {set_key_agg}")
|
|
779
|
+
print(f"[CubeLogs.view] groupby {keys_no_agg}")
|
|
780
|
+
|
|
781
|
+
data_red = data_to_process[[*keys_no_agg, *values]]
|
|
782
|
+
assert set(key_index) <= set(data_red.columns), (
|
|
783
|
+
f"view_def.name={view_def.name!r}, "
|
|
784
|
+
f"nnable to find {set(key_index) - set(data_red.columns)}, "
|
|
785
|
+
f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
|
|
786
|
+
f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
|
|
787
|
+
f"keys={pprint.pformat(sorted(self.keys_time))}"
|
|
788
|
+
)
|
|
789
|
+
grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
|
|
790
|
+
if callable(view_def.agg_args):
|
|
791
|
+
agg_kwargs = view_def.agg_kwargs or {}
|
|
792
|
+
agg_args = ({c: view_def.agg_args(c) for c in values},)
|
|
793
|
+
else:
|
|
794
|
+
agg_args = view_def.agg_args # type: ignore[assignment]
|
|
795
|
+
agg_kwargs = view_def.agg_kwargs or {}
|
|
796
|
+
data = grouped_data.agg(*agg_args, **agg_kwargs)
|
|
797
|
+
if view_def.agg_multi:
|
|
798
|
+
append = []
|
|
799
|
+
for k, f in view_def.agg_multi.items():
|
|
800
|
+
cv = grouped_data.apply(f, include_groups=False)
|
|
801
|
+
append.append(cv.to_frame(k))
|
|
802
|
+
data = pandas.concat([data, *append], axis=1)
|
|
803
|
+
set_all_keys = set(keys_no_agg)
|
|
804
|
+
values = list(data.columns)
|
|
805
|
+
data = data.reset_index(drop=False)
|
|
806
|
+
else:
|
|
807
|
+
key_index = self._filter_column(view_def.key_index, self.keys_time)
|
|
808
|
+
if verbose:
|
|
809
|
+
print(f"[CubeLogs.view] no aggregation, index={key_index}")
|
|
810
|
+
data = data_to_process[[*self.keys_time, *values]]
|
|
811
|
+
set_all_keys = set(self.keys_time)
|
|
812
|
+
final_stack = False
|
|
813
|
+
|
|
814
|
+
assert set(key_index) <= set_all_keys, (
|
|
815
|
+
f"view_def.name={view_def.name!r}, "
|
|
816
|
+
f"Non existing keys in key_index {set(key_index) - set_all_keys}"
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
# remove unnecessary column
|
|
820
|
+
set_key_columns = {
|
|
821
|
+
c for c in self.keys_time if c not in key_index and c not in set(key_agg)
|
|
822
|
+
}
|
|
823
|
+
key_index0 = key_index
|
|
824
|
+
if view_def.ignore_unique:
|
|
825
|
+
unique = {
|
|
826
|
+
k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
|
|
827
|
+
}
|
|
828
|
+
keep_anyway = (
|
|
829
|
+
set(view_def.keep_columns_in_index)
|
|
830
|
+
if view_def.keep_columns_in_index
|
|
831
|
+
else set()
|
|
832
|
+
)
|
|
833
|
+
key_index = [k for k in key_index if k not in unique or k in keep_anyway]
|
|
834
|
+
key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
|
|
835
|
+
if verbose:
|
|
836
|
+
print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
|
|
837
|
+
print(
|
|
838
|
+
f"[CubeLogs.view] columns with unique values "
|
|
839
|
+
f"{set(key_index0) - set(key_index)}"
|
|
840
|
+
)
|
|
841
|
+
else:
|
|
842
|
+
if verbose:
|
|
843
|
+
print("[CubeLogs.view] keep all columns")
|
|
844
|
+
key_columns = sorted(set_key_columns)
|
|
845
|
+
unique = set()
|
|
846
|
+
|
|
847
|
+
# md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
|
|
848
|
+
all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
|
|
849
|
+
assert all_cols == set(self.keys_time), (
|
|
850
|
+
f"view_def.name={view_def.name!r}, "
|
|
851
|
+
f"key_columns + key_index + key_agg + unique != keys, left="
|
|
852
|
+
f"{set(self.keys_time) - all_cols}, "
|
|
853
|
+
f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
|
|
854
|
+
f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
# reorder
|
|
858
|
+
if view_def.order:
|
|
859
|
+
subset = self._filter_column(view_def.order, all_cols | {self.time})
|
|
860
|
+
corder = [o for o in view_def.order if o in subset]
|
|
861
|
+
assert set(corder) <= set_key_columns, (
|
|
862
|
+
f"view_def.name={view_def.name!r}, "
|
|
863
|
+
f"non existing columns from order in key_columns "
|
|
864
|
+
f"{set(corder) - set_key_columns}"
|
|
865
|
+
)
|
|
866
|
+
key_columns = [
|
|
867
|
+
*[o for o in corder if o in key_columns],
|
|
868
|
+
*[c for c in key_columns if c not in view_def.order],
|
|
869
|
+
]
|
|
870
|
+
else:
|
|
871
|
+
corder = None
|
|
872
|
+
|
|
873
|
+
if view_def.dropna:
|
|
874
|
+
data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
|
|
875
|
+
data,
|
|
876
|
+
key_index,
|
|
877
|
+
key_columns,
|
|
878
|
+
values,
|
|
879
|
+
keep_columns_in_index=view_def.keep_columns_in_index,
|
|
880
|
+
)
|
|
881
|
+
if view_def.ignore_columns:
|
|
882
|
+
if verbose:
|
|
883
|
+
print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
|
|
884
|
+
data = data.drop(view_def.ignore_columns, axis=1)
|
|
885
|
+
seti = set(view_def.ignore_columns)
|
|
886
|
+
if view_def.keep_columns_in_index:
|
|
887
|
+
seti -= set(view_def.keep_columns_in_index)
|
|
888
|
+
key_index = [c for c in key_index if c not in seti]
|
|
889
|
+
key_columns = [c for c in key_columns if c not in seti]
|
|
890
|
+
values = [c for c in values if c not in seti]
|
|
891
|
+
|
|
892
|
+
# final verification
|
|
893
|
+
if verbose:
|
|
894
|
+
print(f"[CubeLogs.view] key_index={key_index}")
|
|
895
|
+
print(f"[CubeLogs.view] key_columns={key_columns}")
|
|
896
|
+
g = data[[*key_index, *key_columns]].copy()
|
|
897
|
+
g["count"] = 1
|
|
898
|
+
r = (
|
|
899
|
+
g.copy()
|
|
900
|
+
if not key_index and not key_columns
|
|
901
|
+
else g.groupby([*key_index, *key_columns], dropna=False).sum()
|
|
902
|
+
)
|
|
903
|
+
not_unique = r[r["count"] > 1]
|
|
904
|
+
assert not_unique.shape[0] == 0, (
|
|
905
|
+
f"view_def.name={view_def.name!r}, "
|
|
906
|
+
f"unable to run the pivot with index={sorted(key_index)}, "
|
|
907
|
+
f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
|
|
908
|
+
f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
|
|
909
|
+
f"not unique={set(data.columns) - unique}"
|
|
910
|
+
f"\n--\n{not_unique.head(10)}"
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
# pivot
|
|
914
|
+
if verbose:
|
|
915
|
+
print(f"[CubeLogs.view] values={values}")
|
|
916
|
+
if key_index:
|
|
917
|
+
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
|
|
918
|
+
else:
|
|
919
|
+
# pivot does return the same rank with it is empty.
|
|
920
|
+
# Let's add arficially one
|
|
921
|
+
data = data.copy()
|
|
922
|
+
data["ALL"] = "ALL"
|
|
923
|
+
piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
|
|
924
|
+
if isinstance(piv, pandas.Series):
|
|
925
|
+
piv = piv.to_frame(name="series")
|
|
926
|
+
names = list(piv.columns.names)
|
|
927
|
+
assert (
|
|
928
|
+
"METRICS" not in names
|
|
929
|
+
), f"Not implemented when a level METRICS already exists {names!r}"
|
|
930
|
+
names[0] = "METRICS"
|
|
931
|
+
piv.columns = piv.columns.set_names(names)
|
|
932
|
+
if final_stack:
|
|
933
|
+
piv = piv.stack("METRICS", future_stack=True)
|
|
934
|
+
if view_def.transpose:
|
|
935
|
+
piv = piv.T
|
|
936
|
+
if isinstance(piv, pandas.Series):
|
|
937
|
+
piv = piv.to_frame("VALUE")
|
|
938
|
+
piv.sort_index(inplace=True)
|
|
939
|
+
|
|
940
|
+
if isinstance(piv.columns, pandas.MultiIndex):
|
|
941
|
+
if corder:
|
|
942
|
+
# reorder the levels for the columns with the view definition
|
|
943
|
+
new_corder = [c for c in corder if c in piv.columns.names]
|
|
944
|
+
new_names = [
|
|
945
|
+
*[c for c in piv.columns.names if c not in new_corder],
|
|
946
|
+
*new_corder,
|
|
947
|
+
]
|
|
948
|
+
piv.columns = piv.columns.reorder_levels(new_names)
|
|
949
|
+
elif self.time in piv.columns.names:
|
|
950
|
+
# put time at the end
|
|
951
|
+
new_names = list(piv.columns.names)
|
|
952
|
+
ind = new_names.index(self.time)
|
|
953
|
+
if ind < len(new_names) - 1:
|
|
954
|
+
del new_names[ind]
|
|
955
|
+
new_names.append(self.time)
|
|
956
|
+
piv.columns = piv.columns.reorder_levels(new_names)
|
|
957
|
+
|
|
958
|
+
if view_def.no_index:
|
|
959
|
+
piv = piv.reset_index(drop=False)
|
|
960
|
+
else:
|
|
961
|
+
piv.sort_index(inplace=True, axis=1)
|
|
962
|
+
|
|
963
|
+
# final step, force columns with numerical values to be float
|
|
964
|
+
for c in list(piv.columns):
|
|
965
|
+
s = piv[c]
|
|
966
|
+
if not pandas.api.types.is_object_dtype(s):
|
|
967
|
+
continue
|
|
968
|
+
try:
|
|
969
|
+
sf = s.astype(float)
|
|
970
|
+
except (ValueError, TypeError):
|
|
971
|
+
continue
|
|
972
|
+
piv[c] = sf
|
|
973
|
+
|
|
974
|
+
if verbose:
|
|
975
|
+
print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
|
|
976
|
+
print(f"[CubeLogs.view] -- done view {view_def.name!r}")
|
|
977
|
+
return (piv, view_def) if return_view_def else piv
|
|
978
|
+
|
|
979
|
+
def _fix_aggregation_change(
|
|
980
|
+
self,
|
|
981
|
+
data: pandas.DataFrame,
|
|
982
|
+
columns_to_fix: Union[str, List[str]],
|
|
983
|
+
overwrite_or_merge: bool = True,
|
|
984
|
+
) -> pandas.DataFrame:
|
|
985
|
+
"""
|
|
986
|
+
Fixes columns used to aggregate values because their meaning changed over time.
|
|
987
|
+
|
|
988
|
+
:param data: data to fix
|
|
989
|
+
:param columns_to_fix: list of columns to fix
|
|
990
|
+
:param overwrite_or_merge: if True, overwrite all values by the concatenation
|
|
991
|
+
of all existing values, if merge, merges existing values found
|
|
992
|
+
and grouped by the other keys
|
|
993
|
+
:return: fixed data
|
|
994
|
+
"""
|
|
995
|
+
if not isinstance(columns_to_fix, str):
|
|
996
|
+
for c in columns_to_fix:
|
|
997
|
+
data = self._fix_aggregation_change(data, c)
|
|
998
|
+
return data
|
|
999
|
+
# Let's process one column.
|
|
1000
|
+
keys = set(self.keys_time) - {columns_to_fix}
|
|
1001
|
+
select = data[self.keys_time]
|
|
1002
|
+
select_agg = select.groupby(list(keys)).count()
|
|
1003
|
+
assert select_agg[columns_to_fix].max() <= 1, (
|
|
1004
|
+
f"Column {columns_to_fix!r} has two distinct values at least for one date\n"
|
|
1005
|
+
f"{select_agg[select_agg[columns_to_fix] > 1]}"
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
# unique value (to fill NaN)
|
|
1009
|
+
unique = "-".join(sorted(set(data[columns_to_fix].dropna())))
|
|
1010
|
+
|
|
1011
|
+
keys = set(self.keys_no_time) - {columns_to_fix}
|
|
1012
|
+
select = data[self.keys_no_time]
|
|
1013
|
+
select_agg = select.groupby(list(keys), as_index=True).apply(
|
|
1014
|
+
lambda x: "-".join(sorted(set(x[columns_to_fix].dropna()))), include_groups=False
|
|
1015
|
+
)
|
|
1016
|
+
select_agg = select_agg.to_frame(name=columns_to_fix)
|
|
1017
|
+
res = pandas.merge(
|
|
1018
|
+
data.drop([columns_to_fix], axis=1),
|
|
1019
|
+
select_agg,
|
|
1020
|
+
how="left",
|
|
1021
|
+
left_on=list(keys),
|
|
1022
|
+
right_index=True,
|
|
1023
|
+
)
|
|
1024
|
+
val = f"?{unique}?"
|
|
1025
|
+
res[columns_to_fix] = res[columns_to_fix].fillna(val).replace("", val)
|
|
1026
|
+
assert (
|
|
1027
|
+
data.shape == res.shape
|
|
1028
|
+
and sorted(data.columns) == sorted(res.columns)
|
|
1029
|
+
and sorted(data.index) == sorted(res.index)
|
|
1030
|
+
), (
|
|
1031
|
+
f"Shape should match, data.shape={data.shape}, res.shape={res.shape}, "
|
|
1032
|
+
f"lost={set(data.columns) - set(res.columns)}, "
|
|
1033
|
+
f"added={set(res.columns) - set(data.columns)}"
|
|
1034
|
+
)
|
|
1035
|
+
res = res[data.columns]
|
|
1036
|
+
assert data.columns.equals(res.columns) and data.index.equals(res.index), (
|
|
1037
|
+
f"Columns or index mismatch "
|
|
1038
|
+
f"data.columns.equals(res.columns)={data.columns.equals(res.columns)}, "
|
|
1039
|
+
f"data.index.equals(res.columns)={data.index.equals(res.columns)}, "
|
|
1040
|
+
)
|
|
1041
|
+
return res
|
|
1042
|
+
|
|
1043
|
+
def _dropna(
|
|
1044
|
+
self,
|
|
1045
|
+
data: pandas.DataFrame,
|
|
1046
|
+
key_index: Sequence[str],
|
|
1047
|
+
key_columns: Sequence[str],
|
|
1048
|
+
values: Sequence[str],
|
|
1049
|
+
keep_columns_in_index: Optional[Sequence[str]] = None,
|
|
1050
|
+
) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
|
|
1051
|
+
set_keep_columns_in_index = (
|
|
1052
|
+
set(keep_columns_in_index) if keep_columns_in_index else set()
|
|
1053
|
+
)
|
|
1054
|
+
v = data[values]
|
|
1055
|
+
new_data = data[~v.isnull().all(1)]
|
|
1056
|
+
if data.shape == new_data.shape:
|
|
1057
|
+
return data, key_index, key_columns, values
|
|
1058
|
+
new_data = new_data.copy()
|
|
1059
|
+
new_key_index = []
|
|
1060
|
+
for c in key_index:
|
|
1061
|
+
if c in set_keep_columns_in_index:
|
|
1062
|
+
new_key_index.append(c)
|
|
1063
|
+
continue
|
|
1064
|
+
v = new_data[c]
|
|
1065
|
+
sv = set(v.dropna())
|
|
1066
|
+
if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
|
|
1067
|
+
new_key_index.append(c)
|
|
1068
|
+
new_key_columns = []
|
|
1069
|
+
for c in key_columns:
|
|
1070
|
+
if c in set_keep_columns_in_index:
|
|
1071
|
+
new_key_columns.append(c)
|
|
1072
|
+
continue
|
|
1073
|
+
v = new_data[c]
|
|
1074
|
+
sv = set(v.dropna())
|
|
1075
|
+
if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
|
|
1076
|
+
new_key_columns.append(c)
|
|
1077
|
+
for c in set(key_index) | set(key_columns):
|
|
1078
|
+
s = new_data[c]
|
|
1079
|
+
if s.isna().max():
|
|
1080
|
+
if pandas.api.types.is_numeric_dtype(
|
|
1081
|
+
s
|
|
1082
|
+
) and not pandas.api.types.is_object_dtype(s):
|
|
1083
|
+
min_v = s.dropna().min()
|
|
1084
|
+
assert (
|
|
1085
|
+
min_v >= 0
|
|
1086
|
+
), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
|
|
1087
|
+
new_data[c] = s.fillna(-1)
|
|
1088
|
+
else:
|
|
1089
|
+
new_data[c] = s.fillna("NAN")
|
|
1090
|
+
return new_data, new_key_index, new_key_columns, values
|
|
1091
|
+
|
|
1092
|
+
def describe(self) -> pandas.DataFrame:
|
|
1093
|
+
"""Basic description of all variables."""
|
|
1094
|
+
rows = []
|
|
1095
|
+
for name in self.data.columns:
|
|
1096
|
+
values = self.data[name]
|
|
1097
|
+
dtype = values.dtype
|
|
1098
|
+
nonan = values.dropna()
|
|
1099
|
+
obs = dict(
|
|
1100
|
+
name=name,
|
|
1101
|
+
dtype=str(dtype),
|
|
1102
|
+
missing=len(values) - len(nonan),
|
|
1103
|
+
kind=(
|
|
1104
|
+
"time"
|
|
1105
|
+
if name == self.time
|
|
1106
|
+
else (
|
|
1107
|
+
"keys"
|
|
1108
|
+
if name in self.keys_no_time
|
|
1109
|
+
else (
|
|
1110
|
+
"values"
|
|
1111
|
+
if name in self.values
|
|
1112
|
+
else ("ignored" if name in self.ignored else "unused")
|
|
1113
|
+
)
|
|
1114
|
+
)
|
|
1115
|
+
),
|
|
1116
|
+
)
|
|
1117
|
+
if len(nonan) > 0:
|
|
1118
|
+
obs.update(dict(count=len(nonan)))
|
|
1119
|
+
if is_numeric_dtype(nonan) and not pandas.api.types.is_object_dtype(nonan):
|
|
1120
|
+
obs.update(
|
|
1121
|
+
dict(
|
|
1122
|
+
min=nonan.min(),
|
|
1123
|
+
max=nonan.max(),
|
|
1124
|
+
mean=nonan.mean(),
|
|
1125
|
+
sum=nonan.sum(),
|
|
1126
|
+
n_values=len(set(nonan)),
|
|
1127
|
+
)
|
|
1128
|
+
)
|
|
1129
|
+
elif obs["kind"] == "time":
|
|
1130
|
+
unique = set(nonan)
|
|
1131
|
+
obs["n_values"] = len(unique)
|
|
1132
|
+
o = dict(
|
|
1133
|
+
min=str(nonan.min()),
|
|
1134
|
+
max=str(nonan.max()),
|
|
1135
|
+
n_values=len(set(nonan)),
|
|
1136
|
+
)
|
|
1137
|
+
o["values"] = f"{o['min']} - {o['max']}"
|
|
1138
|
+
obs.update(o)
|
|
1139
|
+
else:
|
|
1140
|
+
unique = set(nonan)
|
|
1141
|
+
obs["n_values"] = len(unique)
|
|
1142
|
+
if len(unique) < 20:
|
|
1143
|
+
obs["values"] = ",".join(map(str, sorted(unique)))
|
|
1144
|
+
rows.append(obs)
|
|
1145
|
+
return pandas.DataFrame(rows).set_index("name")
|
|
1146
|
+
|
|
1147
|
+
def to_excel(
|
|
1148
|
+
self,
|
|
1149
|
+
output: str,
|
|
1150
|
+
views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
|
|
1151
|
+
main: Optional[str] = "main",
|
|
1152
|
+
raw: Optional[str] = "raw",
|
|
1153
|
+
verbose: int = 0,
|
|
1154
|
+
csv: Optional[Sequence[str]] = None,
|
|
1155
|
+
time_mask: bool = False,
|
|
1156
|
+
sbs: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
1157
|
+
):
|
|
1158
|
+
"""
|
|
1159
|
+
Creates an excel file with a list of views.
|
|
1160
|
+
|
|
1161
|
+
:param output: output file to create
|
|
1162
|
+
:param views: sequence or dictionary of views to append
|
|
1163
|
+
:param main: add a page with statitcs on all variables
|
|
1164
|
+
:param raw: add a page with the raw data
|
|
1165
|
+
:param csv: views to dump as csv files (same name as outputs + view naw)
|
|
1166
|
+
:param verbose: verbosity
|
|
1167
|
+
:param time_mask: color the background of the cells if one
|
|
1168
|
+
of the value for the last date is unexpected,
|
|
1169
|
+
assuming they should remain stale
|
|
1170
|
+
:param sbs: configurations to compare side-by-side, this adds two tabs,
|
|
1171
|
+
one gathering raw data about the two configurations, the other one
|
|
1172
|
+
is aggregated by metrics
|
|
1173
|
+
"""
|
|
1174
|
+
if verbose:
|
|
1175
|
+
print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
|
|
1176
|
+
time_mask &= len(self.data[self.time].unique()) > 2
|
|
1177
|
+
cube_time = self.cube_time(fill_other_dates=True) if time_mask else None
|
|
1178
|
+
views = {k: k for k in views} if not isinstance(views, dict) else views
|
|
1179
|
+
f_highlights = {}
|
|
1180
|
+
plots = []
|
|
1181
|
+
with pandas.ExcelWriter(output, engine="openpyxl") as writer:
|
|
1182
|
+
if main:
|
|
1183
|
+
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
1184
|
+
df = self.describe().sort_values("name")
|
|
1185
|
+
if verbose:
|
|
1186
|
+
print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
|
|
1187
|
+
df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
|
|
1188
|
+
|
|
1189
|
+
time_mask_view: Dict[str, pandas.DataFrame] = {}
|
|
1190
|
+
for name, view in views.items():
|
|
1191
|
+
if view is None:
|
|
1192
|
+
continue
|
|
1193
|
+
df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
|
|
1194
|
+
if cube_time is not None:
|
|
1195
|
+
cube_mask = cube_time.view(view)
|
|
1196
|
+
aligned = align_dataframe_with(cube_mask, df)
|
|
1197
|
+
if aligned is not None:
|
|
1198
|
+
assert aligned.shape == df.shape, (
|
|
1199
|
+
f"Shape mismatch between the view {df.shape} and the mask "
|
|
1200
|
+
f"{time_mask_view[name].shape}"
|
|
1201
|
+
)
|
|
1202
|
+
time_mask_view[name] = aligned
|
|
1203
|
+
if verbose:
|
|
1204
|
+
print(
|
|
1205
|
+
f"[CubeLogs.to_excel] compute mask for view {name!r} "
|
|
1206
|
+
f"with shape {aligned.shape}"
|
|
1207
|
+
)
|
|
1208
|
+
if tview is None:
|
|
1209
|
+
continue
|
|
1210
|
+
memory = df.memory_usage(deep=True).sum()
|
|
1211
|
+
if verbose:
|
|
1212
|
+
print(
|
|
1213
|
+
f"[CubeLogs.to_excel] add sheet {name!r} with shape "
|
|
1214
|
+
f"{df.shape} ({memory} bytes), index={df.index.names}, "
|
|
1215
|
+
f"columns={df.columns.names}"
|
|
1216
|
+
)
|
|
1217
|
+
if self.time in df.columns.names:
|
|
1218
|
+
# Let's convert the time into str
|
|
1219
|
+
fr = df.columns.to_frame()
|
|
1220
|
+
if is_datetime64_any_dtype(fr[self.time]):
|
|
1221
|
+
dt = fr[self.time]
|
|
1222
|
+
has_time = (dt != dt.dt.normalize()).any()
|
|
1223
|
+
sdt = dt.apply(
|
|
1224
|
+
lambda t, has_time=has_time: t.strftime(
|
|
1225
|
+
"%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
|
|
1226
|
+
)
|
|
1227
|
+
)
|
|
1228
|
+
fr[self.time] = sdt
|
|
1229
|
+
df.columns = pandas.MultiIndex.from_frame(fr)
|
|
1230
|
+
if csv and name in csv:
|
|
1231
|
+
name_csv = f"{output}.{name}.csv"
|
|
1232
|
+
if verbose:
|
|
1233
|
+
print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
|
|
1234
|
+
df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
|
|
1235
|
+
|
|
1236
|
+
if memory > 2**22:
|
|
1237
|
+
msg = (
|
|
1238
|
+
f"[CubeLogs.to_excel] skipping {name!r}, "
|
|
1239
|
+
f"too big for excel with {memory} bytes"
|
|
1240
|
+
)
|
|
1241
|
+
if verbose:
|
|
1242
|
+
print(msg)
|
|
1243
|
+
else:
|
|
1244
|
+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
|
|
1245
|
+
else:
|
|
1246
|
+
df.to_excel(
|
|
1247
|
+
writer,
|
|
1248
|
+
sheet_name=name,
|
|
1249
|
+
freeze_panes=(df.columns.nlevels + 1, df.index.nlevels),
|
|
1250
|
+
)
|
|
1251
|
+
f_highlights[name] = tview.f_highlight
|
|
1252
|
+
if tview.plots:
|
|
1253
|
+
plots.append(
|
|
1254
|
+
CubePlot(
|
|
1255
|
+
df,
|
|
1256
|
+
kind="line",
|
|
1257
|
+
orientation="row",
|
|
1258
|
+
split=True,
|
|
1259
|
+
timeseries=self.time,
|
|
1260
|
+
)
|
|
1261
|
+
if self.time in df.columns.names
|
|
1262
|
+
else CubePlot(df, kind="barh", orientation="row", split=True)
|
|
1263
|
+
)
|
|
1264
|
+
if raw:
|
|
1265
|
+
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
1266
|
+
# Too long.
|
|
1267
|
+
# self._apply_excel_style(raw, writer, self.data)
|
|
1268
|
+
if csv and "raw" in csv:
|
|
1269
|
+
df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
|
|
1270
|
+
memory = df.memory_usage(deep=True).sum()
|
|
1271
|
+
if memory > 2**22:
|
|
1272
|
+
msg = (
|
|
1273
|
+
f"[CubeLogs.to_excel] skipping 'raw', "
|
|
1274
|
+
f"too big for excel with {memory} bytes"
|
|
1275
|
+
)
|
|
1276
|
+
if verbose:
|
|
1277
|
+
print(msg)
|
|
1278
|
+
else:
|
|
1279
|
+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
|
|
1280
|
+
else:
|
|
1281
|
+
if verbose:
|
|
1282
|
+
print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
|
|
1283
|
+
self.data.to_excel(
|
|
1284
|
+
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
|
|
1285
|
+
)
|
|
1286
|
+
|
|
1287
|
+
if sbs:
|
|
1288
|
+
if verbose:
|
|
1289
|
+
for k, v in sbs.items():
|
|
1290
|
+
print(f"[CubeLogs.to_excel] sbs {k}: {v}")
|
|
1291
|
+
name = "∧".join(sbs)
|
|
1292
|
+
sbs_raw, sbs_agg, sbs_col = self.sbs(sbs)
|
|
1293
|
+
if verbose:
|
|
1294
|
+
print(f"[CubeLogs.to_excel] add sheet {name!r} with shape {sbs_raw.shape}")
|
|
1295
|
+
print(
|
|
1296
|
+
f"[CubeLogs.to_excel] add sheet '{name}-AGG' "
|
|
1297
|
+
f"with shape {sbs_agg.shape}"
|
|
1298
|
+
)
|
|
1299
|
+
sbs_raw = sbs_raw.reset_index(drop=False)
|
|
1300
|
+
sbs_raw.to_excel(
|
|
1301
|
+
writer,
|
|
1302
|
+
sheet_name=name,
|
|
1303
|
+
freeze_panes=(
|
|
1304
|
+
sbs_raw.columns.nlevels + 1,
|
|
1305
|
+
sbs_raw.index.nlevels,
|
|
1306
|
+
),
|
|
1307
|
+
)
|
|
1308
|
+
sbs_agg.to_excel(
|
|
1309
|
+
writer,
|
|
1310
|
+
sheet_name=f"{name}-AGG",
|
|
1311
|
+
freeze_panes=(
|
|
1312
|
+
sbs_agg.columns.nlevels + 1,
|
|
1313
|
+
sbs_agg.index.nlevels,
|
|
1314
|
+
),
|
|
1315
|
+
)
|
|
1316
|
+
sbs_col.to_excel(
|
|
1317
|
+
writer,
|
|
1318
|
+
sheet_name=f"{name}-COL",
|
|
1319
|
+
freeze_panes=(
|
|
1320
|
+
sbs_col.columns.nlevels + 1,
|
|
1321
|
+
sbs_col.index.nlevels,
|
|
1322
|
+
),
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
if plots:
|
|
1326
|
+
from openpyxl.drawing.image import Image
|
|
1327
|
+
|
|
1328
|
+
if verbose:
|
|
1329
|
+
print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
|
|
1330
|
+
sheet = writer.book.create_sheet("plots")
|
|
1331
|
+
pos = 0
|
|
1332
|
+
empty_row = 1
|
|
1333
|
+
times = self.data[self.time].dropna()
|
|
1334
|
+
mini, maxi = times.min(), times.max()
|
|
1335
|
+
title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
|
|
1336
|
+
" 00:00:00", ""
|
|
1337
|
+
)
|
|
1338
|
+
for plot in plots:
|
|
1339
|
+
imgs = plot.to_images(
|
|
1340
|
+
verbose=verbose, merge=True, title_suffix=title_suffix
|
|
1341
|
+
)
|
|
1342
|
+
for img in imgs:
|
|
1343
|
+
y = (pos // 2) * 16
|
|
1344
|
+
loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
|
|
1345
|
+
sheet.add_image(Image(io.BytesIO(img)), loc)
|
|
1346
|
+
if verbose:
|
|
1347
|
+
no = f"{output}.png"
|
|
1348
|
+
print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
|
|
1349
|
+
with open(no, "wb") as f:
|
|
1350
|
+
f.write(img)
|
|
1351
|
+
pos += 1
|
|
1352
|
+
empty_row += len(plots) + 2
|
|
1353
|
+
|
|
1354
|
+
if verbose:
|
|
1355
|
+
print(f"[CubeLogs.to_excel] applies style to {output!r}")
|
|
1356
|
+
apply_excel_style(
|
|
1357
|
+
writer, f_highlights, time_mask_view=time_mask_view, verbose=verbose # type: ignore[arg-type]
|
|
1358
|
+
)
|
|
1359
|
+
if verbose:
|
|
1360
|
+
print(f"[CubeLogs.to_excel] done with {len(views)} views")
|
|
1361
|
+
|
|
1362
|
+
def cube_time(self, fill_other_dates: bool = False, threshold: float = 1.2) -> "CubeLogs":
|
|
1363
|
+
"""
|
|
1364
|
+
Aggregates the data over time to detect changes on the last value.
|
|
1365
|
+
If *fill_other_dates* is True, all dates are kept, but values
|
|
1366
|
+
are filled with 0.
|
|
1367
|
+
*threshold* determines the bandwidth within the values are expected,
|
|
1368
|
+
should be a factor of the standard deviation.
|
|
1369
|
+
"""
|
|
1370
|
+
unique_time = self.data[self.time].unique()
|
|
1371
|
+
assert len(unique_time) > 2, f"Not enough dates to proceed: unique_time={unique_time}"
|
|
1372
|
+
gr = self.data[[*self.keys_no_time, *self.values]].groupby(
|
|
1373
|
+
self.keys_no_time, dropna=False
|
|
1374
|
+
)
|
|
1375
|
+
dgr = gr.agg(
|
|
1376
|
+
lambda series, th=threshold: int(breaking_last_point(series, threshold=th)[0])
|
|
1377
|
+
)
|
|
1378
|
+
tm = unique_time.max()
|
|
1379
|
+
assert dgr.shape[0] > 0, (
|
|
1380
|
+
f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
|
|
1381
|
+
f"data.shape={self.data.shape}"
|
|
1382
|
+
)
|
|
1383
|
+
dgr[self.time] = tm
|
|
1384
|
+
if fill_other_dates:
|
|
1385
|
+
other_df = []
|
|
1386
|
+
other_dates = [t for t in unique_time if t != tm]
|
|
1387
|
+
for t in other_dates:
|
|
1388
|
+
df = dgr.copy()
|
|
1389
|
+
df[self.time] = t
|
|
1390
|
+
for c in df.columns:
|
|
1391
|
+
if c != self.time:
|
|
1392
|
+
df[c] = 0
|
|
1393
|
+
other_df.append(df)
|
|
1394
|
+
dgr = pandas.concat([dgr, *other_df], axis=0)
|
|
1395
|
+
assert dgr.shape[0] > 0, (
|
|
1396
|
+
f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
|
|
1397
|
+
f"data.shape={self.data.shape}, "
|
|
1398
|
+
f"other_df shapes={[df.shape for df in other_df]}"
|
|
1399
|
+
)
|
|
1400
|
+
return self.clone(data=dgr.reset_index(drop=False))
|
|
1401
|
+
|
|
1402
|
+
def sbs(
|
|
1403
|
+
self, configs: Dict[str, Dict[str, Any]], column_name: str = "CONF"
|
|
1404
|
+
) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.DataFrame]:
|
|
1405
|
+
"""
|
|
1406
|
+
Creates a side-by-side for two configurations.
|
|
1407
|
+
Every configuration a dictionary column:value which filters in
|
|
1408
|
+
the rows to keep in order to compute the side by side.
|
|
1409
|
+
Every configuration is given a name (the key in configs),
|
|
1410
|
+
it is added in column column_name.
|
|
1411
|
+
|
|
1412
|
+
:param configs: example
|
|
1413
|
+
``dict(CFA=dict(exporter="E1", opt="O"), CFB=dict(exporter="E2", opt="O"))``
|
|
1414
|
+
:param column_name: column to add with the name of the configuration
|
|
1415
|
+
:return: data, aggregated date, data with a row per model
|
|
1416
|
+
"""
|
|
1417
|
+
assert (
|
|
1418
|
+
len(configs) >= 2
|
|
1419
|
+
), f"A side by side needs at least two configs but configs={configs}"
|
|
1420
|
+
set_keys_time = set(self.keys_time)
|
|
1421
|
+
columns_index = None
|
|
1422
|
+
data_list = []
|
|
1423
|
+
for name_conf, conf in configs.items():
|
|
1424
|
+
if columns_index is None:
|
|
1425
|
+
columns_index = list(conf.keys())
|
|
1426
|
+
assert set(columns_index) <= set_keys_time, (
|
|
1427
|
+
f"Configuration {conf} includes columns outside the keys "
|
|
1428
|
+
f"{', '.join(sorted(set_keys_time))}"
|
|
1429
|
+
)
|
|
1430
|
+
else:
|
|
1431
|
+
assert set(columns_index) == set(conf), (
|
|
1432
|
+
f"Every conf should share the same keys but conf={conf} "
|
|
1433
|
+
f"is different from {set(columns_index)}"
|
|
1434
|
+
)
|
|
1435
|
+
data = self.data
|
|
1436
|
+
for k, v in conf.items():
|
|
1437
|
+
data = data[data[k] == v]
|
|
1438
|
+
assert data.shape[0] > 0, f"No rows found for conf={conf}"
|
|
1439
|
+
assert (
|
|
1440
|
+
column_name not in data.columns
|
|
1441
|
+
), f"column_name={column_name!r} is already in {data.columns}"
|
|
1442
|
+
data = data.copy()
|
|
1443
|
+
data[column_name] = name_conf
|
|
1444
|
+
data_list.append(data)
|
|
1445
|
+
|
|
1446
|
+
new_data = pandas.concat(data_list, axis=0)
|
|
1447
|
+
cube = self.clone(new_data, keys=[*self.keys_no_time, column_name])
|
|
1448
|
+
key_index = set(self.keys_time) - {*columns_index, column_name} # type: ignore[misc]
|
|
1449
|
+
view = CubeViewDef(
|
|
1450
|
+
key_index=set(key_index), # type: ignore[arg-type]
|
|
1451
|
+
name="sbs",
|
|
1452
|
+
values=cube.values,
|
|
1453
|
+
keep_columns_in_index=[self.time],
|
|
1454
|
+
)
|
|
1455
|
+
view_res = cube.view(view)
|
|
1456
|
+
assert isinstance(view_res, pandas.DataFrame), "not needed but mypy complains"
|
|
1457
|
+
|
|
1458
|
+
# add metrics
|
|
1459
|
+
index_column_name = list(view_res.columns.names).index(column_name)
|
|
1460
|
+
index_metrics = list(view_res.columns.names).index("METRICS")
|
|
1461
|
+
|
|
1462
|
+
def _mkc(m, s):
|
|
1463
|
+
c = ["" for c in view_res.columns.names]
|
|
1464
|
+
c[index_column_name] = s
|
|
1465
|
+
c[index_metrics] = m
|
|
1466
|
+
return tuple(c)
|
|
1467
|
+
|
|
1468
|
+
list_configs = list(configs.items())
|
|
1469
|
+
mean_columns = [
|
|
1470
|
+
c
|
|
1471
|
+
for c in view_res.columns
|
|
1472
|
+
if pandas.api.types.is_numeric_dtype(view_res[c])
|
|
1473
|
+
and not pandas.api.types.is_object_dtype(view_res[c])
|
|
1474
|
+
]
|
|
1475
|
+
assert mean_columns, f"No numerical columns in {view_res.dtypes}"
|
|
1476
|
+
view_res = view_res[mean_columns].copy()
|
|
1477
|
+
metrics = sorted(set(c[index_metrics] for c in view_res.columns))
|
|
1478
|
+
assert metrics, (
|
|
1479
|
+
f"No numerical metrics detected in "
|
|
1480
|
+
f"view_res.columns.names={view_res.columns.names}, "
|
|
1481
|
+
f"columns={view_res.dtypes}"
|
|
1482
|
+
)
|
|
1483
|
+
sum_columns = []
|
|
1484
|
+
columns_to_add = []
|
|
1485
|
+
for i in range(len(list_configs)):
|
|
1486
|
+
for j in range(i + 1, len(list_configs)):
|
|
1487
|
+
for m in metrics:
|
|
1488
|
+
iname, ci = list_configs[i]
|
|
1489
|
+
jname, cj = list_configs[j]
|
|
1490
|
+
ci = ci.copy()
|
|
1491
|
+
cj = cj.copy()
|
|
1492
|
+
ci["METRICS"] = m
|
|
1493
|
+
cj["METRICS"] = m
|
|
1494
|
+
ci["CONF"] = iname
|
|
1495
|
+
cj["CONF"] = jname
|
|
1496
|
+
|
|
1497
|
+
ci_name = tuple(ci[n] for n in view_res.columns.names)
|
|
1498
|
+
cj_name = tuple(cj[n] for n in view_res.columns.names)
|
|
1499
|
+
assert ci_name in view_res.columns or cj_name in view_res.columns, (
|
|
1500
|
+
f"Unable to find column {ci_name} or {cj_name} "
|
|
1501
|
+
f"in columns {view_res.columns}, metrics={metrics}"
|
|
1502
|
+
)
|
|
1503
|
+
if ci_name not in view_res.columns or cj_name not in view_res.columns:
|
|
1504
|
+
# One config does not have such metric.
|
|
1505
|
+
continue
|
|
1506
|
+
|
|
1507
|
+
si = view_res[ci_name]
|
|
1508
|
+
sj = view_res[cj_name]
|
|
1509
|
+
|
|
1510
|
+
sinan = si.isna()
|
|
1511
|
+
sjnan = sj.isna()
|
|
1512
|
+
n1 = iname
|
|
1513
|
+
n2 = jname
|
|
1514
|
+
nas = pandas.DataFrame(
|
|
1515
|
+
{
|
|
1516
|
+
_mkc(m, f"∅{n1}∧∅{n2}"): (sinan & sjnan).astype(int),
|
|
1517
|
+
_mkc(m, f"∅{n1}∧{n2}"): (sinan & ~sjnan).astype(int),
|
|
1518
|
+
_mkc(m, f"{n1}∧∅{n2}"): (~sinan & sjnan).astype(int),
|
|
1519
|
+
_mkc(m, f"{n1}∧{n2}"): (~sinan & ~sjnan).astype(int),
|
|
1520
|
+
_mkc(m, f"{n1}<{n2}"): (si < sj).astype(int),
|
|
1521
|
+
_mkc(m, f"{n1}=={n2}"): (si == sj).astype(int),
|
|
1522
|
+
_mkc(m, f"{n1}>{n2}"): (si > sj).astype(int),
|
|
1523
|
+
_mkc(m, f"{n1}*({n1}∧{n2})"): si * (~sinan & ~sjnan).astype(float),
|
|
1524
|
+
_mkc(m, f"{n2}*({n1}∧{n2})"): sj * (~sinan & ~sjnan).astype(float),
|
|
1525
|
+
}
|
|
1526
|
+
)
|
|
1527
|
+
nas.columns.names = view_res.columns.names
|
|
1528
|
+
columns_to_add.append(nas)
|
|
1529
|
+
sum_columns.extend(nas.columns)
|
|
1530
|
+
|
|
1531
|
+
view_res = pandas.concat([view_res, *columns_to_add], axis=1)
|
|
1532
|
+
res = view_res.stack("METRICS", future_stack=True) # type: ignore[union-attr]
|
|
1533
|
+
res = res.reorder_levels(
|
|
1534
|
+
[res.index.nlevels - 1, *list(range(res.index.nlevels - 1))]
|
|
1535
|
+
).sort_index()
|
|
1536
|
+
|
|
1537
|
+
# aggregated metrics
|
|
1538
|
+
aggs = {
|
|
1539
|
+
**{k: "mean" for k in mean_columns}, # noqa: C420
|
|
1540
|
+
**{k: "sum" for k in sum_columns}, # noqa: C420
|
|
1541
|
+
}
|
|
1542
|
+
flat = view_res.groupby(self.time).agg(aggs)
|
|
1543
|
+
flat = flat.stack("METRICS", future_stack=True)
|
|
1544
|
+
return res, flat, view_res.T.sort_index().T
|
|
1545
|
+
|
|
1546
|
+
|
|
1547
|
+
class CubeLogsPerformance(CubeLogs):
|
|
1548
|
+
"""Processes logs coming from experiments."""
|
|
1549
|
+
|
|
1550
|
+
def __init__(
|
|
1551
|
+
self,
|
|
1552
|
+
data: Any,
|
|
1553
|
+
time: str = "DATE",
|
|
1554
|
+
keys: Sequence[str] = (
|
|
1555
|
+
"^version_.*",
|
|
1556
|
+
"^model_.*",
|
|
1557
|
+
"device",
|
|
1558
|
+
"opt_patterns",
|
|
1559
|
+
"suite",
|
|
1560
|
+
"memory_peak",
|
|
1561
|
+
"machine",
|
|
1562
|
+
"exporter",
|
|
1563
|
+
"dynamic",
|
|
1564
|
+
"rtopt",
|
|
1565
|
+
"dtype",
|
|
1566
|
+
"device",
|
|
1567
|
+
"architecture",
|
|
1568
|
+
),
|
|
1569
|
+
values: Sequence[str] = (
|
|
1570
|
+
"^time_.*",
|
|
1571
|
+
"^disc.*",
|
|
1572
|
+
"^ERR_.*",
|
|
1573
|
+
"CMD",
|
|
1574
|
+
"^ITER",
|
|
1575
|
+
"^onnx_.*",
|
|
1576
|
+
"^op_onnx_.*",
|
|
1577
|
+
"^peak_gpu_.*",
|
|
1578
|
+
),
|
|
1579
|
+
ignored: Sequence[str] = ("version_python",),
|
|
1580
|
+
recent: bool = True,
|
|
1581
|
+
formulas: Optional[
|
|
1582
|
+
Union[
|
|
1583
|
+
Sequence[str],
|
|
1584
|
+
Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
|
|
1585
|
+
]
|
|
1586
|
+
] = (
|
|
1587
|
+
"speedup",
|
|
1588
|
+
"bucket[speedup]",
|
|
1589
|
+
"ERR1",
|
|
1590
|
+
"n_models",
|
|
1591
|
+
"n_model_eager",
|
|
1592
|
+
"n_model_running",
|
|
1593
|
+
"n_model_acc01",
|
|
1594
|
+
"n_model_acc001",
|
|
1595
|
+
"n_model_dynamic",
|
|
1596
|
+
"n_model_pass",
|
|
1597
|
+
"n_model_faster",
|
|
1598
|
+
"n_model_faster2x",
|
|
1599
|
+
"n_model_faster3x",
|
|
1600
|
+
"n_model_faster4x",
|
|
1601
|
+
"n_model_faster5x",
|
|
1602
|
+
"n_node_attention",
|
|
1603
|
+
"n_node_attention23",
|
|
1604
|
+
"n_node_causal_mask",
|
|
1605
|
+
"n_node_constant",
|
|
1606
|
+
"n_node_control_flow",
|
|
1607
|
+
"n_node_expand",
|
|
1608
|
+
"n_node_function",
|
|
1609
|
+
"n_node_gqa",
|
|
1610
|
+
"n_node_initializer",
|
|
1611
|
+
"n_node_initializer_small",
|
|
1612
|
+
"n_node_layer_normalization",
|
|
1613
|
+
"n_node_layer_normalization23",
|
|
1614
|
+
"n_node_reshape",
|
|
1615
|
+
"n_node_rotary_embedding",
|
|
1616
|
+
"n_node_rotary_embedding23",
|
|
1617
|
+
"n_node_scatter",
|
|
1618
|
+
"n_node_sequence",
|
|
1619
|
+
"n_node_shape",
|
|
1620
|
+
"onnx_n_nodes_no_cst",
|
|
1621
|
+
"peak_gpu_torch",
|
|
1622
|
+
"peak_gpu_nvidia",
|
|
1623
|
+
"time_export_unbiased",
|
|
1624
|
+
),
|
|
1625
|
+
fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
|
|
1626
|
+
keep_last_date: bool = False,
|
|
1627
|
+
):
|
|
1628
|
+
super().__init__(
|
|
1629
|
+
data=data,
|
|
1630
|
+
time=time,
|
|
1631
|
+
keys=keys,
|
|
1632
|
+
values=values,
|
|
1633
|
+
ignored=ignored,
|
|
1634
|
+
recent=recent,
|
|
1635
|
+
formulas=formulas,
|
|
1636
|
+
fill_missing=fill_missing,
|
|
1637
|
+
keep_last_date=keep_last_date,
|
|
1638
|
+
)
|
|
1639
|
+
|
|
1640
|
+
def clone(
|
|
1641
|
+
self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
|
|
1642
|
+
) -> "CubeLogs":
|
|
1643
|
+
"""
|
|
1644
|
+
Makes a copy of the dataframe.
|
|
1645
|
+
It copies the processed data not the original one.
|
|
1646
|
+
keys can be changed as well.
|
|
1647
|
+
"""
|
|
1648
|
+
cube = self.__class__(
|
|
1649
|
+
data if data is not None else self.data.copy(),
|
|
1650
|
+
time=self.time,
|
|
1651
|
+
keys=keys or self.keys_no_time,
|
|
1652
|
+
values=self.values,
|
|
1653
|
+
recent=False,
|
|
1654
|
+
)
|
|
1655
|
+
cube.load()
|
|
1656
|
+
return cube
|
|
1657
|
+
|
|
1658
|
+
def _process_formula(
|
|
1659
|
+
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
1660
|
+
) -> Callable[[pandas.DataFrame], pandas.Series]:
|
|
1661
|
+
"""
|
|
1662
|
+
Processes a formula, converting it into a function.
|
|
1663
|
+
|
|
1664
|
+
:param formula: a formula string
|
|
1665
|
+
:return: a function
|
|
1666
|
+
"""
|
|
1667
|
+
if callable(formula):
|
|
1668
|
+
return formula
|
|
1669
|
+
assert isinstance(
|
|
1670
|
+
formula, str
|
|
1671
|
+
), f"Unexpected type for formula {type(formula)}: {formula!r}"
|
|
1672
|
+
|
|
1673
|
+
def gdf(df, cname, default_value=np.nan):
|
|
1674
|
+
if cname in df.columns:
|
|
1675
|
+
if np.isnan(default_value):
|
|
1676
|
+
return df[cname]
|
|
1677
|
+
return df[cname].fillna(default_value)
|
|
1678
|
+
return pandas.Series(default_value, index=df.index)
|
|
1679
|
+
|
|
1680
|
+
def ghas_value(df, cname):
|
|
1681
|
+
if cname not in df.columns:
|
|
1682
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1683
|
+
isna = df[cname].isna()
|
|
1684
|
+
return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
|
|
1685
|
+
|
|
1686
|
+
def gpreserve(df, cname, series):
|
|
1687
|
+
if cname not in df.columns:
|
|
1688
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1689
|
+
isna = df[cname].isna()
|
|
1690
|
+
return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
|
|
1691
|
+
|
|
1692
|
+
if formula == "speedup":
|
|
1693
|
+
columns = set(self._filter_column(["^time_.*"], self.data.columns))
|
|
1694
|
+
assert "time_latency" in columns and "time_latency_eager" in columns, (
|
|
1695
|
+
f"Unable to apply formula {formula!r}, with columns\n"
|
|
1696
|
+
f"{pprint.pformat(sorted(columns))}"
|
|
1697
|
+
)
|
|
1698
|
+
return lambda df: df["time_latency_eager"] / df["time_latency"]
|
|
1699
|
+
|
|
1700
|
+
if formula == "bucket[speedup]":
|
|
1701
|
+
columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
|
|
1702
|
+
assert "speedup" in columns, (
|
|
1703
|
+
f"Unable to apply formula {formula!r}, with columns\n"
|
|
1704
|
+
f"{pprint.pformat(sorted(columns))}"
|
|
1705
|
+
)
|
|
1706
|
+
# return lambda df: df["time_latency_eager"] / df["time_latency"]
|
|
1707
|
+
return lambda df: pandas.cut(
|
|
1708
|
+
df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
|
|
1709
|
+
)
|
|
1710
|
+
|
|
1711
|
+
if formula == "ERR1":
|
|
1712
|
+
columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
|
|
1713
|
+
if not columns:
|
|
1714
|
+
return lambda df: np.nan
|
|
1715
|
+
|
|
1716
|
+
def first_err(df: pandas.DataFrame) -> pandas.Series:
|
|
1717
|
+
ordered = [
|
|
1718
|
+
c
|
|
1719
|
+
for c in [
|
|
1720
|
+
"ERR_timeout",
|
|
1721
|
+
"ERR_load",
|
|
1722
|
+
"ERR_feeds",
|
|
1723
|
+
"ERR_warmup_eager",
|
|
1724
|
+
"ERR_export",
|
|
1725
|
+
"ERR_ort",
|
|
1726
|
+
"ERR_warmup",
|
|
1727
|
+
# "ERR_std",
|
|
1728
|
+
# "ERR_crash",
|
|
1729
|
+
# "ERR_stdout",
|
|
1730
|
+
]
|
|
1731
|
+
if c in df.columns
|
|
1732
|
+
]
|
|
1733
|
+
res = None
|
|
1734
|
+
for c in ordered:
|
|
1735
|
+
if res is None:
|
|
1736
|
+
res = df[c].fillna("")
|
|
1737
|
+
else:
|
|
1738
|
+
res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
|
|
1739
|
+
return res
|
|
1740
|
+
|
|
1741
|
+
return first_err
|
|
1742
|
+
|
|
1743
|
+
if formula.startswith("n_"):
|
|
1744
|
+
lambdas = dict(
|
|
1745
|
+
n_models=lambda df: ghas_value(df, "model_name"),
|
|
1746
|
+
n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
|
|
1747
|
+
n_model_running=lambda df: ghas_value(df, "time_latency"),
|
|
1748
|
+
n_model_acc01=lambda df: gpreserve(
|
|
1749
|
+
df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
|
|
1750
|
+
),
|
|
1751
|
+
n_model_acc001=lambda df: gpreserve(
|
|
1752
|
+
df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
|
|
1753
|
+
),
|
|
1754
|
+
n_model_dynamic=lambda df: gpreserve(
|
|
1755
|
+
df,
|
|
1756
|
+
"discrepancies_dynamic_abs",
|
|
1757
|
+
(gdf(df, "discrepancies_dynamic_abs") <= 0.1),
|
|
1758
|
+
),
|
|
1759
|
+
n_model_pass=lambda df: gpreserve(
|
|
1760
|
+
df,
|
|
1761
|
+
"time_latency",
|
|
1762
|
+
(gdf(df, "discrepancies_abs", np.inf) < 0.1)
|
|
1763
|
+
& (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
|
|
1764
|
+
),
|
|
1765
|
+
n_model_faster=lambda df: gpreserve(
|
|
1766
|
+
df,
|
|
1767
|
+
"time_latency",
|
|
1768
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
|
|
1769
|
+
),
|
|
1770
|
+
n_model_faster2x=lambda df: gpreserve(
|
|
1771
|
+
df,
|
|
1772
|
+
"time_latency",
|
|
1773
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
|
|
1774
|
+
),
|
|
1775
|
+
n_model_faster3x=lambda df: gpreserve(
|
|
1776
|
+
df,
|
|
1777
|
+
"time_latency",
|
|
1778
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
|
|
1779
|
+
),
|
|
1780
|
+
n_model_faster4x=lambda df: gpreserve(
|
|
1781
|
+
df,
|
|
1782
|
+
"time_latency",
|
|
1783
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
|
|
1784
|
+
),
|
|
1785
|
+
n_model_faster5x=lambda df: gpreserve(
|
|
1786
|
+
df,
|
|
1787
|
+
"time_latency",
|
|
1788
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 4.98,
|
|
1789
|
+
),
|
|
1790
|
+
n_node_attention23=lambda df: gpreserve(
|
|
1791
|
+
df, "time_latency_eager", gdf(df, "op_onnx__Attention")
|
|
1792
|
+
),
|
|
1793
|
+
n_node_rotary_embedding23=lambda df: gpreserve(
|
|
1794
|
+
df, "time_latency_eager", gdf(df, "op_onnx__RotaryEmbedding")
|
|
1795
|
+
),
|
|
1796
|
+
n_node_layer_normalization23=lambda df: gpreserve(
|
|
1797
|
+
df,
|
|
1798
|
+
"time_latency_eager",
|
|
1799
|
+
gdf(df, "op_onnx__LayerNormalization", 0)
|
|
1800
|
+
+ gdf(df, "op_onnx__RMSNormalization", 0)
|
|
1801
|
+
+ gdf(df, "op_onnx__BatchNormlization", 0)
|
|
1802
|
+
+ gdf(df, "op_onnx__InstanceNormlization", 0)
|
|
1803
|
+
+ gdf(df, "op_onnx__GroupNormalization", 0),
|
|
1804
|
+
),
|
|
1805
|
+
n_node_attention=lambda df: gpreserve(
|
|
1806
|
+
df,
|
|
1807
|
+
"time_latency_eager",
|
|
1808
|
+
gdf(df, "op_onnx_com.microsoft_Attention", 0)
|
|
1809
|
+
+ gdf(df, "op_onnx_com.microsoft_MultiHeadAttention", 0)
|
|
1810
|
+
+ gdf(df, "op_onnx_com.microsoft_PackedAttention", 0)
|
|
1811
|
+
+ gdf(df, "op_onnx_com.microsoft_PackedMultiHeadAttention", 0)
|
|
1812
|
+
+ gdf(df, "op_onnx_com.microsoft_GroupQueryAttention", 0)
|
|
1813
|
+
+ gdf(df, "op_onnx_com.microsoft_PagedAttention", 0)
|
|
1814
|
+
+ gdf(df, "op_onnx_com.microsoft_DecoderAttention", 0)
|
|
1815
|
+
+ gdf(df, "op_onnx_com.microsoft_LongformerAttention", 0)
|
|
1816
|
+
+ gdf(df, "op_onnx_com.microsoft_DecoderMaskedSelfAttention", 0)
|
|
1817
|
+
+ gdf(df, "op_onnx_com.microsoft_DecoderMaskedMultiHeadAttention", 0)
|
|
1818
|
+
+ gdf(df, "op_onnx_com.microsoft_SparseAttention", 0),
|
|
1819
|
+
),
|
|
1820
|
+
n_node_gqa=lambda df: gpreserve(
|
|
1821
|
+
df,
|
|
1822
|
+
"time_latency_eager",
|
|
1823
|
+
gdf(df, "op_onnx_com.microsoft_GroupQueryAttention", 0),
|
|
1824
|
+
),
|
|
1825
|
+
n_node_layer_normalization=lambda df: gpreserve(
|
|
1826
|
+
df,
|
|
1827
|
+
"time_latency_eager",
|
|
1828
|
+
gdf(df, "op_onnx_com.microsoft_EmbedLayerNormalization", 0)
|
|
1829
|
+
+ gdf(df, "op_onnx_com.microsoft_SkipLayerNormalization", 0)
|
|
1830
|
+
+ gdf(df, "op_onnx_com.microsoft_LayerNormalization", 0)
|
|
1831
|
+
+ gdf(df, "op_onnx_com.microsoft_SkipSimplifiedLayerNormalization", 0)
|
|
1832
|
+
+ gdf(df, "op_onnx_com.microsoft_SimplifiedLayerNormalization", 0),
|
|
1833
|
+
),
|
|
1834
|
+
n_node_rotary_embedding=lambda df: gpreserve(
|
|
1835
|
+
df,
|
|
1836
|
+
"time_latency_eager",
|
|
1837
|
+
gdf(df, "op_onnx_com.microsoft_GemmaRotaryEmbedding", 0)
|
|
1838
|
+
+ gdf(df, "op_onnx_com.microsoft_RotaryEmbedding", 0),
|
|
1839
|
+
),
|
|
1840
|
+
n_node_control_flow=lambda df: gpreserve(
|
|
1841
|
+
df,
|
|
1842
|
+
"time_latency_eager",
|
|
1843
|
+
(
|
|
1844
|
+
gdf(df, "op_onnx__If", 0)
|
|
1845
|
+
+ gdf(df, "op_onnx__Scan", 0)
|
|
1846
|
+
+ gdf(df, "op_onnx__Loop", 0)
|
|
1847
|
+
),
|
|
1848
|
+
),
|
|
1849
|
+
n_node_scatter=lambda df: gpreserve(
|
|
1850
|
+
df,
|
|
1851
|
+
"time_latency_eager",
|
|
1852
|
+
gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
|
|
1853
|
+
),
|
|
1854
|
+
n_node_function=lambda df: gpreserve(
|
|
1855
|
+
df, "onnx_n_functions", gdf(df, "onnx_n_functions")
|
|
1856
|
+
),
|
|
1857
|
+
n_node_initializer_small=lambda df: gpreserve(
|
|
1858
|
+
df, "op_onnx_initializer_small", gdf(df, "op_onnx_initializer_small")
|
|
1859
|
+
),
|
|
1860
|
+
n_node_initializer=lambda df: gpreserve(
|
|
1861
|
+
df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
|
|
1862
|
+
),
|
|
1863
|
+
n_node_constant=lambda df: gpreserve(
|
|
1864
|
+
df, "time_latency_eager", gdf(df, "op_onnx__Constant")
|
|
1865
|
+
),
|
|
1866
|
+
n_node_shape=lambda df: gpreserve(
|
|
1867
|
+
df, "time_latency_eager", gdf(df, "op_onnx__Shape")
|
|
1868
|
+
),
|
|
1869
|
+
n_node_reshape=lambda df: gpreserve(
|
|
1870
|
+
df, "time_latency_eager", gdf(df, "op_onnx__Reshape")
|
|
1871
|
+
),
|
|
1872
|
+
n_node_expand=lambda df: gpreserve(
|
|
1873
|
+
df, "time_latency_eager", gdf(df, "op_onnx__Expand")
|
|
1874
|
+
),
|
|
1875
|
+
n_node_causal_mask=lambda df: gpreserve(
|
|
1876
|
+
df,
|
|
1877
|
+
"time_latency_eager",
|
|
1878
|
+
gdf(df, "op_onnx__CausalMask", 0),
|
|
1879
|
+
),
|
|
1880
|
+
n_node_sequence=lambda df: gpreserve(
|
|
1881
|
+
df,
|
|
1882
|
+
"time_latency_eager",
|
|
1883
|
+
gdf(df, "op_onnx__SequenceAt", 0) + gdf(df, "op_onnx__SplitToSequence", 0),
|
|
1884
|
+
),
|
|
1885
|
+
)
|
|
1886
|
+
assert (
|
|
1887
|
+
formula in lambdas
|
|
1888
|
+
), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
|
|
1889
|
+
return lambdas[formula]
|
|
1890
|
+
|
|
1891
|
+
if formula == "onnx_n_nodes_no_cst":
|
|
1892
|
+
return lambda df: gdf(df, "onnx_n_nodes", 0) - gdf(
|
|
1893
|
+
df, "op_onnx__Constant", 0
|
|
1894
|
+
).fillna(0)
|
|
1895
|
+
if formula == "peak_gpu_torch":
|
|
1896
|
+
return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
|
|
1897
|
+
if formula == "peak_gpu_nvidia":
|
|
1898
|
+
return (
|
|
1899
|
+
lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
|
|
1900
|
+
)
|
|
1901
|
+
if formula == "time_export_unbiased":
|
|
1902
|
+
|
|
1903
|
+
def unbiased_export(df):
|
|
1904
|
+
if "time_warmup_first_iteration" not in df.columns:
|
|
1905
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1906
|
+
return pandas.Series(
|
|
1907
|
+
np.where(
|
|
1908
|
+
df["exporter"] == "inductor",
|
|
1909
|
+
df["time_warmup_first_iteration"] + df["time_export_success"],
|
|
1910
|
+
df["time_export_success"],
|
|
1911
|
+
),
|
|
1912
|
+
index=df.index,
|
|
1913
|
+
)
|
|
1914
|
+
|
|
1915
|
+
return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
|
|
1916
|
+
|
|
1917
|
+
raise ValueError(
|
|
1918
|
+
f"Unexpected formula {formula!r}, available columns are\n"
|
|
1919
|
+
f"{pprint.pformat(sorted(self.data.columns))}"
|
|
1920
|
+
)
|
|
1921
|
+
|
|
1922
|
+
def view(
|
|
1923
|
+
self,
|
|
1924
|
+
view_def: Optional[Union[str, CubeViewDef]],
|
|
1925
|
+
return_view_def: bool = False,
|
|
1926
|
+
verbose: int = 0,
|
|
1927
|
+
) -> Union[
|
|
1928
|
+
Optional[pandas.DataFrame], Tuple[Optional[pandas.DataFrame], Optional[CubeViewDef]]
|
|
1929
|
+
]:
|
|
1930
|
+
"""
|
|
1931
|
+
Returns a dataframe, a pivot view.
|
|
1932
|
+
|
|
1933
|
+
If view_def is a string, it is replaced by a prefined view.
|
|
1934
|
+
|
|
1935
|
+
:param view_def: view definition or a string
|
|
1936
|
+
:param return_view_def: returns the view definition as well
|
|
1937
|
+
:param verbose: verbosity level
|
|
1938
|
+
:return: dataframe or a couple (dataframe, view definition),
|
|
1939
|
+
both of them can be one if view_def cannot be interpreted
|
|
1940
|
+
"""
|
|
1941
|
+
assert view_def is not None, "view_def is None, this is not allowed."
|
|
1942
|
+
if isinstance(view_def, str):
|
|
1943
|
+
view_def = self.make_view_def(view_def)
|
|
1944
|
+
if view_def is None:
|
|
1945
|
+
return (None, None) if return_view_def else None
|
|
1946
|
+
return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
|
|
1947
|
+
|
|
1948
|
+
def make_view_def(self, name: str) -> Optional[CubeViewDef]:
|
|
1949
|
+
"""
|
|
1950
|
+
Returns a view definition.
|
|
1951
|
+
|
|
1952
|
+
:param name: name of the view
|
|
1953
|
+
:return: a CubeViewDef or None if name does not make sense
|
|
1954
|
+
|
|
1955
|
+
Available views:
|
|
1956
|
+
|
|
1957
|
+
* **agg-suite:** aggregation per suite
|
|
1958
|
+
* **disc:** discrepancies
|
|
1959
|
+
* **speedup:** speedup
|
|
1960
|
+
* **bucket_speedup:** speedup in buckets
|
|
1961
|
+
* **time:** latency
|
|
1962
|
+
* **time_export:** time to export
|
|
1963
|
+
* **counts:** status, running, faster, has control flow, ...
|
|
1964
|
+
* **err:** important errors
|
|
1965
|
+
* **cmd:** command lines
|
|
1966
|
+
* **raw-short:** raw data without all the unused columns
|
|
1967
|
+
"""
|
|
1968
|
+
fix_aggregation_change = ["model_speedup_input_set", "model_test_with"]
|
|
1969
|
+
fs = ["suite", "model_suite", "task", "model_name", "model_task"]
|
|
1970
|
+
index_cols = self._filter_column(fs, self.keys_time)
|
|
1971
|
+
assert index_cols, (
|
|
1972
|
+
f"No index columns found for {fs!r} in "
|
|
1973
|
+
f"{pprint.pformat(sorted(self.keys_time))}"
|
|
1974
|
+
)
|
|
1975
|
+
index_cols = [c for c in fs if c in set(index_cols)]
|
|
1976
|
+
|
|
1977
|
+
f_speedup = lambda x: ( # noqa: E731
|
|
1978
|
+
CubeViewDef.HighLightKind.NONE
|
|
1979
|
+
if not isinstance(x, (float, int))
|
|
1980
|
+
else (
|
|
1981
|
+
CubeViewDef.HighLightKind.RED
|
|
1982
|
+
if x < 0.9
|
|
1983
|
+
else (
|
|
1984
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1985
|
+
if x > 1.1
|
|
1986
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1987
|
+
)
|
|
1988
|
+
)
|
|
1989
|
+
)
|
|
1990
|
+
f_disc = lambda x: ( # noqa: E731
|
|
1991
|
+
CubeViewDef.HighLightKind.NONE
|
|
1992
|
+
if not isinstance(x, (float, int))
|
|
1993
|
+
else (
|
|
1994
|
+
CubeViewDef.HighLightKind.RED
|
|
1995
|
+
if x > 0.1
|
|
1996
|
+
else (
|
|
1997
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1998
|
+
if x < 0.01
|
|
1999
|
+
else CubeViewDef.HighLightKind.NONE
|
|
2000
|
+
)
|
|
2001
|
+
)
|
|
2002
|
+
)
|
|
2003
|
+
f_bucket = lambda x: ( # noqa: E731
|
|
2004
|
+
CubeViewDef.HighLightKind.NONE
|
|
2005
|
+
if not isinstance(x, str)
|
|
2006
|
+
else (
|
|
2007
|
+
CubeViewDef.HighLightKind.RED
|
|
2008
|
+
if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
|
|
2009
|
+
else (
|
|
2010
|
+
CubeViewDef.HighLightKind.NONE
|
|
2011
|
+
if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
|
|
2012
|
+
else (
|
|
2013
|
+
CubeViewDef.HighLightKind.GREEN
|
|
2014
|
+
if "[" in x
|
|
2015
|
+
else CubeViewDef.HighLightKind.NONE
|
|
2016
|
+
)
|
|
2017
|
+
)
|
|
2018
|
+
)
|
|
2019
|
+
)
|
|
2020
|
+
|
|
2021
|
+
def mean_weight(gr):
|
|
2022
|
+
weight = gr["time_latency_eager"]
|
|
2023
|
+
x = gr["speedup"]
|
|
2024
|
+
if x.shape[0] == 0:
|
|
2025
|
+
return np.nan
|
|
2026
|
+
div = weight.sum()
|
|
2027
|
+
if div > 0:
|
|
2028
|
+
return (x * weight).sum() / div
|
|
2029
|
+
return np.nan
|
|
2030
|
+
|
|
2031
|
+
def mean_geo(gr):
|
|
2032
|
+
x = gr["speedup"]
|
|
2033
|
+
return np.exp(np.log(x.dropna()).mean())
|
|
2034
|
+
|
|
2035
|
+
order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
|
|
2036
|
+
implemented_views = {
|
|
2037
|
+
"agg-suite": lambda: CubeViewDef(
|
|
2038
|
+
key_index=index_cols,
|
|
2039
|
+
values=self._filter_column(
|
|
2040
|
+
[
|
|
2041
|
+
"TIME_ITER",
|
|
2042
|
+
"speedup",
|
|
2043
|
+
"time_latency",
|
|
2044
|
+
"time_latency_eager",
|
|
2045
|
+
"time_export_success",
|
|
2046
|
+
"time_export_unbiased",
|
|
2047
|
+
"^n_.*",
|
|
2048
|
+
"target_opset",
|
|
2049
|
+
"onnx_filesize",
|
|
2050
|
+
"onnx_weight_size_torch",
|
|
2051
|
+
"onnx_weight_size_proto",
|
|
2052
|
+
"onnx_n_nodes",
|
|
2053
|
+
"onnx_n_nodes_no_cst",
|
|
2054
|
+
"op_onnx__Constant",
|
|
2055
|
+
"peak_gpu_torch",
|
|
2056
|
+
"peak_gpu_nvidia",
|
|
2057
|
+
],
|
|
2058
|
+
self.values,
|
|
2059
|
+
),
|
|
2060
|
+
ignore_unique=True,
|
|
2061
|
+
key_agg=["model_name", "task", "model_task"],
|
|
2062
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
2063
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
2064
|
+
keep_columns_in_index=["suite"],
|
|
2065
|
+
name="agg-suite",
|
|
2066
|
+
order=order,
|
|
2067
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2068
|
+
),
|
|
2069
|
+
"agg-all": lambda: CubeViewDef(
|
|
2070
|
+
key_index=index_cols,
|
|
2071
|
+
values=self._filter_column(
|
|
2072
|
+
[
|
|
2073
|
+
"TIME_ITER",
|
|
2074
|
+
"speedup",
|
|
2075
|
+
"time_latency",
|
|
2076
|
+
"time_latency_eager",
|
|
2077
|
+
"time_export_success",
|
|
2078
|
+
"time_export_unbiased",
|
|
2079
|
+
"^n_.*",
|
|
2080
|
+
"target_opset",
|
|
2081
|
+
"onnx_filesize",
|
|
2082
|
+
"onnx_weight_size_torch",
|
|
2083
|
+
"onnx_weight_size_proto",
|
|
2084
|
+
"onnx_n_nodes",
|
|
2085
|
+
"onnx_n_nodes_no_cst",
|
|
2086
|
+
"peak_gpu_torch",
|
|
2087
|
+
"peak_gpu_nvidia",
|
|
2088
|
+
],
|
|
2089
|
+
self.values,
|
|
2090
|
+
),
|
|
2091
|
+
ignore_unique=True,
|
|
2092
|
+
key_agg=["model_name", "task", "model_task", "suite"],
|
|
2093
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
2094
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
2095
|
+
name="agg-all",
|
|
2096
|
+
order=order,
|
|
2097
|
+
plots=True,
|
|
2098
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2099
|
+
),
|
|
2100
|
+
"disc": lambda: CubeViewDef(
|
|
2101
|
+
key_index=index_cols,
|
|
2102
|
+
values=self._filter_column(["discrepancies_abs"], self.values),
|
|
2103
|
+
ignore_unique=True,
|
|
2104
|
+
keep_columns_in_index=["suite"],
|
|
2105
|
+
f_highlight=f_disc,
|
|
2106
|
+
name="disc",
|
|
2107
|
+
order=order,
|
|
2108
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2109
|
+
),
|
|
2110
|
+
"speedup": lambda: CubeViewDef(
|
|
2111
|
+
key_index=index_cols,
|
|
2112
|
+
values=self._filter_column(["speedup"], self.values),
|
|
2113
|
+
ignore_unique=True,
|
|
2114
|
+
keep_columns_in_index=["suite"],
|
|
2115
|
+
f_highlight=f_speedup,
|
|
2116
|
+
name="speedup",
|
|
2117
|
+
order=order,
|
|
2118
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2119
|
+
),
|
|
2120
|
+
"counts": lambda: CubeViewDef(
|
|
2121
|
+
key_index=index_cols,
|
|
2122
|
+
values=self._filter_column(["^n_.*"], self.values),
|
|
2123
|
+
ignore_unique=True,
|
|
2124
|
+
keep_columns_in_index=["suite"],
|
|
2125
|
+
name="counts",
|
|
2126
|
+
order=order,
|
|
2127
|
+
),
|
|
2128
|
+
"peak-gpu": lambda: CubeViewDef(
|
|
2129
|
+
key_index=index_cols,
|
|
2130
|
+
values=self._filter_column(["^peak_gpu_.*"], self.values),
|
|
2131
|
+
ignore_unique=True,
|
|
2132
|
+
keep_columns_in_index=["suite"],
|
|
2133
|
+
name="peak-gpu",
|
|
2134
|
+
order=order,
|
|
2135
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2136
|
+
),
|
|
2137
|
+
"time": lambda: CubeViewDef(
|
|
2138
|
+
key_index=index_cols,
|
|
2139
|
+
values=self._filter_column(
|
|
2140
|
+
["time_latency", "time_latency_eager"], self.values
|
|
2141
|
+
),
|
|
2142
|
+
ignore_unique=True,
|
|
2143
|
+
keep_columns_in_index=["suite"],
|
|
2144
|
+
name="time",
|
|
2145
|
+
order=order,
|
|
2146
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2147
|
+
),
|
|
2148
|
+
"time_export": lambda: CubeViewDef(
|
|
2149
|
+
key_index=index_cols,
|
|
2150
|
+
values=self._filter_column(["time_export_unbiased"], self.values),
|
|
2151
|
+
ignore_unique=True,
|
|
2152
|
+
keep_columns_in_index=["suite"],
|
|
2153
|
+
name="time_export",
|
|
2154
|
+
order=order,
|
|
2155
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2156
|
+
),
|
|
2157
|
+
"err": lambda: CubeViewDef(
|
|
2158
|
+
key_index=index_cols,
|
|
2159
|
+
values=self._filter_column(
|
|
2160
|
+
["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
|
|
2161
|
+
),
|
|
2162
|
+
ignore_unique=True,
|
|
2163
|
+
keep_columns_in_index=["suite"],
|
|
2164
|
+
name="err",
|
|
2165
|
+
order=order,
|
|
2166
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2167
|
+
),
|
|
2168
|
+
"bucket-speedup": lambda: CubeViewDef(
|
|
2169
|
+
key_index=index_cols,
|
|
2170
|
+
values=self._filter_column(["bucket[speedup]"], self.values),
|
|
2171
|
+
ignore_unique=True,
|
|
2172
|
+
keep_columns_in_index=["suite"],
|
|
2173
|
+
name="bucket-speedup",
|
|
2174
|
+
f_highlight=f_bucket,
|
|
2175
|
+
order=order,
|
|
2176
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2177
|
+
),
|
|
2178
|
+
"onnx": lambda: CubeViewDef(
|
|
2179
|
+
key_index=index_cols,
|
|
2180
|
+
values=self._filter_column(
|
|
2181
|
+
[
|
|
2182
|
+
"onnx_filesize",
|
|
2183
|
+
"onnx_n_nodes",
|
|
2184
|
+
"onnx_n_nodes_no_cst",
|
|
2185
|
+
"onnx_weight_size_proto",
|
|
2186
|
+
"onnx_weight_size_torch",
|
|
2187
|
+
"op_onnx_initializer_small",
|
|
2188
|
+
],
|
|
2189
|
+
self.values,
|
|
2190
|
+
),
|
|
2191
|
+
ignore_unique=True,
|
|
2192
|
+
keep_columns_in_index=["suite"],
|
|
2193
|
+
name="onnx",
|
|
2194
|
+
order=order,
|
|
2195
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2196
|
+
),
|
|
2197
|
+
"raw-short": lambda: CubeViewDef(
|
|
2198
|
+
key_index=self.keys_time,
|
|
2199
|
+
values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
|
|
2200
|
+
ignore_unique=False,
|
|
2201
|
+
keep_columns_in_index=["suite"],
|
|
2202
|
+
name="raw-short",
|
|
2203
|
+
no_index=True,
|
|
2204
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2205
|
+
),
|
|
2206
|
+
}
|
|
2207
|
+
|
|
2208
|
+
cmd_col = self._filter_column(["CMD"], self.values, can_be_empty=True)
|
|
2209
|
+
if cmd_col:
|
|
2210
|
+
implemented_views["cmd"] = lambda: CubeViewDef(
|
|
2211
|
+
key_index=index_cols,
|
|
2212
|
+
values=cmd_col,
|
|
2213
|
+
ignore_unique=True,
|
|
2214
|
+
keep_columns_in_index=["suite"],
|
|
2215
|
+
name="cmd",
|
|
2216
|
+
order=order,
|
|
2217
|
+
fix_aggregation_change=fix_aggregation_change,
|
|
2218
|
+
)
|
|
2219
|
+
|
|
2220
|
+
assert name in implemented_views or name in {"cmd"}, (
|
|
2221
|
+
f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
|
|
2222
|
+
f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
|
|
2223
|
+
f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
|
|
2224
|
+
)
|
|
2225
|
+
if name not in implemented_views:
|
|
2226
|
+
return None
|
|
2227
|
+
return implemented_views[name]()
|
|
2228
|
+
|
|
2229
|
+
def post_load_process_piece(
|
|
2230
|
+
self, df: pandas.DataFrame, unique: bool = False
|
|
2231
|
+
) -> pandas.DataFrame:
|
|
2232
|
+
df = super().post_load_process_piece(df, unique=unique)
|
|
2233
|
+
if unique:
|
|
2234
|
+
return df
|
|
2235
|
+
cols = self._filter_column(self._keys, df)
|
|
2236
|
+
res = None
|
|
2237
|
+
for c in cols:
|
|
2238
|
+
if df[c].isna().any():
|
|
2239
|
+
# Missing values for keys are not supposed to happen.
|
|
2240
|
+
uniq = set(df[c].dropna())
|
|
2241
|
+
if len(uniq) == 1:
|
|
2242
|
+
if res is None:
|
|
2243
|
+
res = df.copy()
|
|
2244
|
+
res[c] = res[c].fillna(uniq.pop())
|
|
2245
|
+
return df if res is None else res
|