onnx-diagnostic 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnx_diagnostic/__init__.py +1 -1
- onnx_diagnostic/_command_lines_parser.py +213 -5
- onnx_diagnostic/export/dynamic_shapes.py +48 -20
- onnx_diagnostic/export/shape_helper.py +126 -0
- onnx_diagnostic/ext_test_case.py +31 -0
- onnx_diagnostic/helpers/cache_helper.py +42 -20
- onnx_diagnostic/helpers/config_helper.py +16 -1
- onnx_diagnostic/helpers/log_helper.py +1561 -177
- onnx_diagnostic/helpers/torch_helper.py +6 -2
- onnx_diagnostic/tasks/__init__.py +2 -0
- onnx_diagnostic/tasks/image_text_to_text.py +69 -18
- onnx_diagnostic/tasks/text_generation.py +17 -8
- onnx_diagnostic/tasks/text_to_image.py +91 -0
- onnx_diagnostic/torch_export_patches/onnx_export_errors.py +24 -7
- onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +144 -349
- onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +87 -7
- onnx_diagnostic/torch_export_patches/serialization/__init__.py +46 -0
- onnx_diagnostic/torch_export_patches/serialization/diffusers_impl.py +34 -0
- onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +259 -0
- onnx_diagnostic/torch_models/hghub/hub_api.py +73 -5
- onnx_diagnostic/torch_models/hghub/hub_data.py +7 -2
- onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +28 -0
- onnx_diagnostic/torch_models/hghub/model_inputs.py +74 -14
- onnx_diagnostic/torch_models/validate.py +45 -16
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/METADATA +1 -1
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/RECORD +29 -24
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/WHEEL +0 -0
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/licenses/LICENSE.txt +0 -0
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1,20 +1,89 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
import enum
|
|
2
3
|
import glob
|
|
4
|
+
import io
|
|
3
5
|
import os
|
|
6
|
+
import pprint
|
|
4
7
|
import re
|
|
8
|
+
import warnings
|
|
5
9
|
import zipfile
|
|
6
10
|
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
|
|
7
11
|
import numpy as np
|
|
8
12
|
import pandas
|
|
9
|
-
from pandas.api.types import is_numeric_dtype
|
|
13
|
+
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
|
|
10
14
|
from .helper import string_sig
|
|
11
15
|
|
|
16
|
+
BUCKET_SCALES_VALUES = np.array(
|
|
17
|
+
[-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def filter_data(
|
|
25
|
+
df: pandas.DataFrame,
|
|
26
|
+
filter_in: Optional[str] = None,
|
|
27
|
+
filter_out: Optional[str] = None,
|
|
28
|
+
verbose: int = 0,
|
|
29
|
+
) -> pandas.DataFrame:
|
|
30
|
+
"""
|
|
31
|
+
Argument `filter` follows the syntax
|
|
32
|
+
``<column1>:<fmt1>//<column2>:<fmt2>``.
|
|
33
|
+
|
|
34
|
+
The format is the following:
|
|
35
|
+
|
|
36
|
+
* a value or a set of values separated by ``;``
|
|
37
|
+
"""
|
|
38
|
+
if not filter_in and not filter_out:
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
def _f(fmt):
|
|
42
|
+
cond = {}
|
|
43
|
+
if isinstance(fmt, str):
|
|
44
|
+
cols = fmt.split("//")
|
|
45
|
+
for c in cols:
|
|
46
|
+
assert ":" in c, f"Unexpected value {c!r} in fmt={fmt!r}"
|
|
47
|
+
spl = c.split(":")
|
|
48
|
+
assert len(spl) == 2, f"Unexpected value {c!r} in fmt={fmt!r}"
|
|
49
|
+
name, fil = spl
|
|
50
|
+
cond[name] = set(fil.split(";"))
|
|
51
|
+
return cond
|
|
52
|
+
|
|
53
|
+
if filter_in:
|
|
54
|
+
cond = _f(filter_in)
|
|
55
|
+
assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_in!r}"
|
|
56
|
+
for k, v in cond.items():
|
|
57
|
+
if k not in df.columns:
|
|
58
|
+
continue
|
|
59
|
+
if verbose:
|
|
60
|
+
print(
|
|
61
|
+
f"[_filter_data] filter in column {k!r}, "
|
|
62
|
+
f"values {v!r} among {set(df[k].astype(str))}"
|
|
63
|
+
)
|
|
64
|
+
df = df[df[k].astype(str).isin(v)]
|
|
65
|
+
|
|
66
|
+
if filter_out:
|
|
67
|
+
cond = _f(filter_out)
|
|
68
|
+
assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_out!r}"
|
|
69
|
+
for k, v in cond.items():
|
|
70
|
+
if k not in df.columns:
|
|
71
|
+
continue
|
|
72
|
+
if verbose:
|
|
73
|
+
print(
|
|
74
|
+
f"[_filter_data] filter out column {k!r}, "
|
|
75
|
+
f"values {v!r} among {set(df[k].astype(str))}"
|
|
76
|
+
)
|
|
77
|
+
df = df[~df[k].astype(str).isin(v)]
|
|
78
|
+
return df
|
|
79
|
+
|
|
12
80
|
|
|
13
81
|
def enumerate_csv_files(
|
|
14
82
|
data: Union[
|
|
15
83
|
pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
|
|
16
84
|
],
|
|
17
85
|
verbose: int = 0,
|
|
86
|
+
filtering: Optional[Callable[[str], bool]] = None,
|
|
18
87
|
) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
|
|
19
88
|
"""
|
|
20
89
|
Enumerates files considered for the aggregation.
|
|
@@ -23,6 +92,10 @@ def enumerate_csv_files(
|
|
|
23
92
|
loops over csv candidates.
|
|
24
93
|
|
|
25
94
|
:param data: dataframe with the raw data or a file or list of files
|
|
95
|
+
:param vrbose: verbosity
|
|
96
|
+
:param filtering: function to filter in or out files in zip files,
|
|
97
|
+
must return true to keep the file, false to skip it.
|
|
98
|
+
:return: a generator yielding tuples with the filename, date, full path and zip file
|
|
26
99
|
|
|
27
100
|
data can contains:
|
|
28
101
|
* a dataframe
|
|
@@ -52,13 +125,9 @@ def enumerate_csv_files(
|
|
|
52
125
|
# We check the first line is ok.
|
|
53
126
|
if verbose:
|
|
54
127
|
print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
continue
|
|
59
|
-
dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
|
|
60
|
-
du = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
61
|
-
yield (os.path.split(filename)[-1], du, filename, "")
|
|
128
|
+
dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
|
|
129
|
+
du = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
130
|
+
yield (os.path.split(filename)[-1], du, filename, "")
|
|
62
131
|
continue
|
|
63
132
|
|
|
64
133
|
if ext == ".zip":
|
|
@@ -67,8 +136,11 @@ def enumerate_csv_files(
|
|
|
67
136
|
zf = zipfile.ZipFile(filename, "r")
|
|
68
137
|
for ii, info in enumerate(zf.infolist()):
|
|
69
138
|
name = info.filename
|
|
70
|
-
|
|
71
|
-
|
|
139
|
+
if filtering is None:
|
|
140
|
+
ext = os.path.splitext(name)[-1]
|
|
141
|
+
if ext != ".csv":
|
|
142
|
+
continue
|
|
143
|
+
elif not filtering(name):
|
|
72
144
|
continue
|
|
73
145
|
if verbose:
|
|
74
146
|
print(
|
|
@@ -96,14 +168,15 @@ def enumerate_csv_files(
|
|
|
96
168
|
for ii, f in enumerate(found):
|
|
97
169
|
if verbose:
|
|
98
170
|
print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
|
|
99
|
-
yield from enumerate_csv_files(f, verbose=verbose)
|
|
171
|
+
yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
|
|
100
172
|
|
|
101
173
|
|
|
102
174
|
def open_dataframe(
|
|
103
175
|
data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
|
|
104
176
|
) -> pandas.DataFrame:
|
|
105
177
|
"""
|
|
106
|
-
Opens a filename
|
|
178
|
+
Opens a filename defined by function
|
|
179
|
+
:func:`onnx_diagnostic.helpers.log_helper.enumerate_csv_files`.
|
|
107
180
|
|
|
108
181
|
:param data: a dataframe, a filename, a tuple indicating the file is coming
|
|
109
182
|
from a zip file
|
|
@@ -140,10 +213,26 @@ class CubeViewDef:
|
|
|
140
213
|
:param order: to reorder key in columns index
|
|
141
214
|
:param key_agg: aggregate according to these columns before
|
|
142
215
|
creating the view
|
|
143
|
-
:param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg
|
|
216
|
+
:param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
|
|
217
|
+
it can be also a callable to return a different aggregation
|
|
218
|
+
method depending on the column name
|
|
144
219
|
:param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
|
|
220
|
+
:param agg_multi: aggregation over multiple columns
|
|
221
|
+
:param ignore_columns: ignore the following columns if known to overload the view
|
|
222
|
+
:param keep_columns_in_index: keeps the columns even if there is only one unique value
|
|
223
|
+
:param dropna: drops rows with nan if not relevant
|
|
224
|
+
:param transpose: transpose
|
|
225
|
+
:param f_highlight: to highlights some values
|
|
226
|
+
:param name: name of the view, used mostly to debug
|
|
227
|
+
:param plots: adds plot to the Excel sheet
|
|
228
|
+
:param no_index: remove the index (but keeps the columns)
|
|
145
229
|
"""
|
|
146
230
|
|
|
231
|
+
class HighLightKind(enum.IntEnum):
|
|
232
|
+
NONE = 0
|
|
233
|
+
RED = 1
|
|
234
|
+
GREEN = 2
|
|
235
|
+
|
|
147
236
|
def __init__(
|
|
148
237
|
self,
|
|
149
238
|
key_index: Sequence[str],
|
|
@@ -151,8 +240,19 @@ class CubeViewDef:
|
|
|
151
240
|
ignore_unique: bool = True,
|
|
152
241
|
order: Optional[Sequence[str]] = None,
|
|
153
242
|
key_agg: Optional[Sequence[str]] = None,
|
|
154
|
-
agg_args: Sequence[Any] = ("sum",),
|
|
243
|
+
agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
|
|
155
244
|
agg_kwargs: Optional[Dict[str, Any]] = None,
|
|
245
|
+
agg_multi: Optional[
|
|
246
|
+
Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
|
|
247
|
+
] = None,
|
|
248
|
+
ignore_columns: Optional[Sequence[str]] = None,
|
|
249
|
+
keep_columns_in_index: Optional[Sequence[str]] = None,
|
|
250
|
+
dropna: bool = True,
|
|
251
|
+
transpose: bool = False,
|
|
252
|
+
f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
|
|
253
|
+
name: Optional[str] = None,
|
|
254
|
+
no_index: bool = False,
|
|
255
|
+
plots: bool = False,
|
|
156
256
|
):
|
|
157
257
|
self.key_index = key_index
|
|
158
258
|
self.values = values
|
|
@@ -161,12 +261,349 @@ class CubeViewDef:
|
|
|
161
261
|
self.key_agg = key_agg
|
|
162
262
|
self.agg_args = agg_args
|
|
163
263
|
self.agg_kwargs = agg_kwargs
|
|
264
|
+
self.agg_multi = agg_multi
|
|
265
|
+
self.dropna = dropna
|
|
266
|
+
self.ignore_columns = ignore_columns
|
|
267
|
+
self.keep_columns_in_index = keep_columns_in_index
|
|
268
|
+
self.f_highlight = f_highlight
|
|
269
|
+
self.transpose = transpose
|
|
270
|
+
self.name = name
|
|
271
|
+
self.no_index = no_index
|
|
272
|
+
self.plots = plots
|
|
164
273
|
|
|
165
274
|
def __repr__(self) -> str:
|
|
166
275
|
"usual"
|
|
167
276
|
return string_sig(self) # type: ignore[arg-type]
|
|
168
277
|
|
|
169
278
|
|
|
279
|
+
def apply_excel_style(
|
|
280
|
+
filename_or_writer: Any,
|
|
281
|
+
f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
|
|
282
|
+
):
|
|
283
|
+
"""
|
|
284
|
+
Applies styles on all sheets in a file unless the sheet is too big.
|
|
285
|
+
|
|
286
|
+
:param filename_or_writer: filename, modified inplace
|
|
287
|
+
:param f_highlight: color function to apply, one per sheet
|
|
288
|
+
"""
|
|
289
|
+
from openpyxl import load_workbook
|
|
290
|
+
from openpyxl.styles import Alignment
|
|
291
|
+
from openpyxl.utils import get_column_letter
|
|
292
|
+
from openpyxl.styles import Font # , PatternFill, numbers
|
|
293
|
+
|
|
294
|
+
if isinstance(filename_or_writer, str):
|
|
295
|
+
workbook = load_workbook(filename_or_writer)
|
|
296
|
+
save = True
|
|
297
|
+
else:
|
|
298
|
+
workbook = filename_or_writer.book
|
|
299
|
+
save = False
|
|
300
|
+
|
|
301
|
+
left = Alignment(horizontal="left")
|
|
302
|
+
left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
|
|
303
|
+
right = Alignment(horizontal="right")
|
|
304
|
+
font_colors = {
|
|
305
|
+
CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
|
|
306
|
+
CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
for name in workbook.sheetnames:
|
|
310
|
+
f_highlight = f_highlights.get(name, None) if f_highlights else None
|
|
311
|
+
sheet = workbook[name]
|
|
312
|
+
n_rows = sheet.max_row
|
|
313
|
+
n_cols = sheet.max_column
|
|
314
|
+
if n_rows * n_cols > 2**18:
|
|
315
|
+
# Too big.
|
|
316
|
+
continue
|
|
317
|
+
co: Dict[int, int] = {}
|
|
318
|
+
sizes: Dict[int, int] = {}
|
|
319
|
+
cols = set()
|
|
320
|
+
for i in range(1, n_rows + 1):
|
|
321
|
+
for j, cell in enumerate(sheet[i]):
|
|
322
|
+
if j > n_cols:
|
|
323
|
+
break
|
|
324
|
+
cols.add(cell.column)
|
|
325
|
+
if isinstance(cell.value, float):
|
|
326
|
+
co[j] = co.get(j, 0) + 1
|
|
327
|
+
elif isinstance(cell.value, str):
|
|
328
|
+
sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
|
|
329
|
+
|
|
330
|
+
for k, v in sizes.items():
|
|
331
|
+
c = get_column_letter(k)
|
|
332
|
+
sheet.column_dimensions[c].width = min(max(8, v), 30)
|
|
333
|
+
for k in cols:
|
|
334
|
+
if k not in sizes:
|
|
335
|
+
c = get_column_letter(k)
|
|
336
|
+
sheet.column_dimensions[c].width = 15
|
|
337
|
+
|
|
338
|
+
for i in range(1, n_rows + 1):
|
|
339
|
+
for j, cell in enumerate(sheet[i]):
|
|
340
|
+
if j > n_cols:
|
|
341
|
+
break
|
|
342
|
+
if isinstance(cell.value, pandas.Timestamp):
|
|
343
|
+
cell.alignment = right
|
|
344
|
+
dt = cell.value.to_pydatetime()
|
|
345
|
+
cell.value = dt
|
|
346
|
+
cell.number_format = (
|
|
347
|
+
"YYYY-MM-DD"
|
|
348
|
+
if (
|
|
349
|
+
dt.hour == 0
|
|
350
|
+
and dt.minute == 0
|
|
351
|
+
and dt.second == 0
|
|
352
|
+
and dt.microsecond == 0
|
|
353
|
+
)
|
|
354
|
+
else "YYYY-MM-DD 00:00:00"
|
|
355
|
+
)
|
|
356
|
+
elif isinstance(cell.value, (float, int)):
|
|
357
|
+
cell.alignment = right
|
|
358
|
+
x = abs(cell.value)
|
|
359
|
+
if int(x) == x:
|
|
360
|
+
cell.number_format = "0"
|
|
361
|
+
elif x > 5000:
|
|
362
|
+
cell.number_format = "# ##0"
|
|
363
|
+
elif x >= 500:
|
|
364
|
+
cell.number_format = "0.0"
|
|
365
|
+
elif x >= 50:
|
|
366
|
+
cell.number_format = "0.00"
|
|
367
|
+
elif x >= 5:
|
|
368
|
+
cell.number_format = "0.000"
|
|
369
|
+
elif x > 0.5:
|
|
370
|
+
cell.number_format = "0.0000"
|
|
371
|
+
elif x > 0.005:
|
|
372
|
+
cell.number_format = "0.00000"
|
|
373
|
+
else:
|
|
374
|
+
cell.number_format = "0.000E+00"
|
|
375
|
+
if f_highlight:
|
|
376
|
+
h = f_highlight(cell.value)
|
|
377
|
+
if h in font_colors:
|
|
378
|
+
cell.font = font_colors[h]
|
|
379
|
+
elif isinstance(cell.value, str) and len(cell.value) > 70:
|
|
380
|
+
cell.alignment = left_shrink
|
|
381
|
+
else:
|
|
382
|
+
cell.alignment = left
|
|
383
|
+
if f_highlight:
|
|
384
|
+
h = f_highlight(cell.value)
|
|
385
|
+
if h in font_colors:
|
|
386
|
+
cell.font = font_colors[h]
|
|
387
|
+
if save:
|
|
388
|
+
workbook.save(filename_or_writer)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class CubePlot:
|
|
392
|
+
"""
|
|
393
|
+
Creates a plot.
|
|
394
|
+
|
|
395
|
+
:param df: dataframe
|
|
396
|
+
:param kind: kind of graph to plot, bar, barh, line
|
|
397
|
+
:param split: draw a graph per line in the dataframe
|
|
398
|
+
:param timeseries: this assumes the time is one level of the columns,
|
|
399
|
+
this argument indices the level name
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
KINDS = {"bar", "barh", "line"}
|
|
403
|
+
|
|
404
|
+
@classmethod
|
|
405
|
+
def group_columns(
|
|
406
|
+
cls, columns: List[str], sep: str = "/", depth: int = 2
|
|
407
|
+
) -> List[List[str]]:
|
|
408
|
+
"""Groups columns to have nice display."""
|
|
409
|
+
res: Dict[str, List[str]] = {}
|
|
410
|
+
for c in columns:
|
|
411
|
+
p = c.split("/")
|
|
412
|
+
k = "/".join(p[:depth])
|
|
413
|
+
if k not in res:
|
|
414
|
+
res[k] = []
|
|
415
|
+
res[k].append(c)
|
|
416
|
+
new_res: Dict[str, List[str]] = {}
|
|
417
|
+
for k, v in res.items():
|
|
418
|
+
if len(v) >= 3:
|
|
419
|
+
new_res[k] = v
|
|
420
|
+
else:
|
|
421
|
+
if "0" not in new_res:
|
|
422
|
+
new_res["0"] = []
|
|
423
|
+
new_res["0"].extend(v)
|
|
424
|
+
groups: List[List[str]] = [sorted(v) for k, v in sorted(new_res.items())]
|
|
425
|
+
if depth <= 1:
|
|
426
|
+
return groups
|
|
427
|
+
new_groups: List[List[str]] = []
|
|
428
|
+
for v in groups:
|
|
429
|
+
if len(v) >= 6:
|
|
430
|
+
new_groups.extend(cls.group_columns(v, depth=1, sep=sep))
|
|
431
|
+
else:
|
|
432
|
+
new_groups.append(v)
|
|
433
|
+
return new_groups
|
|
434
|
+
|
|
435
|
+
def __init__(
|
|
436
|
+
self,
|
|
437
|
+
df: pandas.DataFrame,
|
|
438
|
+
kind: str = "bar",
|
|
439
|
+
orientation="col",
|
|
440
|
+
split: bool = True,
|
|
441
|
+
timeseries: Optional[str] = None,
|
|
442
|
+
):
|
|
443
|
+
assert (
|
|
444
|
+
not timeseries or timeseries in df.columns.names
|
|
445
|
+
), f"Level {timeseries!r} is not part of the columns levels {df.columns.names}"
|
|
446
|
+
assert (
|
|
447
|
+
kind in self.__class__.KINDS
|
|
448
|
+
), f"Unexpected kind={kind!r} not in {self.__class__.KINDS}"
|
|
449
|
+
assert split, f"split={split} not implemented"
|
|
450
|
+
assert (
|
|
451
|
+
not timeseries or orientation == "row"
|
|
452
|
+
), f"orientation={orientation!r} must be 'row' for timeseries"
|
|
453
|
+
self.df = df.copy()
|
|
454
|
+
self.kind = kind
|
|
455
|
+
self.orientation = orientation
|
|
456
|
+
self.split = split
|
|
457
|
+
self.timeseries = timeseries
|
|
458
|
+
|
|
459
|
+
if timeseries:
|
|
460
|
+
if isinstance(self.df.columns, pandas.MultiIndex):
|
|
461
|
+
index_time = list(self.df.columns.names).index(self.timeseries)
|
|
462
|
+
|
|
463
|
+
def _drop(t, i=index_time):
|
|
464
|
+
return (*t[:i], *t[i + 1 :])
|
|
465
|
+
|
|
466
|
+
self.df.columns = pandas.MultiIndex.from_tuples(
|
|
467
|
+
[("/".join(map(str, _drop(i))), i[index_time]) for i in self.df.columns],
|
|
468
|
+
names=["metric", timeseries],
|
|
469
|
+
)
|
|
470
|
+
else:
|
|
471
|
+
if isinstance(self.df.columns, pandas.MultiIndex):
|
|
472
|
+
self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
|
|
473
|
+
if isinstance(self.df.index, pandas.MultiIndex):
|
|
474
|
+
self.df.index = ["/".join(map(str, i)) for i in self.df.index]
|
|
475
|
+
|
|
476
|
+
def __repr__(self) -> str:
|
|
477
|
+
"usual"
|
|
478
|
+
return string_sig(self) # type: ignore[arg-type]
|
|
479
|
+
|
|
480
|
+
def to_images(
|
|
481
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
482
|
+
) -> List[bytes]:
|
|
483
|
+
"""
|
|
484
|
+
Converts data into plots and images.
|
|
485
|
+
|
|
486
|
+
:param verbose: verbosity
|
|
487
|
+
:param merge: returns all graphs in a single image (True)
|
|
488
|
+
or an image for every graph (False)
|
|
489
|
+
:param title_suffix: prefix for the title of every graph
|
|
490
|
+
:return: list of binary images (format PNG)
|
|
491
|
+
"""
|
|
492
|
+
if self.kind in ("barh", "bar"):
|
|
493
|
+
return self._to_images_bar(verbose=verbose, merge=merge, title_suffix=title_suffix)
|
|
494
|
+
if self.kind == "line":
|
|
495
|
+
return self._to_images_line(
|
|
496
|
+
verbose=verbose, merge=merge, title_suffix=title_suffix
|
|
497
|
+
)
|
|
498
|
+
raise AssertionError(f"self.kind={self.kind!r} not implemented")
|
|
499
|
+
|
|
500
|
+
@classmethod
|
|
501
|
+
def _make_loop(cls, ensemble, verbose):
|
|
502
|
+
if verbose:
|
|
503
|
+
from tqdm import tqdm
|
|
504
|
+
|
|
505
|
+
loop = tqdm(ensemble)
|
|
506
|
+
else:
|
|
507
|
+
loop = ensemble
|
|
508
|
+
return loop
|
|
509
|
+
|
|
510
|
+
def _to_images_bar(
|
|
511
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
512
|
+
) -> List[bytes]:
|
|
513
|
+
assert merge, f"merge={merge} not implemented yet"
|
|
514
|
+
import matplotlib.pyplot as plt
|
|
515
|
+
|
|
516
|
+
df = self.df.T if self.orientation == "row" else self.df
|
|
517
|
+
title_suffix = f"\n{title_suffix}" if title_suffix else ""
|
|
518
|
+
|
|
519
|
+
n_cols = 3
|
|
520
|
+
nn = df.shape[1] // n_cols
|
|
521
|
+
nn += int(df.shape[1] % n_cols != 0)
|
|
522
|
+
fig, axs = plt.subplots(nn, n_cols, figsize=(6 * n_cols, nn * df.shape[0] / 5))
|
|
523
|
+
pos = 0
|
|
524
|
+
imgs = []
|
|
525
|
+
for c in self._make_loop(df.columns, verbose):
|
|
526
|
+
ax = axs[pos // n_cols, pos % n_cols]
|
|
527
|
+
(
|
|
528
|
+
df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
|
|
529
|
+
if self.kind == "barh"
|
|
530
|
+
else df[c].plot.bar(title=f"{c}{title_suffix}", ax=ax)
|
|
531
|
+
)
|
|
532
|
+
ax.tick_params(axis="both", which="major", labelsize=8)
|
|
533
|
+
ax.grid(True)
|
|
534
|
+
pos += 1 # noqa: SIM113
|
|
535
|
+
fig.tight_layout()
|
|
536
|
+
imgdata = io.BytesIO()
|
|
537
|
+
fig.savefig(imgdata, format="png")
|
|
538
|
+
imgs.append(imgdata.getvalue())
|
|
539
|
+
plt.close()
|
|
540
|
+
return imgs
|
|
541
|
+
|
|
542
|
+
def _to_images_line(
|
|
543
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
544
|
+
) -> List[bytes]:
|
|
545
|
+
assert merge, f"merge={merge} not implemented yet"
|
|
546
|
+
assert (
|
|
547
|
+
self.orientation == "row"
|
|
548
|
+
), f"self.orientation={self.orientation!r} not implemented for this kind of graph."
|
|
549
|
+
|
|
550
|
+
def rotate_align(ax, angle=15, align="right"):
|
|
551
|
+
for label in ax.get_xticklabels():
|
|
552
|
+
label.set_rotation(angle)
|
|
553
|
+
label.set_horizontalalignment(align)
|
|
554
|
+
ax.tick_params(axis="both", which="major", labelsize=8)
|
|
555
|
+
ax.grid(True)
|
|
556
|
+
ax.legend()
|
|
557
|
+
ax.tick_params(labelleft=True)
|
|
558
|
+
return ax
|
|
559
|
+
|
|
560
|
+
import matplotlib.pyplot as plt
|
|
561
|
+
|
|
562
|
+
df = self.df.T
|
|
563
|
+
|
|
564
|
+
confs = list(df.unstack(self.timeseries).index)
|
|
565
|
+
groups = self.group_columns(confs)
|
|
566
|
+
n_cols = len(groups)
|
|
567
|
+
|
|
568
|
+
title_suffix = f"\n{title_suffix}" if title_suffix else ""
|
|
569
|
+
fig, axs = plt.subplots(
|
|
570
|
+
df.shape[1],
|
|
571
|
+
n_cols,
|
|
572
|
+
figsize=(5 * n_cols, max(len(g) for g in groups) * df.shape[1] / 2),
|
|
573
|
+
sharex=True,
|
|
574
|
+
sharey="row" if n_cols > 1 else False,
|
|
575
|
+
)
|
|
576
|
+
imgs = []
|
|
577
|
+
row = 0
|
|
578
|
+
for c in self._make_loop(df.columns, verbose):
|
|
579
|
+
dfc = df[[c]]
|
|
580
|
+
dfc = dfc.unstack(self.timeseries).T.droplevel(0)
|
|
581
|
+
if n_cols == 1:
|
|
582
|
+
dfc.plot(title=f"{c}{title_suffix}", ax=axs[row], linewidth=3)
|
|
583
|
+
axs[row].grid(True)
|
|
584
|
+
rotate_align(axs[row])
|
|
585
|
+
else:
|
|
586
|
+
x = list(range(dfc.shape[0]))
|
|
587
|
+
ticks = list(dfc.index)
|
|
588
|
+
for ii, group in enumerate(groups):
|
|
589
|
+
ddd = dfc.loc[:, group].copy()
|
|
590
|
+
axs[row, ii].set_xticks(x)
|
|
591
|
+
axs[row, ii].set_xticklabels(ticks)
|
|
592
|
+
# This is very slow
|
|
593
|
+
# ddd.plot(ax=axs[row, ii],linewidth=3)
|
|
594
|
+
for jj in range(ddd.shape[1]):
|
|
595
|
+
axs[row, ii].plot(x, ddd.iloc[:, jj], lw=3, label=ddd.columns[jj])
|
|
596
|
+
axs[row, ii].set_title(f"{c}{title_suffix}")
|
|
597
|
+
rotate_align(axs[row, ii])
|
|
598
|
+
row += 1 # noqa: SIM113
|
|
599
|
+
fig.tight_layout()
|
|
600
|
+
imgdata = io.BytesIO()
|
|
601
|
+
fig.savefig(imgdata, format="png")
|
|
602
|
+
imgs.append(imgdata.getvalue())
|
|
603
|
+
plt.close()
|
|
604
|
+
return imgs
|
|
605
|
+
|
|
606
|
+
|
|
170
607
|
class CubeLogs:
|
|
171
608
|
"""
|
|
172
609
|
Processes logs coming from experiments.
|
|
@@ -180,7 +617,14 @@ class CubeLogs:
|
|
|
180
617
|
values: Sequence[str] = ("time_.*", "disc_.*"),
|
|
181
618
|
ignored: Sequence[str] = (),
|
|
182
619
|
recent: bool = False,
|
|
183
|
-
formulas: Optional[
|
|
620
|
+
formulas: Optional[
|
|
621
|
+
Union[
|
|
622
|
+
Sequence[str],
|
|
623
|
+
Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
|
|
624
|
+
]
|
|
625
|
+
] = None,
|
|
626
|
+
fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
|
|
627
|
+
keep_last_date: bool = False,
|
|
184
628
|
):
|
|
185
629
|
self._data = data
|
|
186
630
|
self._time = time
|
|
@@ -189,24 +633,51 @@ class CubeLogs:
|
|
|
189
633
|
self._ignored = ignored
|
|
190
634
|
self.recent = recent
|
|
191
635
|
self._formulas = formulas
|
|
636
|
+
self.fill_missing = fill_missing
|
|
637
|
+
self.keep_last_date = keep_last_date
|
|
638
|
+
|
|
639
|
+
def post_load_process_piece(
|
|
640
|
+
self, df: pandas.DataFrame, unique: bool = False
|
|
641
|
+
) -> pandas.DataFrame:
|
|
642
|
+
"""
|
|
643
|
+
Postprocesses a piece when a cube is made of multiple pieces
|
|
644
|
+
before it gets merged.
|
|
645
|
+
"""
|
|
646
|
+
if not self.fill_missing:
|
|
647
|
+
return df
|
|
648
|
+
missing = dict(self.fill_missing)
|
|
649
|
+
for k, v in missing.items():
|
|
650
|
+
if k not in df.columns:
|
|
651
|
+
df[k] = v
|
|
652
|
+
return df
|
|
192
653
|
|
|
193
654
|
def load(self, verbose: int = 0):
|
|
194
655
|
"""Loads and preprocesses the data. Returns self."""
|
|
195
656
|
if isinstance(self._data, pandas.DataFrame):
|
|
196
657
|
if verbose:
|
|
197
658
|
print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
|
|
198
|
-
self.data = self._data
|
|
659
|
+
self.data = self.post_load_process_piece(self._data, unique=True)
|
|
660
|
+
if verbose:
|
|
661
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
199
662
|
elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
|
|
200
663
|
if verbose:
|
|
201
664
|
print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
|
|
202
|
-
self.data = pandas.DataFrame(self._data)
|
|
665
|
+
self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
|
|
666
|
+
if verbose:
|
|
667
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
203
668
|
elif isinstance(self._data, list) and all(
|
|
204
669
|
isinstance(r, pandas.DataFrame) for r in self._data
|
|
205
670
|
):
|
|
206
671
|
if verbose:
|
|
207
672
|
print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
|
|
208
|
-
self.data = pandas.concat(
|
|
673
|
+
self.data = pandas.concat(
|
|
674
|
+
[self.post_load_process_piece(c) for c in self._data], axis=0
|
|
675
|
+
)
|
|
676
|
+
if verbose:
|
|
677
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
209
678
|
elif isinstance(self._data, list):
|
|
679
|
+
if verbose:
|
|
680
|
+
print("[CubeLogs.load] load from list of Cubes")
|
|
210
681
|
cubes = []
|
|
211
682
|
for item in enumerate_csv_files(self._data, verbose=verbose):
|
|
212
683
|
df = open_dataframe(item)
|
|
@@ -219,8 +690,10 @@ class CubeLogs:
|
|
|
219
690
|
recent=self.recent,
|
|
220
691
|
)
|
|
221
692
|
cube.load()
|
|
222
|
-
cubes.append(cube.data)
|
|
693
|
+
cubes.append(self.post_load_process_piece(cube.data))
|
|
223
694
|
self.data = pandas.concat(cubes, axis=0)
|
|
695
|
+
if verbose:
|
|
696
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
224
697
|
else:
|
|
225
698
|
raise NotImplementedError(
|
|
226
699
|
f"Not implemented with the provided data (type={type(self._data)})"
|
|
@@ -236,59 +709,101 @@ class CubeLogs:
|
|
|
236
709
|
self._initialize_columns()
|
|
237
710
|
if verbose:
|
|
238
711
|
print(f"[CubeLogs.load] time={self.time}")
|
|
239
|
-
print(f"[CubeLogs.load] keys={self.
|
|
712
|
+
print(f"[CubeLogs.load] keys={self.keys_no_time}")
|
|
240
713
|
print(f"[CubeLogs.load] values={self.values}")
|
|
241
714
|
print(f"[CubeLogs.load] ignored={self.ignored}")
|
|
242
715
|
print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
|
|
243
716
|
print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
|
|
717
|
+
assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
|
|
718
|
+
assert self.values, f"No values found with {self._values} from {self.data.columns}"
|
|
244
719
|
assert not (
|
|
245
|
-
set(self.
|
|
246
|
-
), f"Columns {set(self.
|
|
720
|
+
set(self.keys_no_time) & set(self.values)
|
|
721
|
+
), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
|
|
247
722
|
assert not (
|
|
248
|
-
set(self.
|
|
249
|
-
), f"Columns {set(self.
|
|
723
|
+
set(self.keys_no_time) & set(self.ignored)
|
|
724
|
+
), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
|
|
250
725
|
assert not (
|
|
251
726
|
set(self.values) & set(self.ignored)
|
|
252
|
-
), f"Columns {set(self.
|
|
727
|
+
), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
|
|
253
728
|
assert (
|
|
254
|
-
self.time not in self.
|
|
729
|
+
self.time not in self.keys_no_time
|
|
255
730
|
and self.time not in self.values
|
|
256
731
|
and self.time not in self.ignored
|
|
257
|
-
),
|
|
258
|
-
|
|
732
|
+
), (
|
|
733
|
+
f"Column {self.time!r} is also a key, a value or ignored, "
|
|
734
|
+
f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
|
|
735
|
+
f"ignored={sorted(self.ignored)}"
|
|
736
|
+
)
|
|
737
|
+
self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
|
|
259
738
|
self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
|
|
260
739
|
self.data = self.data[self.columns]
|
|
261
740
|
if verbose:
|
|
262
741
|
print(f"[CubeLogs.load] dropped={self.dropped}")
|
|
263
742
|
print(f"[CubeLogs.load] data.shape={self.data.shape}")
|
|
264
743
|
|
|
744
|
+
shape = self.data.shape
|
|
745
|
+
if verbose:
|
|
746
|
+
print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
|
|
265
747
|
self._preprocess()
|
|
748
|
+
if verbose:
|
|
749
|
+
print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
|
|
750
|
+
assert (
|
|
751
|
+
self.data.shape[0] > 0
|
|
752
|
+
), f"The preprocessing reduced shape {shape} to {self.data.shape}."
|
|
266
753
|
if self.recent and verbose:
|
|
267
754
|
print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
|
|
268
755
|
|
|
269
756
|
# Let's apply the formulas
|
|
270
757
|
if self._formulas:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
if
|
|
758
|
+
forms = (
|
|
759
|
+
{k: k for k in self._formulas}
|
|
760
|
+
if not isinstance(self._formulas, dict)
|
|
761
|
+
else self._formulas
|
|
762
|
+
)
|
|
763
|
+
cols = set(self.values)
|
|
764
|
+
for k, ff in forms.items():
|
|
765
|
+
f = self._process_formula(ff)
|
|
766
|
+
if k in cols or f is None:
|
|
274
767
|
if verbose:
|
|
275
768
|
print(f"[CubeLogs.load] skip formula {k!r}")
|
|
276
769
|
else:
|
|
277
770
|
if verbose:
|
|
278
771
|
print(f"[CubeLogs.load] apply formula {k!r}")
|
|
279
772
|
self.data[k] = f(self.data)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
773
|
+
self.values.append(k)
|
|
774
|
+
cols.add(k)
|
|
775
|
+
self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
|
|
776
|
+
for k in self.keys_no_time:
|
|
777
|
+
if self.data[k].isna().max():
|
|
778
|
+
self.values_for_key[k].add(np.nan)
|
|
779
|
+
self.keys_with_nans = [
|
|
780
|
+
c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
|
|
283
781
|
]
|
|
284
|
-
assert not nans, f"The following keys {nans} have nan values. This is not allowed."
|
|
285
782
|
if verbose:
|
|
286
783
|
print(f"[CubeLogs.load] convert column {self.time!r} into date")
|
|
784
|
+
if self.keys_with_nans:
|
|
785
|
+
print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
|
|
287
786
|
self.data[self.time] = pandas.to_datetime(self.data[self.time])
|
|
787
|
+
|
|
788
|
+
if self.keep_last_date:
|
|
789
|
+
times = self.data[self.time].dropna()
|
|
790
|
+
mi, mx = times.min(), times.max()
|
|
791
|
+
if mi != mx:
|
|
792
|
+
print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
|
|
793
|
+
self.data.loc[~self.data[self.time].isna(), self.time] = mx
|
|
794
|
+
self.values_for_key[self.time] = {mx}
|
|
795
|
+
if self.data[self.time].isna().max():
|
|
796
|
+
self.values_for_key[self.time].add(np.nan)
|
|
288
797
|
if verbose:
|
|
289
798
|
print(f"[CubeLogs.load] done, shape={self.shape}")
|
|
290
799
|
return self
|
|
291
800
|
|
|
801
|
+
def _process_formula(
|
|
802
|
+
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
803
|
+
) -> Callable[[pandas.DataFrame], pandas.Series]:
|
|
804
|
+
assert callable(formula), f"formula={formula!r} is not supported."
|
|
805
|
+
return formula
|
|
806
|
+
|
|
292
807
|
@property
|
|
293
808
|
def shape(self) -> Tuple[int, int]:
|
|
294
809
|
"Returns the shape."
|
|
@@ -303,7 +818,7 @@ class CubeLogs:
|
|
|
303
818
|
|
|
304
819
|
def _preprocess(self):
|
|
305
820
|
last = self.values[0]
|
|
306
|
-
gr = self.data[[
|
|
821
|
+
gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
|
|
307
822
|
gr = gr[gr[last] > 1]
|
|
308
823
|
if self.recent:
|
|
309
824
|
cp = self.data.copy()
|
|
@@ -312,11 +827,15 @@ class CubeLogs:
|
|
|
312
827
|
), f"'__index__' should not be a column in {cp.columns}"
|
|
313
828
|
cp["__index__"] = np.arange(cp.shape[0])
|
|
314
829
|
gr = (
|
|
315
|
-
cp[[*self.
|
|
316
|
-
.groupby(self.
|
|
830
|
+
cp[[*self.keys_time, "__index__"]]
|
|
831
|
+
.groupby(self.keys_no_time, as_index=False, dropna=False)
|
|
317
832
|
.max()
|
|
318
833
|
)
|
|
319
|
-
|
|
834
|
+
assert gr.shape[0] > 0, (
|
|
835
|
+
f"Something went wrong after the groupby.\n"
|
|
836
|
+
f"{cp[[*self.keys, self.time, '__index__']].head().T}"
|
|
837
|
+
)
|
|
838
|
+
filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
|
|
320
839
|
assert filtered.shape[0] <= self.data.shape[0], (
|
|
321
840
|
f"Keeping the latest row brings more row {filtered.shape} "
|
|
322
841
|
f"(initial is {self.data.shape})."
|
|
@@ -324,18 +843,20 @@ class CubeLogs:
|
|
|
324
843
|
self.data = filtered.drop("__index__", axis=1)
|
|
325
844
|
else:
|
|
326
845
|
assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
|
|
327
|
-
gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
|
|
328
|
-
gr = gr[gr[self.time] > 1]
|
|
329
|
-
assert (
|
|
330
|
-
gr.shape[0] == 0
|
|
331
|
-
), f"recent should be true to keep the most recent row:\n{gr}"
|
|
332
846
|
|
|
333
847
|
@classmethod
|
|
334
848
|
def _filter_column(cls, filters, columns, can_be_empty=False):
|
|
849
|
+
assert list(columns), "columns is empty"
|
|
335
850
|
set_cols = set()
|
|
336
851
|
for f in filters:
|
|
337
|
-
|
|
338
|
-
|
|
852
|
+
if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
|
|
853
|
+
reg = re.compile(f)
|
|
854
|
+
cols = [c for c in columns if reg.search(c)]
|
|
855
|
+
elif f in columns:
|
|
856
|
+
# No regular expression.
|
|
857
|
+
cols = [f]
|
|
858
|
+
else:
|
|
859
|
+
continue
|
|
339
860
|
set_cols |= set(cols)
|
|
340
861
|
assert (
|
|
341
862
|
can_be_empty or set_cols
|
|
@@ -343,25 +864,31 @@ class CubeLogs:
|
|
|
343
864
|
return sorted(set_cols)
|
|
344
865
|
|
|
345
866
|
def _initialize_columns(self):
|
|
346
|
-
|
|
867
|
+
keys = self._filter_column(self._keys, self.data.columns)
|
|
347
868
|
self.values = self._filter_column(self._values, self.data.columns)
|
|
348
869
|
self.ignored = self._filter_column(self._ignored, self.data.columns, True)
|
|
349
870
|
assert (
|
|
350
871
|
self._time in self.data.columns
|
|
351
|
-
), f"Column {self._time} not found in {self.data.columns}"
|
|
352
|
-
ignored_keys = set(self.ignored) & set(
|
|
872
|
+
), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
|
|
873
|
+
ignored_keys = set(self.ignored) & set(keys)
|
|
353
874
|
ignored_values = set(self.ignored) & set(self.values)
|
|
354
|
-
self.
|
|
875
|
+
self.keys_no_time = [c for c in keys if c not in ignored_keys]
|
|
355
876
|
self.values = [c for c in self.values if c not in ignored_values]
|
|
356
877
|
self.ignored_keys = sorted(ignored_keys)
|
|
357
878
|
self.ignored_values = sorted(ignored_values)
|
|
358
879
|
self.time = self._time
|
|
880
|
+
self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
|
|
359
881
|
|
|
360
882
|
def __str__(self) -> str:
|
|
361
883
|
"usual"
|
|
362
884
|
return str(self.data) if hasattr(self, "data") else str(self._data)
|
|
363
885
|
|
|
364
|
-
def view(
|
|
886
|
+
def view(
|
|
887
|
+
self,
|
|
888
|
+
view_def: Union[str, CubeViewDef],
|
|
889
|
+
return_view_def: bool = False,
|
|
890
|
+
verbose: int = 0,
|
|
891
|
+
) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
|
|
365
892
|
"""
|
|
366
893
|
Returns a dataframe, a pivot view.
|
|
367
894
|
`key_index` determines the index, the other key columns determines
|
|
@@ -369,58 +896,274 @@ class CubeLogs:
|
|
|
369
896
|
is removed.
|
|
370
897
|
|
|
371
898
|
:param view_def: view definition
|
|
899
|
+
:param return_view_def: returns the view as well
|
|
900
|
+
:param verbose: verbosity level
|
|
372
901
|
:return: dataframe
|
|
373
902
|
"""
|
|
374
|
-
|
|
903
|
+
assert isinstance(
|
|
904
|
+
view_def, CubeViewDef
|
|
905
|
+
), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
|
|
906
|
+
if verbose:
|
|
907
|
+
print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
|
|
908
|
+
key_agg = (
|
|
909
|
+
self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
|
|
910
|
+
)
|
|
375
911
|
set_key_agg = set(key_agg)
|
|
376
|
-
assert set_key_agg <= set(
|
|
377
|
-
|
|
378
|
-
|
|
912
|
+
assert set_key_agg <= set(self.keys_time), (
|
|
913
|
+
f"view_def.name={view_def.name!r}, "
|
|
914
|
+
f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
|
|
915
|
+
f"keys={sorted(self.keys_time)}",
|
|
916
|
+
)
|
|
379
917
|
|
|
380
918
|
values = self._filter_column(view_def.values, self.values)
|
|
381
|
-
assert set(values) <= set(
|
|
382
|
-
|
|
383
|
-
|
|
919
|
+
assert set(values) <= set(self.values), (
|
|
920
|
+
f"view_def.name={view_def.name!r}, "
|
|
921
|
+
f"non existing columns in values {set(values) - set(self.values)}, "
|
|
922
|
+
f"values={sorted(self.values)}"
|
|
923
|
+
)
|
|
384
924
|
|
|
925
|
+
# aggregation
|
|
385
926
|
if key_agg:
|
|
927
|
+
final_stack = True
|
|
386
928
|
key_index = [
|
|
387
929
|
c
|
|
388
|
-
for c in self._filter_column(view_def.key_index, self.
|
|
930
|
+
for c in self._filter_column(view_def.key_index, self.keys_time)
|
|
389
931
|
if c not in set_key_agg
|
|
390
932
|
]
|
|
391
|
-
keys_no_agg = [c for c in self.
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
.groupby
|
|
395
|
-
|
|
933
|
+
keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
|
|
934
|
+
if verbose:
|
|
935
|
+
print(f"[CubeLogs.view] aggregation of {set_key_agg}")
|
|
936
|
+
print(f"[CubeLogs.view] groupby {keys_no_agg}")
|
|
937
|
+
|
|
938
|
+
data_red = self.data[[*keys_no_agg, *values]]
|
|
939
|
+
assert set(key_index) <= set(data_red.columns), (
|
|
940
|
+
f"view_def.name={view_def.name!r}, "
|
|
941
|
+
f"nnable to find {set(key_index) - set(data_red.columns)}, "
|
|
942
|
+
f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
|
|
943
|
+
f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
|
|
944
|
+
f"keys={pprint.pformat(sorted(self.keys_time))}"
|
|
396
945
|
)
|
|
946
|
+
grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
|
|
947
|
+
if callable(view_def.agg_args):
|
|
948
|
+
agg_kwargs = view_def.agg_kwargs or {}
|
|
949
|
+
agg_args = ({c: view_def.agg_args(c) for c in values},)
|
|
950
|
+
else:
|
|
951
|
+
agg_args = view_def.agg_args # type: ignore[assignment]
|
|
952
|
+
agg_kwargs = view_def.agg_kwargs or {}
|
|
953
|
+
data = grouped_data.agg(*agg_args, **agg_kwargs)
|
|
954
|
+
if view_def.agg_multi:
|
|
955
|
+
append = []
|
|
956
|
+
for k, f in view_def.agg_multi.items():
|
|
957
|
+
cv = grouped_data.apply(f, include_groups=False)
|
|
958
|
+
append.append(cv.to_frame(k))
|
|
959
|
+
data = pandas.concat([data, *append], axis=1)
|
|
960
|
+
set_all_keys = set(keys_no_agg)
|
|
961
|
+
values = list(data.columns)
|
|
962
|
+
data = data.reset_index(drop=False)
|
|
397
963
|
else:
|
|
398
|
-
key_index = self._filter_column(view_def.key_index, self.
|
|
399
|
-
|
|
964
|
+
key_index = self._filter_column(view_def.key_index, self.keys_time)
|
|
965
|
+
if verbose:
|
|
966
|
+
print(f"[CubeLogs.view] no aggregation, index={key_index}")
|
|
967
|
+
data = self.data[[*self.keys_time, *values]]
|
|
968
|
+
set_all_keys = set(self.keys_time)
|
|
969
|
+
final_stack = False
|
|
400
970
|
|
|
401
|
-
assert set(key_index) <=
|
|
402
|
-
|
|
403
|
-
|
|
971
|
+
assert set(key_index) <= set_all_keys, (
|
|
972
|
+
f"view_def.name={view_def.name!r}, "
|
|
973
|
+
f"Non existing keys in key_index {set(key_index) - set_all_keys}"
|
|
974
|
+
)
|
|
404
975
|
|
|
976
|
+
# remove unnecessary column
|
|
405
977
|
set_key_columns = {
|
|
406
|
-
c for c in self.
|
|
978
|
+
c for c in self.keys_time if c not in key_index and c not in set(key_agg)
|
|
407
979
|
}
|
|
980
|
+
key_index0 = key_index
|
|
408
981
|
if view_def.ignore_unique:
|
|
409
|
-
|
|
410
|
-
|
|
982
|
+
unique = {
|
|
983
|
+
k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
|
|
984
|
+
}
|
|
985
|
+
keep_anyway = (
|
|
986
|
+
set(view_def.keep_columns_in_index)
|
|
987
|
+
if view_def.keep_columns_in_index
|
|
988
|
+
else set()
|
|
989
|
+
)
|
|
990
|
+
key_index = [k for k in key_index if k not in unique or k in keep_anyway]
|
|
991
|
+
key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
|
|
992
|
+
if verbose:
|
|
993
|
+
print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
|
|
994
|
+
print(
|
|
995
|
+
f"[CubeLogs.view] columns with unique values "
|
|
996
|
+
f"{set(key_index0) - set(key_index)}"
|
|
997
|
+
)
|
|
411
998
|
else:
|
|
999
|
+
if verbose:
|
|
1000
|
+
print("[CubeLogs.view] keep all columns")
|
|
412
1001
|
key_columns = sorted(set_key_columns)
|
|
1002
|
+
unique = set()
|
|
413
1003
|
|
|
1004
|
+
_md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
|
|
1005
|
+
all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
|
|
1006
|
+
assert all_cols == set(self.keys_time), (
|
|
1007
|
+
f"view_def.name={view_def.name!r}, "
|
|
1008
|
+
f"key_columns + key_index + key_agg + unique != keys, left="
|
|
1009
|
+
f"{set(self.keys_time) - all_cols}, "
|
|
1010
|
+
f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
|
|
1011
|
+
f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
# reorder
|
|
414
1015
|
if view_def.order:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
1016
|
+
subset = self._filter_column(view_def.order, all_cols | {self.time})
|
|
1017
|
+
corder = [o for o in view_def.order if o in subset]
|
|
1018
|
+
assert set(corder) <= set_key_columns, (
|
|
1019
|
+
f"view_def.name={view_def.name!r}, "
|
|
1020
|
+
f"non existing columns from order in key_columns "
|
|
1021
|
+
f"{set(corder) - set_key_columns}"
|
|
418
1022
|
)
|
|
419
1023
|
key_columns = [
|
|
420
|
-
*
|
|
1024
|
+
*[o for o in corder if o in key_columns],
|
|
421
1025
|
*[c for c in key_columns if c not in view_def.order],
|
|
422
1026
|
]
|
|
423
|
-
|
|
1027
|
+
else:
|
|
1028
|
+
corder = None
|
|
1029
|
+
|
|
1030
|
+
if view_def.dropna:
|
|
1031
|
+
data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
|
|
1032
|
+
data,
|
|
1033
|
+
key_index,
|
|
1034
|
+
key_columns,
|
|
1035
|
+
values,
|
|
1036
|
+
keep_columns_in_index=view_def.keep_columns_in_index,
|
|
1037
|
+
)
|
|
1038
|
+
if view_def.ignore_columns:
|
|
1039
|
+
if verbose:
|
|
1040
|
+
print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
|
|
1041
|
+
data = data.drop(view_def.ignore_columns, axis=1)
|
|
1042
|
+
seti = set(view_def.ignore_columns)
|
|
1043
|
+
if view_def.keep_columns_in_index:
|
|
1044
|
+
seti -= set(view_def.keep_columns_in_index)
|
|
1045
|
+
key_index = [c for c in key_index if c not in seti]
|
|
1046
|
+
key_columns = [c for c in key_columns if c not in seti]
|
|
1047
|
+
values = [c for c in values if c not in seti]
|
|
1048
|
+
|
|
1049
|
+
# final verification
|
|
1050
|
+
if verbose:
|
|
1051
|
+
print(f"[CubeLogs.view] key_index={key_index}")
|
|
1052
|
+
print(f"[CubeLogs.view] key_columns={key_columns}")
|
|
1053
|
+
g = data[[*key_index, *key_columns]].copy()
|
|
1054
|
+
g["count"] = 1
|
|
1055
|
+
r = g.groupby([*key_index, *key_columns], dropna=False).sum()
|
|
1056
|
+
not_unique = r[r["count"] > 1]
|
|
1057
|
+
assert not_unique.shape[0] == 0, (
|
|
1058
|
+
f"view_def.name={view_def.name!r}, "
|
|
1059
|
+
f"unable to run the pivot with index={sorted(key_index)}, "
|
|
1060
|
+
f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
|
|
1061
|
+
f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
|
|
1062
|
+
f"not unique={set(data.columns) - unique}"
|
|
1063
|
+
f"\n--\n{not_unique.head()}"
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
# pivot
|
|
1067
|
+
if verbose:
|
|
1068
|
+
print(f"[CubeLogs.view] values={values}")
|
|
1069
|
+
if key_index:
|
|
1070
|
+
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
|
|
1071
|
+
else:
|
|
1072
|
+
# pivot does return the same rank with it is empty.
|
|
1073
|
+
# Let's add arficially one
|
|
1074
|
+
data = data.copy()
|
|
1075
|
+
data["ALL"] = "ALL"
|
|
1076
|
+
piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
|
|
1077
|
+
if isinstance(piv, pandas.Series):
|
|
1078
|
+
piv = piv.to_frame(name="series")
|
|
1079
|
+
names = list(piv.columns.names)
|
|
1080
|
+
assert (
|
|
1081
|
+
"METRICS" not in names
|
|
1082
|
+
), f"Not implemented when a level METRICS already exists {names!r}"
|
|
1083
|
+
names[0] = "METRICS"
|
|
1084
|
+
piv.columns = piv.columns.set_names(names)
|
|
1085
|
+
if final_stack:
|
|
1086
|
+
piv = piv.stack("METRICS", future_stack=True)
|
|
1087
|
+
if view_def.transpose:
|
|
1088
|
+
piv = piv.T
|
|
1089
|
+
if isinstance(piv, pandas.Series):
|
|
1090
|
+
piv = piv.to_frame("VALUE")
|
|
1091
|
+
piv.sort_index(inplace=True)
|
|
1092
|
+
|
|
1093
|
+
if isinstance(piv.columns, pandas.MultiIndex):
|
|
1094
|
+
if corder:
|
|
1095
|
+
# reorder the levels for the columns with the view definition
|
|
1096
|
+
new_corder = [c for c in corder if c in piv.columns.names]
|
|
1097
|
+
new_names = [
|
|
1098
|
+
*[c for c in piv.columns.names if c not in new_corder],
|
|
1099
|
+
*new_corder,
|
|
1100
|
+
]
|
|
1101
|
+
piv.columns = piv.columns.reorder_levels(new_names)
|
|
1102
|
+
elif self.time in piv.columns.names:
|
|
1103
|
+
# put time at the end
|
|
1104
|
+
new_names = list(piv.columns.names)
|
|
1105
|
+
ind = new_names.index(self.time)
|
|
1106
|
+
if ind < len(new_names) - 1:
|
|
1107
|
+
del new_names[ind]
|
|
1108
|
+
new_names.append(self.time)
|
|
1109
|
+
piv.columns = piv.columns.reorder_levels(new_names)
|
|
1110
|
+
|
|
1111
|
+
if view_def.no_index:
|
|
1112
|
+
piv = piv.reset_index(drop=False)
|
|
1113
|
+
else:
|
|
1114
|
+
piv.sort_index(inplace=True, axis=1)
|
|
1115
|
+
|
|
1116
|
+
if verbose:
|
|
1117
|
+
print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
|
|
1118
|
+
print(f"[CubeLogs.view] -- done view {view_def.name!r}")
|
|
1119
|
+
return (piv, view_def) if return_view_def else piv
|
|
1120
|
+
|
|
1121
|
+
def _dropna(
|
|
1122
|
+
self,
|
|
1123
|
+
data: pandas.DataFrame,
|
|
1124
|
+
key_index: Sequence[str],
|
|
1125
|
+
key_columns: Sequence[str],
|
|
1126
|
+
values: Sequence[str],
|
|
1127
|
+
keep_columns_in_index: Optional[Sequence[str]] = None,
|
|
1128
|
+
) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
|
|
1129
|
+
set_keep_columns_in_index = (
|
|
1130
|
+
set(keep_columns_in_index) if keep_columns_in_index else set()
|
|
1131
|
+
)
|
|
1132
|
+
v = data[values]
|
|
1133
|
+
new_data = data[~v.isnull().all(1)]
|
|
1134
|
+
if data.shape == new_data.shape:
|
|
1135
|
+
return data, key_index, key_columns, values
|
|
1136
|
+
new_data = new_data.copy()
|
|
1137
|
+
new_key_index = []
|
|
1138
|
+
for c in key_index:
|
|
1139
|
+
if c in set_keep_columns_in_index:
|
|
1140
|
+
new_key_index.append(c)
|
|
1141
|
+
continue
|
|
1142
|
+
v = new_data[c]
|
|
1143
|
+
sv = set(v.dropna())
|
|
1144
|
+
if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
|
|
1145
|
+
new_key_index.append(c)
|
|
1146
|
+
new_key_columns = []
|
|
1147
|
+
for c in key_columns:
|
|
1148
|
+
if c in set_keep_columns_in_index:
|
|
1149
|
+
new_key_columns.append(c)
|
|
1150
|
+
continue
|
|
1151
|
+
v = new_data[c]
|
|
1152
|
+
sv = set(v.dropna())
|
|
1153
|
+
if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
|
|
1154
|
+
new_key_columns.append(c)
|
|
1155
|
+
for c in set(key_index) | set(key_columns):
|
|
1156
|
+
s = new_data[c]
|
|
1157
|
+
if s.isna().max():
|
|
1158
|
+
if pandas.api.types.is_numeric_dtype(s):
|
|
1159
|
+
min_v = s.dropna().min()
|
|
1160
|
+
assert (
|
|
1161
|
+
min_v >= 0
|
|
1162
|
+
), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
|
|
1163
|
+
new_data[c] = s.fillna(-1)
|
|
1164
|
+
else:
|
|
1165
|
+
new_data[c] = s.fillna("NAN")
|
|
1166
|
+
return new_data, new_key_index, new_key_columns, values
|
|
424
1167
|
|
|
425
1168
|
def describe(self) -> pandas.DataFrame:
|
|
426
1169
|
"""Basic description of all variables."""
|
|
@@ -433,22 +1176,42 @@ class CubeLogs:
|
|
|
433
1176
|
name=name,
|
|
434
1177
|
dtype=str(dtype),
|
|
435
1178
|
missing=len(values) - len(nonan),
|
|
1179
|
+
kind=(
|
|
1180
|
+
"time"
|
|
1181
|
+
if name == self.time
|
|
1182
|
+
else (
|
|
1183
|
+
"keys"
|
|
1184
|
+
if name in self.keys_no_time
|
|
1185
|
+
else (
|
|
1186
|
+
"values"
|
|
1187
|
+
if name in self.values
|
|
1188
|
+
else ("ignored" if name in self.ignored else "unused")
|
|
1189
|
+
)
|
|
1190
|
+
)
|
|
1191
|
+
),
|
|
436
1192
|
)
|
|
437
1193
|
if len(nonan) > 0:
|
|
438
|
-
obs.update(
|
|
439
|
-
dict(
|
|
440
|
-
min=nonan.min(),
|
|
441
|
-
max=nonan.max(),
|
|
442
|
-
count=len(nonan),
|
|
443
|
-
)
|
|
444
|
-
)
|
|
1194
|
+
obs.update(dict(count=len(nonan)))
|
|
445
1195
|
if is_numeric_dtype(nonan):
|
|
446
1196
|
obs.update(
|
|
447
1197
|
dict(
|
|
1198
|
+
min=nonan.min(),
|
|
1199
|
+
max=nonan.max(),
|
|
448
1200
|
mean=nonan.mean(),
|
|
449
1201
|
sum=nonan.sum(),
|
|
1202
|
+
n_values=len(set(nonan)),
|
|
450
1203
|
)
|
|
451
1204
|
)
|
|
1205
|
+
elif obs["kind"] == "time":
|
|
1206
|
+
unique = set(nonan)
|
|
1207
|
+
obs["n_values"] = len(unique)
|
|
1208
|
+
o = dict(
|
|
1209
|
+
min=str(nonan.min()),
|
|
1210
|
+
max=str(nonan.max()),
|
|
1211
|
+
n_values=len(set(nonan)),
|
|
1212
|
+
)
|
|
1213
|
+
o["values"] = f"{o['min']} - {o['max']}"
|
|
1214
|
+
obs.update(o)
|
|
452
1215
|
else:
|
|
453
1216
|
unique = set(nonan)
|
|
454
1217
|
obs["n_values"] = len(unique)
|
|
@@ -460,126 +1223,747 @@ class CubeLogs:
|
|
|
460
1223
|
def to_excel(
|
|
461
1224
|
self,
|
|
462
1225
|
output: str,
|
|
463
|
-
views: Dict[str, CubeViewDef],
|
|
1226
|
+
views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
|
|
464
1227
|
main: Optional[str] = "main",
|
|
465
1228
|
raw: Optional[str] = "raw",
|
|
466
1229
|
verbose: int = 0,
|
|
1230
|
+
csv: Optional[Sequence[str]] = None,
|
|
467
1231
|
):
|
|
468
1232
|
"""
|
|
469
1233
|
Creates an excel file with a list of view.
|
|
470
1234
|
|
|
471
1235
|
:param output: output file to create
|
|
472
|
-
:param views:
|
|
1236
|
+
:param views: sequence or dictionary of views to append
|
|
473
1237
|
:param main: add a page with statitcs on all variables
|
|
474
1238
|
:param raw: add a page with the raw data
|
|
1239
|
+
:param csv: views to dump as csv files (same name as outputs + view naw)
|
|
475
1240
|
:param verbose: verbosity
|
|
476
1241
|
"""
|
|
477
|
-
|
|
1242
|
+
if verbose:
|
|
1243
|
+
print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
|
|
1244
|
+
views = {k: k for k in views} if not isinstance(views, dict) else views
|
|
1245
|
+
f_highlights = {}
|
|
1246
|
+
plots = []
|
|
478
1247
|
with pandas.ExcelWriter(output, engine="openpyxl") as writer:
|
|
479
1248
|
if main:
|
|
480
1249
|
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
481
|
-
df = self.describe()
|
|
1250
|
+
df = self.describe().sort_values("name")
|
|
482
1251
|
if verbose:
|
|
483
|
-
print(f"[CubeLogs.
|
|
1252
|
+
print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
|
|
484
1253
|
df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
|
|
485
|
-
self._apply_excel_style(main, writer, df)
|
|
486
|
-
if raw:
|
|
487
|
-
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
488
|
-
if verbose:
|
|
489
|
-
print(f"[CubeLogs.to_helper] add sheet {raw!r} with shape {self.shape}")
|
|
490
|
-
self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
|
|
491
|
-
self._apply_excel_style(raw, writer, self.data)
|
|
492
1254
|
|
|
493
1255
|
for name, view in views.items():
|
|
494
|
-
|
|
1256
|
+
if view is None:
|
|
1257
|
+
continue
|
|
1258
|
+
df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
|
|
1259
|
+
if tview is None:
|
|
1260
|
+
continue
|
|
1261
|
+
memory = df.memory_usage(deep=True).sum()
|
|
495
1262
|
if verbose:
|
|
496
1263
|
print(
|
|
497
|
-
f"[CubeLogs.
|
|
498
|
-
f"{df.shape}, index={df.index.names},
|
|
1264
|
+
f"[CubeLogs.to_excel] add sheet {name!r} with shape "
|
|
1265
|
+
f"{df.shape} ({memory} bytes), index={df.index.names}, "
|
|
1266
|
+
f"columns={df.columns.names}"
|
|
1267
|
+
)
|
|
1268
|
+
if self.time in df.columns.names:
|
|
1269
|
+
# Let's convert the time into str
|
|
1270
|
+
fr = df.columns.to_frame()
|
|
1271
|
+
if is_datetime64_any_dtype(fr[self.time]):
|
|
1272
|
+
dt = fr[self.time]
|
|
1273
|
+
has_time = (dt != dt.dt.normalize()).any()
|
|
1274
|
+
sdt = dt.apply(
|
|
1275
|
+
lambda t, has_time=has_time: t.strftime(
|
|
1276
|
+
"%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
|
|
1277
|
+
)
|
|
1278
|
+
)
|
|
1279
|
+
fr[self.time] = sdt
|
|
1280
|
+
df.columns = pandas.MultiIndex.from_frame(fr)
|
|
1281
|
+
if csv and name in csv:
|
|
1282
|
+
name_csv = f"{output}.{name}.csv"
|
|
1283
|
+
if verbose:
|
|
1284
|
+
print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
|
|
1285
|
+
df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
|
|
1286
|
+
|
|
1287
|
+
if memory > 2**22:
|
|
1288
|
+
msg = (
|
|
1289
|
+
f"[CubeLogs.to_excel] skipping {name!r}, "
|
|
1290
|
+
f"too big for excel with {memory} bytes"
|
|
1291
|
+
)
|
|
1292
|
+
if verbose:
|
|
1293
|
+
print(msg)
|
|
1294
|
+
else:
|
|
1295
|
+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
|
|
1296
|
+
else:
|
|
1297
|
+
df.to_excel(
|
|
1298
|
+
writer,
|
|
1299
|
+
sheet_name=name,
|
|
1300
|
+
freeze_panes=(df.columns.nlevels + df.index.nlevels, df.index.nlevels),
|
|
499
1301
|
)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
1302
|
+
f_highlights[name] = tview.f_highlight
|
|
1303
|
+
if tview.plots:
|
|
1304
|
+
plots.append(
|
|
1305
|
+
CubePlot(
|
|
1306
|
+
df,
|
|
1307
|
+
kind="line",
|
|
1308
|
+
orientation="row",
|
|
1309
|
+
split=True,
|
|
1310
|
+
timeseries=self.time,
|
|
1311
|
+
)
|
|
1312
|
+
if self.time in df.columns.names
|
|
1313
|
+
else CubePlot(df, kind="barh", orientation="row", split=True)
|
|
1314
|
+
)
|
|
1315
|
+
if raw:
|
|
1316
|
+
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
1317
|
+
# Too long.
|
|
1318
|
+
# self._apply_excel_style(raw, writer, self.data)
|
|
1319
|
+
if csv and "raw" in csv:
|
|
1320
|
+
df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
|
|
1321
|
+
memory = df.memory_usage(deep=True).sum()
|
|
1322
|
+
if memory > 2**22:
|
|
1323
|
+
msg = (
|
|
1324
|
+
f"[CubeLogs.to_excel] skipping 'raw', "
|
|
1325
|
+
f"too big for excel with {memory} bytes"
|
|
1326
|
+
)
|
|
1327
|
+
if verbose:
|
|
1328
|
+
print(msg)
|
|
1329
|
+
else:
|
|
1330
|
+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
|
|
1331
|
+
else:
|
|
1332
|
+
if verbose:
|
|
1333
|
+
print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
|
|
1334
|
+
self.data.to_excel(
|
|
1335
|
+
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
|
|
1336
|
+
)
|
|
1337
|
+
|
|
1338
|
+
if plots:
|
|
1339
|
+
from openpyxl.drawing.image import Image
|
|
1340
|
+
|
|
1341
|
+
if verbose:
|
|
1342
|
+
print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
|
|
1343
|
+
sheet = writer.book.create_sheet("plots")
|
|
1344
|
+
pos = 0
|
|
1345
|
+
empty_row = 1
|
|
1346
|
+
times = self.data[self.time].dropna()
|
|
1347
|
+
mini, maxi = times.min(), times.max()
|
|
1348
|
+
title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
|
|
1349
|
+
" 00:00:00", ""
|
|
504
1350
|
)
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
1351
|
+
for plot in plots:
|
|
1352
|
+
imgs = plot.to_images(
|
|
1353
|
+
verbose=verbose, merge=True, title_suffix=title_suffix
|
|
1354
|
+
)
|
|
1355
|
+
for img in imgs:
|
|
1356
|
+
y = (pos // 2) * 16
|
|
1357
|
+
loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
|
|
1358
|
+
sheet.add_image(Image(io.BytesIO(img)), loc)
|
|
1359
|
+
if verbose:
|
|
1360
|
+
no = f"{output}.png"
|
|
1361
|
+
print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
|
|
1362
|
+
with open(no, "wb") as f:
|
|
1363
|
+
f.write(img)
|
|
1364
|
+
pos += 1
|
|
1365
|
+
empty_row += len(plots) + 2
|
|
508
1366
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
1367
|
+
if verbose:
|
|
1368
|
+
print(f"[CubeLogs.to_excel] applies style to {output!r}")
|
|
1369
|
+
apply_excel_style(writer, f_highlights) # type: ignore[arg-type]
|
|
1370
|
+
if verbose:
|
|
1371
|
+
print(f"[CubeLogs.to_excel] done with {len(views)} views")
|
|
512
1372
|
|
|
513
|
-
# from openpyxl.styles import Font, PatternFill, numbers
|
|
514
1373
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
# red = Font(color="FF0000")
|
|
520
|
-
# yellow = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
|
|
521
|
-
# redf = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid")
|
|
1374
|
+
class CubeLogsPerformance(CubeLogs):
|
|
1375
|
+
"""
|
|
1376
|
+
Processes logs coming from experiments.
|
|
1377
|
+
"""
|
|
522
1378
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
1379
|
+
def __init__(
|
|
1380
|
+
self,
|
|
1381
|
+
data: Any,
|
|
1382
|
+
time: str = "DATE",
|
|
1383
|
+
keys: Sequence[str] = (
|
|
1384
|
+
"^version_.*",
|
|
1385
|
+
"^model_.*",
|
|
1386
|
+
"device",
|
|
1387
|
+
"opt_patterns",
|
|
1388
|
+
"suite",
|
|
1389
|
+
"memory_peak",
|
|
1390
|
+
"machine",
|
|
1391
|
+
"exporter",
|
|
1392
|
+
"dynamic",
|
|
1393
|
+
"rtopt",
|
|
1394
|
+
"dtype",
|
|
1395
|
+
"device",
|
|
1396
|
+
"architecture",
|
|
1397
|
+
),
|
|
1398
|
+
values: Sequence[str] = (
|
|
1399
|
+
"^time_.*",
|
|
1400
|
+
"^disc.*",
|
|
1401
|
+
"^ERR_.*",
|
|
1402
|
+
"CMD",
|
|
1403
|
+
"^ITER",
|
|
1404
|
+
"^onnx_.*",
|
|
1405
|
+
"^op_onnx_.*",
|
|
1406
|
+
"^peak_gpu_.*",
|
|
1407
|
+
),
|
|
1408
|
+
ignored: Sequence[str] = ("version_python",),
|
|
1409
|
+
recent: bool = True,
|
|
1410
|
+
formulas: Optional[
|
|
1411
|
+
Union[
|
|
1412
|
+
Sequence[str],
|
|
1413
|
+
Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
|
|
1414
|
+
]
|
|
1415
|
+
] = (
|
|
1416
|
+
"speedup",
|
|
1417
|
+
"bucket[speedup]",
|
|
1418
|
+
"ERR1",
|
|
1419
|
+
"n_models",
|
|
1420
|
+
"n_model_eager",
|
|
1421
|
+
"n_model_running",
|
|
1422
|
+
"n_model_acc01",
|
|
1423
|
+
"n_model_acc001",
|
|
1424
|
+
"n_model_dynamic",
|
|
1425
|
+
"n_model_pass",
|
|
1426
|
+
"n_model_faster",
|
|
1427
|
+
"n_model_faster2x",
|
|
1428
|
+
"n_model_faster3x",
|
|
1429
|
+
"n_model_faster4x",
|
|
1430
|
+
"n_node_attention",
|
|
1431
|
+
"n_node_control_flow",
|
|
1432
|
+
"n_node_scatter",
|
|
1433
|
+
"n_node_function",
|
|
1434
|
+
"n_node_initializer",
|
|
1435
|
+
"n_node_initializer_small",
|
|
1436
|
+
"n_node_constant",
|
|
1437
|
+
"n_node_shape",
|
|
1438
|
+
"n_node_expand",
|
|
1439
|
+
"onnx_n_nodes_no_cst",
|
|
1440
|
+
"peak_gpu_torch",
|
|
1441
|
+
"peak_gpu_nvidia",
|
|
1442
|
+
"time_export_unbiased",
|
|
1443
|
+
),
|
|
1444
|
+
fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
|
|
1445
|
+
keep_last_date: bool = False,
|
|
1446
|
+
):
|
|
1447
|
+
super().__init__(
|
|
1448
|
+
data=data,
|
|
1449
|
+
time=time,
|
|
1450
|
+
keys=keys,
|
|
1451
|
+
values=values,
|
|
1452
|
+
ignored=ignored,
|
|
1453
|
+
recent=recent,
|
|
1454
|
+
formulas=formulas,
|
|
1455
|
+
fill_missing=fill_missing,
|
|
1456
|
+
keep_last_date=keep_last_date,
|
|
1457
|
+
)
|
|
538
1458
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
c = get_column_letter(k)
|
|
545
|
-
sheet.column_dimensions[c].width = 15
|
|
1459
|
+
def _process_formula(
|
|
1460
|
+
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
1461
|
+
) -> Callable[[pandas.DataFrame], pandas.Series]:
|
|
1462
|
+
"""
|
|
1463
|
+
Processes a formula, converting it into a function.
|
|
546
1464
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
1465
|
+
:param formula: a formula string
|
|
1466
|
+
:return: a function
|
|
1467
|
+
"""
|
|
1468
|
+
if callable(formula):
|
|
1469
|
+
return formula
|
|
1470
|
+
assert isinstance(
|
|
1471
|
+
formula, str
|
|
1472
|
+
), f"Unexpected type for formula {type(formula)}: {formula!r}"
|
|
1473
|
+
|
|
1474
|
+
def gdf(df, cname, default_value=np.nan):
|
|
1475
|
+
if cname in df.columns:
|
|
1476
|
+
return df[cname]
|
|
1477
|
+
return pandas.Series(default_value, index=df.index)
|
|
1478
|
+
|
|
1479
|
+
def ghas_value(df, cname):
|
|
1480
|
+
if cname not in df.columns:
|
|
1481
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1482
|
+
isna = df[cname].isna()
|
|
1483
|
+
return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
|
|
1484
|
+
|
|
1485
|
+
def gpreserve(df, cname, series):
|
|
1486
|
+
if cname not in df.columns:
|
|
1487
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1488
|
+
isna = df[cname].isna()
|
|
1489
|
+
return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
|
|
1490
|
+
|
|
1491
|
+
if formula == "speedup":
|
|
1492
|
+
columns = set(self._filter_column(["^time_.*"], self.data.columns))
|
|
1493
|
+
assert "time_latency" in columns and "time_latency_eager" in columns, (
|
|
1494
|
+
f"Unable to apply formula {formula!r}, with columns\n"
|
|
1495
|
+
f"{pprint.pformat(sorted(columns))}"
|
|
1496
|
+
)
|
|
1497
|
+
return lambda df: df["time_latency_eager"] / df["time_latency"]
|
|
1498
|
+
|
|
1499
|
+
if formula == "bucket[speedup]":
|
|
1500
|
+
columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
|
|
1501
|
+
assert "speedup" in columns, (
|
|
1502
|
+
f"Unable to apply formula {formula!r}, with columns\n"
|
|
1503
|
+
f"{pprint.pformat(sorted(columns))}"
|
|
1504
|
+
)
|
|
1505
|
+
# return lambda df: df["time_latency_eager"] / df["time_latency"]
|
|
1506
|
+
return lambda df: pandas.cut(
|
|
1507
|
+
df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
|
|
1508
|
+
)
|
|
1509
|
+
|
|
1510
|
+
if formula == "ERR1":
|
|
1511
|
+
columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
|
|
1512
|
+
if not columns:
|
|
1513
|
+
return lambda df: np.nan
|
|
1514
|
+
|
|
1515
|
+
def first_err(df: pandas.DataFrame) -> pandas.Series:
|
|
1516
|
+
ordered = [
|
|
1517
|
+
c
|
|
1518
|
+
for c in [
|
|
1519
|
+
"ERR_timeout",
|
|
1520
|
+
"ERR_load",
|
|
1521
|
+
"ERR_feeds",
|
|
1522
|
+
"ERR_warmup_eager",
|
|
1523
|
+
"ERR_export",
|
|
1524
|
+
"ERR_ort",
|
|
1525
|
+
"ERR_warmup",
|
|
1526
|
+
# "ERR_std",
|
|
1527
|
+
# "ERR_crash",
|
|
1528
|
+
# "ERR_stdout",
|
|
1529
|
+
]
|
|
1530
|
+
if c in df.columns
|
|
1531
|
+
]
|
|
1532
|
+
res = None
|
|
1533
|
+
for c in ordered:
|
|
1534
|
+
if res is None:
|
|
1535
|
+
res = df[c].fillna("")
|
|
582
1536
|
else:
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
1537
|
+
res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
|
|
1538
|
+
return res
|
|
1539
|
+
|
|
1540
|
+
return first_err
|
|
1541
|
+
|
|
1542
|
+
if formula.startswith("n_"):
|
|
1543
|
+
lambdas = dict(
|
|
1544
|
+
n_models=lambda df: ghas_value(df, "model_name"),
|
|
1545
|
+
n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
|
|
1546
|
+
n_model_running=lambda df: ghas_value(df, "time_latency"),
|
|
1547
|
+
n_model_acc01=lambda df: gpreserve(
|
|
1548
|
+
df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
|
|
1549
|
+
),
|
|
1550
|
+
n_model_acc001=lambda df: gpreserve(
|
|
1551
|
+
df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
|
|
1552
|
+
),
|
|
1553
|
+
n_model_dynamic=lambda df: gpreserve(
|
|
1554
|
+
df,
|
|
1555
|
+
"discrepancies_dynamic_abs",
|
|
1556
|
+
(gdf(df, "discrepancies_dynamic_abs") <= 0.1),
|
|
1557
|
+
),
|
|
1558
|
+
n_model_pass=lambda df: gpreserve(
|
|
1559
|
+
df,
|
|
1560
|
+
"time_latency",
|
|
1561
|
+
(gdf(df, "discrepancies_abs", np.inf) < 0.1)
|
|
1562
|
+
& (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
|
|
1563
|
+
),
|
|
1564
|
+
n_model_faster=lambda df: gpreserve(
|
|
1565
|
+
df,
|
|
1566
|
+
"time_latency",
|
|
1567
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
|
|
1568
|
+
),
|
|
1569
|
+
n_model_faster2x=lambda df: gpreserve(
|
|
1570
|
+
df,
|
|
1571
|
+
"time_latency",
|
|
1572
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
|
|
1573
|
+
),
|
|
1574
|
+
n_model_faster3x=lambda df: gpreserve(
|
|
1575
|
+
df,
|
|
1576
|
+
"time_latency",
|
|
1577
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
|
|
1578
|
+
),
|
|
1579
|
+
n_model_faster4x=lambda df: gpreserve(
|
|
1580
|
+
df,
|
|
1581
|
+
"time_latency",
|
|
1582
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
|
|
1583
|
+
),
|
|
1584
|
+
n_node_attention=lambda df: gpreserve(
|
|
1585
|
+
df,
|
|
1586
|
+
"op_onnx_com.microsoft_Attention",
|
|
1587
|
+
gdf(df, "op_onnx_com.microsoft_Attention")
|
|
1588
|
+
+ gdf(df, "op_onnx_com.microsoft_MultiHeadAttention"),
|
|
1589
|
+
),
|
|
1590
|
+
n_node_control_flow=lambda df: gpreserve(
|
|
1591
|
+
df,
|
|
1592
|
+
"op_onnx__If",
|
|
1593
|
+
(
|
|
1594
|
+
gdf(df, "op_onnx__If", 0)
|
|
1595
|
+
+ gdf(df, "op_onnx__Scan", 0)
|
|
1596
|
+
+ gdf(df, "op_onnx__Loop", 0)
|
|
1597
|
+
),
|
|
1598
|
+
),
|
|
1599
|
+
n_node_scatter=lambda df: gpreserve(
|
|
1600
|
+
df,
|
|
1601
|
+
"op_onnx__ScatterND",
|
|
1602
|
+
gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
|
|
1603
|
+
),
|
|
1604
|
+
n_node_function=lambda df: gpreserve(
|
|
1605
|
+
df, "onnx_n_functions", gdf(df, "onnx_n_functions")
|
|
1606
|
+
),
|
|
1607
|
+
n_node_initializer_small=lambda df: gpreserve(
|
|
1608
|
+
df, "op_onnx_initializer_small", gdf(df, "op_onnx_initializer_small")
|
|
1609
|
+
),
|
|
1610
|
+
n_node_initializer=lambda df: gpreserve(
|
|
1611
|
+
df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
|
|
1612
|
+
),
|
|
1613
|
+
n_node_constant=lambda df: gpreserve(
|
|
1614
|
+
df, "op_onnx__Constant", gdf(df, "op_onnx__Constant")
|
|
1615
|
+
),
|
|
1616
|
+
n_node_shape=lambda df: gpreserve(
|
|
1617
|
+
df, "op_onnx__Shape", gdf(df, "op_onnx__Shape")
|
|
1618
|
+
),
|
|
1619
|
+
n_node_expand=lambda df: gpreserve(
|
|
1620
|
+
df, "op_onnx__Expand", gdf(df, "op_onnx__Expand")
|
|
1621
|
+
),
|
|
1622
|
+
)
|
|
1623
|
+
assert (
|
|
1624
|
+
formula in lambdas
|
|
1625
|
+
), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
|
|
1626
|
+
return lambdas[formula]
|
|
1627
|
+
|
|
1628
|
+
if formula == "onnx_n_nodes_no_cst":
|
|
1629
|
+
return lambda df: gdf(df, "onnx_n_nodes", 0) - gdf(
|
|
1630
|
+
df, "op_onnx__Constant", 0
|
|
1631
|
+
).fillna(0)
|
|
1632
|
+
if formula == "peak_gpu_torch":
|
|
1633
|
+
return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
|
|
1634
|
+
if formula == "peak_gpu_nvidia":
|
|
1635
|
+
return (
|
|
1636
|
+
lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
|
|
1637
|
+
)
|
|
1638
|
+
if formula == "time_export_unbiased":
|
|
1639
|
+
|
|
1640
|
+
def unbiased_export(df):
|
|
1641
|
+
if "time_warmup_first_iteration" not in df.columns:
|
|
1642
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1643
|
+
return pandas.Series(
|
|
1644
|
+
np.where(
|
|
1645
|
+
df["exporter"] == "inductor",
|
|
1646
|
+
df["time_warmup_first_iteration"] + df["time_export_success"],
|
|
1647
|
+
df["time_export_success"],
|
|
1648
|
+
),
|
|
1649
|
+
index=df.index,
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
|
|
1653
|
+
|
|
1654
|
+
raise ValueError(
|
|
1655
|
+
f"Unexpected formula {formula!r}, available columns are\n"
|
|
1656
|
+
f"{pprint.pformat(sorted(self.data.columns))}"
|
|
1657
|
+
)
|
|
1658
|
+
|
|
1659
|
+
def view(
|
|
1660
|
+
self,
|
|
1661
|
+
view_def: Optional[Union[str, CubeViewDef]],
|
|
1662
|
+
return_view_def: bool = False,
|
|
1663
|
+
verbose: int = 0,
|
|
1664
|
+
) -> Union[
|
|
1665
|
+
Optional[pandas.DataFrame], Tuple[Optional[pandas.DataFrame], Optional[CubeViewDef]]
|
|
1666
|
+
]:
|
|
1667
|
+
"""
|
|
1668
|
+
Returns a dataframe, a pivot view.
|
|
1669
|
+
|
|
1670
|
+
If view_def is a string, it is replaced by a prefined view.
|
|
1671
|
+
|
|
1672
|
+
:param view_def: view definition or a string
|
|
1673
|
+
:param return_view_def: returns the view definition as well
|
|
1674
|
+
:param verbose: verbosity level
|
|
1675
|
+
:return: dataframe or a couple (dataframe, view definition),
|
|
1676
|
+
both of them can be one if view_def cannot be interpreted
|
|
1677
|
+
"""
|
|
1678
|
+
assert view_def is not None, "view_def is None, this is not allowed."
|
|
1679
|
+
if isinstance(view_def, str):
|
|
1680
|
+
view_def = self.make_view_def(view_def)
|
|
1681
|
+
if view_def is None:
|
|
1682
|
+
return (None, None) if return_view_def else None
|
|
1683
|
+
return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
|
|
1684
|
+
|
|
1685
|
+
def make_view_def(self, name: str) -> Optional[CubeViewDef]:
|
|
1686
|
+
"""
|
|
1687
|
+
Returns a view definition.
|
|
1688
|
+
|
|
1689
|
+
:param name: name of the view
|
|
1690
|
+
:return: a CubeViewDef or None if name does not make sense
|
|
1691
|
+
|
|
1692
|
+
Available views:
|
|
1693
|
+
|
|
1694
|
+
* **agg-suite:** aggregation per suite
|
|
1695
|
+
* **disc:** discrepancies
|
|
1696
|
+
* **speedup:** speedup
|
|
1697
|
+
* **bucket_speedup:** speedup in buckets
|
|
1698
|
+
* **time:** latency
|
|
1699
|
+
* **time_export:** time to export
|
|
1700
|
+
* **counts:** status, running, faster, has control flow, ...
|
|
1701
|
+
* **err:** important errors
|
|
1702
|
+
* **cmd:** command lines
|
|
1703
|
+
* **raw-short:** raw data without all the unused columns
|
|
1704
|
+
"""
|
|
1705
|
+
fs = ["suite", "model_suite", "task", "model_name", "model_task"]
|
|
1706
|
+
index_cols = self._filter_column(fs, self.keys_time)
|
|
1707
|
+
assert index_cols, (
|
|
1708
|
+
f"No index columns found for {fs!r} in "
|
|
1709
|
+
f"{pprint.pformat(sorted(self.keys_time))}"
|
|
1710
|
+
)
|
|
1711
|
+
index_cols = [c for c in fs if c in set(index_cols)]
|
|
1712
|
+
|
|
1713
|
+
f_speedup = lambda x: ( # noqa: E731
|
|
1714
|
+
CubeViewDef.HighLightKind.NONE
|
|
1715
|
+
if not isinstance(x, (float, int))
|
|
1716
|
+
else (
|
|
1717
|
+
CubeViewDef.HighLightKind.RED
|
|
1718
|
+
if x < 0.9
|
|
1719
|
+
else (
|
|
1720
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1721
|
+
if x > 1.1
|
|
1722
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1723
|
+
)
|
|
1724
|
+
)
|
|
1725
|
+
)
|
|
1726
|
+
f_disc = lambda x: ( # noqa: E731
|
|
1727
|
+
CubeViewDef.HighLightKind.NONE
|
|
1728
|
+
if not isinstance(x, (float, int))
|
|
1729
|
+
else (
|
|
1730
|
+
CubeViewDef.HighLightKind.RED
|
|
1731
|
+
if x > 0.1
|
|
1732
|
+
else (
|
|
1733
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1734
|
+
if x < 0.01
|
|
1735
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1736
|
+
)
|
|
1737
|
+
)
|
|
1738
|
+
)
|
|
1739
|
+
f_bucket = lambda x: ( # noqa: E731
|
|
1740
|
+
CubeViewDef.HighLightKind.NONE
|
|
1741
|
+
if not isinstance(x, str)
|
|
1742
|
+
else (
|
|
1743
|
+
CubeViewDef.HighLightKind.RED
|
|
1744
|
+
if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
|
|
1745
|
+
else (
|
|
1746
|
+
CubeViewDef.HighLightKind.NONE
|
|
1747
|
+
if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
|
|
1748
|
+
else (
|
|
1749
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1750
|
+
if "[" in x
|
|
1751
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1752
|
+
)
|
|
1753
|
+
)
|
|
1754
|
+
)
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
def mean_weight(gr):
|
|
1758
|
+
weight = gr["time_latency_eager"]
|
|
1759
|
+
x = gr["speedup"]
|
|
1760
|
+
if x.shape[0] == 0:
|
|
1761
|
+
return np.nan
|
|
1762
|
+
div = weight.sum()
|
|
1763
|
+
if div > 0:
|
|
1764
|
+
return (x * weight).sum() / div
|
|
1765
|
+
return np.nan
|
|
1766
|
+
|
|
1767
|
+
def mean_geo(gr):
|
|
1768
|
+
x = gr["speedup"]
|
|
1769
|
+
return np.exp(np.log(x.dropna()).mean())
|
|
1770
|
+
|
|
1771
|
+
order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
|
|
1772
|
+
implemented_views = {
|
|
1773
|
+
"agg-suite": lambda: CubeViewDef(
|
|
1774
|
+
key_index=index_cols,
|
|
1775
|
+
values=self._filter_column(
|
|
1776
|
+
[
|
|
1777
|
+
"TIME_ITER",
|
|
1778
|
+
"speedup",
|
|
1779
|
+
"time_latency",
|
|
1780
|
+
"time_latency_eager",
|
|
1781
|
+
"time_export_success",
|
|
1782
|
+
"time_export_unbiased",
|
|
1783
|
+
"^n_.*",
|
|
1784
|
+
"target_opset",
|
|
1785
|
+
"onnx_filesize",
|
|
1786
|
+
"onnx_weight_size_torch",
|
|
1787
|
+
"onnx_weight_size_proto",
|
|
1788
|
+
"onnx_n_nodes",
|
|
1789
|
+
"onnx_n_nodes_no_cst",
|
|
1790
|
+
"op_onnx__Constant",
|
|
1791
|
+
"peak_gpu_torch",
|
|
1792
|
+
"peak_gpu_nvidia",
|
|
1793
|
+
],
|
|
1794
|
+
self.values,
|
|
1795
|
+
),
|
|
1796
|
+
ignore_unique=True,
|
|
1797
|
+
key_agg=["model_name", "task", "model_task"],
|
|
1798
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
1799
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
1800
|
+
keep_columns_in_index=["suite"],
|
|
1801
|
+
name="agg-suite",
|
|
1802
|
+
order=order,
|
|
1803
|
+
),
|
|
1804
|
+
"agg-all": lambda: CubeViewDef(
|
|
1805
|
+
key_index=index_cols,
|
|
1806
|
+
values=self._filter_column(
|
|
1807
|
+
[
|
|
1808
|
+
"TIME_ITER",
|
|
1809
|
+
"speedup",
|
|
1810
|
+
"time_latency",
|
|
1811
|
+
"time_latency_eager",
|
|
1812
|
+
"time_export_success",
|
|
1813
|
+
"time_export_unbiased",
|
|
1814
|
+
"^n_.*",
|
|
1815
|
+
"target_opset",
|
|
1816
|
+
"onnx_filesize",
|
|
1817
|
+
"onnx_weight_size_torch",
|
|
1818
|
+
"onnx_weight_size_proto",
|
|
1819
|
+
"onnx_n_nodes",
|
|
1820
|
+
"onnx_n_nodes_no_cst",
|
|
1821
|
+
"peak_gpu_torch",
|
|
1822
|
+
"peak_gpu_nvidia",
|
|
1823
|
+
],
|
|
1824
|
+
self.values,
|
|
1825
|
+
),
|
|
1826
|
+
ignore_unique=True,
|
|
1827
|
+
key_agg=["model_name", "task", "model_task", "suite"],
|
|
1828
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
1829
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
1830
|
+
name="agg-all",
|
|
1831
|
+
order=order,
|
|
1832
|
+
plots=True,
|
|
1833
|
+
),
|
|
1834
|
+
"disc": lambda: CubeViewDef(
|
|
1835
|
+
key_index=index_cols,
|
|
1836
|
+
values=self._filter_column(["discrepancies_abs"], self.values),
|
|
1837
|
+
ignore_unique=True,
|
|
1838
|
+
keep_columns_in_index=["suite"],
|
|
1839
|
+
f_highlight=f_disc,
|
|
1840
|
+
name="disc",
|
|
1841
|
+
order=order,
|
|
1842
|
+
),
|
|
1843
|
+
"speedup": lambda: CubeViewDef(
|
|
1844
|
+
key_index=index_cols,
|
|
1845
|
+
values=self._filter_column(["speedup"], self.values),
|
|
1846
|
+
ignore_unique=True,
|
|
1847
|
+
keep_columns_in_index=["suite"],
|
|
1848
|
+
f_highlight=f_speedup,
|
|
1849
|
+
name="speedup",
|
|
1850
|
+
order=order,
|
|
1851
|
+
),
|
|
1852
|
+
"counts": lambda: CubeViewDef(
|
|
1853
|
+
key_index=index_cols,
|
|
1854
|
+
values=self._filter_column(["^n_.*"], self.values),
|
|
1855
|
+
ignore_unique=True,
|
|
1856
|
+
keep_columns_in_index=["suite"],
|
|
1857
|
+
name="counts",
|
|
1858
|
+
order=order,
|
|
1859
|
+
),
|
|
1860
|
+
"peak-gpu": lambda: CubeViewDef(
|
|
1861
|
+
key_index=index_cols,
|
|
1862
|
+
values=self._filter_column(["^peak_gpu_.*"], self.values),
|
|
1863
|
+
ignore_unique=True,
|
|
1864
|
+
keep_columns_in_index=["suite"],
|
|
1865
|
+
name="peak-gpu",
|
|
1866
|
+
order=order,
|
|
1867
|
+
),
|
|
1868
|
+
"time": lambda: CubeViewDef(
|
|
1869
|
+
key_index=index_cols,
|
|
1870
|
+
values=self._filter_column(
|
|
1871
|
+
["time_latency", "time_latency_eager"], self.values
|
|
1872
|
+
),
|
|
1873
|
+
ignore_unique=True,
|
|
1874
|
+
keep_columns_in_index=["suite"],
|
|
1875
|
+
name="time",
|
|
1876
|
+
order=order,
|
|
1877
|
+
),
|
|
1878
|
+
"time_export": lambda: CubeViewDef(
|
|
1879
|
+
key_index=index_cols,
|
|
1880
|
+
values=self._filter_column(["time_export_unbiased"], self.values),
|
|
1881
|
+
ignore_unique=True,
|
|
1882
|
+
keep_columns_in_index=["suite"],
|
|
1883
|
+
name="time_export",
|
|
1884
|
+
order=order,
|
|
1885
|
+
),
|
|
1886
|
+
"err": lambda: CubeViewDef(
|
|
1887
|
+
key_index=index_cols,
|
|
1888
|
+
values=self._filter_column(
|
|
1889
|
+
["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
|
|
1890
|
+
),
|
|
1891
|
+
ignore_unique=True,
|
|
1892
|
+
keep_columns_in_index=["suite"],
|
|
1893
|
+
name="err",
|
|
1894
|
+
order=order,
|
|
1895
|
+
),
|
|
1896
|
+
"bucket-speedup": lambda: CubeViewDef(
|
|
1897
|
+
key_index=index_cols,
|
|
1898
|
+
values=self._filter_column(["bucket[speedup]"], self.values),
|
|
1899
|
+
ignore_unique=True,
|
|
1900
|
+
keep_columns_in_index=["suite"],
|
|
1901
|
+
name="bucket-speedup",
|
|
1902
|
+
f_highlight=f_bucket,
|
|
1903
|
+
order=order,
|
|
1904
|
+
),
|
|
1905
|
+
"onnx": lambda: CubeViewDef(
|
|
1906
|
+
key_index=index_cols,
|
|
1907
|
+
values=self._filter_column(
|
|
1908
|
+
[
|
|
1909
|
+
"onnx_filesize",
|
|
1910
|
+
"onnx_n_nodes",
|
|
1911
|
+
"onnx_n_nodes_no_cst",
|
|
1912
|
+
"onnx_weight_size_proto",
|
|
1913
|
+
"onnx_weight_size_torch",
|
|
1914
|
+
"op_onnx_initializer_small",
|
|
1915
|
+
],
|
|
1916
|
+
self.values,
|
|
1917
|
+
),
|
|
1918
|
+
ignore_unique=True,
|
|
1919
|
+
keep_columns_in_index=["suite"],
|
|
1920
|
+
name="onnx",
|
|
1921
|
+
order=order,
|
|
1922
|
+
),
|
|
1923
|
+
"raw-short": lambda: CubeViewDef(
|
|
1924
|
+
key_index=self.keys_time,
|
|
1925
|
+
values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
|
|
1926
|
+
ignore_unique=False,
|
|
1927
|
+
keep_columns_in_index=["suite"],
|
|
1928
|
+
name="raw-short",
|
|
1929
|
+
no_index=True,
|
|
1930
|
+
),
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
cmd_col = self._filter_column(["CMD"], self.values, can_be_empty=True)
|
|
1934
|
+
if cmd_col:
|
|
1935
|
+
implemented_views["cmd"] = lambda: CubeViewDef(
|
|
1936
|
+
key_index=index_cols,
|
|
1937
|
+
values=cmd_col,
|
|
1938
|
+
ignore_unique=True,
|
|
1939
|
+
keep_columns_in_index=["suite"],
|
|
1940
|
+
name="cmd",
|
|
1941
|
+
order=order,
|
|
1942
|
+
)
|
|
1943
|
+
|
|
1944
|
+
assert name in implemented_views or name in {"cmd"}, (
|
|
1945
|
+
f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
|
|
1946
|
+
f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
|
|
1947
|
+
f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
|
|
1948
|
+
)
|
|
1949
|
+
if name not in implemented_views:
|
|
1950
|
+
return None
|
|
1951
|
+
return implemented_views[name]()
|
|
1952
|
+
|
|
1953
|
+
def post_load_process_piece(
|
|
1954
|
+
self, df: pandas.DataFrame, unique: bool = False
|
|
1955
|
+
) -> pandas.DataFrame:
|
|
1956
|
+
df = super().post_load_process_piece(df, unique=unique)
|
|
1957
|
+
if unique:
|
|
1958
|
+
return df
|
|
1959
|
+
cols = self._filter_column(self._keys, df)
|
|
1960
|
+
res = None
|
|
1961
|
+
for c in cols:
|
|
1962
|
+
if df[c].isna().any():
|
|
1963
|
+
# Missing values for keys are not supposed to happen.
|
|
1964
|
+
uniq = set(df[c].dropna())
|
|
1965
|
+
if len(uniq) == 1:
|
|
1966
|
+
if res is None:
|
|
1967
|
+
res = df.copy()
|
|
1968
|
+
res[c] = res[c].fillna(uniq.pop())
|
|
1969
|
+
return df if res is None else res
|