onnx-diagnostic 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnx_diagnostic/__init__.py +1 -1
- onnx_diagnostic/_command_lines_parser.py +196 -5
- onnx_diagnostic/export/dynamic_shapes.py +48 -20
- onnx_diagnostic/export/shape_helper.py +126 -0
- onnx_diagnostic/helpers/cache_helper.py +19 -8
- onnx_diagnostic/helpers/log_helper.py +1335 -176
- onnx_diagnostic/tasks/image_text_to_text.py +69 -18
- onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +3 -3
- onnx_diagnostic/torch_models/hghub/hub_api.py +61 -4
- onnx_diagnostic/torch_models/hghub/hub_data.py +6 -2
- onnx_diagnostic/torch_models/hghub/model_inputs.py +55 -14
- onnx_diagnostic/torch_models/validate.py +9 -4
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.1.dist-info}/METADATA +1 -1
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.1.dist-info}/RECORD +17 -16
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.1.dist-info}/WHEEL +0 -0
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.1.dist-info}/licenses/LICENSE.txt +0 -0
- {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.1.dist-info}/top_level.txt +0 -0
|
@@ -1,20 +1,32 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
import enum
|
|
2
3
|
import glob
|
|
4
|
+
import io
|
|
3
5
|
import os
|
|
6
|
+
import pprint
|
|
4
7
|
import re
|
|
8
|
+
import warnings
|
|
5
9
|
import zipfile
|
|
6
10
|
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
|
|
7
11
|
import numpy as np
|
|
8
12
|
import pandas
|
|
9
|
-
from pandas.api.types import is_numeric_dtype
|
|
13
|
+
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
|
|
10
14
|
from .helper import string_sig
|
|
11
15
|
|
|
16
|
+
BUCKET_SCALES_VALUES = np.array(
|
|
17
|
+
[-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
|
|
22
|
+
|
|
12
23
|
|
|
13
24
|
def enumerate_csv_files(
|
|
14
25
|
data: Union[
|
|
15
26
|
pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
|
|
16
27
|
],
|
|
17
28
|
verbose: int = 0,
|
|
29
|
+
filtering: Optional[Callable[[str], bool]] = None,
|
|
18
30
|
) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
|
|
19
31
|
"""
|
|
20
32
|
Enumerates files considered for the aggregation.
|
|
@@ -23,6 +35,10 @@ def enumerate_csv_files(
|
|
|
23
35
|
loops over csv candidates.
|
|
24
36
|
|
|
25
37
|
:param data: dataframe with the raw data or a file or list of files
|
|
38
|
+
:param vrbose: verbosity
|
|
39
|
+
:param filtering: function to filter in or out files in zip files,
|
|
40
|
+
must return true to keep the file, false to skip it.
|
|
41
|
+
:return: a generator yielding tuples with the filename, date, full path and zip file
|
|
26
42
|
|
|
27
43
|
data can contains:
|
|
28
44
|
* a dataframe
|
|
@@ -52,13 +68,9 @@ def enumerate_csv_files(
|
|
|
52
68
|
# We check the first line is ok.
|
|
53
69
|
if verbose:
|
|
54
70
|
print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
continue
|
|
59
|
-
dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
|
|
60
|
-
du = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
61
|
-
yield (os.path.split(filename)[-1], du, filename, "")
|
|
71
|
+
dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
|
|
72
|
+
du = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
73
|
+
yield (os.path.split(filename)[-1], du, filename, "")
|
|
62
74
|
continue
|
|
63
75
|
|
|
64
76
|
if ext == ".zip":
|
|
@@ -67,8 +79,11 @@ def enumerate_csv_files(
|
|
|
67
79
|
zf = zipfile.ZipFile(filename, "r")
|
|
68
80
|
for ii, info in enumerate(zf.infolist()):
|
|
69
81
|
name = info.filename
|
|
70
|
-
|
|
71
|
-
|
|
82
|
+
if filtering is None:
|
|
83
|
+
ext = os.path.splitext(name)[-1]
|
|
84
|
+
if ext != ".csv":
|
|
85
|
+
continue
|
|
86
|
+
elif not filtering(name):
|
|
72
87
|
continue
|
|
73
88
|
if verbose:
|
|
74
89
|
print(
|
|
@@ -96,7 +111,7 @@ def enumerate_csv_files(
|
|
|
96
111
|
for ii, f in enumerate(found):
|
|
97
112
|
if verbose:
|
|
98
113
|
print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
|
|
99
|
-
yield from enumerate_csv_files(f, verbose=verbose)
|
|
114
|
+
yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
|
|
100
115
|
|
|
101
116
|
|
|
102
117
|
def open_dataframe(
|
|
@@ -140,10 +155,26 @@ class CubeViewDef:
|
|
|
140
155
|
:param order: to reorder key in columns index
|
|
141
156
|
:param key_agg: aggregate according to these columns before
|
|
142
157
|
creating the view
|
|
143
|
-
:param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg
|
|
158
|
+
:param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
|
|
159
|
+
it can be also a callable to return a different aggregation
|
|
160
|
+
method depending on the column name
|
|
144
161
|
:param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
|
|
162
|
+
:param agg_multi: aggregation over multiple columns
|
|
163
|
+
:param ignore_columns: ignore the following columns if known to overload the view
|
|
164
|
+
:param keep_columns_in_index: keeps the columns even if there is only one unique value
|
|
165
|
+
:param dropna: drops rows with nan if not relevant
|
|
166
|
+
:param transpose: transpose
|
|
167
|
+
:param f_highlight: to highlights some values
|
|
168
|
+
:param name: name of the view, used mostly to debug
|
|
169
|
+
:param plots: adds plot to the Excel sheet
|
|
170
|
+
:param no_index: remove the index (but keeps the columns)
|
|
145
171
|
"""
|
|
146
172
|
|
|
173
|
+
class HighLightKind(enum.IntEnum):
|
|
174
|
+
NONE = 0
|
|
175
|
+
RED = 1
|
|
176
|
+
GREEN = 2
|
|
177
|
+
|
|
147
178
|
def __init__(
|
|
148
179
|
self,
|
|
149
180
|
key_index: Sequence[str],
|
|
@@ -151,8 +182,19 @@ class CubeViewDef:
|
|
|
151
182
|
ignore_unique: bool = True,
|
|
152
183
|
order: Optional[Sequence[str]] = None,
|
|
153
184
|
key_agg: Optional[Sequence[str]] = None,
|
|
154
|
-
agg_args: Sequence[Any] = ("sum",),
|
|
185
|
+
agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
|
|
155
186
|
agg_kwargs: Optional[Dict[str, Any]] = None,
|
|
187
|
+
agg_multi: Optional[
|
|
188
|
+
Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
|
|
189
|
+
] = None,
|
|
190
|
+
ignore_columns: Optional[Sequence[str]] = None,
|
|
191
|
+
keep_columns_in_index: Optional[Sequence[str]] = None,
|
|
192
|
+
dropna: bool = True,
|
|
193
|
+
transpose: bool = False,
|
|
194
|
+
f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
|
|
195
|
+
name: Optional[str] = None,
|
|
196
|
+
no_index: bool = False,
|
|
197
|
+
plots: bool = False,
|
|
156
198
|
):
|
|
157
199
|
self.key_index = key_index
|
|
158
200
|
self.values = values
|
|
@@ -161,11 +203,237 @@ class CubeViewDef:
|
|
|
161
203
|
self.key_agg = key_agg
|
|
162
204
|
self.agg_args = agg_args
|
|
163
205
|
self.agg_kwargs = agg_kwargs
|
|
206
|
+
self.agg_multi = agg_multi
|
|
207
|
+
self.dropna = dropna
|
|
208
|
+
self.ignore_columns = ignore_columns
|
|
209
|
+
self.keep_columns_in_index = keep_columns_in_index
|
|
210
|
+
self.f_highlight = f_highlight
|
|
211
|
+
self.transpose = transpose
|
|
212
|
+
self.name = name
|
|
213
|
+
self.no_index = no_index
|
|
214
|
+
self.plots = plots
|
|
215
|
+
|
|
216
|
+
def __repr__(self) -> str:
|
|
217
|
+
"usual"
|
|
218
|
+
return string_sig(self) # type: ignore[arg-type]
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def apply_excel_style(
|
|
222
|
+
filename_or_writer: Any,
|
|
223
|
+
f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
|
|
224
|
+
):
|
|
225
|
+
"""
|
|
226
|
+
Applies styles on all sheets in a file unless the sheet is too big.
|
|
227
|
+
|
|
228
|
+
:param filename_or_writer: filename, modified inplace
|
|
229
|
+
:param f_highlight: color function to apply, one per sheet
|
|
230
|
+
"""
|
|
231
|
+
from openpyxl import load_workbook
|
|
232
|
+
from openpyxl.styles import Alignment
|
|
233
|
+
from openpyxl.utils import get_column_letter
|
|
234
|
+
from openpyxl.styles import Font # , PatternFill, numbers
|
|
235
|
+
|
|
236
|
+
if isinstance(filename_or_writer, str):
|
|
237
|
+
workbook = load_workbook(filename_or_writer)
|
|
238
|
+
save = True
|
|
239
|
+
else:
|
|
240
|
+
workbook = filename_or_writer.book
|
|
241
|
+
save = False
|
|
242
|
+
|
|
243
|
+
left = Alignment(horizontal="left")
|
|
244
|
+
left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
|
|
245
|
+
right = Alignment(horizontal="right")
|
|
246
|
+
font_colors = {
|
|
247
|
+
CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
|
|
248
|
+
CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
for name in workbook.sheetnames:
|
|
252
|
+
f_highlight = f_highlights.get(name, None) if f_highlights else None
|
|
253
|
+
sheet = workbook[name]
|
|
254
|
+
n_rows = sheet.max_row
|
|
255
|
+
n_cols = sheet.max_column
|
|
256
|
+
if n_rows * n_cols > 2**18:
|
|
257
|
+
# Too big.
|
|
258
|
+
continue
|
|
259
|
+
co: Dict[int, int] = {}
|
|
260
|
+
sizes: Dict[int, int] = {}
|
|
261
|
+
cols = set()
|
|
262
|
+
for i in range(1, n_rows):
|
|
263
|
+
for j, cell in enumerate(sheet[i]):
|
|
264
|
+
if j > n_cols:
|
|
265
|
+
break
|
|
266
|
+
cols.add(cell.column)
|
|
267
|
+
if isinstance(cell.value, float):
|
|
268
|
+
co[j] = co.get(j, 0) + 1
|
|
269
|
+
elif isinstance(cell.value, str):
|
|
270
|
+
sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
|
|
271
|
+
|
|
272
|
+
for k, v in sizes.items():
|
|
273
|
+
c = get_column_letter(k)
|
|
274
|
+
sheet.column_dimensions[c].width = min(max(8, v), 30)
|
|
275
|
+
for k in cols:
|
|
276
|
+
if k not in sizes:
|
|
277
|
+
c = get_column_letter(k)
|
|
278
|
+
sheet.column_dimensions[c].width = 15
|
|
279
|
+
|
|
280
|
+
for i in range(1, n_rows):
|
|
281
|
+
for j, cell in enumerate(sheet[i]):
|
|
282
|
+
if j > n_cols:
|
|
283
|
+
break
|
|
284
|
+
if isinstance(cell.value, pandas.Timestamp):
|
|
285
|
+
cell.alignment = right
|
|
286
|
+
dt = cell.value.to_pydatetime()
|
|
287
|
+
cell.value = dt
|
|
288
|
+
cell.number_format = (
|
|
289
|
+
"YYYY-MM-DD"
|
|
290
|
+
if (
|
|
291
|
+
dt.hour == 0
|
|
292
|
+
and dt.minute == 0
|
|
293
|
+
and dt.second == 0
|
|
294
|
+
and dt.microsecond == 0
|
|
295
|
+
)
|
|
296
|
+
else "YYYY-MM-DD 00:00:00"
|
|
297
|
+
)
|
|
298
|
+
elif isinstance(cell.value, (float, int)):
|
|
299
|
+
cell.alignment = right
|
|
300
|
+
x = abs(cell.value)
|
|
301
|
+
if int(x) == x:
|
|
302
|
+
cell.number_format = "0"
|
|
303
|
+
elif x > 5000:
|
|
304
|
+
cell.number_format = "# ##0"
|
|
305
|
+
elif x >= 500:
|
|
306
|
+
cell.number_format = "0.0"
|
|
307
|
+
elif x >= 50:
|
|
308
|
+
cell.number_format = "0.00"
|
|
309
|
+
elif x >= 5:
|
|
310
|
+
cell.number_format = "0.000"
|
|
311
|
+
elif x > 0.5:
|
|
312
|
+
cell.number_format = "0.0000"
|
|
313
|
+
elif x > 0.005:
|
|
314
|
+
cell.number_format = "0.00000"
|
|
315
|
+
else:
|
|
316
|
+
cell.number_format = "0.000E+00"
|
|
317
|
+
if f_highlight:
|
|
318
|
+
h = f_highlight(cell.value)
|
|
319
|
+
if h in font_colors:
|
|
320
|
+
cell.font = font_colors[h]
|
|
321
|
+
elif isinstance(cell.value, str) and len(cell.value) > 70:
|
|
322
|
+
cell.alignment = left_shrink
|
|
323
|
+
else:
|
|
324
|
+
cell.alignment = left
|
|
325
|
+
if f_highlight:
|
|
326
|
+
h = f_highlight(cell.value)
|
|
327
|
+
if h in font_colors:
|
|
328
|
+
cell.font = font_colors[h]
|
|
329
|
+
if save:
|
|
330
|
+
workbook.save(filename_or_writer)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class CubePlot:
|
|
334
|
+
"""
|
|
335
|
+
Creates a plot.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
def __init__(
|
|
339
|
+
self, df: pandas.DataFrame, kind: str = "bar", orientation="col", split: bool = True
|
|
340
|
+
):
|
|
341
|
+
self.df = df.copy()
|
|
342
|
+
self.kind = kind
|
|
343
|
+
self.orientation = orientation
|
|
344
|
+
self.split = split
|
|
345
|
+
|
|
346
|
+
if isinstance(self.df.columns, pandas.MultiIndex):
|
|
347
|
+
self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
|
|
348
|
+
if isinstance(self.df.index, pandas.MultiIndex):
|
|
349
|
+
self.df.index = ["/".join(map(str, i)) for i in self.df.index]
|
|
164
350
|
|
|
165
351
|
def __repr__(self) -> str:
|
|
166
352
|
"usual"
|
|
167
353
|
return string_sig(self) # type: ignore[arg-type]
|
|
168
354
|
|
|
355
|
+
def to_images(
|
|
356
|
+
self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
|
|
357
|
+
):
|
|
358
|
+
"""
|
|
359
|
+
Converts data into plots and images.
|
|
360
|
+
"""
|
|
361
|
+
import matplotlib.pyplot as plt
|
|
362
|
+
|
|
363
|
+
df = self.df.T if self.orientation == "row" else self.df
|
|
364
|
+
imgs = []
|
|
365
|
+
if verbose:
|
|
366
|
+
from tqdm import tqdm
|
|
367
|
+
|
|
368
|
+
loop = tqdm(df.columns)
|
|
369
|
+
else:
|
|
370
|
+
loop = df.columns
|
|
371
|
+
title_suffix = f"\n{title_suffix}" if title_suffix else ""
|
|
372
|
+
if merge:
|
|
373
|
+
nn = len(df.columns) // 2
|
|
374
|
+
nn += nn % 2
|
|
375
|
+
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12))
|
|
376
|
+
pos = 0
|
|
377
|
+
for c in loop:
|
|
378
|
+
ax = axs[pos // 2, pos % 2]
|
|
379
|
+
df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
|
|
380
|
+
ax.tick_params(axis="both", which="major", labelsize=8)
|
|
381
|
+
ax.grid(True)
|
|
382
|
+
pos += 1 # noqa: SIM113
|
|
383
|
+
fig.tight_layout()
|
|
384
|
+
imgdata = io.BytesIO()
|
|
385
|
+
fig.savefig(imgdata, format="png")
|
|
386
|
+
imgs.append(imgdata.getvalue())
|
|
387
|
+
plt.close()
|
|
388
|
+
else:
|
|
389
|
+
for c in loop:
|
|
390
|
+
fig, ax = plt.subplots(1, 1, figsize=(3, 3))
|
|
391
|
+
df[c].plot.barh(title=c, ax=ax)
|
|
392
|
+
ax.tick_params(axis="both", which="major", labelsize=8)
|
|
393
|
+
ax.grid(True)
|
|
394
|
+
fig.tight_layout()
|
|
395
|
+
imgdata = io.BytesIO()
|
|
396
|
+
fig.savefig(imgdata, format="png")
|
|
397
|
+
imgs.append(imgdata.getvalue())
|
|
398
|
+
plt.close()
|
|
399
|
+
return imgs
|
|
400
|
+
|
|
401
|
+
def to_charts(self, writer: pandas.ExcelWriter, sheet, empty_row: int = 1):
|
|
402
|
+
"""
|
|
403
|
+
Draws plots on a page.
|
|
404
|
+
The data is copied on this page.
|
|
405
|
+
|
|
406
|
+
:param name: sheet name
|
|
407
|
+
:param writer: writer (from pandas)
|
|
408
|
+
:param sheet_name: sheet
|
|
409
|
+
:param graph_index: graph index
|
|
410
|
+
:return: list of graph
|
|
411
|
+
"""
|
|
412
|
+
assert self.split, f"Not implemented if split={self.split}"
|
|
413
|
+
assert self.orientation == "row", f"Not implemented if orientation={self.orientation}"
|
|
414
|
+
workbook = writer.book
|
|
415
|
+
labels = list(self.df.columns)
|
|
416
|
+
sheet.write_row(empty_row, 0, labels)
|
|
417
|
+
|
|
418
|
+
charts = []
|
|
419
|
+
pos = empty_row + 1
|
|
420
|
+
for i in self.df.index:
|
|
421
|
+
values = self.df.loc[i, :].tolist()
|
|
422
|
+
values = [("" if isinstance(v, float) and np.isnan(v) else v) for v in values]
|
|
423
|
+
sheet.write_row(pos, 0, values)
|
|
424
|
+
chart = workbook.add_chart({"type": "bar"})
|
|
425
|
+
chart.add_series(
|
|
426
|
+
{
|
|
427
|
+
"name": i,
|
|
428
|
+
"categories": [i, 1, empty_row, len(labels), empty_row],
|
|
429
|
+
"values": [i, 1, pos, len(labels), pos],
|
|
430
|
+
}
|
|
431
|
+
)
|
|
432
|
+
chart.set_title({"name": i})
|
|
433
|
+
charts.append(chart)
|
|
434
|
+
pos += 1
|
|
435
|
+
return charts
|
|
436
|
+
|
|
169
437
|
|
|
170
438
|
class CubeLogs:
|
|
171
439
|
"""
|
|
@@ -180,7 +448,14 @@ class CubeLogs:
|
|
|
180
448
|
values: Sequence[str] = ("time_.*", "disc_.*"),
|
|
181
449
|
ignored: Sequence[str] = (),
|
|
182
450
|
recent: bool = False,
|
|
183
|
-
formulas: Optional[
|
|
451
|
+
formulas: Optional[
|
|
452
|
+
Union[
|
|
453
|
+
Sequence[str],
|
|
454
|
+
Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
|
|
455
|
+
]
|
|
456
|
+
] = None,
|
|
457
|
+
fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
|
|
458
|
+
keep_last_date: bool = False,
|
|
184
459
|
):
|
|
185
460
|
self._data = data
|
|
186
461
|
self._time = time
|
|
@@ -189,24 +464,51 @@ class CubeLogs:
|
|
|
189
464
|
self._ignored = ignored
|
|
190
465
|
self.recent = recent
|
|
191
466
|
self._formulas = formulas
|
|
467
|
+
self.fill_missing = fill_missing
|
|
468
|
+
self.keep_last_date = keep_last_date
|
|
469
|
+
|
|
470
|
+
def post_load_process_piece(
|
|
471
|
+
self, df: pandas.DataFrame, unique: bool = False
|
|
472
|
+
) -> pandas.DataFrame:
|
|
473
|
+
"""
|
|
474
|
+
Postprocesses a piece when a cube is made of multiple pieces
|
|
475
|
+
before it gets merged.
|
|
476
|
+
"""
|
|
477
|
+
if not self.fill_missing:
|
|
478
|
+
return df
|
|
479
|
+
missing = dict(self.fill_missing)
|
|
480
|
+
for k, v in missing.items():
|
|
481
|
+
if k not in df.columns:
|
|
482
|
+
df[k] = v
|
|
483
|
+
return df
|
|
192
484
|
|
|
193
485
|
def load(self, verbose: int = 0):
|
|
194
486
|
"""Loads and preprocesses the data. Returns self."""
|
|
195
487
|
if isinstance(self._data, pandas.DataFrame):
|
|
196
488
|
if verbose:
|
|
197
489
|
print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
|
|
198
|
-
self.data = self._data
|
|
490
|
+
self.data = self.post_load_process_piece(self._data, unique=True)
|
|
491
|
+
if verbose:
|
|
492
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
199
493
|
elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
|
|
200
494
|
if verbose:
|
|
201
495
|
print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
|
|
202
|
-
self.data = pandas.DataFrame(self._data)
|
|
496
|
+
self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
|
|
497
|
+
if verbose:
|
|
498
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
203
499
|
elif isinstance(self._data, list) and all(
|
|
204
500
|
isinstance(r, pandas.DataFrame) for r in self._data
|
|
205
501
|
):
|
|
206
502
|
if verbose:
|
|
207
503
|
print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
|
|
208
|
-
self.data = pandas.concat(
|
|
504
|
+
self.data = pandas.concat(
|
|
505
|
+
[self.post_load_process_piece(c) for c in self._data], axis=0
|
|
506
|
+
)
|
|
507
|
+
if verbose:
|
|
508
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
209
509
|
elif isinstance(self._data, list):
|
|
510
|
+
if verbose:
|
|
511
|
+
print("[CubeLogs.load] load from list of Cubes")
|
|
210
512
|
cubes = []
|
|
211
513
|
for item in enumerate_csv_files(self._data, verbose=verbose):
|
|
212
514
|
df = open_dataframe(item)
|
|
@@ -219,8 +521,10 @@ class CubeLogs:
|
|
|
219
521
|
recent=self.recent,
|
|
220
522
|
)
|
|
221
523
|
cube.load()
|
|
222
|
-
cubes.append(cube.data)
|
|
524
|
+
cubes.append(self.post_load_process_piece(cube.data))
|
|
223
525
|
self.data = pandas.concat(cubes, axis=0)
|
|
526
|
+
if verbose:
|
|
527
|
+
print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
|
|
224
528
|
else:
|
|
225
529
|
raise NotImplementedError(
|
|
226
530
|
f"Not implemented with the provided data (type={type(self._data)})"
|
|
@@ -236,59 +540,101 @@ class CubeLogs:
|
|
|
236
540
|
self._initialize_columns()
|
|
237
541
|
if verbose:
|
|
238
542
|
print(f"[CubeLogs.load] time={self.time}")
|
|
239
|
-
print(f"[CubeLogs.load] keys={self.
|
|
543
|
+
print(f"[CubeLogs.load] keys={self.keys_no_time}")
|
|
240
544
|
print(f"[CubeLogs.load] values={self.values}")
|
|
241
545
|
print(f"[CubeLogs.load] ignored={self.ignored}")
|
|
242
546
|
print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
|
|
243
547
|
print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
|
|
548
|
+
assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
|
|
549
|
+
assert self.values, f"No values found with {self._values} from {self.data.columns}"
|
|
244
550
|
assert not (
|
|
245
|
-
set(self.
|
|
246
|
-
), f"Columns {set(self.
|
|
551
|
+
set(self.keys_no_time) & set(self.values)
|
|
552
|
+
), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
|
|
247
553
|
assert not (
|
|
248
|
-
set(self.
|
|
249
|
-
), f"Columns {set(self.
|
|
554
|
+
set(self.keys_no_time) & set(self.ignored)
|
|
555
|
+
), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
|
|
250
556
|
assert not (
|
|
251
557
|
set(self.values) & set(self.ignored)
|
|
252
|
-
), f"Columns {set(self.
|
|
558
|
+
), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
|
|
253
559
|
assert (
|
|
254
|
-
self.time not in self.
|
|
560
|
+
self.time not in self.keys_no_time
|
|
255
561
|
and self.time not in self.values
|
|
256
562
|
and self.time not in self.ignored
|
|
257
|
-
),
|
|
258
|
-
|
|
563
|
+
), (
|
|
564
|
+
f"Column {self.time!r} is also a key, a value or ignored, "
|
|
565
|
+
f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
|
|
566
|
+
f"ignored={sorted(self.ignored)}"
|
|
567
|
+
)
|
|
568
|
+
self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
|
|
259
569
|
self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
|
|
260
570
|
self.data = self.data[self.columns]
|
|
261
571
|
if verbose:
|
|
262
572
|
print(f"[CubeLogs.load] dropped={self.dropped}")
|
|
263
573
|
print(f"[CubeLogs.load] data.shape={self.data.shape}")
|
|
264
574
|
|
|
575
|
+
shape = self.data.shape
|
|
576
|
+
if verbose:
|
|
577
|
+
print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
|
|
265
578
|
self._preprocess()
|
|
579
|
+
if verbose:
|
|
580
|
+
print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
|
|
581
|
+
assert (
|
|
582
|
+
self.data.shape[0] > 0
|
|
583
|
+
), f"The preprocessing reduced shape {shape} to {self.data.shape}."
|
|
266
584
|
if self.recent and verbose:
|
|
267
585
|
print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
|
|
268
586
|
|
|
269
587
|
# Let's apply the formulas
|
|
270
588
|
if self._formulas:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
if
|
|
589
|
+
forms = (
|
|
590
|
+
{k: k for k in self._formulas}
|
|
591
|
+
if not isinstance(self._formulas, dict)
|
|
592
|
+
else self._formulas
|
|
593
|
+
)
|
|
594
|
+
cols = set(self.values)
|
|
595
|
+
for k, ff in forms.items():
|
|
596
|
+
f = self._process_formula(ff)
|
|
597
|
+
if k in cols or f is None:
|
|
274
598
|
if verbose:
|
|
275
599
|
print(f"[CubeLogs.load] skip formula {k!r}")
|
|
276
600
|
else:
|
|
277
601
|
if verbose:
|
|
278
602
|
print(f"[CubeLogs.load] apply formula {k!r}")
|
|
279
603
|
self.data[k] = f(self.data)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
604
|
+
self.values.append(k)
|
|
605
|
+
cols.add(k)
|
|
606
|
+
self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
|
|
607
|
+
for k in self.keys_no_time:
|
|
608
|
+
if self.data[k].isna().max():
|
|
609
|
+
self.values_for_key[k].add(np.nan)
|
|
610
|
+
self.keys_with_nans = [
|
|
611
|
+
c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
|
|
283
612
|
]
|
|
284
|
-
assert not nans, f"The following keys {nans} have nan values. This is not allowed."
|
|
285
613
|
if verbose:
|
|
286
614
|
print(f"[CubeLogs.load] convert column {self.time!r} into date")
|
|
615
|
+
if self.keys_with_nans:
|
|
616
|
+
print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
|
|
287
617
|
self.data[self.time] = pandas.to_datetime(self.data[self.time])
|
|
618
|
+
|
|
619
|
+
if self.keep_last_date:
|
|
620
|
+
times = self.data[self.time].dropna()
|
|
621
|
+
mi, mx = times.min(), times.max()
|
|
622
|
+
if mi != mx:
|
|
623
|
+
print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
|
|
624
|
+
self.data.loc[~self.data[self.time].isna(), self.time] = mx
|
|
625
|
+
self.values_for_key[self.time] = {mx}
|
|
626
|
+
if self.data[self.time].isna().max():
|
|
627
|
+
self.values_for_key[self.time].add(np.nan)
|
|
288
628
|
if verbose:
|
|
289
629
|
print(f"[CubeLogs.load] done, shape={self.shape}")
|
|
290
630
|
return self
|
|
291
631
|
|
|
632
|
+
def _process_formula(
|
|
633
|
+
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
634
|
+
) -> Callable[[pandas.DataFrame], pandas.Series]:
|
|
635
|
+
assert callable(formula), f"formula={formula!r} is not supported."
|
|
636
|
+
return formula
|
|
637
|
+
|
|
292
638
|
@property
|
|
293
639
|
def shape(self) -> Tuple[int, int]:
|
|
294
640
|
"Returns the shape."
|
|
@@ -303,7 +649,7 @@ class CubeLogs:
|
|
|
303
649
|
|
|
304
650
|
def _preprocess(self):
|
|
305
651
|
last = self.values[0]
|
|
306
|
-
gr = self.data[[
|
|
652
|
+
gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
|
|
307
653
|
gr = gr[gr[last] > 1]
|
|
308
654
|
if self.recent:
|
|
309
655
|
cp = self.data.copy()
|
|
@@ -312,11 +658,15 @@ class CubeLogs:
|
|
|
312
658
|
), f"'__index__' should not be a column in {cp.columns}"
|
|
313
659
|
cp["__index__"] = np.arange(cp.shape[0])
|
|
314
660
|
gr = (
|
|
315
|
-
cp[[*self.
|
|
316
|
-
.groupby(self.
|
|
661
|
+
cp[[*self.keys_time, "__index__"]]
|
|
662
|
+
.groupby(self.keys_no_time, as_index=False, dropna=False)
|
|
317
663
|
.max()
|
|
318
664
|
)
|
|
319
|
-
|
|
665
|
+
assert gr.shape[0] > 0, (
|
|
666
|
+
f"Something went wrong after the groupby.\n"
|
|
667
|
+
f"{cp[[*self.keys, self.time, '__index__']].head().T}"
|
|
668
|
+
)
|
|
669
|
+
filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
|
|
320
670
|
assert filtered.shape[0] <= self.data.shape[0], (
|
|
321
671
|
f"Keeping the latest row brings more row {filtered.shape} "
|
|
322
672
|
f"(initial is {self.data.shape})."
|
|
@@ -324,18 +674,20 @@ class CubeLogs:
|
|
|
324
674
|
self.data = filtered.drop("__index__", axis=1)
|
|
325
675
|
else:
|
|
326
676
|
assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
|
|
327
|
-
gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
|
|
328
|
-
gr = gr[gr[self.time] > 1]
|
|
329
|
-
assert (
|
|
330
|
-
gr.shape[0] == 0
|
|
331
|
-
), f"recent should be true to keep the most recent row:\n{gr}"
|
|
332
677
|
|
|
333
678
|
@classmethod
|
|
334
679
|
def _filter_column(cls, filters, columns, can_be_empty=False):
|
|
680
|
+
assert list(columns), "columns is empty"
|
|
335
681
|
set_cols = set()
|
|
336
682
|
for f in filters:
|
|
337
|
-
|
|
338
|
-
|
|
683
|
+
if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
|
|
684
|
+
reg = re.compile(f)
|
|
685
|
+
cols = [c for c in columns if reg.search(c)]
|
|
686
|
+
elif f in columns:
|
|
687
|
+
# No regular expression.
|
|
688
|
+
cols = [f]
|
|
689
|
+
else:
|
|
690
|
+
continue
|
|
339
691
|
set_cols |= set(cols)
|
|
340
692
|
assert (
|
|
341
693
|
can_be_empty or set_cols
|
|
@@ -343,25 +695,31 @@ class CubeLogs:
|
|
|
343
695
|
return sorted(set_cols)
|
|
344
696
|
|
|
345
697
|
def _initialize_columns(self):
|
|
346
|
-
|
|
698
|
+
keys = self._filter_column(self._keys, self.data.columns)
|
|
347
699
|
self.values = self._filter_column(self._values, self.data.columns)
|
|
348
700
|
self.ignored = self._filter_column(self._ignored, self.data.columns, True)
|
|
349
701
|
assert (
|
|
350
702
|
self._time in self.data.columns
|
|
351
|
-
), f"Column {self._time} not found in {self.data.columns}"
|
|
352
|
-
ignored_keys = set(self.ignored) & set(
|
|
703
|
+
), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
|
|
704
|
+
ignored_keys = set(self.ignored) & set(keys)
|
|
353
705
|
ignored_values = set(self.ignored) & set(self.values)
|
|
354
|
-
self.
|
|
706
|
+
self.keys_no_time = [c for c in keys if c not in ignored_keys]
|
|
355
707
|
self.values = [c for c in self.values if c not in ignored_values]
|
|
356
708
|
self.ignored_keys = sorted(ignored_keys)
|
|
357
709
|
self.ignored_values = sorted(ignored_values)
|
|
358
710
|
self.time = self._time
|
|
711
|
+
self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
|
|
359
712
|
|
|
360
713
|
def __str__(self) -> str:
|
|
361
714
|
"usual"
|
|
362
715
|
return str(self.data) if hasattr(self, "data") else str(self._data)
|
|
363
716
|
|
|
364
|
-
def view(
|
|
717
|
+
def view(
|
|
718
|
+
self,
|
|
719
|
+
view_def: Union[str, CubeViewDef],
|
|
720
|
+
return_view_def: bool = False,
|
|
721
|
+
verbose: int = 0,
|
|
722
|
+
) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
|
|
365
723
|
"""
|
|
366
724
|
Returns a dataframe, a pivot view.
|
|
367
725
|
`key_index` determines the index, the other key columns determines
|
|
@@ -369,58 +727,274 @@ class CubeLogs:
|
|
|
369
727
|
is removed.
|
|
370
728
|
|
|
371
729
|
:param view_def: view definition
|
|
730
|
+
:param return_view_def: returns the view as well
|
|
731
|
+
:param verbose: verbosity level
|
|
372
732
|
:return: dataframe
|
|
373
733
|
"""
|
|
374
|
-
|
|
734
|
+
assert isinstance(
|
|
735
|
+
view_def, CubeViewDef
|
|
736
|
+
), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
|
|
737
|
+
if verbose:
|
|
738
|
+
print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
|
|
739
|
+
key_agg = (
|
|
740
|
+
self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
|
|
741
|
+
)
|
|
375
742
|
set_key_agg = set(key_agg)
|
|
376
|
-
assert set_key_agg <= set(
|
|
377
|
-
|
|
378
|
-
|
|
743
|
+
assert set_key_agg <= set(self.keys_time), (
|
|
744
|
+
f"view_def.name={view_def.name!r}, "
|
|
745
|
+
f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
|
|
746
|
+
f"keys={sorted(self.keys_time)}",
|
|
747
|
+
)
|
|
379
748
|
|
|
380
749
|
values = self._filter_column(view_def.values, self.values)
|
|
381
|
-
assert set(values) <= set(
|
|
382
|
-
|
|
383
|
-
|
|
750
|
+
assert set(values) <= set(self.values), (
|
|
751
|
+
f"view_def.name={view_def.name!r}, "
|
|
752
|
+
f"non existing columns in values {set(values) - set(self.values)}, "
|
|
753
|
+
f"values={sorted(self.values)}"
|
|
754
|
+
)
|
|
384
755
|
|
|
756
|
+
# aggregation
|
|
385
757
|
if key_agg:
|
|
758
|
+
final_stack = True
|
|
386
759
|
key_index = [
|
|
387
760
|
c
|
|
388
|
-
for c in self._filter_column(view_def.key_index, self.
|
|
761
|
+
for c in self._filter_column(view_def.key_index, self.keys_time)
|
|
389
762
|
if c not in set_key_agg
|
|
390
763
|
]
|
|
391
|
-
keys_no_agg = [c for c in self.
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
.groupby
|
|
395
|
-
|
|
764
|
+
keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
|
|
765
|
+
if verbose:
|
|
766
|
+
print(f"[CubeLogs.view] aggregation of {set_key_agg}")
|
|
767
|
+
print(f"[CubeLogs.view] groupby {keys_no_agg}")
|
|
768
|
+
|
|
769
|
+
data_red = self.data[[*keys_no_agg, *values]]
|
|
770
|
+
assert set(key_index) <= set(data_red.columns), (
|
|
771
|
+
f"view_def.name={view_def.name!r}, "
|
|
772
|
+
f"nnable to find {set(key_index) - set(data_red.columns)}, "
|
|
773
|
+
f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
|
|
774
|
+
f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
|
|
775
|
+
f"keys={pprint.pformat(sorted(self.keys_time))}"
|
|
396
776
|
)
|
|
777
|
+
grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
|
|
778
|
+
if callable(view_def.agg_args):
|
|
779
|
+
agg_kwargs = view_def.agg_kwargs or {}
|
|
780
|
+
agg_args = ({c: view_def.agg_args(c) for c in values},)
|
|
781
|
+
else:
|
|
782
|
+
agg_args = view_def.agg_args # type: ignore[assignment]
|
|
783
|
+
agg_kwargs = view_def.agg_kwargs or {}
|
|
784
|
+
data = grouped_data.agg(*agg_args, **agg_kwargs)
|
|
785
|
+
if view_def.agg_multi:
|
|
786
|
+
append = []
|
|
787
|
+
for k, f in view_def.agg_multi.items():
|
|
788
|
+
cv = grouped_data.apply(f, include_groups=False)
|
|
789
|
+
append.append(cv.to_frame(k))
|
|
790
|
+
data = pandas.concat([data, *append], axis=1)
|
|
791
|
+
set_all_keys = set(keys_no_agg)
|
|
792
|
+
values = list(data.columns)
|
|
793
|
+
data = data.reset_index(drop=False)
|
|
397
794
|
else:
|
|
398
|
-
key_index = self._filter_column(view_def.key_index, self.
|
|
399
|
-
|
|
795
|
+
key_index = self._filter_column(view_def.key_index, self.keys_time)
|
|
796
|
+
if verbose:
|
|
797
|
+
print(f"[CubeLogs.view] no aggregation, index={key_index}")
|
|
798
|
+
data = self.data[[*self.keys_time, *values]]
|
|
799
|
+
set_all_keys = set(self.keys_time)
|
|
800
|
+
final_stack = False
|
|
400
801
|
|
|
401
|
-
assert set(key_index) <=
|
|
402
|
-
|
|
403
|
-
|
|
802
|
+
assert set(key_index) <= set_all_keys, (
|
|
803
|
+
f"view_def.name={view_def.name!r}, "
|
|
804
|
+
f"Non existing keys in key_index {set(key_index) - set_all_keys}"
|
|
805
|
+
)
|
|
404
806
|
|
|
807
|
+
# remove unnecessary column
|
|
405
808
|
set_key_columns = {
|
|
406
|
-
c for c in self.
|
|
809
|
+
c for c in self.keys_time if c not in key_index and c not in set(key_agg)
|
|
407
810
|
}
|
|
811
|
+
key_index0 = key_index
|
|
408
812
|
if view_def.ignore_unique:
|
|
409
|
-
|
|
410
|
-
|
|
813
|
+
unique = {
|
|
814
|
+
k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
|
|
815
|
+
}
|
|
816
|
+
keep_anyway = (
|
|
817
|
+
set(view_def.keep_columns_in_index)
|
|
818
|
+
if view_def.keep_columns_in_index
|
|
819
|
+
else set()
|
|
820
|
+
)
|
|
821
|
+
key_index = [k for k in key_index if k not in unique or k in keep_anyway]
|
|
822
|
+
key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
|
|
823
|
+
if verbose:
|
|
824
|
+
print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
|
|
825
|
+
print(
|
|
826
|
+
f"[CubeLogs.view] columns with unique values "
|
|
827
|
+
f"{set(key_index0) - set(key_index)}"
|
|
828
|
+
)
|
|
411
829
|
else:
|
|
830
|
+
if verbose:
|
|
831
|
+
print("[CubeLogs.view] keep all columns")
|
|
412
832
|
key_columns = sorted(set_key_columns)
|
|
833
|
+
unique = set()
|
|
413
834
|
|
|
835
|
+
_md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
|
|
836
|
+
all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
|
|
837
|
+
assert all_cols == set(self.keys_time), (
|
|
838
|
+
f"view_def.name={view_def.name!r}, "
|
|
839
|
+
f"key_columns + key_index + key_agg + unique != keys, left="
|
|
840
|
+
f"{set(self.keys_time) - all_cols}, "
|
|
841
|
+
f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
|
|
842
|
+
f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
# reorder
|
|
414
846
|
if view_def.order:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
847
|
+
subset = self._filter_column(view_def.order, all_cols | {self.time})
|
|
848
|
+
corder = [o for o in view_def.order if o in subset]
|
|
849
|
+
assert set(corder) <= set_key_columns, (
|
|
850
|
+
f"view_def.name={view_def.name!r}, "
|
|
851
|
+
f"non existing columns from order in key_columns "
|
|
852
|
+
f"{set(corder) - set_key_columns}"
|
|
418
853
|
)
|
|
419
854
|
key_columns = [
|
|
420
|
-
*
|
|
855
|
+
*[o for o in corder if o in key_columns],
|
|
421
856
|
*[c for c in key_columns if c not in view_def.order],
|
|
422
857
|
]
|
|
423
|
-
|
|
858
|
+
else:
|
|
859
|
+
corder = None
|
|
860
|
+
|
|
861
|
+
if view_def.dropna:
|
|
862
|
+
data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
|
|
863
|
+
data,
|
|
864
|
+
key_index,
|
|
865
|
+
key_columns,
|
|
866
|
+
values,
|
|
867
|
+
keep_columns_in_index=view_def.keep_columns_in_index,
|
|
868
|
+
)
|
|
869
|
+
if view_def.ignore_columns:
|
|
870
|
+
if verbose:
|
|
871
|
+
print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
|
|
872
|
+
data = data.drop(view_def.ignore_columns, axis=1)
|
|
873
|
+
seti = set(view_def.ignore_columns)
|
|
874
|
+
if view_def.keep_columns_in_index:
|
|
875
|
+
seti -= set(view_def.keep_columns_in_index)
|
|
876
|
+
key_index = [c for c in key_index if c not in seti]
|
|
877
|
+
key_columns = [c for c in key_columns if c not in seti]
|
|
878
|
+
values = [c for c in values if c not in seti]
|
|
879
|
+
|
|
880
|
+
# final verification
|
|
881
|
+
if verbose:
|
|
882
|
+
print(f"[CubeLogs.view] key_index={key_index}")
|
|
883
|
+
print(f"[CubeLogs.view] key_columns={key_columns}")
|
|
884
|
+
g = data[[*key_index, *key_columns]].copy()
|
|
885
|
+
g["count"] = 1
|
|
886
|
+
r = g.groupby([*key_index, *key_columns], dropna=False).sum()
|
|
887
|
+
not_unique = r[r["count"] > 1]
|
|
888
|
+
assert not_unique.shape[0] == 0, (
|
|
889
|
+
f"view_def.name={view_def.name!r}, "
|
|
890
|
+
f"unable to run the pivot with index={sorted(key_index)}, "
|
|
891
|
+
f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
|
|
892
|
+
f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
|
|
893
|
+
f"not unique={set(data.columns) - unique}"
|
|
894
|
+
f"\n--\n{not_unique.head()}"
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
# pivot
|
|
898
|
+
if verbose:
|
|
899
|
+
print(f"[CubeLogs.view] values={values}")
|
|
900
|
+
if key_index:
|
|
901
|
+
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
|
|
902
|
+
else:
|
|
903
|
+
# pivot does return the same rank with it is empty.
|
|
904
|
+
# Let's add arficially one
|
|
905
|
+
data = data.copy()
|
|
906
|
+
data["ALL"] = "ALL"
|
|
907
|
+
piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
|
|
908
|
+
if isinstance(piv, pandas.Series):
|
|
909
|
+
piv = piv.to_frame(name="series")
|
|
910
|
+
names = list(piv.columns.names)
|
|
911
|
+
assert (
|
|
912
|
+
"METRICS" not in names
|
|
913
|
+
), f"Not implemented when a level METRICS already exists {names!r}"
|
|
914
|
+
names[0] = "METRICS"
|
|
915
|
+
piv.columns = piv.columns.set_names(names)
|
|
916
|
+
if final_stack:
|
|
917
|
+
piv = piv.stack("METRICS", future_stack=True)
|
|
918
|
+
if view_def.transpose:
|
|
919
|
+
piv = piv.T
|
|
920
|
+
if isinstance(piv, pandas.Series):
|
|
921
|
+
piv = piv.to_frame("VALUE")
|
|
922
|
+
piv.sort_index(inplace=True)
|
|
923
|
+
|
|
924
|
+
if isinstance(piv.columns, pandas.MultiIndex):
|
|
925
|
+
if corder:
|
|
926
|
+
# reorder the levels for the columns with the view definition
|
|
927
|
+
new_corder = [c for c in corder if c in piv.columns.names]
|
|
928
|
+
new_names = [
|
|
929
|
+
*[c for c in piv.columns.names if c not in new_corder],
|
|
930
|
+
*new_corder,
|
|
931
|
+
]
|
|
932
|
+
piv.columns = piv.columns.reorder_levels(new_names)
|
|
933
|
+
elif self.time in piv.columns.names:
|
|
934
|
+
# put time at the end
|
|
935
|
+
new_names = list(piv.columns.names)
|
|
936
|
+
ind = new_names.index(self.time)
|
|
937
|
+
if ind < len(new_names) - 1:
|
|
938
|
+
del new_names[ind]
|
|
939
|
+
new_names.append(self.time)
|
|
940
|
+
piv.columns = piv.columns.reorder_levels(new_names)
|
|
941
|
+
|
|
942
|
+
if view_def.no_index:
|
|
943
|
+
piv = piv.reset_index(drop=False)
|
|
944
|
+
else:
|
|
945
|
+
piv.sort_index(inplace=True, axis=1)
|
|
946
|
+
|
|
947
|
+
if verbose:
|
|
948
|
+
print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
|
|
949
|
+
print(f"[CubeLogs.view] -- done view {view_def.name!r}")
|
|
950
|
+
return (piv, view_def) if return_view_def else piv
|
|
951
|
+
|
|
952
|
+
def _dropna(
|
|
953
|
+
self,
|
|
954
|
+
data: pandas.DataFrame,
|
|
955
|
+
key_index: Sequence[str],
|
|
956
|
+
key_columns: Sequence[str],
|
|
957
|
+
values: Sequence[str],
|
|
958
|
+
keep_columns_in_index: Optional[Sequence[str]] = None,
|
|
959
|
+
) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
|
|
960
|
+
set_keep_columns_in_index = (
|
|
961
|
+
set(keep_columns_in_index) if keep_columns_in_index else set()
|
|
962
|
+
)
|
|
963
|
+
v = data[values]
|
|
964
|
+
new_data = data[~v.isnull().all(1)]
|
|
965
|
+
if data.shape == new_data.shape:
|
|
966
|
+
return data, key_index, key_columns, values
|
|
967
|
+
new_data = new_data.copy()
|
|
968
|
+
new_key_index = []
|
|
969
|
+
for c in key_index:
|
|
970
|
+
if c in set_keep_columns_in_index:
|
|
971
|
+
new_key_index.append(c)
|
|
972
|
+
continue
|
|
973
|
+
v = new_data[c]
|
|
974
|
+
sv = set(v.dropna())
|
|
975
|
+
if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
|
|
976
|
+
new_key_index.append(c)
|
|
977
|
+
new_key_columns = []
|
|
978
|
+
for c in key_columns:
|
|
979
|
+
if c in set_keep_columns_in_index:
|
|
980
|
+
new_key_columns.append(c)
|
|
981
|
+
continue
|
|
982
|
+
v = new_data[c]
|
|
983
|
+
sv = set(v.dropna())
|
|
984
|
+
if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
|
|
985
|
+
new_key_columns.append(c)
|
|
986
|
+
for c in set(key_index) | set(key_columns):
|
|
987
|
+
s = new_data[c]
|
|
988
|
+
if s.isna().max():
|
|
989
|
+
if pandas.api.types.is_numeric_dtype(s):
|
|
990
|
+
min_v = s.dropna().min()
|
|
991
|
+
assert (
|
|
992
|
+
min_v >= 0
|
|
993
|
+
), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
|
|
994
|
+
new_data[c] = s.fillna(-1)
|
|
995
|
+
else:
|
|
996
|
+
new_data[c] = s.fillna("NAN")
|
|
997
|
+
return new_data, new_key_index, new_key_columns, values
|
|
424
998
|
|
|
425
999
|
def describe(self) -> pandas.DataFrame:
|
|
426
1000
|
"""Basic description of all variables."""
|
|
@@ -433,22 +1007,42 @@ class CubeLogs:
|
|
|
433
1007
|
name=name,
|
|
434
1008
|
dtype=str(dtype),
|
|
435
1009
|
missing=len(values) - len(nonan),
|
|
1010
|
+
kind=(
|
|
1011
|
+
"time"
|
|
1012
|
+
if name == self.time
|
|
1013
|
+
else (
|
|
1014
|
+
"keys"
|
|
1015
|
+
if name in self.keys_no_time
|
|
1016
|
+
else (
|
|
1017
|
+
"values"
|
|
1018
|
+
if name in self.values
|
|
1019
|
+
else ("ignored" if name in self.ignored else "unused")
|
|
1020
|
+
)
|
|
1021
|
+
)
|
|
1022
|
+
),
|
|
436
1023
|
)
|
|
437
1024
|
if len(nonan) > 0:
|
|
438
|
-
obs.update(
|
|
439
|
-
dict(
|
|
440
|
-
min=nonan.min(),
|
|
441
|
-
max=nonan.max(),
|
|
442
|
-
count=len(nonan),
|
|
443
|
-
)
|
|
444
|
-
)
|
|
1025
|
+
obs.update(dict(count=len(nonan)))
|
|
445
1026
|
if is_numeric_dtype(nonan):
|
|
446
1027
|
obs.update(
|
|
447
1028
|
dict(
|
|
1029
|
+
min=nonan.min(),
|
|
1030
|
+
max=nonan.max(),
|
|
448
1031
|
mean=nonan.mean(),
|
|
449
1032
|
sum=nonan.sum(),
|
|
1033
|
+
n_values=len(set(nonan)),
|
|
450
1034
|
)
|
|
451
1035
|
)
|
|
1036
|
+
elif obs["kind"] == "time":
|
|
1037
|
+
unique = set(nonan)
|
|
1038
|
+
obs["n_values"] = len(unique)
|
|
1039
|
+
o = dict(
|
|
1040
|
+
min=str(nonan.min()),
|
|
1041
|
+
max=str(nonan.max()),
|
|
1042
|
+
n_values=len(set(nonan)),
|
|
1043
|
+
)
|
|
1044
|
+
o["values"] = f"{o['min']} - {o['max']}"
|
|
1045
|
+
obs.update(o)
|
|
452
1046
|
else:
|
|
453
1047
|
unique = set(nonan)
|
|
454
1048
|
obs["n_values"] = len(unique)
|
|
@@ -460,126 +1054,691 @@ class CubeLogs:
|
|
|
460
1054
|
def to_excel(
|
|
461
1055
|
self,
|
|
462
1056
|
output: str,
|
|
463
|
-
views: Dict[str, CubeViewDef],
|
|
1057
|
+
views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
|
|
464
1058
|
main: Optional[str] = "main",
|
|
465
1059
|
raw: Optional[str] = "raw",
|
|
466
1060
|
verbose: int = 0,
|
|
1061
|
+
csv: Optional[Sequence[str]] = None,
|
|
467
1062
|
):
|
|
468
1063
|
"""
|
|
469
1064
|
Creates an excel file with a list of view.
|
|
470
1065
|
|
|
471
1066
|
:param output: output file to create
|
|
472
|
-
:param views:
|
|
1067
|
+
:param views: sequence or dictionary of views to append
|
|
473
1068
|
:param main: add a page with statitcs on all variables
|
|
474
1069
|
:param raw: add a page with the raw data
|
|
1070
|
+
:param csv: views to dump as csv files (same name as outputs + view naw)
|
|
475
1071
|
:param verbose: verbosity
|
|
476
1072
|
"""
|
|
477
|
-
|
|
1073
|
+
if verbose:
|
|
1074
|
+
print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
|
|
1075
|
+
views = {k: k for k in views} if not isinstance(views, dict) else views
|
|
1076
|
+
f_highlights = {}
|
|
1077
|
+
plots = []
|
|
478
1078
|
with pandas.ExcelWriter(output, engine="openpyxl") as writer:
|
|
479
1079
|
if main:
|
|
480
1080
|
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
481
|
-
df = self.describe()
|
|
1081
|
+
df = self.describe().sort_values("name")
|
|
482
1082
|
if verbose:
|
|
483
|
-
print(f"[CubeLogs.
|
|
1083
|
+
print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
|
|
484
1084
|
df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
|
|
485
|
-
self._apply_excel_style(main, writer, df)
|
|
486
|
-
if raw:
|
|
487
|
-
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
488
|
-
if verbose:
|
|
489
|
-
print(f"[CubeLogs.to_helper] add sheet {raw!r} with shape {self.shape}")
|
|
490
|
-
self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
|
|
491
|
-
self._apply_excel_style(raw, writer, self.data)
|
|
492
1085
|
|
|
493
1086
|
for name, view in views.items():
|
|
494
|
-
df = self.view(view)
|
|
1087
|
+
df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
|
|
1088
|
+
memory = df.memory_usage(deep=True).sum()
|
|
495
1089
|
if verbose:
|
|
496
1090
|
print(
|
|
497
|
-
f"[CubeLogs.
|
|
498
|
-
f"{df.shape}, index={df.index.names},
|
|
1091
|
+
f"[CubeLogs.to_excel] add sheet {name!r} with shape "
|
|
1092
|
+
f"{df.shape} ({memory} bytes), index={df.index.names}, "
|
|
1093
|
+
f"columns={df.columns.names}"
|
|
1094
|
+
)
|
|
1095
|
+
if self.time in df.columns.names:
|
|
1096
|
+
# Let's convert the time into str
|
|
1097
|
+
fr = df.columns.to_frame()
|
|
1098
|
+
if is_datetime64_any_dtype(fr[self.time]):
|
|
1099
|
+
dt = fr[self.time]
|
|
1100
|
+
has_time = (dt != dt.dt.normalize()).any()
|
|
1101
|
+
sdt = dt.apply(
|
|
1102
|
+
lambda t, has_time=has_time: t.strftime(
|
|
1103
|
+
"%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
|
|
1104
|
+
)
|
|
1105
|
+
)
|
|
1106
|
+
fr[self.time] = sdt
|
|
1107
|
+
df.columns = pandas.MultiIndex.from_frame(fr)
|
|
1108
|
+
if csv and name in csv:
|
|
1109
|
+
name_csv = f"{output}.{name}.csv"
|
|
1110
|
+
if verbose:
|
|
1111
|
+
print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
|
|
1112
|
+
df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
|
|
1113
|
+
|
|
1114
|
+
if memory > 2**22:
|
|
1115
|
+
msg = (
|
|
1116
|
+
f"[CubeLogs.to_excel] skipping {name!r}, "
|
|
1117
|
+
f"too big for excel with {memory} bytes"
|
|
1118
|
+
)
|
|
1119
|
+
if verbose:
|
|
1120
|
+
print(msg)
|
|
1121
|
+
else:
|
|
1122
|
+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
|
|
1123
|
+
else:
|
|
1124
|
+
df.to_excel(
|
|
1125
|
+
writer,
|
|
1126
|
+
sheet_name=name,
|
|
1127
|
+
freeze_panes=(df.columns.nlevels + df.index.nlevels, df.index.nlevels),
|
|
1128
|
+
)
|
|
1129
|
+
f_highlights[name] = tview.f_highlight
|
|
1130
|
+
if tview.plots:
|
|
1131
|
+
plots.append(CubePlot(df, kind="barh", orientation="row", split=True))
|
|
1132
|
+
if raw:
|
|
1133
|
+
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
1134
|
+
# Too long.
|
|
1135
|
+
# self._apply_excel_style(raw, writer, self.data)
|
|
1136
|
+
if csv and "raw" in csv:
|
|
1137
|
+
df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
|
|
1138
|
+
memory = df.memory_usage(deep=True).sum()
|
|
1139
|
+
if memory > 2**22:
|
|
1140
|
+
msg = (
|
|
1141
|
+
f"[CubeLogs.to_excel] skipping 'raw', "
|
|
1142
|
+
f"too big for excel with {memory} bytes"
|
|
499
1143
|
)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
1144
|
+
if verbose:
|
|
1145
|
+
print(msg)
|
|
1146
|
+
else:
|
|
1147
|
+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
|
|
1148
|
+
else:
|
|
1149
|
+
if verbose:
|
|
1150
|
+
print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
|
|
1151
|
+
self.data.to_excel(
|
|
1152
|
+
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
if plots:
|
|
1156
|
+
from openpyxl.drawing.image import Image
|
|
1157
|
+
|
|
1158
|
+
if verbose:
|
|
1159
|
+
print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
|
|
1160
|
+
sheet = writer.book.create_sheet("plots")
|
|
1161
|
+
pos = 0
|
|
1162
|
+
empty_row = 1
|
|
1163
|
+
times = self.data[self.time].dropna()
|
|
1164
|
+
mini, maxi = times.min(), times.max()
|
|
1165
|
+
title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
|
|
1166
|
+
" 00:00:00", ""
|
|
504
1167
|
)
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
1168
|
+
for plot in plots:
|
|
1169
|
+
imgs = plot.to_images(
|
|
1170
|
+
verbose=verbose, merge=True, title_suffix=title_suffix
|
|
1171
|
+
)
|
|
1172
|
+
for img in imgs:
|
|
1173
|
+
y = (pos // 2) * 16
|
|
1174
|
+
loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
|
|
1175
|
+
sheet.add_image(Image(io.BytesIO(img)), loc)
|
|
1176
|
+
if verbose:
|
|
1177
|
+
no = f"{output}.png"
|
|
1178
|
+
print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
|
|
1179
|
+
with open(no, "wb") as f:
|
|
1180
|
+
f.write(img)
|
|
1181
|
+
pos += 1
|
|
1182
|
+
empty_row += len(plots) + 2
|
|
508
1183
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
1184
|
+
if verbose:
|
|
1185
|
+
print(f"[CubeLogs.to_excel] applies style to {output!r}")
|
|
1186
|
+
apply_excel_style(writer, f_highlights) # type: ignore[arg-type]
|
|
1187
|
+
if verbose:
|
|
1188
|
+
print(f"[CubeLogs.to_excel] done with {len(views)} views")
|
|
512
1189
|
|
|
513
|
-
# from openpyxl.styles import Font, PatternFill, numbers
|
|
514
1190
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
# red = Font(color="FF0000")
|
|
520
|
-
# yellow = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
|
|
521
|
-
# redf = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid")
|
|
1191
|
+
class CubeLogsPerformance(CubeLogs):
|
|
1192
|
+
"""
|
|
1193
|
+
Processes logs coming from experiments.
|
|
1194
|
+
"""
|
|
522
1195
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
1196
|
+
def __init__(
|
|
1197
|
+
self,
|
|
1198
|
+
data: Any,
|
|
1199
|
+
time: str = "DATE",
|
|
1200
|
+
keys: Sequence[str] = (
|
|
1201
|
+
"^version_.*",
|
|
1202
|
+
"^model_.*",
|
|
1203
|
+
"device",
|
|
1204
|
+
"opt_patterns",
|
|
1205
|
+
"suite",
|
|
1206
|
+
"memory_peak",
|
|
1207
|
+
"machine",
|
|
1208
|
+
"exporter",
|
|
1209
|
+
"dynamic",
|
|
1210
|
+
"rtopt",
|
|
1211
|
+
"dtype",
|
|
1212
|
+
"device",
|
|
1213
|
+
"architecture",
|
|
1214
|
+
),
|
|
1215
|
+
values: Sequence[str] = (
|
|
1216
|
+
"^time_.*",
|
|
1217
|
+
"^disc.*",
|
|
1218
|
+
"^ERR_.*",
|
|
1219
|
+
"CMD",
|
|
1220
|
+
"^ITER",
|
|
1221
|
+
"^onnx_.*",
|
|
1222
|
+
"^op_onnx_.*",
|
|
1223
|
+
"^peak_gpu_.*",
|
|
1224
|
+
),
|
|
1225
|
+
ignored: Sequence[str] = ("version_python",),
|
|
1226
|
+
recent: bool = True,
|
|
1227
|
+
formulas: Optional[
|
|
1228
|
+
Union[
|
|
1229
|
+
Sequence[str],
|
|
1230
|
+
Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
|
|
1231
|
+
]
|
|
1232
|
+
] = (
|
|
1233
|
+
"speedup",
|
|
1234
|
+
"bucket[speedup]",
|
|
1235
|
+
"ERR1",
|
|
1236
|
+
"n_models",
|
|
1237
|
+
"n_model_eager",
|
|
1238
|
+
"n_model_running",
|
|
1239
|
+
"n_model_acc01",
|
|
1240
|
+
"n_model_acc001",
|
|
1241
|
+
"n_model_dynamic",
|
|
1242
|
+
"n_model_pass",
|
|
1243
|
+
"n_model_faster",
|
|
1244
|
+
"n_model_faster2x",
|
|
1245
|
+
"n_model_faster3x",
|
|
1246
|
+
"n_model_faster4x",
|
|
1247
|
+
"n_node_attention",
|
|
1248
|
+
"n_node_control_flow",
|
|
1249
|
+
"n_node_scatter",
|
|
1250
|
+
"n_node_function",
|
|
1251
|
+
"n_node_initializer",
|
|
1252
|
+
"n_node_constant",
|
|
1253
|
+
"n_node_shape",
|
|
1254
|
+
"n_node_expand",
|
|
1255
|
+
"peak_gpu_torch",
|
|
1256
|
+
"peak_gpu_nvidia",
|
|
1257
|
+
"time_export_unbiased",
|
|
1258
|
+
),
|
|
1259
|
+
fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
|
|
1260
|
+
keep_last_date: bool = False,
|
|
1261
|
+
):
|
|
1262
|
+
super().__init__(
|
|
1263
|
+
data=data,
|
|
1264
|
+
time=time,
|
|
1265
|
+
keys=keys,
|
|
1266
|
+
values=values,
|
|
1267
|
+
ignored=ignored,
|
|
1268
|
+
recent=recent,
|
|
1269
|
+
formulas=formulas,
|
|
1270
|
+
fill_missing=fill_missing,
|
|
1271
|
+
keep_last_date=keep_last_date,
|
|
1272
|
+
)
|
|
538
1273
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
c = get_column_letter(k)
|
|
545
|
-
sheet.column_dimensions[c].width = 15
|
|
1274
|
+
def _process_formula(
|
|
1275
|
+
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
1276
|
+
) -> Callable[[pandas.DataFrame], pandas.Series]:
|
|
1277
|
+
"""
|
|
1278
|
+
Processes a formula, converting it into a function.
|
|
546
1279
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
1280
|
+
:param formula: a formula string
|
|
1281
|
+
:return: a function
|
|
1282
|
+
"""
|
|
1283
|
+
if callable(formula):
|
|
1284
|
+
return formula
|
|
1285
|
+
assert isinstance(
|
|
1286
|
+
formula, str
|
|
1287
|
+
), f"Unexpected type for formula {type(formula)}: {formula!r}"
|
|
1288
|
+
|
|
1289
|
+
def gdf(df, cname, default_value=np.nan):
|
|
1290
|
+
if cname in df.columns:
|
|
1291
|
+
return df[cname]
|
|
1292
|
+
return pandas.Series(default_value, index=df.index)
|
|
1293
|
+
|
|
1294
|
+
def ghas_value(df, cname):
|
|
1295
|
+
if cname not in df.columns:
|
|
1296
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1297
|
+
isna = df[cname].isna()
|
|
1298
|
+
return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
|
|
1299
|
+
|
|
1300
|
+
def gpreserve(df, cname, series):
|
|
1301
|
+
if cname not in df.columns:
|
|
1302
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1303
|
+
isna = df[cname].isna()
|
|
1304
|
+
return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
|
|
1305
|
+
|
|
1306
|
+
if formula == "speedup":
|
|
1307
|
+
columns = set(self._filter_column(["^time_.*"], self.data.columns))
|
|
1308
|
+
assert "time_latency" in columns and "time_latency_eager" in columns, (
|
|
1309
|
+
f"Unable to apply formula {formula!r}, with columns\n"
|
|
1310
|
+
f"{pprint.pformat(sorted(columns))}"
|
|
1311
|
+
)
|
|
1312
|
+
return lambda df: df["time_latency_eager"] / df["time_latency"]
|
|
1313
|
+
|
|
1314
|
+
if formula == "bucket[speedup]":
|
|
1315
|
+
columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
|
|
1316
|
+
assert "speedup" in columns, (
|
|
1317
|
+
f"Unable to apply formula {formula!r}, with columns\n"
|
|
1318
|
+
f"{pprint.pformat(sorted(columns))}"
|
|
1319
|
+
)
|
|
1320
|
+
# return lambda df: df["time_latency_eager"] / df["time_latency"]
|
|
1321
|
+
return lambda df: pandas.cut(
|
|
1322
|
+
df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
if formula == "ERR1":
|
|
1326
|
+
columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
|
|
1327
|
+
if not columns:
|
|
1328
|
+
return lambda df: np.nan
|
|
1329
|
+
|
|
1330
|
+
def first_err(df: pandas.DataFrame) -> pandas.Series:
|
|
1331
|
+
ordered = [
|
|
1332
|
+
c
|
|
1333
|
+
for c in [
|
|
1334
|
+
"ERR_timeout",
|
|
1335
|
+
"ERR_load",
|
|
1336
|
+
"ERR_feeds",
|
|
1337
|
+
"ERR_warmup_eager",
|
|
1338
|
+
"ERR_export",
|
|
1339
|
+
"ERR_ort",
|
|
1340
|
+
"ERR_warmup",
|
|
1341
|
+
# "ERR_std",
|
|
1342
|
+
# "ERR_crash",
|
|
1343
|
+
# "ERR_stdout",
|
|
1344
|
+
]
|
|
1345
|
+
if c in df.columns
|
|
1346
|
+
]
|
|
1347
|
+
res = None
|
|
1348
|
+
for c in ordered:
|
|
1349
|
+
if res is None:
|
|
1350
|
+
res = df[c].fillna("")
|
|
582
1351
|
else:
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
1352
|
+
res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
|
|
1353
|
+
return res
|
|
1354
|
+
|
|
1355
|
+
return first_err
|
|
1356
|
+
|
|
1357
|
+
if formula.startswith("n_"):
|
|
1358
|
+
lambdas = dict(
|
|
1359
|
+
n_models=lambda df: ghas_value(df, "model_name"),
|
|
1360
|
+
n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
|
|
1361
|
+
n_model_running=lambda df: ghas_value(df, "time_latency"),
|
|
1362
|
+
n_model_acc01=lambda df: gpreserve(
|
|
1363
|
+
df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
|
|
1364
|
+
),
|
|
1365
|
+
n_model_acc001=lambda df: gpreserve(
|
|
1366
|
+
df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
|
|
1367
|
+
),
|
|
1368
|
+
n_model_dynamic=lambda df: gpreserve(
|
|
1369
|
+
df,
|
|
1370
|
+
"discrepancies_dynamic_abs",
|
|
1371
|
+
(gdf(df, "discrepancies_dynamic_abs") <= 0.1),
|
|
1372
|
+
),
|
|
1373
|
+
n_model_pass=lambda df: gpreserve(
|
|
1374
|
+
df,
|
|
1375
|
+
"time_latency",
|
|
1376
|
+
(gdf(df, "discrepancies_abs", np.inf) < 0.1)
|
|
1377
|
+
& (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
|
|
1378
|
+
),
|
|
1379
|
+
n_model_faster=lambda df: gpreserve(
|
|
1380
|
+
df,
|
|
1381
|
+
"time_latency",
|
|
1382
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
|
|
1383
|
+
),
|
|
1384
|
+
n_model_faster2x=lambda df: gpreserve(
|
|
1385
|
+
df,
|
|
1386
|
+
"time_latency",
|
|
1387
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
|
|
1388
|
+
),
|
|
1389
|
+
n_model_faster3x=lambda df: gpreserve(
|
|
1390
|
+
df,
|
|
1391
|
+
"time_latency",
|
|
1392
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
|
|
1393
|
+
),
|
|
1394
|
+
n_model_faster4x=lambda df: gpreserve(
|
|
1395
|
+
df,
|
|
1396
|
+
"time_latency",
|
|
1397
|
+
gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
|
|
1398
|
+
),
|
|
1399
|
+
n_node_attention=lambda df: gpreserve(
|
|
1400
|
+
df,
|
|
1401
|
+
"op_onnx_com.microsoft_Attention",
|
|
1402
|
+
gdf(df, "op_onnx_com.microsoft_Attention")
|
|
1403
|
+
+ gdf(df, "op_onnx_com.microsoft_MultiHeadAttention"),
|
|
1404
|
+
),
|
|
1405
|
+
n_node_control_flow=lambda df: gpreserve(
|
|
1406
|
+
df,
|
|
1407
|
+
"op_onnx__If",
|
|
1408
|
+
(
|
|
1409
|
+
gdf(df, "op_onnx__If", 0)
|
|
1410
|
+
+ gdf(df, "op_onnx__Scan", 0)
|
|
1411
|
+
+ gdf(df, "op_onnx__Loop", 0)
|
|
1412
|
+
),
|
|
1413
|
+
),
|
|
1414
|
+
n_node_scatter=lambda df: gpreserve(
|
|
1415
|
+
df,
|
|
1416
|
+
"op_onnx__ScatterND",
|
|
1417
|
+
gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
|
|
1418
|
+
),
|
|
1419
|
+
n_node_function=lambda df: gpreserve(
|
|
1420
|
+
df, "onnx_n_functions", gdf(df, "onnx_n_functions")
|
|
1421
|
+
),
|
|
1422
|
+
n_node_initializer=lambda df: gpreserve(
|
|
1423
|
+
df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
|
|
1424
|
+
),
|
|
1425
|
+
n_node_constant=lambda df: gpreserve(
|
|
1426
|
+
df, "op_onnx__Constant", gdf(df, "op_onnx__Constant")
|
|
1427
|
+
),
|
|
1428
|
+
n_node_shape=lambda df: gpreserve(
|
|
1429
|
+
df, "op_onnx__Shape", gdf(df, "op_onnx__Shape")
|
|
1430
|
+
),
|
|
1431
|
+
n_node_expand=lambda df: gpreserve(
|
|
1432
|
+
df, "op_onnx__Expand", gdf(df, "op_onnx__Expand")
|
|
1433
|
+
),
|
|
1434
|
+
)
|
|
1435
|
+
assert (
|
|
1436
|
+
formula in lambdas
|
|
1437
|
+
), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
|
|
1438
|
+
return lambdas[formula]
|
|
1439
|
+
|
|
1440
|
+
if formula == "peak_gpu_torch":
|
|
1441
|
+
return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
|
|
1442
|
+
if formula == "peak_gpu_nvidia":
|
|
1443
|
+
return (
|
|
1444
|
+
lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
|
|
1445
|
+
)
|
|
1446
|
+
if formula == "time_export_unbiased":
|
|
1447
|
+
|
|
1448
|
+
def unbiased_export(df):
|
|
1449
|
+
if "time_warmup_first_iteration" not in df.columns:
|
|
1450
|
+
return pandas.Series(np.nan, index=df.index)
|
|
1451
|
+
return pandas.Series(
|
|
1452
|
+
np.where(
|
|
1453
|
+
df["exporter"] == "inductor",
|
|
1454
|
+
df["time_warmup_first_iteration"] + df["time_export_success"],
|
|
1455
|
+
df["time_export_success"],
|
|
1456
|
+
),
|
|
1457
|
+
index=df.index,
|
|
1458
|
+
)
|
|
1459
|
+
|
|
1460
|
+
return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
|
|
1461
|
+
|
|
1462
|
+
raise ValueError(
|
|
1463
|
+
f"Unexpected formula {formula!r}, available columns are\n"
|
|
1464
|
+
f"{pprint.pformat(sorted(self.data.columns))}"
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
def view(
|
|
1468
|
+
self,
|
|
1469
|
+
view_def: Union[str, CubeViewDef],
|
|
1470
|
+
return_view_def: bool = False,
|
|
1471
|
+
verbose: int = 0,
|
|
1472
|
+
) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
|
|
1473
|
+
"""
|
|
1474
|
+
Returns a dataframe, a pivot view.
|
|
1475
|
+
|
|
1476
|
+
If view_def is a string, it is replaced by a prefined view.
|
|
1477
|
+
|
|
1478
|
+
:param view_def: view definition or a string
|
|
1479
|
+
:param return_view_def: returns the view definition as well
|
|
1480
|
+
:param verbose: verbosity level
|
|
1481
|
+
:return: dataframe
|
|
1482
|
+
"""
|
|
1483
|
+
if isinstance(view_def, str):
|
|
1484
|
+
view_def = self.make_view_def(view_def)
|
|
1485
|
+
return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
|
|
1486
|
+
|
|
1487
|
+
def make_view_def(self, name: str) -> CubeViewDef:
|
|
1488
|
+
"""
|
|
1489
|
+
Returns a view definition.
|
|
1490
|
+
|
|
1491
|
+
:param name: name of the view
|
|
1492
|
+
:return: a CubeViewDef
|
|
1493
|
+
|
|
1494
|
+
Available views:
|
|
1495
|
+
|
|
1496
|
+
* **agg-suite:** aggregation per suite
|
|
1497
|
+
* **disc:** discrepancies
|
|
1498
|
+
* **speedup:** speedup
|
|
1499
|
+
* **bucket_speedup:** speedup in buckets
|
|
1500
|
+
* **time:** latency
|
|
1501
|
+
* **time_export:** time to export
|
|
1502
|
+
* **counts:** status, running, faster, has control flow, ...
|
|
1503
|
+
* **err:** important errors
|
|
1504
|
+
* **cmd:** command lines
|
|
1505
|
+
* **raw-short:** raw data without all the unused columns
|
|
1506
|
+
"""
|
|
1507
|
+
fs = ["suite", "model_suite", "task", "model_name", "model_task"]
|
|
1508
|
+
index_cols = self._filter_column(fs, self.keys_time)
|
|
1509
|
+
assert index_cols, (
|
|
1510
|
+
f"No index columns found for {fs!r} in "
|
|
1511
|
+
f"{pprint.pformat(sorted(self.keys_time))}"
|
|
1512
|
+
)
|
|
1513
|
+
index_cols = [c for c in fs if c in set(index_cols)]
|
|
1514
|
+
|
|
1515
|
+
f_speedup = lambda x: ( # noqa: E731
|
|
1516
|
+
CubeViewDef.HighLightKind.NONE
|
|
1517
|
+
if not isinstance(x, (float, int))
|
|
1518
|
+
else (
|
|
1519
|
+
CubeViewDef.HighLightKind.RED
|
|
1520
|
+
if x < 0.9
|
|
1521
|
+
else (
|
|
1522
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1523
|
+
if x > 1.1
|
|
1524
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1525
|
+
)
|
|
1526
|
+
)
|
|
1527
|
+
)
|
|
1528
|
+
f_disc = lambda x: ( # noqa: E731
|
|
1529
|
+
CubeViewDef.HighLightKind.NONE
|
|
1530
|
+
if not isinstance(x, (float, int))
|
|
1531
|
+
else (
|
|
1532
|
+
CubeViewDef.HighLightKind.RED
|
|
1533
|
+
if x > 0.1
|
|
1534
|
+
else (
|
|
1535
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1536
|
+
if x < 0.01
|
|
1537
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1538
|
+
)
|
|
1539
|
+
)
|
|
1540
|
+
)
|
|
1541
|
+
f_bucket = lambda x: ( # noqa: E731
|
|
1542
|
+
CubeViewDef.HighLightKind.NONE
|
|
1543
|
+
if not isinstance(x, str)
|
|
1544
|
+
else (
|
|
1545
|
+
CubeViewDef.HighLightKind.RED
|
|
1546
|
+
if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
|
|
1547
|
+
else (
|
|
1548
|
+
CubeViewDef.HighLightKind.NONE
|
|
1549
|
+
if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
|
|
1550
|
+
else (
|
|
1551
|
+
CubeViewDef.HighLightKind.GREEN
|
|
1552
|
+
if "[" in x
|
|
1553
|
+
else CubeViewDef.HighLightKind.NONE
|
|
1554
|
+
)
|
|
1555
|
+
)
|
|
1556
|
+
)
|
|
1557
|
+
)
|
|
1558
|
+
|
|
1559
|
+
def mean_weight(gr):
|
|
1560
|
+
weight = gr["time_latency_eager"]
|
|
1561
|
+
x = gr["speedup"]
|
|
1562
|
+
if x.shape[0] == 0:
|
|
1563
|
+
return np.nan
|
|
1564
|
+
div = weight.sum()
|
|
1565
|
+
if div > 0:
|
|
1566
|
+
return (x * weight).sum() / div
|
|
1567
|
+
return np.nan
|
|
1568
|
+
|
|
1569
|
+
def mean_geo(gr):
|
|
1570
|
+
x = gr["speedup"]
|
|
1571
|
+
return np.exp(np.log(x.dropna()).mean())
|
|
1572
|
+
|
|
1573
|
+
order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
|
|
1574
|
+
implemented_views = {
|
|
1575
|
+
"agg-suite": lambda: CubeViewDef(
|
|
1576
|
+
key_index=index_cols,
|
|
1577
|
+
values=self._filter_column(
|
|
1578
|
+
[
|
|
1579
|
+
"TIME_ITER",
|
|
1580
|
+
"speedup",
|
|
1581
|
+
"time_latency",
|
|
1582
|
+
"time_latency_eager",
|
|
1583
|
+
"time_export_success",
|
|
1584
|
+
"time_export_unbiased",
|
|
1585
|
+
"^n_.*",
|
|
1586
|
+
"target_opset",
|
|
1587
|
+
"onnx_filesize",
|
|
1588
|
+
"onnx_weight_size_torch",
|
|
1589
|
+
"onnx_weight_size_proto",
|
|
1590
|
+
"onnx_n_nodes",
|
|
1591
|
+
"peak_gpu_torch",
|
|
1592
|
+
"peak_gpu_nvidia",
|
|
1593
|
+
],
|
|
1594
|
+
self.values,
|
|
1595
|
+
),
|
|
1596
|
+
ignore_unique=True,
|
|
1597
|
+
key_agg=["model_name", "task", "model_task"],
|
|
1598
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
1599
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
1600
|
+
keep_columns_in_index=["suite"],
|
|
1601
|
+
name="agg-suite",
|
|
1602
|
+
order=order,
|
|
1603
|
+
),
|
|
1604
|
+
"agg-all": lambda: CubeViewDef(
|
|
1605
|
+
key_index=index_cols,
|
|
1606
|
+
values=self._filter_column(
|
|
1607
|
+
[
|
|
1608
|
+
"TIME_ITER",
|
|
1609
|
+
"speedup",
|
|
1610
|
+
"time_latency",
|
|
1611
|
+
"time_latency_eager",
|
|
1612
|
+
"time_export_success",
|
|
1613
|
+
"time_export_unbiased",
|
|
1614
|
+
"^n_.*",
|
|
1615
|
+
"target_opset",
|
|
1616
|
+
"onnx_filesize",
|
|
1617
|
+
"onnx_weight_size_torch",
|
|
1618
|
+
"onnx_weight_size_proto",
|
|
1619
|
+
"onnx_n_nodes",
|
|
1620
|
+
"peak_gpu_torch",
|
|
1621
|
+
"peak_gpu_nvidia",
|
|
1622
|
+
],
|
|
1623
|
+
self.values,
|
|
1624
|
+
),
|
|
1625
|
+
ignore_unique=True,
|
|
1626
|
+
key_agg=["model_name", "task", "model_task", "suite"],
|
|
1627
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
1628
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
1629
|
+
name="agg-all",
|
|
1630
|
+
order=order,
|
|
1631
|
+
plots=True,
|
|
1632
|
+
),
|
|
1633
|
+
"disc": lambda: CubeViewDef(
|
|
1634
|
+
key_index=index_cols,
|
|
1635
|
+
values=self._filter_column(["discrepancies_abs"], self.values),
|
|
1636
|
+
ignore_unique=True,
|
|
1637
|
+
keep_columns_in_index=["suite"],
|
|
1638
|
+
f_highlight=f_disc,
|
|
1639
|
+
name="disc",
|
|
1640
|
+
order=order,
|
|
1641
|
+
),
|
|
1642
|
+
"speedup": lambda: CubeViewDef(
|
|
1643
|
+
key_index=index_cols,
|
|
1644
|
+
values=self._filter_column(["speedup"], self.values),
|
|
1645
|
+
ignore_unique=True,
|
|
1646
|
+
keep_columns_in_index=["suite"],
|
|
1647
|
+
f_highlight=f_speedup,
|
|
1648
|
+
name="speedup",
|
|
1649
|
+
order=order,
|
|
1650
|
+
),
|
|
1651
|
+
"counts": lambda: CubeViewDef(
|
|
1652
|
+
key_index=index_cols,
|
|
1653
|
+
values=self._filter_column(["^n_.*"], self.values),
|
|
1654
|
+
ignore_unique=True,
|
|
1655
|
+
keep_columns_in_index=["suite"],
|
|
1656
|
+
name="counts",
|
|
1657
|
+
order=order,
|
|
1658
|
+
),
|
|
1659
|
+
"peak-gpu": lambda: CubeViewDef(
|
|
1660
|
+
key_index=index_cols,
|
|
1661
|
+
values=self._filter_column(["^peak_gpu_.*"], self.values),
|
|
1662
|
+
ignore_unique=True,
|
|
1663
|
+
keep_columns_in_index=["suite"],
|
|
1664
|
+
name="peak-gpu",
|
|
1665
|
+
order=order,
|
|
1666
|
+
),
|
|
1667
|
+
"time": lambda: CubeViewDef(
|
|
1668
|
+
key_index=index_cols,
|
|
1669
|
+
values=self._filter_column(
|
|
1670
|
+
["time_latency", "time_latency_eager"], self.values
|
|
1671
|
+
),
|
|
1672
|
+
ignore_unique=True,
|
|
1673
|
+
keep_columns_in_index=["suite"],
|
|
1674
|
+
name="time",
|
|
1675
|
+
order=order,
|
|
1676
|
+
),
|
|
1677
|
+
"time_export": lambda: CubeViewDef(
|
|
1678
|
+
key_index=index_cols,
|
|
1679
|
+
values=self._filter_column(["time_export_unbiased"], self.values),
|
|
1680
|
+
ignore_unique=True,
|
|
1681
|
+
keep_columns_in_index=["suite"],
|
|
1682
|
+
name="time_export",
|
|
1683
|
+
order=order,
|
|
1684
|
+
),
|
|
1685
|
+
"err": lambda: CubeViewDef(
|
|
1686
|
+
key_index=index_cols,
|
|
1687
|
+
values=self._filter_column(
|
|
1688
|
+
["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
|
|
1689
|
+
),
|
|
1690
|
+
ignore_unique=True,
|
|
1691
|
+
keep_columns_in_index=["suite"],
|
|
1692
|
+
name="err",
|
|
1693
|
+
order=order,
|
|
1694
|
+
),
|
|
1695
|
+
"bucket-speedup": lambda: CubeViewDef(
|
|
1696
|
+
key_index=index_cols,
|
|
1697
|
+
values=self._filter_column(["bucket[speedup]"], self.values),
|
|
1698
|
+
ignore_unique=True,
|
|
1699
|
+
keep_columns_in_index=["suite"],
|
|
1700
|
+
name="bucket-speedup",
|
|
1701
|
+
f_highlight=f_bucket,
|
|
1702
|
+
order=order,
|
|
1703
|
+
),
|
|
1704
|
+
"cmd": lambda: CubeViewDef(
|
|
1705
|
+
key_index=index_cols,
|
|
1706
|
+
values=self._filter_column(["CMD"], self.values),
|
|
1707
|
+
ignore_unique=True,
|
|
1708
|
+
keep_columns_in_index=["suite"],
|
|
1709
|
+
name="cmd",
|
|
1710
|
+
order=order,
|
|
1711
|
+
),
|
|
1712
|
+
"raw-short": lambda: CubeViewDef(
|
|
1713
|
+
key_index=self.keys_time,
|
|
1714
|
+
values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
|
|
1715
|
+
ignore_unique=False,
|
|
1716
|
+
keep_columns_in_index=["suite"],
|
|
1717
|
+
name="raw-short",
|
|
1718
|
+
no_index=True,
|
|
1719
|
+
),
|
|
1720
|
+
}
|
|
1721
|
+
assert name in implemented_views, (
|
|
1722
|
+
f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
|
|
1723
|
+
f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
|
|
1724
|
+
f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
|
|
1725
|
+
)
|
|
1726
|
+
return implemented_views[name]()
|
|
1727
|
+
|
|
1728
|
+
def post_load_process_piece(
|
|
1729
|
+
self, df: pandas.DataFrame, unique: bool = False
|
|
1730
|
+
) -> pandas.DataFrame:
|
|
1731
|
+
df = super().post_load_process_piece(df, unique=unique)
|
|
1732
|
+
if unique:
|
|
1733
|
+
return df
|
|
1734
|
+
cols = self._filter_column(self._keys, df)
|
|
1735
|
+
res = None
|
|
1736
|
+
for c in cols:
|
|
1737
|
+
if df[c].isna().any():
|
|
1738
|
+
# Missing values for keys are not supposed to happen.
|
|
1739
|
+
uniq = set(df[c].dropna())
|
|
1740
|
+
if len(uniq) == 1:
|
|
1741
|
+
if res is None:
|
|
1742
|
+
res = df.copy()
|
|
1743
|
+
res[c] = res[c].fillna(uniq.pop())
|
|
1744
|
+
return df if res is None else res
|