onnx-diagnostic 0.6.3__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. onnx_diagnostic/__init__.py +1 -1
  2. onnx_diagnostic/_command_lines_parser.py +281 -80
  3. onnx_diagnostic/doc.py +22 -0
  4. onnx_diagnostic/export/dynamic_shapes.py +48 -20
  5. onnx_diagnostic/export/shape_helper.py +126 -0
  6. onnx_diagnostic/ext_test_case.py +1 -1
  7. onnx_diagnostic/helpers/cache_helper.py +78 -8
  8. onnx_diagnostic/helpers/config_helper.py +8 -4
  9. onnx_diagnostic/helpers/helper.py +30 -3
  10. onnx_diagnostic/helpers/log_helper.py +1744 -0
  11. onnx_diagnostic/helpers/mini_onnx_builder.py +4 -1
  12. onnx_diagnostic/helpers/model_builder_helper.py +54 -73
  13. onnx_diagnostic/helpers/torch_helper.py +18 -2
  14. onnx_diagnostic/reference/__init__.py +1 -0
  15. onnx_diagnostic/reference/ort_evaluator.py +29 -4
  16. onnx_diagnostic/reference/report_results_comparison.py +95 -0
  17. onnx_diagnostic/reference/torch_evaluator.py +21 -0
  18. onnx_diagnostic/tasks/automatic_speech_recognition.py +3 -0
  19. onnx_diagnostic/tasks/feature_extraction.py +3 -0
  20. onnx_diagnostic/tasks/fill_mask.py +3 -0
  21. onnx_diagnostic/tasks/image_classification.py +7 -1
  22. onnx_diagnostic/tasks/image_text_to_text.py +72 -18
  23. onnx_diagnostic/tasks/mixture_of_expert.py +3 -0
  24. onnx_diagnostic/tasks/object_detection.py +3 -0
  25. onnx_diagnostic/tasks/sentence_similarity.py +3 -0
  26. onnx_diagnostic/tasks/summarization.py +3 -0
  27. onnx_diagnostic/tasks/text2text_generation.py +3 -0
  28. onnx_diagnostic/tasks/text_classification.py +3 -0
  29. onnx_diagnostic/tasks/text_generation.py +90 -43
  30. onnx_diagnostic/tasks/zero_shot_image_classification.py +3 -0
  31. onnx_diagnostic/torch_export_patches/onnx_export_errors.py +78 -25
  32. onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +37 -0
  33. onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +365 -17
  34. onnx_diagnostic/torch_models/hghub/hub_api.py +81 -8
  35. onnx_diagnostic/torch_models/hghub/hub_data.py +6 -2
  36. onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +209 -0
  37. onnx_diagnostic/torch_models/hghub/model_inputs.py +58 -14
  38. onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py +23 -50
  39. onnx_diagnostic/torch_models/{test_helper.py → validate.py} +166 -106
  40. {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.1.dist-info}/METADATA +2 -2
  41. {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.1.dist-info}/RECORD +44 -41
  42. {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.1.dist-info}/WHEEL +0 -0
  43. {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.1.dist-info}/licenses/LICENSE.txt +0 -0
  44. {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1744 @@
1
+ import datetime
2
+ import enum
3
+ import glob
4
+ import io
5
+ import os
6
+ import pprint
7
+ import re
8
+ import warnings
9
+ import zipfile
10
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
11
+ import numpy as np
12
+ import pandas
13
+ from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
14
+ from .helper import string_sig
15
+
16
+ BUCKET_SCALES_VALUES = np.array(
17
+ [-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
18
+ )
19
+
20
+
21
+ BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
22
+
23
+
24
+ def enumerate_csv_files(
25
+ data: Union[
26
+ pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
27
+ ],
28
+ verbose: int = 0,
29
+ filtering: Optional[Callable[[str], bool]] = None,
30
+ ) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
31
+ """
32
+ Enumerates files considered for the aggregation.
33
+ Only csv files are considered.
34
+ If a zip file is given, the function digs into the zip files and
35
+ loops over csv candidates.
36
+
37
+ :param data: dataframe with the raw data or a file or list of files
38
+ :param vrbose: verbosity
39
+ :param filtering: function to filter in or out files in zip files,
40
+ must return true to keep the file, false to skip it.
41
+ :return: a generator yielding tuples with the filename, date, full path and zip file
42
+
43
+ data can contains:
44
+ * a dataframe
45
+ * a string for a filename, zip or csv
46
+ * a list of string
47
+ * a tuple
48
+ """
49
+ if not isinstance(data, list):
50
+ data = [data]
51
+ for itn, filename in enumerate(data):
52
+ if isinstance(filename, pandas.DataFrame):
53
+ if verbose:
54
+ print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
55
+ yield filename
56
+ continue
57
+
58
+ if isinstance(filename, tuple):
59
+ # A file in a zipfile
60
+ if verbose:
61
+ print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
62
+ yield filename
63
+ continue
64
+
65
+ if os.path.exists(filename):
66
+ ext = os.path.splitext(filename)[-1]
67
+ if ext == ".csv":
68
+ # We check the first line is ok.
69
+ if verbose:
70
+ print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
71
+ dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
72
+ du = dt.strftime("%Y-%m-%d %H:%M:%S")
73
+ yield (os.path.split(filename)[-1], du, filename, "")
74
+ continue
75
+
76
+ if ext == ".zip":
77
+ if verbose:
78
+ print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
79
+ zf = zipfile.ZipFile(filename, "r")
80
+ for ii, info in enumerate(zf.infolist()):
81
+ name = info.filename
82
+ if filtering is None:
83
+ ext = os.path.splitext(name)[-1]
84
+ if ext != ".csv":
85
+ continue
86
+ elif not filtering(name):
87
+ continue
88
+ if verbose:
89
+ print(
90
+ f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
91
+ )
92
+ with zf.open(name) as zzf:
93
+ first_line = zzf.readline()
94
+ if b"," not in first_line:
95
+ continue
96
+ yield (
97
+ os.path.split(name)[-1],
98
+ "%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
99
+ name,
100
+ filename,
101
+ )
102
+ zf.close()
103
+ continue
104
+
105
+ raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
106
+
107
+ # filename is a pattern.
108
+ found = glob.glob(filename)
109
+ if verbose and not found:
110
+ print(f"[enumerate_csv_files] unable to find file in {filename!r}")
111
+ for ii, f in enumerate(found):
112
+ if verbose:
113
+ print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
114
+ yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
115
+
116
+
117
+ def open_dataframe(
118
+ data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
119
+ ) -> pandas.DataFrame:
120
+ """
121
+ Opens a filename.
122
+
123
+ :param data: a dataframe, a filename, a tuple indicating the file is coming
124
+ from a zip file
125
+ :return: a dataframe
126
+ """
127
+ if isinstance(data, pandas.DataFrame):
128
+ return data
129
+ if isinstance(data, str):
130
+ df = pandas.read_csv(data)
131
+ df["RAWFILENAME"] = data
132
+ return df
133
+ if isinstance(data, tuple):
134
+ if not data[-1]:
135
+ df = pandas.read_csv(data[2])
136
+ df["RAWFILENAME"] = data[2]
137
+ return df
138
+ zf = zipfile.ZipFile(data[-1])
139
+ with zf.open(data[2]) as f:
140
+ df = pandas.read_csv(f)
141
+ df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
142
+ zf.close()
143
+ return df
144
+
145
+ raise ValueError(f"Unexpected value for data: {data!r}")
146
+
147
+
148
+ class CubeViewDef:
149
+ """
150
+ Defines how to compute a view.
151
+
152
+ :param key_index: keys to put in the row index
153
+ :param values: values to show
154
+ :param ignore_unique: ignore keys with a unique value
155
+ :param order: to reorder key in columns index
156
+ :param key_agg: aggregate according to these columns before
157
+ creating the view
158
+ :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
159
+ it can be also a callable to return a different aggregation
160
+ method depending on the column name
161
+ :param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
162
+ :param agg_multi: aggregation over multiple columns
163
+ :param ignore_columns: ignore the following columns if known to overload the view
164
+ :param keep_columns_in_index: keeps the columns even if there is only one unique value
165
+ :param dropna: drops rows with nan if not relevant
166
+ :param transpose: transpose
167
+ :param f_highlight: to highlights some values
168
+ :param name: name of the view, used mostly to debug
169
+ :param plots: adds plot to the Excel sheet
170
+ :param no_index: remove the index (but keeps the columns)
171
+ """
172
+
173
+ class HighLightKind(enum.IntEnum):
174
+ NONE = 0
175
+ RED = 1
176
+ GREEN = 2
177
+
178
+ def __init__(
179
+ self,
180
+ key_index: Sequence[str],
181
+ values: Sequence[str],
182
+ ignore_unique: bool = True,
183
+ order: Optional[Sequence[str]] = None,
184
+ key_agg: Optional[Sequence[str]] = None,
185
+ agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
186
+ agg_kwargs: Optional[Dict[str, Any]] = None,
187
+ agg_multi: Optional[
188
+ Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
189
+ ] = None,
190
+ ignore_columns: Optional[Sequence[str]] = None,
191
+ keep_columns_in_index: Optional[Sequence[str]] = None,
192
+ dropna: bool = True,
193
+ transpose: bool = False,
194
+ f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
195
+ name: Optional[str] = None,
196
+ no_index: bool = False,
197
+ plots: bool = False,
198
+ ):
199
+ self.key_index = key_index
200
+ self.values = values
201
+ self.ignore_unique = ignore_unique
202
+ self.order = order
203
+ self.key_agg = key_agg
204
+ self.agg_args = agg_args
205
+ self.agg_kwargs = agg_kwargs
206
+ self.agg_multi = agg_multi
207
+ self.dropna = dropna
208
+ self.ignore_columns = ignore_columns
209
+ self.keep_columns_in_index = keep_columns_in_index
210
+ self.f_highlight = f_highlight
211
+ self.transpose = transpose
212
+ self.name = name
213
+ self.no_index = no_index
214
+ self.plots = plots
215
+
216
+ def __repr__(self) -> str:
217
+ "usual"
218
+ return string_sig(self) # type: ignore[arg-type]
219
+
220
+
221
+ def apply_excel_style(
222
+ filename_or_writer: Any,
223
+ f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
224
+ ):
225
+ """
226
+ Applies styles on all sheets in a file unless the sheet is too big.
227
+
228
+ :param filename_or_writer: filename, modified inplace
229
+ :param f_highlight: color function to apply, one per sheet
230
+ """
231
+ from openpyxl import load_workbook
232
+ from openpyxl.styles import Alignment
233
+ from openpyxl.utils import get_column_letter
234
+ from openpyxl.styles import Font # , PatternFill, numbers
235
+
236
+ if isinstance(filename_or_writer, str):
237
+ workbook = load_workbook(filename_or_writer)
238
+ save = True
239
+ else:
240
+ workbook = filename_or_writer.book
241
+ save = False
242
+
243
+ left = Alignment(horizontal="left")
244
+ left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
245
+ right = Alignment(horizontal="right")
246
+ font_colors = {
247
+ CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
248
+ CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
249
+ }
250
+
251
+ for name in workbook.sheetnames:
252
+ f_highlight = f_highlights.get(name, None) if f_highlights else None
253
+ sheet = workbook[name]
254
+ n_rows = sheet.max_row
255
+ n_cols = sheet.max_column
256
+ if n_rows * n_cols > 2**18:
257
+ # Too big.
258
+ continue
259
+ co: Dict[int, int] = {}
260
+ sizes: Dict[int, int] = {}
261
+ cols = set()
262
+ for i in range(1, n_rows):
263
+ for j, cell in enumerate(sheet[i]):
264
+ if j > n_cols:
265
+ break
266
+ cols.add(cell.column)
267
+ if isinstance(cell.value, float):
268
+ co[j] = co.get(j, 0) + 1
269
+ elif isinstance(cell.value, str):
270
+ sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
271
+
272
+ for k, v in sizes.items():
273
+ c = get_column_letter(k)
274
+ sheet.column_dimensions[c].width = min(max(8, v), 30)
275
+ for k in cols:
276
+ if k not in sizes:
277
+ c = get_column_letter(k)
278
+ sheet.column_dimensions[c].width = 15
279
+
280
+ for i in range(1, n_rows):
281
+ for j, cell in enumerate(sheet[i]):
282
+ if j > n_cols:
283
+ break
284
+ if isinstance(cell.value, pandas.Timestamp):
285
+ cell.alignment = right
286
+ dt = cell.value.to_pydatetime()
287
+ cell.value = dt
288
+ cell.number_format = (
289
+ "YYYY-MM-DD"
290
+ if (
291
+ dt.hour == 0
292
+ and dt.minute == 0
293
+ and dt.second == 0
294
+ and dt.microsecond == 0
295
+ )
296
+ else "YYYY-MM-DD 00:00:00"
297
+ )
298
+ elif isinstance(cell.value, (float, int)):
299
+ cell.alignment = right
300
+ x = abs(cell.value)
301
+ if int(x) == x:
302
+ cell.number_format = "0"
303
+ elif x > 5000:
304
+ cell.number_format = "# ##0"
305
+ elif x >= 500:
306
+ cell.number_format = "0.0"
307
+ elif x >= 50:
308
+ cell.number_format = "0.00"
309
+ elif x >= 5:
310
+ cell.number_format = "0.000"
311
+ elif x > 0.5:
312
+ cell.number_format = "0.0000"
313
+ elif x > 0.005:
314
+ cell.number_format = "0.00000"
315
+ else:
316
+ cell.number_format = "0.000E+00"
317
+ if f_highlight:
318
+ h = f_highlight(cell.value)
319
+ if h in font_colors:
320
+ cell.font = font_colors[h]
321
+ elif isinstance(cell.value, str) and len(cell.value) > 70:
322
+ cell.alignment = left_shrink
323
+ else:
324
+ cell.alignment = left
325
+ if f_highlight:
326
+ h = f_highlight(cell.value)
327
+ if h in font_colors:
328
+ cell.font = font_colors[h]
329
+ if save:
330
+ workbook.save(filename_or_writer)
331
+
332
+
333
+ class CubePlot:
334
+ """
335
+ Creates a plot.
336
+ """
337
+
338
+ def __init__(
339
+ self, df: pandas.DataFrame, kind: str = "bar", orientation="col", split: bool = True
340
+ ):
341
+ self.df = df.copy()
342
+ self.kind = kind
343
+ self.orientation = orientation
344
+ self.split = split
345
+
346
+ if isinstance(self.df.columns, pandas.MultiIndex):
347
+ self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
348
+ if isinstance(self.df.index, pandas.MultiIndex):
349
+ self.df.index = ["/".join(map(str, i)) for i in self.df.index]
350
+
351
+ def __repr__(self) -> str:
352
+ "usual"
353
+ return string_sig(self) # type: ignore[arg-type]
354
+
355
+ def to_images(
356
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
357
+ ):
358
+ """
359
+ Converts data into plots and images.
360
+ """
361
+ import matplotlib.pyplot as plt
362
+
363
+ df = self.df.T if self.orientation == "row" else self.df
364
+ imgs = []
365
+ if verbose:
366
+ from tqdm import tqdm
367
+
368
+ loop = tqdm(df.columns)
369
+ else:
370
+ loop = df.columns
371
+ title_suffix = f"\n{title_suffix}" if title_suffix else ""
372
+ if merge:
373
+ nn = len(df.columns) // 2
374
+ nn += nn % 2
375
+ fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12))
376
+ pos = 0
377
+ for c in loop:
378
+ ax = axs[pos // 2, pos % 2]
379
+ df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
380
+ ax.tick_params(axis="both", which="major", labelsize=8)
381
+ ax.grid(True)
382
+ pos += 1 # noqa: SIM113
383
+ fig.tight_layout()
384
+ imgdata = io.BytesIO()
385
+ fig.savefig(imgdata, format="png")
386
+ imgs.append(imgdata.getvalue())
387
+ plt.close()
388
+ else:
389
+ for c in loop:
390
+ fig, ax = plt.subplots(1, 1, figsize=(3, 3))
391
+ df[c].plot.barh(title=c, ax=ax)
392
+ ax.tick_params(axis="both", which="major", labelsize=8)
393
+ ax.grid(True)
394
+ fig.tight_layout()
395
+ imgdata = io.BytesIO()
396
+ fig.savefig(imgdata, format="png")
397
+ imgs.append(imgdata.getvalue())
398
+ plt.close()
399
+ return imgs
400
+
401
+ def to_charts(self, writer: pandas.ExcelWriter, sheet, empty_row: int = 1):
402
+ """
403
+ Draws plots on a page.
404
+ The data is copied on this page.
405
+
406
+ :param name: sheet name
407
+ :param writer: writer (from pandas)
408
+ :param sheet_name: sheet
409
+ :param graph_index: graph index
410
+ :return: list of graph
411
+ """
412
+ assert self.split, f"Not implemented if split={self.split}"
413
+ assert self.orientation == "row", f"Not implemented if orientation={self.orientation}"
414
+ workbook = writer.book
415
+ labels = list(self.df.columns)
416
+ sheet.write_row(empty_row, 0, labels)
417
+
418
+ charts = []
419
+ pos = empty_row + 1
420
+ for i in self.df.index:
421
+ values = self.df.loc[i, :].tolist()
422
+ values = [("" if isinstance(v, float) and np.isnan(v) else v) for v in values]
423
+ sheet.write_row(pos, 0, values)
424
+ chart = workbook.add_chart({"type": "bar"})
425
+ chart.add_series(
426
+ {
427
+ "name": i,
428
+ "categories": [i, 1, empty_row, len(labels), empty_row],
429
+ "values": [i, 1, pos, len(labels), pos],
430
+ }
431
+ )
432
+ chart.set_title({"name": i})
433
+ charts.append(chart)
434
+ pos += 1
435
+ return charts
436
+
437
+
438
+ class CubeLogs:
439
+ """
440
+ Processes logs coming from experiments.
441
+ """
442
+
443
+ def __init__(
444
+ self,
445
+ data: Any,
446
+ time: str = "date",
447
+ keys: Sequence[str] = ("version_.*", "model_.*"),
448
+ values: Sequence[str] = ("time_.*", "disc_.*"),
449
+ ignored: Sequence[str] = (),
450
+ recent: bool = False,
451
+ formulas: Optional[
452
+ Union[
453
+ Sequence[str],
454
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
455
+ ]
456
+ ] = None,
457
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
458
+ keep_last_date: bool = False,
459
+ ):
460
+ self._data = data
461
+ self._time = time
462
+ self._keys = keys
463
+ self._values = values
464
+ self._ignored = ignored
465
+ self.recent = recent
466
+ self._formulas = formulas
467
+ self.fill_missing = fill_missing
468
+ self.keep_last_date = keep_last_date
469
+
470
+ def post_load_process_piece(
471
+ self, df: pandas.DataFrame, unique: bool = False
472
+ ) -> pandas.DataFrame:
473
+ """
474
+ Postprocesses a piece when a cube is made of multiple pieces
475
+ before it gets merged.
476
+ """
477
+ if not self.fill_missing:
478
+ return df
479
+ missing = dict(self.fill_missing)
480
+ for k, v in missing.items():
481
+ if k not in df.columns:
482
+ df[k] = v
483
+ return df
484
+
485
+ def load(self, verbose: int = 0):
486
+ """Loads and preprocesses the data. Returns self."""
487
+ if isinstance(self._data, pandas.DataFrame):
488
+ if verbose:
489
+ print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
490
+ self.data = self.post_load_process_piece(self._data, unique=True)
491
+ if verbose:
492
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
493
+ elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
494
+ if verbose:
495
+ print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
496
+ self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
497
+ if verbose:
498
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
499
+ elif isinstance(self._data, list) and all(
500
+ isinstance(r, pandas.DataFrame) for r in self._data
501
+ ):
502
+ if verbose:
503
+ print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
504
+ self.data = pandas.concat(
505
+ [self.post_load_process_piece(c) for c in self._data], axis=0
506
+ )
507
+ if verbose:
508
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
509
+ elif isinstance(self._data, list):
510
+ if verbose:
511
+ print("[CubeLogs.load] load from list of Cubes")
512
+ cubes = []
513
+ for item in enumerate_csv_files(self._data, verbose=verbose):
514
+ df = open_dataframe(item)
515
+ cube = CubeLogs(
516
+ df,
517
+ time=self._time,
518
+ keys=self._keys,
519
+ values=self._values,
520
+ ignored=self._ignored,
521
+ recent=self.recent,
522
+ )
523
+ cube.load()
524
+ cubes.append(self.post_load_process_piece(cube.data))
525
+ self.data = pandas.concat(cubes, axis=0)
526
+ if verbose:
527
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
528
+ else:
529
+ raise NotImplementedError(
530
+ f"Not implemented with the provided data (type={type(self._data)})"
531
+ )
532
+
533
+ assert all(isinstance(c, str) for c in self.data.columns), (
534
+ f"The class only supports string as column names "
535
+ f"but found {[c for c in self.data.columns if not isinstance(c, str)]}"
536
+ )
537
+ if verbose:
538
+ print(f"[CubeLogs.load] loaded with shape={self.data.shape}")
539
+
540
+ self._initialize_columns()
541
+ if verbose:
542
+ print(f"[CubeLogs.load] time={self.time}")
543
+ print(f"[CubeLogs.load] keys={self.keys_no_time}")
544
+ print(f"[CubeLogs.load] values={self.values}")
545
+ print(f"[CubeLogs.load] ignored={self.ignored}")
546
+ print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
547
+ print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
548
+ assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
549
+ assert self.values, f"No values found with {self._values} from {self.data.columns}"
550
+ assert not (
551
+ set(self.keys_no_time) & set(self.values)
552
+ ), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
553
+ assert not (
554
+ set(self.keys_no_time) & set(self.ignored)
555
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
556
+ assert not (
557
+ set(self.values) & set(self.ignored)
558
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
559
+ assert (
560
+ self.time not in self.keys_no_time
561
+ and self.time not in self.values
562
+ and self.time not in self.ignored
563
+ ), (
564
+ f"Column {self.time!r} is also a key, a value or ignored, "
565
+ f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
566
+ f"ignored={sorted(self.ignored)}"
567
+ )
568
+ self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
569
+ self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
570
+ self.data = self.data[self.columns]
571
+ if verbose:
572
+ print(f"[CubeLogs.load] dropped={self.dropped}")
573
+ print(f"[CubeLogs.load] data.shape={self.data.shape}")
574
+
575
+ shape = self.data.shape
576
+ if verbose:
577
+ print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
578
+ self._preprocess()
579
+ if verbose:
580
+ print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
581
+ assert (
582
+ self.data.shape[0] > 0
583
+ ), f"The preprocessing reduced shape {shape} to {self.data.shape}."
584
+ if self.recent and verbose:
585
+ print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
586
+
587
+ # Let's apply the formulas
588
+ if self._formulas:
589
+ forms = (
590
+ {k: k for k in self._formulas}
591
+ if not isinstance(self._formulas, dict)
592
+ else self._formulas
593
+ )
594
+ cols = set(self.values)
595
+ for k, ff in forms.items():
596
+ f = self._process_formula(ff)
597
+ if k in cols or f is None:
598
+ if verbose:
599
+ print(f"[CubeLogs.load] skip formula {k!r}")
600
+ else:
601
+ if verbose:
602
+ print(f"[CubeLogs.load] apply formula {k!r}")
603
+ self.data[k] = f(self.data)
604
+ self.values.append(k)
605
+ cols.add(k)
606
+ self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
607
+ for k in self.keys_no_time:
608
+ if self.data[k].isna().max():
609
+ self.values_for_key[k].add(np.nan)
610
+ self.keys_with_nans = [
611
+ c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
612
+ ]
613
+ if verbose:
614
+ print(f"[CubeLogs.load] convert column {self.time!r} into date")
615
+ if self.keys_with_nans:
616
+ print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
617
+ self.data[self.time] = pandas.to_datetime(self.data[self.time])
618
+
619
+ if self.keep_last_date:
620
+ times = self.data[self.time].dropna()
621
+ mi, mx = times.min(), times.max()
622
+ if mi != mx:
623
+ print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
624
+ self.data.loc[~self.data[self.time].isna(), self.time] = mx
625
+ self.values_for_key[self.time] = {mx}
626
+ if self.data[self.time].isna().max():
627
+ self.values_for_key[self.time].add(np.nan)
628
+ if verbose:
629
+ print(f"[CubeLogs.load] done, shape={self.shape}")
630
+ return self
631
+
632
+ def _process_formula(
633
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
634
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
635
+ assert callable(formula), f"formula={formula!r} is not supported."
636
+ return formula
637
+
638
+ @property
639
+ def shape(self) -> Tuple[int, int]:
640
+ "Returns the shape."
641
+ assert hasattr(self, "data"), "Method load was not called"
642
+ return self.data.shape
643
+
644
+ @property
645
+ def columns(self) -> Sequence[str]:
646
+ "Returns the columns."
647
+ assert hasattr(self, "data"), "Method load was not called"
648
+ return self.data.columns
649
+
650
+ def _preprocess(self):
651
+ last = self.values[0]
652
+ gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
653
+ gr = gr[gr[last] > 1]
654
+ if self.recent:
655
+ cp = self.data.copy()
656
+ assert (
657
+ "__index__" not in cp.columns
658
+ ), f"'__index__' should not be a column in {cp.columns}"
659
+ cp["__index__"] = np.arange(cp.shape[0])
660
+ gr = (
661
+ cp[[*self.keys_time, "__index__"]]
662
+ .groupby(self.keys_no_time, as_index=False, dropna=False)
663
+ .max()
664
+ )
665
+ assert gr.shape[0] > 0, (
666
+ f"Something went wrong after the groupby.\n"
667
+ f"{cp[[*self.keys, self.time, '__index__']].head().T}"
668
+ )
669
+ filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
670
+ assert filtered.shape[0] <= self.data.shape[0], (
671
+ f"Keeping the latest row brings more row {filtered.shape} "
672
+ f"(initial is {self.data.shape})."
673
+ )
674
+ self.data = filtered.drop("__index__", axis=1)
675
+ else:
676
+ assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
677
+
678
+ @classmethod
679
+ def _filter_column(cls, filters, columns, can_be_empty=False):
680
+ assert list(columns), "columns is empty"
681
+ set_cols = set()
682
+ for f in filters:
683
+ if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
684
+ reg = re.compile(f)
685
+ cols = [c for c in columns if reg.search(c)]
686
+ elif f in columns:
687
+ # No regular expression.
688
+ cols = [f]
689
+ else:
690
+ continue
691
+ set_cols |= set(cols)
692
+ assert (
693
+ can_be_empty or set_cols
694
+ ), f"Filters {filters} returns an empty set from {columns}"
695
+ return sorted(set_cols)
696
+
697
+ def _initialize_columns(self):
698
+ keys = self._filter_column(self._keys, self.data.columns)
699
+ self.values = self._filter_column(self._values, self.data.columns)
700
+ self.ignored = self._filter_column(self._ignored, self.data.columns, True)
701
+ assert (
702
+ self._time in self.data.columns
703
+ ), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
704
+ ignored_keys = set(self.ignored) & set(keys)
705
+ ignored_values = set(self.ignored) & set(self.values)
706
+ self.keys_no_time = [c for c in keys if c not in ignored_keys]
707
+ self.values = [c for c in self.values if c not in ignored_values]
708
+ self.ignored_keys = sorted(ignored_keys)
709
+ self.ignored_values = sorted(ignored_values)
710
+ self.time = self._time
711
+ self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
712
+
713
+ def __str__(self) -> str:
714
+ "usual"
715
+ return str(self.data) if hasattr(self, "data") else str(self._data)
716
+
717
+ def view(
718
+ self,
719
+ view_def: Union[str, CubeViewDef],
720
+ return_view_def: bool = False,
721
+ verbose: int = 0,
722
+ ) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
723
+ """
724
+ Returns a dataframe, a pivot view.
725
+ `key_index` determines the index, the other key columns determines
726
+ the columns. If `ignore_unique` is True, every columns with a unique value
727
+ is removed.
728
+
729
+ :param view_def: view definition
730
+ :param return_view_def: returns the view as well
731
+ :param verbose: verbosity level
732
+ :return: dataframe
733
+ """
734
+ assert isinstance(
735
+ view_def, CubeViewDef
736
+ ), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
737
+ if verbose:
738
+ print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
739
+ key_agg = (
740
+ self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
741
+ )
742
+ set_key_agg = set(key_agg)
743
+ assert set_key_agg <= set(self.keys_time), (
744
+ f"view_def.name={view_def.name!r}, "
745
+ f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
746
+ f"keys={sorted(self.keys_time)}",
747
+ )
748
+
749
+ values = self._filter_column(view_def.values, self.values)
750
+ assert set(values) <= set(self.values), (
751
+ f"view_def.name={view_def.name!r}, "
752
+ f"non existing columns in values {set(values) - set(self.values)}, "
753
+ f"values={sorted(self.values)}"
754
+ )
755
+
756
+ # aggregation
757
+ if key_agg:
758
+ final_stack = True
759
+ key_index = [
760
+ c
761
+ for c in self._filter_column(view_def.key_index, self.keys_time)
762
+ if c not in set_key_agg
763
+ ]
764
+ keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
765
+ if verbose:
766
+ print(f"[CubeLogs.view] aggregation of {set_key_agg}")
767
+ print(f"[CubeLogs.view] groupby {keys_no_agg}")
768
+
769
+ data_red = self.data[[*keys_no_agg, *values]]
770
+ assert set(key_index) <= set(data_red.columns), (
771
+ f"view_def.name={view_def.name!r}, "
772
+ f"nnable to find {set(key_index) - set(data_red.columns)}, "
773
+ f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
774
+ f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
775
+ f"keys={pprint.pformat(sorted(self.keys_time))}"
776
+ )
777
+ grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
778
+ if callable(view_def.agg_args):
779
+ agg_kwargs = view_def.agg_kwargs or {}
780
+ agg_args = ({c: view_def.agg_args(c) for c in values},)
781
+ else:
782
+ agg_args = view_def.agg_args # type: ignore[assignment]
783
+ agg_kwargs = view_def.agg_kwargs or {}
784
+ data = grouped_data.agg(*agg_args, **agg_kwargs)
785
+ if view_def.agg_multi:
786
+ append = []
787
+ for k, f in view_def.agg_multi.items():
788
+ cv = grouped_data.apply(f, include_groups=False)
789
+ append.append(cv.to_frame(k))
790
+ data = pandas.concat([data, *append], axis=1)
791
+ set_all_keys = set(keys_no_agg)
792
+ values = list(data.columns)
793
+ data = data.reset_index(drop=False)
794
+ else:
795
+ key_index = self._filter_column(view_def.key_index, self.keys_time)
796
+ if verbose:
797
+ print(f"[CubeLogs.view] no aggregation, index={key_index}")
798
+ data = self.data[[*self.keys_time, *values]]
799
+ set_all_keys = set(self.keys_time)
800
+ final_stack = False
801
+
802
+ assert set(key_index) <= set_all_keys, (
803
+ f"view_def.name={view_def.name!r}, "
804
+ f"Non existing keys in key_index {set(key_index) - set_all_keys}"
805
+ )
806
+
807
+ # remove unnecessary column
808
+ set_key_columns = {
809
+ c for c in self.keys_time if c not in key_index and c not in set(key_agg)
810
+ }
811
+ key_index0 = key_index
812
+ if view_def.ignore_unique:
813
+ unique = {
814
+ k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
815
+ }
816
+ keep_anyway = (
817
+ set(view_def.keep_columns_in_index)
818
+ if view_def.keep_columns_in_index
819
+ else set()
820
+ )
821
+ key_index = [k for k in key_index if k not in unique or k in keep_anyway]
822
+ key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
823
+ if verbose:
824
+ print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
825
+ print(
826
+ f"[CubeLogs.view] columns with unique values "
827
+ f"{set(key_index0) - set(key_index)}"
828
+ )
829
+ else:
830
+ if verbose:
831
+ print("[CubeLogs.view] keep all columns")
832
+ key_columns = sorted(set_key_columns)
833
+ unique = set()
834
+
835
+ _md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
836
+ all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
837
+ assert all_cols == set(self.keys_time), (
838
+ f"view_def.name={view_def.name!r}, "
839
+ f"key_columns + key_index + key_agg + unique != keys, left="
840
+ f"{set(self.keys_time) - all_cols}, "
841
+ f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
842
+ f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
843
+ )
844
+
845
+ # reorder
846
+ if view_def.order:
847
+ subset = self._filter_column(view_def.order, all_cols | {self.time})
848
+ corder = [o for o in view_def.order if o in subset]
849
+ assert set(corder) <= set_key_columns, (
850
+ f"view_def.name={view_def.name!r}, "
851
+ f"non existing columns from order in key_columns "
852
+ f"{set(corder) - set_key_columns}"
853
+ )
854
+ key_columns = [
855
+ *[o for o in corder if o in key_columns],
856
+ *[c for c in key_columns if c not in view_def.order],
857
+ ]
858
+ else:
859
+ corder = None
860
+
861
+ if view_def.dropna:
862
+ data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
863
+ data,
864
+ key_index,
865
+ key_columns,
866
+ values,
867
+ keep_columns_in_index=view_def.keep_columns_in_index,
868
+ )
869
+ if view_def.ignore_columns:
870
+ if verbose:
871
+ print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
872
+ data = data.drop(view_def.ignore_columns, axis=1)
873
+ seti = set(view_def.ignore_columns)
874
+ if view_def.keep_columns_in_index:
875
+ seti -= set(view_def.keep_columns_in_index)
876
+ key_index = [c for c in key_index if c not in seti]
877
+ key_columns = [c for c in key_columns if c not in seti]
878
+ values = [c for c in values if c not in seti]
879
+
880
+ # final verification
881
+ if verbose:
882
+ print(f"[CubeLogs.view] key_index={key_index}")
883
+ print(f"[CubeLogs.view] key_columns={key_columns}")
884
+ g = data[[*key_index, *key_columns]].copy()
885
+ g["count"] = 1
886
+ r = g.groupby([*key_index, *key_columns], dropna=False).sum()
887
+ not_unique = r[r["count"] > 1]
888
+ assert not_unique.shape[0] == 0, (
889
+ f"view_def.name={view_def.name!r}, "
890
+ f"unable to run the pivot with index={sorted(key_index)}, "
891
+ f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
892
+ f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
893
+ f"not unique={set(data.columns) - unique}"
894
+ f"\n--\n{not_unique.head()}"
895
+ )
896
+
897
+ # pivot
898
+ if verbose:
899
+ print(f"[CubeLogs.view] values={values}")
900
+ if key_index:
901
+ piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
902
+ else:
903
+ # pivot does return the same rank with it is empty.
904
+ # Let's add arficially one
905
+ data = data.copy()
906
+ data["ALL"] = "ALL"
907
+ piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
908
+ if isinstance(piv, pandas.Series):
909
+ piv = piv.to_frame(name="series")
910
+ names = list(piv.columns.names)
911
+ assert (
912
+ "METRICS" not in names
913
+ ), f"Not implemented when a level METRICS already exists {names!r}"
914
+ names[0] = "METRICS"
915
+ piv.columns = piv.columns.set_names(names)
916
+ if final_stack:
917
+ piv = piv.stack("METRICS", future_stack=True)
918
+ if view_def.transpose:
919
+ piv = piv.T
920
+ if isinstance(piv, pandas.Series):
921
+ piv = piv.to_frame("VALUE")
922
+ piv.sort_index(inplace=True)
923
+
924
+ if isinstance(piv.columns, pandas.MultiIndex):
925
+ if corder:
926
+ # reorder the levels for the columns with the view definition
927
+ new_corder = [c for c in corder if c in piv.columns.names]
928
+ new_names = [
929
+ *[c for c in piv.columns.names if c not in new_corder],
930
+ *new_corder,
931
+ ]
932
+ piv.columns = piv.columns.reorder_levels(new_names)
933
+ elif self.time in piv.columns.names:
934
+ # put time at the end
935
+ new_names = list(piv.columns.names)
936
+ ind = new_names.index(self.time)
937
+ if ind < len(new_names) - 1:
938
+ del new_names[ind]
939
+ new_names.append(self.time)
940
+ piv.columns = piv.columns.reorder_levels(new_names)
941
+
942
+ if view_def.no_index:
943
+ piv = piv.reset_index(drop=False)
944
+ else:
945
+ piv.sort_index(inplace=True, axis=1)
946
+
947
+ if verbose:
948
+ print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
949
+ print(f"[CubeLogs.view] -- done view {view_def.name!r}")
950
+ return (piv, view_def) if return_view_def else piv
951
+
952
+ def _dropna(
953
+ self,
954
+ data: pandas.DataFrame,
955
+ key_index: Sequence[str],
956
+ key_columns: Sequence[str],
957
+ values: Sequence[str],
958
+ keep_columns_in_index: Optional[Sequence[str]] = None,
959
+ ) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
960
+ set_keep_columns_in_index = (
961
+ set(keep_columns_in_index) if keep_columns_in_index else set()
962
+ )
963
+ v = data[values]
964
+ new_data = data[~v.isnull().all(1)]
965
+ if data.shape == new_data.shape:
966
+ return data, key_index, key_columns, values
967
+ new_data = new_data.copy()
968
+ new_key_index = []
969
+ for c in key_index:
970
+ if c in set_keep_columns_in_index:
971
+ new_key_index.append(c)
972
+ continue
973
+ v = new_data[c]
974
+ sv = set(v.dropna())
975
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
976
+ new_key_index.append(c)
977
+ new_key_columns = []
978
+ for c in key_columns:
979
+ if c in set_keep_columns_in_index:
980
+ new_key_columns.append(c)
981
+ continue
982
+ v = new_data[c]
983
+ sv = set(v.dropna())
984
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
985
+ new_key_columns.append(c)
986
+ for c in set(key_index) | set(key_columns):
987
+ s = new_data[c]
988
+ if s.isna().max():
989
+ if pandas.api.types.is_numeric_dtype(s):
990
+ min_v = s.dropna().min()
991
+ assert (
992
+ min_v >= 0
993
+ ), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
994
+ new_data[c] = s.fillna(-1)
995
+ else:
996
+ new_data[c] = s.fillna("NAN")
997
+ return new_data, new_key_index, new_key_columns, values
998
+
999
+ def describe(self) -> pandas.DataFrame:
1000
+ """Basic description of all variables."""
1001
+ rows = []
1002
+ for name in self.data.columns:
1003
+ values = self.data[name]
1004
+ dtype = values.dtype
1005
+ nonan = values.dropna()
1006
+ obs = dict(
1007
+ name=name,
1008
+ dtype=str(dtype),
1009
+ missing=len(values) - len(nonan),
1010
+ kind=(
1011
+ "time"
1012
+ if name == self.time
1013
+ else (
1014
+ "keys"
1015
+ if name in self.keys_no_time
1016
+ else (
1017
+ "values"
1018
+ if name in self.values
1019
+ else ("ignored" if name in self.ignored else "unused")
1020
+ )
1021
+ )
1022
+ ),
1023
+ )
1024
+ if len(nonan) > 0:
1025
+ obs.update(dict(count=len(nonan)))
1026
+ if is_numeric_dtype(nonan):
1027
+ obs.update(
1028
+ dict(
1029
+ min=nonan.min(),
1030
+ max=nonan.max(),
1031
+ mean=nonan.mean(),
1032
+ sum=nonan.sum(),
1033
+ n_values=len(set(nonan)),
1034
+ )
1035
+ )
1036
+ elif obs["kind"] == "time":
1037
+ unique = set(nonan)
1038
+ obs["n_values"] = len(unique)
1039
+ o = dict(
1040
+ min=str(nonan.min()),
1041
+ max=str(nonan.max()),
1042
+ n_values=len(set(nonan)),
1043
+ )
1044
+ o["values"] = f"{o['min']} - {o['max']}"
1045
+ obs.update(o)
1046
+ else:
1047
+ unique = set(nonan)
1048
+ obs["n_values"] = len(unique)
1049
+ if len(unique) < 20:
1050
+ obs["values"] = ",".join(map(str, sorted(unique)))
1051
+ rows.append(obs)
1052
+ return pandas.DataFrame(rows).set_index("name")
1053
+
1054
+ def to_excel(
1055
+ self,
1056
+ output: str,
1057
+ views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
1058
+ main: Optional[str] = "main",
1059
+ raw: Optional[str] = "raw",
1060
+ verbose: int = 0,
1061
+ csv: Optional[Sequence[str]] = None,
1062
+ ):
1063
+ """
1064
+ Creates an excel file with a list of view.
1065
+
1066
+ :param output: output file to create
1067
+ :param views: sequence or dictionary of views to append
1068
+ :param main: add a page with statitcs on all variables
1069
+ :param raw: add a page with the raw data
1070
+ :param csv: views to dump as csv files (same name as outputs + view naw)
1071
+ :param verbose: verbosity
1072
+ """
1073
+ if verbose:
1074
+ print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
1075
+ views = {k: k for k in views} if not isinstance(views, dict) else views
1076
+ f_highlights = {}
1077
+ plots = []
1078
+ with pandas.ExcelWriter(output, engine="openpyxl") as writer:
1079
+ if main:
1080
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1081
+ df = self.describe().sort_values("name")
1082
+ if verbose:
1083
+ print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
1084
+ df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
1085
+
1086
+ for name, view in views.items():
1087
+ df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
1088
+ memory = df.memory_usage(deep=True).sum()
1089
+ if verbose:
1090
+ print(
1091
+ f"[CubeLogs.to_excel] add sheet {name!r} with shape "
1092
+ f"{df.shape} ({memory} bytes), index={df.index.names}, "
1093
+ f"columns={df.columns.names}"
1094
+ )
1095
+ if self.time in df.columns.names:
1096
+ # Let's convert the time into str
1097
+ fr = df.columns.to_frame()
1098
+ if is_datetime64_any_dtype(fr[self.time]):
1099
+ dt = fr[self.time]
1100
+ has_time = (dt != dt.dt.normalize()).any()
1101
+ sdt = dt.apply(
1102
+ lambda t, has_time=has_time: t.strftime(
1103
+ "%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
1104
+ )
1105
+ )
1106
+ fr[self.time] = sdt
1107
+ df.columns = pandas.MultiIndex.from_frame(fr)
1108
+ if csv and name in csv:
1109
+ name_csv = f"{output}.{name}.csv"
1110
+ if verbose:
1111
+ print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
1112
+ df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
1113
+
1114
+ if memory > 2**22:
1115
+ msg = (
1116
+ f"[CubeLogs.to_excel] skipping {name!r}, "
1117
+ f"too big for excel with {memory} bytes"
1118
+ )
1119
+ if verbose:
1120
+ print(msg)
1121
+ else:
1122
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1123
+ else:
1124
+ df.to_excel(
1125
+ writer,
1126
+ sheet_name=name,
1127
+ freeze_panes=(df.columns.nlevels + df.index.nlevels, df.index.nlevels),
1128
+ )
1129
+ f_highlights[name] = tview.f_highlight
1130
+ if tview.plots:
1131
+ plots.append(CubePlot(df, kind="barh", orientation="row", split=True))
1132
+ if raw:
1133
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1134
+ # Too long.
1135
+ # self._apply_excel_style(raw, writer, self.data)
1136
+ if csv and "raw" in csv:
1137
+ df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
1138
+ memory = df.memory_usage(deep=True).sum()
1139
+ if memory > 2**22:
1140
+ msg = (
1141
+ f"[CubeLogs.to_excel] skipping 'raw', "
1142
+ f"too big for excel with {memory} bytes"
1143
+ )
1144
+ if verbose:
1145
+ print(msg)
1146
+ else:
1147
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1148
+ else:
1149
+ if verbose:
1150
+ print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
1151
+ self.data.to_excel(
1152
+ writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1153
+ )
1154
+
1155
+ if plots:
1156
+ from openpyxl.drawing.image import Image
1157
+
1158
+ if verbose:
1159
+ print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
1160
+ sheet = writer.book.create_sheet("plots")
1161
+ pos = 0
1162
+ empty_row = 1
1163
+ times = self.data[self.time].dropna()
1164
+ mini, maxi = times.min(), times.max()
1165
+ title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
1166
+ " 00:00:00", ""
1167
+ )
1168
+ for plot in plots:
1169
+ imgs = plot.to_images(
1170
+ verbose=verbose, merge=True, title_suffix=title_suffix
1171
+ )
1172
+ for img in imgs:
1173
+ y = (pos // 2) * 16
1174
+ loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
1175
+ sheet.add_image(Image(io.BytesIO(img)), loc)
1176
+ if verbose:
1177
+ no = f"{output}.png"
1178
+ print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
1179
+ with open(no, "wb") as f:
1180
+ f.write(img)
1181
+ pos += 1
1182
+ empty_row += len(plots) + 2
1183
+
1184
+ if verbose:
1185
+ print(f"[CubeLogs.to_excel] applies style to {output!r}")
1186
+ apply_excel_style(writer, f_highlights) # type: ignore[arg-type]
1187
+ if verbose:
1188
+ print(f"[CubeLogs.to_excel] done with {len(views)} views")
1189
+
1190
+
1191
+ class CubeLogsPerformance(CubeLogs):
1192
+ """
1193
+ Processes logs coming from experiments.
1194
+ """
1195
+
1196
+ def __init__(
1197
+ self,
1198
+ data: Any,
1199
+ time: str = "DATE",
1200
+ keys: Sequence[str] = (
1201
+ "^version_.*",
1202
+ "^model_.*",
1203
+ "device",
1204
+ "opt_patterns",
1205
+ "suite",
1206
+ "memory_peak",
1207
+ "machine",
1208
+ "exporter",
1209
+ "dynamic",
1210
+ "rtopt",
1211
+ "dtype",
1212
+ "device",
1213
+ "architecture",
1214
+ ),
1215
+ values: Sequence[str] = (
1216
+ "^time_.*",
1217
+ "^disc.*",
1218
+ "^ERR_.*",
1219
+ "CMD",
1220
+ "^ITER",
1221
+ "^onnx_.*",
1222
+ "^op_onnx_.*",
1223
+ "^peak_gpu_.*",
1224
+ ),
1225
+ ignored: Sequence[str] = ("version_python",),
1226
+ recent: bool = True,
1227
+ formulas: Optional[
1228
+ Union[
1229
+ Sequence[str],
1230
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
1231
+ ]
1232
+ ] = (
1233
+ "speedup",
1234
+ "bucket[speedup]",
1235
+ "ERR1",
1236
+ "n_models",
1237
+ "n_model_eager",
1238
+ "n_model_running",
1239
+ "n_model_acc01",
1240
+ "n_model_acc001",
1241
+ "n_model_dynamic",
1242
+ "n_model_pass",
1243
+ "n_model_faster",
1244
+ "n_model_faster2x",
1245
+ "n_model_faster3x",
1246
+ "n_model_faster4x",
1247
+ "n_node_attention",
1248
+ "n_node_control_flow",
1249
+ "n_node_scatter",
1250
+ "n_node_function",
1251
+ "n_node_initializer",
1252
+ "n_node_constant",
1253
+ "n_node_shape",
1254
+ "n_node_expand",
1255
+ "peak_gpu_torch",
1256
+ "peak_gpu_nvidia",
1257
+ "time_export_unbiased",
1258
+ ),
1259
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
1260
+ keep_last_date: bool = False,
1261
+ ):
1262
+ super().__init__(
1263
+ data=data,
1264
+ time=time,
1265
+ keys=keys,
1266
+ values=values,
1267
+ ignored=ignored,
1268
+ recent=recent,
1269
+ formulas=formulas,
1270
+ fill_missing=fill_missing,
1271
+ keep_last_date=keep_last_date,
1272
+ )
1273
+
1274
+ def _process_formula(
1275
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
1276
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
1277
+ """
1278
+ Processes a formula, converting it into a function.
1279
+
1280
+ :param formula: a formula string
1281
+ :return: a function
1282
+ """
1283
+ if callable(formula):
1284
+ return formula
1285
+ assert isinstance(
1286
+ formula, str
1287
+ ), f"Unexpected type for formula {type(formula)}: {formula!r}"
1288
+
1289
+ def gdf(df, cname, default_value=np.nan):
1290
+ if cname in df.columns:
1291
+ return df[cname]
1292
+ return pandas.Series(default_value, index=df.index)
1293
+
1294
+ def ghas_value(df, cname):
1295
+ if cname not in df.columns:
1296
+ return pandas.Series(np.nan, index=df.index)
1297
+ isna = df[cname].isna()
1298
+ return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
1299
+
1300
+ def gpreserve(df, cname, series):
1301
+ if cname not in df.columns:
1302
+ return pandas.Series(np.nan, index=df.index)
1303
+ isna = df[cname].isna()
1304
+ return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
1305
+
1306
+ if formula == "speedup":
1307
+ columns = set(self._filter_column(["^time_.*"], self.data.columns))
1308
+ assert "time_latency" in columns and "time_latency_eager" in columns, (
1309
+ f"Unable to apply formula {formula!r}, with columns\n"
1310
+ f"{pprint.pformat(sorted(columns))}"
1311
+ )
1312
+ return lambda df: df["time_latency_eager"] / df["time_latency"]
1313
+
1314
+ if formula == "bucket[speedup]":
1315
+ columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
1316
+ assert "speedup" in columns, (
1317
+ f"Unable to apply formula {formula!r}, with columns\n"
1318
+ f"{pprint.pformat(sorted(columns))}"
1319
+ )
1320
+ # return lambda df: df["time_latency_eager"] / df["time_latency"]
1321
+ return lambda df: pandas.cut(
1322
+ df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
1323
+ )
1324
+
1325
+ if formula == "ERR1":
1326
+ columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
1327
+ if not columns:
1328
+ return lambda df: np.nan
1329
+
1330
+ def first_err(df: pandas.DataFrame) -> pandas.Series:
1331
+ ordered = [
1332
+ c
1333
+ for c in [
1334
+ "ERR_timeout",
1335
+ "ERR_load",
1336
+ "ERR_feeds",
1337
+ "ERR_warmup_eager",
1338
+ "ERR_export",
1339
+ "ERR_ort",
1340
+ "ERR_warmup",
1341
+ # "ERR_std",
1342
+ # "ERR_crash",
1343
+ # "ERR_stdout",
1344
+ ]
1345
+ if c in df.columns
1346
+ ]
1347
+ res = None
1348
+ for c in ordered:
1349
+ if res is None:
1350
+ res = df[c].fillna("")
1351
+ else:
1352
+ res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
1353
+ return res
1354
+
1355
+ return first_err
1356
+
1357
+ if formula.startswith("n_"):
1358
+ lambdas = dict(
1359
+ n_models=lambda df: ghas_value(df, "model_name"),
1360
+ n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
1361
+ n_model_running=lambda df: ghas_value(df, "time_latency"),
1362
+ n_model_acc01=lambda df: gpreserve(
1363
+ df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
1364
+ ),
1365
+ n_model_acc001=lambda df: gpreserve(
1366
+ df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
1367
+ ),
1368
+ n_model_dynamic=lambda df: gpreserve(
1369
+ df,
1370
+ "discrepancies_dynamic_abs",
1371
+ (gdf(df, "discrepancies_dynamic_abs") <= 0.1),
1372
+ ),
1373
+ n_model_pass=lambda df: gpreserve(
1374
+ df,
1375
+ "time_latency",
1376
+ (gdf(df, "discrepancies_abs", np.inf) < 0.1)
1377
+ & (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
1378
+ ),
1379
+ n_model_faster=lambda df: gpreserve(
1380
+ df,
1381
+ "time_latency",
1382
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
1383
+ ),
1384
+ n_model_faster2x=lambda df: gpreserve(
1385
+ df,
1386
+ "time_latency",
1387
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
1388
+ ),
1389
+ n_model_faster3x=lambda df: gpreserve(
1390
+ df,
1391
+ "time_latency",
1392
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
1393
+ ),
1394
+ n_model_faster4x=lambda df: gpreserve(
1395
+ df,
1396
+ "time_latency",
1397
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
1398
+ ),
1399
+ n_node_attention=lambda df: gpreserve(
1400
+ df,
1401
+ "op_onnx_com.microsoft_Attention",
1402
+ gdf(df, "op_onnx_com.microsoft_Attention")
1403
+ + gdf(df, "op_onnx_com.microsoft_MultiHeadAttention"),
1404
+ ),
1405
+ n_node_control_flow=lambda df: gpreserve(
1406
+ df,
1407
+ "op_onnx__If",
1408
+ (
1409
+ gdf(df, "op_onnx__If", 0)
1410
+ + gdf(df, "op_onnx__Scan", 0)
1411
+ + gdf(df, "op_onnx__Loop", 0)
1412
+ ),
1413
+ ),
1414
+ n_node_scatter=lambda df: gpreserve(
1415
+ df,
1416
+ "op_onnx__ScatterND",
1417
+ gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
1418
+ ),
1419
+ n_node_function=lambda df: gpreserve(
1420
+ df, "onnx_n_functions", gdf(df, "onnx_n_functions")
1421
+ ),
1422
+ n_node_initializer=lambda df: gpreserve(
1423
+ df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
1424
+ ),
1425
+ n_node_constant=lambda df: gpreserve(
1426
+ df, "op_onnx__Constant", gdf(df, "op_onnx__Constant")
1427
+ ),
1428
+ n_node_shape=lambda df: gpreserve(
1429
+ df, "op_onnx__Shape", gdf(df, "op_onnx__Shape")
1430
+ ),
1431
+ n_node_expand=lambda df: gpreserve(
1432
+ df, "op_onnx__Expand", gdf(df, "op_onnx__Expand")
1433
+ ),
1434
+ )
1435
+ assert (
1436
+ formula in lambdas
1437
+ ), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
1438
+ return lambdas[formula]
1439
+
1440
+ if formula == "peak_gpu_torch":
1441
+ return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
1442
+ if formula == "peak_gpu_nvidia":
1443
+ return (
1444
+ lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
1445
+ )
1446
+ if formula == "time_export_unbiased":
1447
+
1448
+ def unbiased_export(df):
1449
+ if "time_warmup_first_iteration" not in df.columns:
1450
+ return pandas.Series(np.nan, index=df.index)
1451
+ return pandas.Series(
1452
+ np.where(
1453
+ df["exporter"] == "inductor",
1454
+ df["time_warmup_first_iteration"] + df["time_export_success"],
1455
+ df["time_export_success"],
1456
+ ),
1457
+ index=df.index,
1458
+ )
1459
+
1460
+ return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
1461
+
1462
+ raise ValueError(
1463
+ f"Unexpected formula {formula!r}, available columns are\n"
1464
+ f"{pprint.pformat(sorted(self.data.columns))}"
1465
+ )
1466
+
1467
+ def view(
1468
+ self,
1469
+ view_def: Union[str, CubeViewDef],
1470
+ return_view_def: bool = False,
1471
+ verbose: int = 0,
1472
+ ) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
1473
+ """
1474
+ Returns a dataframe, a pivot view.
1475
+
1476
+ If view_def is a string, it is replaced by a prefined view.
1477
+
1478
+ :param view_def: view definition or a string
1479
+ :param return_view_def: returns the view definition as well
1480
+ :param verbose: verbosity level
1481
+ :return: dataframe
1482
+ """
1483
+ if isinstance(view_def, str):
1484
+ view_def = self.make_view_def(view_def)
1485
+ return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
1486
+
1487
+ def make_view_def(self, name: str) -> CubeViewDef:
1488
+ """
1489
+ Returns a view definition.
1490
+
1491
+ :param name: name of the view
1492
+ :return: a CubeViewDef
1493
+
1494
+ Available views:
1495
+
1496
+ * **agg-suite:** aggregation per suite
1497
+ * **disc:** discrepancies
1498
+ * **speedup:** speedup
1499
+ * **bucket_speedup:** speedup in buckets
1500
+ * **time:** latency
1501
+ * **time_export:** time to export
1502
+ * **counts:** status, running, faster, has control flow, ...
1503
+ * **err:** important errors
1504
+ * **cmd:** command lines
1505
+ * **raw-short:** raw data without all the unused columns
1506
+ """
1507
+ fs = ["suite", "model_suite", "task", "model_name", "model_task"]
1508
+ index_cols = self._filter_column(fs, self.keys_time)
1509
+ assert index_cols, (
1510
+ f"No index columns found for {fs!r} in "
1511
+ f"{pprint.pformat(sorted(self.keys_time))}"
1512
+ )
1513
+ index_cols = [c for c in fs if c in set(index_cols)]
1514
+
1515
+ f_speedup = lambda x: ( # noqa: E731
1516
+ CubeViewDef.HighLightKind.NONE
1517
+ if not isinstance(x, (float, int))
1518
+ else (
1519
+ CubeViewDef.HighLightKind.RED
1520
+ if x < 0.9
1521
+ else (
1522
+ CubeViewDef.HighLightKind.GREEN
1523
+ if x > 1.1
1524
+ else CubeViewDef.HighLightKind.NONE
1525
+ )
1526
+ )
1527
+ )
1528
+ f_disc = lambda x: ( # noqa: E731
1529
+ CubeViewDef.HighLightKind.NONE
1530
+ if not isinstance(x, (float, int))
1531
+ else (
1532
+ CubeViewDef.HighLightKind.RED
1533
+ if x > 0.1
1534
+ else (
1535
+ CubeViewDef.HighLightKind.GREEN
1536
+ if x < 0.01
1537
+ else CubeViewDef.HighLightKind.NONE
1538
+ )
1539
+ )
1540
+ )
1541
+ f_bucket = lambda x: ( # noqa: E731
1542
+ CubeViewDef.HighLightKind.NONE
1543
+ if not isinstance(x, str)
1544
+ else (
1545
+ CubeViewDef.HighLightKind.RED
1546
+ if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
1547
+ else (
1548
+ CubeViewDef.HighLightKind.NONE
1549
+ if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
1550
+ else (
1551
+ CubeViewDef.HighLightKind.GREEN
1552
+ if "[" in x
1553
+ else CubeViewDef.HighLightKind.NONE
1554
+ )
1555
+ )
1556
+ )
1557
+ )
1558
+
1559
+ def mean_weight(gr):
1560
+ weight = gr["time_latency_eager"]
1561
+ x = gr["speedup"]
1562
+ if x.shape[0] == 0:
1563
+ return np.nan
1564
+ div = weight.sum()
1565
+ if div > 0:
1566
+ return (x * weight).sum() / div
1567
+ return np.nan
1568
+
1569
+ def mean_geo(gr):
1570
+ x = gr["speedup"]
1571
+ return np.exp(np.log(x.dropna()).mean())
1572
+
1573
+ order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
1574
+ implemented_views = {
1575
+ "agg-suite": lambda: CubeViewDef(
1576
+ key_index=index_cols,
1577
+ values=self._filter_column(
1578
+ [
1579
+ "TIME_ITER",
1580
+ "speedup",
1581
+ "time_latency",
1582
+ "time_latency_eager",
1583
+ "time_export_success",
1584
+ "time_export_unbiased",
1585
+ "^n_.*",
1586
+ "target_opset",
1587
+ "onnx_filesize",
1588
+ "onnx_weight_size_torch",
1589
+ "onnx_weight_size_proto",
1590
+ "onnx_n_nodes",
1591
+ "peak_gpu_torch",
1592
+ "peak_gpu_nvidia",
1593
+ ],
1594
+ self.values,
1595
+ ),
1596
+ ignore_unique=True,
1597
+ key_agg=["model_name", "task", "model_task"],
1598
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1599
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1600
+ keep_columns_in_index=["suite"],
1601
+ name="agg-suite",
1602
+ order=order,
1603
+ ),
1604
+ "agg-all": lambda: CubeViewDef(
1605
+ key_index=index_cols,
1606
+ values=self._filter_column(
1607
+ [
1608
+ "TIME_ITER",
1609
+ "speedup",
1610
+ "time_latency",
1611
+ "time_latency_eager",
1612
+ "time_export_success",
1613
+ "time_export_unbiased",
1614
+ "^n_.*",
1615
+ "target_opset",
1616
+ "onnx_filesize",
1617
+ "onnx_weight_size_torch",
1618
+ "onnx_weight_size_proto",
1619
+ "onnx_n_nodes",
1620
+ "peak_gpu_torch",
1621
+ "peak_gpu_nvidia",
1622
+ ],
1623
+ self.values,
1624
+ ),
1625
+ ignore_unique=True,
1626
+ key_agg=["model_name", "task", "model_task", "suite"],
1627
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1628
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1629
+ name="agg-all",
1630
+ order=order,
1631
+ plots=True,
1632
+ ),
1633
+ "disc": lambda: CubeViewDef(
1634
+ key_index=index_cols,
1635
+ values=self._filter_column(["discrepancies_abs"], self.values),
1636
+ ignore_unique=True,
1637
+ keep_columns_in_index=["suite"],
1638
+ f_highlight=f_disc,
1639
+ name="disc",
1640
+ order=order,
1641
+ ),
1642
+ "speedup": lambda: CubeViewDef(
1643
+ key_index=index_cols,
1644
+ values=self._filter_column(["speedup"], self.values),
1645
+ ignore_unique=True,
1646
+ keep_columns_in_index=["suite"],
1647
+ f_highlight=f_speedup,
1648
+ name="speedup",
1649
+ order=order,
1650
+ ),
1651
+ "counts": lambda: CubeViewDef(
1652
+ key_index=index_cols,
1653
+ values=self._filter_column(["^n_.*"], self.values),
1654
+ ignore_unique=True,
1655
+ keep_columns_in_index=["suite"],
1656
+ name="counts",
1657
+ order=order,
1658
+ ),
1659
+ "peak-gpu": lambda: CubeViewDef(
1660
+ key_index=index_cols,
1661
+ values=self._filter_column(["^peak_gpu_.*"], self.values),
1662
+ ignore_unique=True,
1663
+ keep_columns_in_index=["suite"],
1664
+ name="peak-gpu",
1665
+ order=order,
1666
+ ),
1667
+ "time": lambda: CubeViewDef(
1668
+ key_index=index_cols,
1669
+ values=self._filter_column(
1670
+ ["time_latency", "time_latency_eager"], self.values
1671
+ ),
1672
+ ignore_unique=True,
1673
+ keep_columns_in_index=["suite"],
1674
+ name="time",
1675
+ order=order,
1676
+ ),
1677
+ "time_export": lambda: CubeViewDef(
1678
+ key_index=index_cols,
1679
+ values=self._filter_column(["time_export_unbiased"], self.values),
1680
+ ignore_unique=True,
1681
+ keep_columns_in_index=["suite"],
1682
+ name="time_export",
1683
+ order=order,
1684
+ ),
1685
+ "err": lambda: CubeViewDef(
1686
+ key_index=index_cols,
1687
+ values=self._filter_column(
1688
+ ["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
1689
+ ),
1690
+ ignore_unique=True,
1691
+ keep_columns_in_index=["suite"],
1692
+ name="err",
1693
+ order=order,
1694
+ ),
1695
+ "bucket-speedup": lambda: CubeViewDef(
1696
+ key_index=index_cols,
1697
+ values=self._filter_column(["bucket[speedup]"], self.values),
1698
+ ignore_unique=True,
1699
+ keep_columns_in_index=["suite"],
1700
+ name="bucket-speedup",
1701
+ f_highlight=f_bucket,
1702
+ order=order,
1703
+ ),
1704
+ "cmd": lambda: CubeViewDef(
1705
+ key_index=index_cols,
1706
+ values=self._filter_column(["CMD"], self.values),
1707
+ ignore_unique=True,
1708
+ keep_columns_in_index=["suite"],
1709
+ name="cmd",
1710
+ order=order,
1711
+ ),
1712
+ "raw-short": lambda: CubeViewDef(
1713
+ key_index=self.keys_time,
1714
+ values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
1715
+ ignore_unique=False,
1716
+ keep_columns_in_index=["suite"],
1717
+ name="raw-short",
1718
+ no_index=True,
1719
+ ),
1720
+ }
1721
+ assert name in implemented_views, (
1722
+ f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
1723
+ f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
1724
+ f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
1725
+ )
1726
+ return implemented_views[name]()
1727
+
1728
+ def post_load_process_piece(
1729
+ self, df: pandas.DataFrame, unique: bool = False
1730
+ ) -> pandas.DataFrame:
1731
+ df = super().post_load_process_piece(df, unique=unique)
1732
+ if unique:
1733
+ return df
1734
+ cols = self._filter_column(self._keys, df)
1735
+ res = None
1736
+ for c in cols:
1737
+ if df[c].isna().any():
1738
+ # Missing values for keys are not supposed to happen.
1739
+ uniq = set(df[c].dropna())
1740
+ if len(uniq) == 1:
1741
+ if res is None:
1742
+ res = df.copy()
1743
+ res[c] = res[c].fillna(uniq.pop())
1744
+ return df if res is None else res