onnx-diagnostic 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,32 @@
1
1
  import datetime
2
+ import enum
2
3
  import glob
4
+ import io
3
5
  import os
6
+ import pprint
4
7
  import re
8
+ import warnings
5
9
  import zipfile
6
10
  from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
7
11
  import numpy as np
8
12
  import pandas
9
- from pandas.api.types import is_numeric_dtype
13
+ from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
10
14
  from .helper import string_sig
11
15
 
16
+ BUCKET_SCALES_VALUES = np.array(
17
+ [-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
18
+ )
19
+
20
+
21
+ BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
22
+
12
23
 
13
24
  def enumerate_csv_files(
14
25
  data: Union[
15
26
  pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
16
27
  ],
17
28
  verbose: int = 0,
29
+ filtering: Optional[Callable[[str], bool]] = None,
18
30
  ) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
19
31
  """
20
32
  Enumerates files considered for the aggregation.
@@ -23,6 +35,10 @@ def enumerate_csv_files(
23
35
  loops over csv candidates.
24
36
 
25
37
  :param data: dataframe with the raw data or a file or list of files
38
+ :param vrbose: verbosity
39
+ :param filtering: function to filter in or out files in zip files,
40
+ must return true to keep the file, false to skip it.
41
+ :return: a generator yielding tuples with the filename, date, full path and zip file
26
42
 
27
43
  data can contains:
28
44
  * a dataframe
@@ -52,13 +68,9 @@ def enumerate_csv_files(
52
68
  # We check the first line is ok.
53
69
  if verbose:
54
70
  print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
55
- with open(filename, "r", encoding="utf-8") as f:
56
- line = f.readline()
57
- if "~help" in line or (",CMD" not in line and ",DATE" not in line):
58
- continue
59
- dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
60
- du = dt.strftime("%Y-%m-%d %H:%M:%S")
61
- yield (os.path.split(filename)[-1], du, filename, "")
71
+ dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
72
+ du = dt.strftime("%Y-%m-%d %H:%M:%S")
73
+ yield (os.path.split(filename)[-1], du, filename, "")
62
74
  continue
63
75
 
64
76
  if ext == ".zip":
@@ -67,8 +79,11 @@ def enumerate_csv_files(
67
79
  zf = zipfile.ZipFile(filename, "r")
68
80
  for ii, info in enumerate(zf.infolist()):
69
81
  name = info.filename
70
- ext = os.path.splitext(name)[-1]
71
- if ext != ".csv":
82
+ if filtering is None:
83
+ ext = os.path.splitext(name)[-1]
84
+ if ext != ".csv":
85
+ continue
86
+ elif not filtering(name):
72
87
  continue
73
88
  if verbose:
74
89
  print(
@@ -96,7 +111,7 @@ def enumerate_csv_files(
96
111
  for ii, f in enumerate(found):
97
112
  if verbose:
98
113
  print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
99
- yield from enumerate_csv_files(f, verbose=verbose)
114
+ yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
100
115
 
101
116
 
102
117
  def open_dataframe(
@@ -140,10 +155,26 @@ class CubeViewDef:
140
155
  :param order: to reorder key in columns index
141
156
  :param key_agg: aggregate according to these columns before
142
157
  creating the view
143
- :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
158
+ :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
159
+ it can be also a callable to return a different aggregation
160
+ method depending on the column name
144
161
  :param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
162
+ :param agg_multi: aggregation over multiple columns
163
+ :param ignore_columns: ignore the following columns if known to overload the view
164
+ :param keep_columns_in_index: keeps the columns even if there is only one unique value
165
+ :param dropna: drops rows with nan if not relevant
166
+ :param transpose: transpose
167
+ :param f_highlight: to highlights some values
168
+ :param name: name of the view, used mostly to debug
169
+ :param plots: adds plot to the Excel sheet
170
+ :param no_index: remove the index (but keeps the columns)
145
171
  """
146
172
 
173
+ class HighLightKind(enum.IntEnum):
174
+ NONE = 0
175
+ RED = 1
176
+ GREEN = 2
177
+
147
178
  def __init__(
148
179
  self,
149
180
  key_index: Sequence[str],
@@ -151,8 +182,19 @@ class CubeViewDef:
151
182
  ignore_unique: bool = True,
152
183
  order: Optional[Sequence[str]] = None,
153
184
  key_agg: Optional[Sequence[str]] = None,
154
- agg_args: Sequence[Any] = ("sum",),
185
+ agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
155
186
  agg_kwargs: Optional[Dict[str, Any]] = None,
187
+ agg_multi: Optional[
188
+ Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
189
+ ] = None,
190
+ ignore_columns: Optional[Sequence[str]] = None,
191
+ keep_columns_in_index: Optional[Sequence[str]] = None,
192
+ dropna: bool = True,
193
+ transpose: bool = False,
194
+ f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
195
+ name: Optional[str] = None,
196
+ no_index: bool = False,
197
+ plots: bool = False,
156
198
  ):
157
199
  self.key_index = key_index
158
200
  self.values = values
@@ -161,11 +203,237 @@ class CubeViewDef:
161
203
  self.key_agg = key_agg
162
204
  self.agg_args = agg_args
163
205
  self.agg_kwargs = agg_kwargs
206
+ self.agg_multi = agg_multi
207
+ self.dropna = dropna
208
+ self.ignore_columns = ignore_columns
209
+ self.keep_columns_in_index = keep_columns_in_index
210
+ self.f_highlight = f_highlight
211
+ self.transpose = transpose
212
+ self.name = name
213
+ self.no_index = no_index
214
+ self.plots = plots
215
+
216
+ def __repr__(self) -> str:
217
+ "usual"
218
+ return string_sig(self) # type: ignore[arg-type]
219
+
220
+
221
+ def apply_excel_style(
222
+ filename_or_writer: Any,
223
+ f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
224
+ ):
225
+ """
226
+ Applies styles on all sheets in a file unless the sheet is too big.
227
+
228
+ :param filename_or_writer: filename, modified inplace
229
+ :param f_highlight: color function to apply, one per sheet
230
+ """
231
+ from openpyxl import load_workbook
232
+ from openpyxl.styles import Alignment
233
+ from openpyxl.utils import get_column_letter
234
+ from openpyxl.styles import Font # , PatternFill, numbers
235
+
236
+ if isinstance(filename_or_writer, str):
237
+ workbook = load_workbook(filename_or_writer)
238
+ save = True
239
+ else:
240
+ workbook = filename_or_writer.book
241
+ save = False
242
+
243
+ left = Alignment(horizontal="left")
244
+ left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
245
+ right = Alignment(horizontal="right")
246
+ font_colors = {
247
+ CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
248
+ CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
249
+ }
250
+
251
+ for name in workbook.sheetnames:
252
+ f_highlight = f_highlights.get(name, None) if f_highlights else None
253
+ sheet = workbook[name]
254
+ n_rows = sheet.max_row
255
+ n_cols = sheet.max_column
256
+ if n_rows * n_cols > 2**18:
257
+ # Too big.
258
+ continue
259
+ co: Dict[int, int] = {}
260
+ sizes: Dict[int, int] = {}
261
+ cols = set()
262
+ for i in range(1, n_rows):
263
+ for j, cell in enumerate(sheet[i]):
264
+ if j > n_cols:
265
+ break
266
+ cols.add(cell.column)
267
+ if isinstance(cell.value, float):
268
+ co[j] = co.get(j, 0) + 1
269
+ elif isinstance(cell.value, str):
270
+ sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
271
+
272
+ for k, v in sizes.items():
273
+ c = get_column_letter(k)
274
+ sheet.column_dimensions[c].width = min(max(8, v), 30)
275
+ for k in cols:
276
+ if k not in sizes:
277
+ c = get_column_letter(k)
278
+ sheet.column_dimensions[c].width = 15
279
+
280
+ for i in range(1, n_rows):
281
+ for j, cell in enumerate(sheet[i]):
282
+ if j > n_cols:
283
+ break
284
+ if isinstance(cell.value, pandas.Timestamp):
285
+ cell.alignment = right
286
+ dt = cell.value.to_pydatetime()
287
+ cell.value = dt
288
+ cell.number_format = (
289
+ "YYYY-MM-DD"
290
+ if (
291
+ dt.hour == 0
292
+ and dt.minute == 0
293
+ and dt.second == 0
294
+ and dt.microsecond == 0
295
+ )
296
+ else "YYYY-MM-DD 00:00:00"
297
+ )
298
+ elif isinstance(cell.value, (float, int)):
299
+ cell.alignment = right
300
+ x = abs(cell.value)
301
+ if int(x) == x:
302
+ cell.number_format = "0"
303
+ elif x > 5000:
304
+ cell.number_format = "# ##0"
305
+ elif x >= 500:
306
+ cell.number_format = "0.0"
307
+ elif x >= 50:
308
+ cell.number_format = "0.00"
309
+ elif x >= 5:
310
+ cell.number_format = "0.000"
311
+ elif x > 0.5:
312
+ cell.number_format = "0.0000"
313
+ elif x > 0.005:
314
+ cell.number_format = "0.00000"
315
+ else:
316
+ cell.number_format = "0.000E+00"
317
+ if f_highlight:
318
+ h = f_highlight(cell.value)
319
+ if h in font_colors:
320
+ cell.font = font_colors[h]
321
+ elif isinstance(cell.value, str) and len(cell.value) > 70:
322
+ cell.alignment = left_shrink
323
+ else:
324
+ cell.alignment = left
325
+ if f_highlight:
326
+ h = f_highlight(cell.value)
327
+ if h in font_colors:
328
+ cell.font = font_colors[h]
329
+ if save:
330
+ workbook.save(filename_or_writer)
331
+
332
+
333
+ class CubePlot:
334
+ """
335
+ Creates a plot.
336
+ """
337
+
338
+ def __init__(
339
+ self, df: pandas.DataFrame, kind: str = "bar", orientation="col", split: bool = True
340
+ ):
341
+ self.df = df.copy()
342
+ self.kind = kind
343
+ self.orientation = orientation
344
+ self.split = split
345
+
346
+ if isinstance(self.df.columns, pandas.MultiIndex):
347
+ self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
348
+ if isinstance(self.df.index, pandas.MultiIndex):
349
+ self.df.index = ["/".join(map(str, i)) for i in self.df.index]
164
350
 
165
351
  def __repr__(self) -> str:
166
352
  "usual"
167
353
  return string_sig(self) # type: ignore[arg-type]
168
354
 
355
+ def to_images(
356
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
357
+ ):
358
+ """
359
+ Converts data into plots and images.
360
+ """
361
+ import matplotlib.pyplot as plt
362
+
363
+ df = self.df.T if self.orientation == "row" else self.df
364
+ imgs = []
365
+ if verbose:
366
+ from tqdm import tqdm
367
+
368
+ loop = tqdm(df.columns)
369
+ else:
370
+ loop = df.columns
371
+ title_suffix = f"\n{title_suffix}" if title_suffix else ""
372
+ if merge:
373
+ nn = len(df.columns) // 2
374
+ nn += nn % 2
375
+ fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12))
376
+ pos = 0
377
+ for c in loop:
378
+ ax = axs[pos // 2, pos % 2]
379
+ df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
380
+ ax.tick_params(axis="both", which="major", labelsize=8)
381
+ ax.grid(True)
382
+ pos += 1 # noqa: SIM113
383
+ fig.tight_layout()
384
+ imgdata = io.BytesIO()
385
+ fig.savefig(imgdata, format="png")
386
+ imgs.append(imgdata.getvalue())
387
+ plt.close()
388
+ else:
389
+ for c in loop:
390
+ fig, ax = plt.subplots(1, 1, figsize=(3, 3))
391
+ df[c].plot.barh(title=c, ax=ax)
392
+ ax.tick_params(axis="both", which="major", labelsize=8)
393
+ ax.grid(True)
394
+ fig.tight_layout()
395
+ imgdata = io.BytesIO()
396
+ fig.savefig(imgdata, format="png")
397
+ imgs.append(imgdata.getvalue())
398
+ plt.close()
399
+ return imgs
400
+
401
+ def to_charts(self, writer: pandas.ExcelWriter, sheet, empty_row: int = 1):
402
+ """
403
+ Draws plots on a page.
404
+ The data is copied on this page.
405
+
406
+ :param name: sheet name
407
+ :param writer: writer (from pandas)
408
+ :param sheet_name: sheet
409
+ :param graph_index: graph index
410
+ :return: list of graph
411
+ """
412
+ assert self.split, f"Not implemented if split={self.split}"
413
+ assert self.orientation == "row", f"Not implemented if orientation={self.orientation}"
414
+ workbook = writer.book
415
+ labels = list(self.df.columns)
416
+ sheet.write_row(empty_row, 0, labels)
417
+
418
+ charts = []
419
+ pos = empty_row + 1
420
+ for i in self.df.index:
421
+ values = self.df.loc[i, :].tolist()
422
+ values = [("" if isinstance(v, float) and np.isnan(v) else v) for v in values]
423
+ sheet.write_row(pos, 0, values)
424
+ chart = workbook.add_chart({"type": "bar"})
425
+ chart.add_series(
426
+ {
427
+ "name": i,
428
+ "categories": [i, 1, empty_row, len(labels), empty_row],
429
+ "values": [i, 1, pos, len(labels), pos],
430
+ }
431
+ )
432
+ chart.set_title({"name": i})
433
+ charts.append(chart)
434
+ pos += 1
435
+ return charts
436
+
169
437
 
170
438
  class CubeLogs:
171
439
  """
@@ -180,7 +448,14 @@ class CubeLogs:
180
448
  values: Sequence[str] = ("time_.*", "disc_.*"),
181
449
  ignored: Sequence[str] = (),
182
450
  recent: bool = False,
183
- formulas: Optional[Dict[str, Callable[[pandas.DataFrame], pandas.Series]]] = None,
451
+ formulas: Optional[
452
+ Union[
453
+ Sequence[str],
454
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
455
+ ]
456
+ ] = None,
457
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
458
+ keep_last_date: bool = False,
184
459
  ):
185
460
  self._data = data
186
461
  self._time = time
@@ -189,24 +464,51 @@ class CubeLogs:
189
464
  self._ignored = ignored
190
465
  self.recent = recent
191
466
  self._formulas = formulas
467
+ self.fill_missing = fill_missing
468
+ self.keep_last_date = keep_last_date
469
+
470
+ def post_load_process_piece(
471
+ self, df: pandas.DataFrame, unique: bool = False
472
+ ) -> pandas.DataFrame:
473
+ """
474
+ Postprocesses a piece when a cube is made of multiple pieces
475
+ before it gets merged.
476
+ """
477
+ if not self.fill_missing:
478
+ return df
479
+ missing = dict(self.fill_missing)
480
+ for k, v in missing.items():
481
+ if k not in df.columns:
482
+ df[k] = v
483
+ return df
192
484
 
193
485
  def load(self, verbose: int = 0):
194
486
  """Loads and preprocesses the data. Returns self."""
195
487
  if isinstance(self._data, pandas.DataFrame):
196
488
  if verbose:
197
489
  print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
198
- self.data = self._data
490
+ self.data = self.post_load_process_piece(self._data, unique=True)
491
+ if verbose:
492
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
199
493
  elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
200
494
  if verbose:
201
495
  print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
202
- self.data = pandas.DataFrame(self._data)
496
+ self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
497
+ if verbose:
498
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
203
499
  elif isinstance(self._data, list) and all(
204
500
  isinstance(r, pandas.DataFrame) for r in self._data
205
501
  ):
206
502
  if verbose:
207
503
  print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
208
- self.data = pandas.concat(self._data, axis=0)
504
+ self.data = pandas.concat(
505
+ [self.post_load_process_piece(c) for c in self._data], axis=0
506
+ )
507
+ if verbose:
508
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
209
509
  elif isinstance(self._data, list):
510
+ if verbose:
511
+ print("[CubeLogs.load] load from list of Cubes")
210
512
  cubes = []
211
513
  for item in enumerate_csv_files(self._data, verbose=verbose):
212
514
  df = open_dataframe(item)
@@ -219,8 +521,10 @@ class CubeLogs:
219
521
  recent=self.recent,
220
522
  )
221
523
  cube.load()
222
- cubes.append(cube.data)
524
+ cubes.append(self.post_load_process_piece(cube.data))
223
525
  self.data = pandas.concat(cubes, axis=0)
526
+ if verbose:
527
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
224
528
  else:
225
529
  raise NotImplementedError(
226
530
  f"Not implemented with the provided data (type={type(self._data)})"
@@ -236,59 +540,101 @@ class CubeLogs:
236
540
  self._initialize_columns()
237
541
  if verbose:
238
542
  print(f"[CubeLogs.load] time={self.time}")
239
- print(f"[CubeLogs.load] keys={self.keys}")
543
+ print(f"[CubeLogs.load] keys={self.keys_no_time}")
240
544
  print(f"[CubeLogs.load] values={self.values}")
241
545
  print(f"[CubeLogs.load] ignored={self.ignored}")
242
546
  print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
243
547
  print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
548
+ assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
549
+ assert self.values, f"No values found with {self._values} from {self.data.columns}"
244
550
  assert not (
245
- set(self.keys) & set(self.values)
246
- ), f"Columns {set(self.keys) & set(self.values)} cannot be keys and values"
551
+ set(self.keys_no_time) & set(self.values)
552
+ ), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
247
553
  assert not (
248
- set(self.keys) & set(self.ignored)
249
- ), f"Columns {set(self.keys) & set(self.ignored)} cannot be keys and ignored"
554
+ set(self.keys_no_time) & set(self.ignored)
555
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
250
556
  assert not (
251
557
  set(self.values) & set(self.ignored)
252
- ), f"Columns {set(self.keys) & set(self.ignored)} cannot be values and ignored"
558
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
253
559
  assert (
254
- self.time not in self.keys
560
+ self.time not in self.keys_no_time
255
561
  and self.time not in self.values
256
562
  and self.time not in self.ignored
257
- ), f"Column {self.time!r} is also a key, a value or ignored"
258
- self._columns = [self.time, *self.keys, *self.values, *self.ignored]
563
+ ), (
564
+ f"Column {self.time!r} is also a key, a value or ignored, "
565
+ f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
566
+ f"ignored={sorted(self.ignored)}"
567
+ )
568
+ self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
259
569
  self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
260
570
  self.data = self.data[self.columns]
261
571
  if verbose:
262
572
  print(f"[CubeLogs.load] dropped={self.dropped}")
263
573
  print(f"[CubeLogs.load] data.shape={self.data.shape}")
264
574
 
575
+ shape = self.data.shape
576
+ if verbose:
577
+ print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
265
578
  self._preprocess()
579
+ if verbose:
580
+ print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
581
+ assert (
582
+ self.data.shape[0] > 0
583
+ ), f"The preprocessing reduced shape {shape} to {self.data.shape}."
266
584
  if self.recent and verbose:
267
585
  print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
268
586
 
269
587
  # Let's apply the formulas
270
588
  if self._formulas:
271
- cols = set(self.data.columns)
272
- for k, f in self._formulas.items():
273
- if k in cols:
589
+ forms = (
590
+ {k: k for k in self._formulas}
591
+ if not isinstance(self._formulas, dict)
592
+ else self._formulas
593
+ )
594
+ cols = set(self.values)
595
+ for k, ff in forms.items():
596
+ f = self._process_formula(ff)
597
+ if k in cols or f is None:
274
598
  if verbose:
275
599
  print(f"[CubeLogs.load] skip formula {k!r}")
276
600
  else:
277
601
  if verbose:
278
602
  print(f"[CubeLogs.load] apply formula {k!r}")
279
603
  self.data[k] = f(self.data)
280
- self.values_for_key = {k: set(self.data[k]) for k in self.keys}
281
- nans = [
282
- c for c in [self.time, *self.keys] if self.data[c].isna().astype(int).sum() > 0
604
+ self.values.append(k)
605
+ cols.add(k)
606
+ self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
607
+ for k in self.keys_no_time:
608
+ if self.data[k].isna().max():
609
+ self.values_for_key[k].add(np.nan)
610
+ self.keys_with_nans = [
611
+ c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
283
612
  ]
284
- assert not nans, f"The following keys {nans} have nan values. This is not allowed."
285
613
  if verbose:
286
614
  print(f"[CubeLogs.load] convert column {self.time!r} into date")
615
+ if self.keys_with_nans:
616
+ print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
287
617
  self.data[self.time] = pandas.to_datetime(self.data[self.time])
618
+
619
+ if self.keep_last_date:
620
+ times = self.data[self.time].dropna()
621
+ mi, mx = times.min(), times.max()
622
+ if mi != mx:
623
+ print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
624
+ self.data.loc[~self.data[self.time].isna(), self.time] = mx
625
+ self.values_for_key[self.time] = {mx}
626
+ if self.data[self.time].isna().max():
627
+ self.values_for_key[self.time].add(np.nan)
288
628
  if verbose:
289
629
  print(f"[CubeLogs.load] done, shape={self.shape}")
290
630
  return self
291
631
 
632
+ def _process_formula(
633
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
634
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
635
+ assert callable(formula), f"formula={formula!r} is not supported."
636
+ return formula
637
+
292
638
  @property
293
639
  def shape(self) -> Tuple[int, int]:
294
640
  "Returns the shape."
@@ -303,7 +649,7 @@ class CubeLogs:
303
649
 
304
650
  def _preprocess(self):
305
651
  last = self.values[0]
306
- gr = self.data[[self.time, *self.keys, last]].groupby([self.time, *self.keys]).count()
652
+ gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
307
653
  gr = gr[gr[last] > 1]
308
654
  if self.recent:
309
655
  cp = self.data.copy()
@@ -312,11 +658,15 @@ class CubeLogs:
312
658
  ), f"'__index__' should not be a column in {cp.columns}"
313
659
  cp["__index__"] = np.arange(cp.shape[0])
314
660
  gr = (
315
- cp[[*self.keys, self.time, "__index__"]]
316
- .groupby(self.keys, as_index=False)
661
+ cp[[*self.keys_time, "__index__"]]
662
+ .groupby(self.keys_no_time, as_index=False, dropna=False)
317
663
  .max()
318
664
  )
319
- filtered = pandas.merge(cp, gr, on=[self.time, "__index__", *self.keys])
665
+ assert gr.shape[0] > 0, (
666
+ f"Something went wrong after the groupby.\n"
667
+ f"{cp[[*self.keys, self.time, '__index__']].head().T}"
668
+ )
669
+ filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
320
670
  assert filtered.shape[0] <= self.data.shape[0], (
321
671
  f"Keeping the latest row brings more row {filtered.shape} "
322
672
  f"(initial is {self.data.shape})."
@@ -324,18 +674,20 @@ class CubeLogs:
324
674
  self.data = filtered.drop("__index__", axis=1)
325
675
  else:
326
676
  assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
327
- gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
328
- gr = gr[gr[self.time] > 1]
329
- assert (
330
- gr.shape[0] == 0
331
- ), f"recent should be true to keep the most recent row:\n{gr}"
332
677
 
333
678
  @classmethod
334
679
  def _filter_column(cls, filters, columns, can_be_empty=False):
680
+ assert list(columns), "columns is empty"
335
681
  set_cols = set()
336
682
  for f in filters:
337
- reg = re.compile(f)
338
- cols = [c for c in columns if reg.search(c)]
683
+ if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
684
+ reg = re.compile(f)
685
+ cols = [c for c in columns if reg.search(c)]
686
+ elif f in columns:
687
+ # No regular expression.
688
+ cols = [f]
689
+ else:
690
+ continue
339
691
  set_cols |= set(cols)
340
692
  assert (
341
693
  can_be_empty or set_cols
@@ -343,25 +695,31 @@ class CubeLogs:
343
695
  return sorted(set_cols)
344
696
 
345
697
  def _initialize_columns(self):
346
- self.keys = self._filter_column(self._keys, self.data.columns)
698
+ keys = self._filter_column(self._keys, self.data.columns)
347
699
  self.values = self._filter_column(self._values, self.data.columns)
348
700
  self.ignored = self._filter_column(self._ignored, self.data.columns, True)
349
701
  assert (
350
702
  self._time in self.data.columns
351
- ), f"Column {self._time} not found in {self.data.columns}"
352
- ignored_keys = set(self.ignored) & set(self.keys)
703
+ ), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
704
+ ignored_keys = set(self.ignored) & set(keys)
353
705
  ignored_values = set(self.ignored) & set(self.values)
354
- self.keys = [c for c in self.keys if c not in ignored_keys]
706
+ self.keys_no_time = [c for c in keys if c not in ignored_keys]
355
707
  self.values = [c for c in self.values if c not in ignored_values]
356
708
  self.ignored_keys = sorted(ignored_keys)
357
709
  self.ignored_values = sorted(ignored_values)
358
710
  self.time = self._time
711
+ self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
359
712
 
360
713
  def __str__(self) -> str:
361
714
  "usual"
362
715
  return str(self.data) if hasattr(self, "data") else str(self._data)
363
716
 
364
- def view(self, view_def: CubeViewDef) -> pandas.DataFrame:
717
+ def view(
718
+ self,
719
+ view_def: Union[str, CubeViewDef],
720
+ return_view_def: bool = False,
721
+ verbose: int = 0,
722
+ ) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
365
723
  """
366
724
  Returns a dataframe, a pivot view.
367
725
  `key_index` determines the index, the other key columns determines
@@ -369,58 +727,274 @@ class CubeLogs:
369
727
  is removed.
370
728
 
371
729
  :param view_def: view definition
730
+ :param return_view_def: returns the view as well
731
+ :param verbose: verbosity level
372
732
  :return: dataframe
373
733
  """
374
- key_agg = self._filter_column(view_def.key_agg, self.keys) if view_def.key_agg else []
734
+ assert isinstance(
735
+ view_def, CubeViewDef
736
+ ), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
737
+ if verbose:
738
+ print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
739
+ key_agg = (
740
+ self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
741
+ )
375
742
  set_key_agg = set(key_agg)
376
- assert set_key_agg <= set(
377
- self.keys
378
- ), f"Non existing keys in key_agg {set_key_agg - set(self.keys)}"
743
+ assert set_key_agg <= set(self.keys_time), (
744
+ f"view_def.name={view_def.name!r}, "
745
+ f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
746
+ f"keys={sorted(self.keys_time)}",
747
+ )
379
748
 
380
749
  values = self._filter_column(view_def.values, self.values)
381
- assert set(values) <= set(
382
- self.values
383
- ), f"Non existing columns in values {set(values) - set(self.values)}"
750
+ assert set(values) <= set(self.values), (
751
+ f"view_def.name={view_def.name!r}, "
752
+ f"non existing columns in values {set(values) - set(self.values)}, "
753
+ f"values={sorted(self.values)}"
754
+ )
384
755
 
756
+ # aggregation
385
757
  if key_agg:
758
+ final_stack = True
386
759
  key_index = [
387
760
  c
388
- for c in self._filter_column(view_def.key_index, self.keys)
761
+ for c in self._filter_column(view_def.key_index, self.keys_time)
389
762
  if c not in set_key_agg
390
763
  ]
391
- keys_no_agg = [c for c in self.keys if c not in set_key_agg]
392
- data = (
393
- self.data[[*keys_no_agg, *values]]
394
- .groupby(key_index, as_index=False)
395
- .agg(*view_def.agg_args, **(view_def.agg_kwargs or {}))
764
+ keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
765
+ if verbose:
766
+ print(f"[CubeLogs.view] aggregation of {set_key_agg}")
767
+ print(f"[CubeLogs.view] groupby {keys_no_agg}")
768
+
769
+ data_red = self.data[[*keys_no_agg, *values]]
770
+ assert set(key_index) <= set(data_red.columns), (
771
+ f"view_def.name={view_def.name!r}, "
772
+ f"nnable to find {set(key_index) - set(data_red.columns)}, "
773
+ f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
774
+ f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
775
+ f"keys={pprint.pformat(sorted(self.keys_time))}"
396
776
  )
777
+ grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
778
+ if callable(view_def.agg_args):
779
+ agg_kwargs = view_def.agg_kwargs or {}
780
+ agg_args = ({c: view_def.agg_args(c) for c in values},)
781
+ else:
782
+ agg_args = view_def.agg_args # type: ignore[assignment]
783
+ agg_kwargs = view_def.agg_kwargs or {}
784
+ data = grouped_data.agg(*agg_args, **agg_kwargs)
785
+ if view_def.agg_multi:
786
+ append = []
787
+ for k, f in view_def.agg_multi.items():
788
+ cv = grouped_data.apply(f, include_groups=False)
789
+ append.append(cv.to_frame(k))
790
+ data = pandas.concat([data, *append], axis=1)
791
+ set_all_keys = set(keys_no_agg)
792
+ values = list(data.columns)
793
+ data = data.reset_index(drop=False)
397
794
  else:
398
- key_index = self._filter_column(view_def.key_index, self.keys)
399
- data = self.data[[*self.keys, *values]]
795
+ key_index = self._filter_column(view_def.key_index, self.keys_time)
796
+ if verbose:
797
+ print(f"[CubeLogs.view] no aggregation, index={key_index}")
798
+ data = self.data[[*self.keys_time, *values]]
799
+ set_all_keys = set(self.keys_time)
800
+ final_stack = False
400
801
 
401
- assert set(key_index) <= set(
402
- self.keys
403
- ), f"Non existing keys in key_index {set(key_index) - set(self.keys)}"
802
+ assert set(key_index) <= set_all_keys, (
803
+ f"view_def.name={view_def.name!r}, "
804
+ f"Non existing keys in key_index {set(key_index) - set_all_keys}"
805
+ )
404
806
 
807
+ # remove unnecessary column
405
808
  set_key_columns = {
406
- c for c in self.keys if c not in key_index and c not in set(key_agg)
809
+ c for c in self.keys_time if c not in key_index and c not in set(key_agg)
407
810
  }
811
+ key_index0 = key_index
408
812
  if view_def.ignore_unique:
409
- key_index = [k for k in key_index if len(self.values_for_key[k]) > 1]
410
- key_columns = [k for k in set_key_columns if len(self.values_for_key[k]) > 1]
813
+ unique = {
814
+ k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
815
+ }
816
+ keep_anyway = (
817
+ set(view_def.keep_columns_in_index)
818
+ if view_def.keep_columns_in_index
819
+ else set()
820
+ )
821
+ key_index = [k for k in key_index if k not in unique or k in keep_anyway]
822
+ key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
823
+ if verbose:
824
+ print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
825
+ print(
826
+ f"[CubeLogs.view] columns with unique values "
827
+ f"{set(key_index0) - set(key_index)}"
828
+ )
411
829
  else:
830
+ if verbose:
831
+ print("[CubeLogs.view] keep all columns")
412
832
  key_columns = sorted(set_key_columns)
833
+ unique = set()
413
834
 
835
+ _md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
836
+ all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
837
+ assert all_cols == set(self.keys_time), (
838
+ f"view_def.name={view_def.name!r}, "
839
+ f"key_columns + key_index + key_agg + unique != keys, left="
840
+ f"{set(self.keys_time) - all_cols}, "
841
+ f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
842
+ f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
843
+ )
844
+
845
+ # reorder
414
846
  if view_def.order:
415
- assert set(view_def.order) <= set_key_columns, (
416
- f"Non existing columns from order in key_columns "
417
- f"{set(view_def.order) - set_key_columns}"
847
+ subset = self._filter_column(view_def.order, all_cols | {self.time})
848
+ corder = [o for o in view_def.order if o in subset]
849
+ assert set(corder) <= set_key_columns, (
850
+ f"view_def.name={view_def.name!r}, "
851
+ f"non existing columns from order in key_columns "
852
+ f"{set(corder) - set_key_columns}"
418
853
  )
419
854
  key_columns = [
420
- *view_def.order,
855
+ *[o for o in corder if o in key_columns],
421
856
  *[c for c in key_columns if c not in view_def.order],
422
857
  ]
423
- return data.pivot(index=key_index[::-1], columns=key_columns, values=values)
858
+ else:
859
+ corder = None
860
+
861
+ if view_def.dropna:
862
+ data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
863
+ data,
864
+ key_index,
865
+ key_columns,
866
+ values,
867
+ keep_columns_in_index=view_def.keep_columns_in_index,
868
+ )
869
+ if view_def.ignore_columns:
870
+ if verbose:
871
+ print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
872
+ data = data.drop(view_def.ignore_columns, axis=1)
873
+ seti = set(view_def.ignore_columns)
874
+ if view_def.keep_columns_in_index:
875
+ seti -= set(view_def.keep_columns_in_index)
876
+ key_index = [c for c in key_index if c not in seti]
877
+ key_columns = [c for c in key_columns if c not in seti]
878
+ values = [c for c in values if c not in seti]
879
+
880
+ # final verification
881
+ if verbose:
882
+ print(f"[CubeLogs.view] key_index={key_index}")
883
+ print(f"[CubeLogs.view] key_columns={key_columns}")
884
+ g = data[[*key_index, *key_columns]].copy()
885
+ g["count"] = 1
886
+ r = g.groupby([*key_index, *key_columns], dropna=False).sum()
887
+ not_unique = r[r["count"] > 1]
888
+ assert not_unique.shape[0] == 0, (
889
+ f"view_def.name={view_def.name!r}, "
890
+ f"unable to run the pivot with index={sorted(key_index)}, "
891
+ f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
892
+ f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
893
+ f"not unique={set(data.columns) - unique}"
894
+ f"\n--\n{not_unique.head()}"
895
+ )
896
+
897
+ # pivot
898
+ if verbose:
899
+ print(f"[CubeLogs.view] values={values}")
900
+ if key_index:
901
+ piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
902
+ else:
903
+ # pivot does return the same rank with it is empty.
904
+ # Let's add arficially one
905
+ data = data.copy()
906
+ data["ALL"] = "ALL"
907
+ piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
908
+ if isinstance(piv, pandas.Series):
909
+ piv = piv.to_frame(name="series")
910
+ names = list(piv.columns.names)
911
+ assert (
912
+ "METRICS" not in names
913
+ ), f"Not implemented when a level METRICS already exists {names!r}"
914
+ names[0] = "METRICS"
915
+ piv.columns = piv.columns.set_names(names)
916
+ if final_stack:
917
+ piv = piv.stack("METRICS", future_stack=True)
918
+ if view_def.transpose:
919
+ piv = piv.T
920
+ if isinstance(piv, pandas.Series):
921
+ piv = piv.to_frame("VALUE")
922
+ piv.sort_index(inplace=True)
923
+
924
+ if isinstance(piv.columns, pandas.MultiIndex):
925
+ if corder:
926
+ # reorder the levels for the columns with the view definition
927
+ new_corder = [c for c in corder if c in piv.columns.names]
928
+ new_names = [
929
+ *[c for c in piv.columns.names if c not in new_corder],
930
+ *new_corder,
931
+ ]
932
+ piv.columns = piv.columns.reorder_levels(new_names)
933
+ elif self.time in piv.columns.names:
934
+ # put time at the end
935
+ new_names = list(piv.columns.names)
936
+ ind = new_names.index(self.time)
937
+ if ind < len(new_names) - 1:
938
+ del new_names[ind]
939
+ new_names.append(self.time)
940
+ piv.columns = piv.columns.reorder_levels(new_names)
941
+
942
+ if view_def.no_index:
943
+ piv = piv.reset_index(drop=False)
944
+ else:
945
+ piv.sort_index(inplace=True, axis=1)
946
+
947
+ if verbose:
948
+ print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
949
+ print(f"[CubeLogs.view] -- done view {view_def.name!r}")
950
+ return (piv, view_def) if return_view_def else piv
951
+
952
+ def _dropna(
953
+ self,
954
+ data: pandas.DataFrame,
955
+ key_index: Sequence[str],
956
+ key_columns: Sequence[str],
957
+ values: Sequence[str],
958
+ keep_columns_in_index: Optional[Sequence[str]] = None,
959
+ ) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
960
+ set_keep_columns_in_index = (
961
+ set(keep_columns_in_index) if keep_columns_in_index else set()
962
+ )
963
+ v = data[values]
964
+ new_data = data[~v.isnull().all(1)]
965
+ if data.shape == new_data.shape:
966
+ return data, key_index, key_columns, values
967
+ new_data = new_data.copy()
968
+ new_key_index = []
969
+ for c in key_index:
970
+ if c in set_keep_columns_in_index:
971
+ new_key_index.append(c)
972
+ continue
973
+ v = new_data[c]
974
+ sv = set(v.dropna())
975
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
976
+ new_key_index.append(c)
977
+ new_key_columns = []
978
+ for c in key_columns:
979
+ if c in set_keep_columns_in_index:
980
+ new_key_columns.append(c)
981
+ continue
982
+ v = new_data[c]
983
+ sv = set(v.dropna())
984
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
985
+ new_key_columns.append(c)
986
+ for c in set(key_index) | set(key_columns):
987
+ s = new_data[c]
988
+ if s.isna().max():
989
+ if pandas.api.types.is_numeric_dtype(s):
990
+ min_v = s.dropna().min()
991
+ assert (
992
+ min_v >= 0
993
+ ), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
994
+ new_data[c] = s.fillna(-1)
995
+ else:
996
+ new_data[c] = s.fillna("NAN")
997
+ return new_data, new_key_index, new_key_columns, values
424
998
 
425
999
  def describe(self) -> pandas.DataFrame:
426
1000
  """Basic description of all variables."""
@@ -433,22 +1007,42 @@ class CubeLogs:
433
1007
  name=name,
434
1008
  dtype=str(dtype),
435
1009
  missing=len(values) - len(nonan),
1010
+ kind=(
1011
+ "time"
1012
+ if name == self.time
1013
+ else (
1014
+ "keys"
1015
+ if name in self.keys_no_time
1016
+ else (
1017
+ "values"
1018
+ if name in self.values
1019
+ else ("ignored" if name in self.ignored else "unused")
1020
+ )
1021
+ )
1022
+ ),
436
1023
  )
437
1024
  if len(nonan) > 0:
438
- obs.update(
439
- dict(
440
- min=nonan.min(),
441
- max=nonan.max(),
442
- count=len(nonan),
443
- )
444
- )
1025
+ obs.update(dict(count=len(nonan)))
445
1026
  if is_numeric_dtype(nonan):
446
1027
  obs.update(
447
1028
  dict(
1029
+ min=nonan.min(),
1030
+ max=nonan.max(),
448
1031
  mean=nonan.mean(),
449
1032
  sum=nonan.sum(),
1033
+ n_values=len(set(nonan)),
450
1034
  )
451
1035
  )
1036
+ elif obs["kind"] == "time":
1037
+ unique = set(nonan)
1038
+ obs["n_values"] = len(unique)
1039
+ o = dict(
1040
+ min=str(nonan.min()),
1041
+ max=str(nonan.max()),
1042
+ n_values=len(set(nonan)),
1043
+ )
1044
+ o["values"] = f"{o['min']} - {o['max']}"
1045
+ obs.update(o)
452
1046
  else:
453
1047
  unique = set(nonan)
454
1048
  obs["n_values"] = len(unique)
@@ -460,126 +1054,691 @@ class CubeLogs:
460
1054
  def to_excel(
461
1055
  self,
462
1056
  output: str,
463
- views: Dict[str, CubeViewDef],
1057
+ views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
464
1058
  main: Optional[str] = "main",
465
1059
  raw: Optional[str] = "raw",
466
1060
  verbose: int = 0,
1061
+ csv: Optional[Sequence[str]] = None,
467
1062
  ):
468
1063
  """
469
1064
  Creates an excel file with a list of view.
470
1065
 
471
1066
  :param output: output file to create
472
- :param views: list of views to append
1067
+ :param views: sequence or dictionary of views to append
473
1068
  :param main: add a page with statitcs on all variables
474
1069
  :param raw: add a page with the raw data
1070
+ :param csv: views to dump as csv files (same name as outputs + view naw)
475
1071
  :param verbose: verbosity
476
1072
  """
477
-
1073
+ if verbose:
1074
+ print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
1075
+ views = {k: k for k in views} if not isinstance(views, dict) else views
1076
+ f_highlights = {}
1077
+ plots = []
478
1078
  with pandas.ExcelWriter(output, engine="openpyxl") as writer:
479
1079
  if main:
480
1080
  assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
481
- df = self.describe()
1081
+ df = self.describe().sort_values("name")
482
1082
  if verbose:
483
- print(f"[CubeLogs.to_helper] add sheet {main!r} with shape {df.shape}")
1083
+ print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
484
1084
  df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
485
- self._apply_excel_style(main, writer, df)
486
- if raw:
487
- assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
488
- if verbose:
489
- print(f"[CubeLogs.to_helper] add sheet {raw!r} with shape {self.shape}")
490
- self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
491
- self._apply_excel_style(raw, writer, self.data)
492
1085
 
493
1086
  for name, view in views.items():
494
- df = self.view(view)
1087
+ df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
1088
+ memory = df.memory_usage(deep=True).sum()
495
1089
  if verbose:
496
1090
  print(
497
- f"[CubeLogs.to_helper] add sheet {name!r} with shape "
498
- f"{df.shape}, index={df.index.names}, columns={df.columns.names}"
1091
+ f"[CubeLogs.to_excel] add sheet {name!r} with shape "
1092
+ f"{df.shape} ({memory} bytes), index={df.index.names}, "
1093
+ f"columns={df.columns.names}"
1094
+ )
1095
+ if self.time in df.columns.names:
1096
+ # Let's convert the time into str
1097
+ fr = df.columns.to_frame()
1098
+ if is_datetime64_any_dtype(fr[self.time]):
1099
+ dt = fr[self.time]
1100
+ has_time = (dt != dt.dt.normalize()).any()
1101
+ sdt = dt.apply(
1102
+ lambda t, has_time=has_time: t.strftime(
1103
+ "%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
1104
+ )
1105
+ )
1106
+ fr[self.time] = sdt
1107
+ df.columns = pandas.MultiIndex.from_frame(fr)
1108
+ if csv and name in csv:
1109
+ name_csv = f"{output}.{name}.csv"
1110
+ if verbose:
1111
+ print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
1112
+ df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
1113
+
1114
+ if memory > 2**22:
1115
+ msg = (
1116
+ f"[CubeLogs.to_excel] skipping {name!r}, "
1117
+ f"too big for excel with {memory} bytes"
1118
+ )
1119
+ if verbose:
1120
+ print(msg)
1121
+ else:
1122
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1123
+ else:
1124
+ df.to_excel(
1125
+ writer,
1126
+ sheet_name=name,
1127
+ freeze_panes=(df.columns.nlevels + df.index.nlevels, df.index.nlevels),
1128
+ )
1129
+ f_highlights[name] = tview.f_highlight
1130
+ if tview.plots:
1131
+ plots.append(CubePlot(df, kind="barh", orientation="row", split=True))
1132
+ if raw:
1133
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1134
+ # Too long.
1135
+ # self._apply_excel_style(raw, writer, self.data)
1136
+ if csv and "raw" in csv:
1137
+ df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
1138
+ memory = df.memory_usage(deep=True).sum()
1139
+ if memory > 2**22:
1140
+ msg = (
1141
+ f"[CubeLogs.to_excel] skipping 'raw', "
1142
+ f"too big for excel with {memory} bytes"
499
1143
  )
500
- df.to_excel(
501
- writer,
502
- sheet_name=name,
503
- freeze_panes=(df.index.nlevels, df.columns.nlevels),
1144
+ if verbose:
1145
+ print(msg)
1146
+ else:
1147
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1148
+ else:
1149
+ if verbose:
1150
+ print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
1151
+ self.data.to_excel(
1152
+ writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1153
+ )
1154
+
1155
+ if plots:
1156
+ from openpyxl.drawing.image import Image
1157
+
1158
+ if verbose:
1159
+ print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
1160
+ sheet = writer.book.create_sheet("plots")
1161
+ pos = 0
1162
+ empty_row = 1
1163
+ times = self.data[self.time].dropna()
1164
+ mini, maxi = times.min(), times.max()
1165
+ title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
1166
+ " 00:00:00", ""
504
1167
  )
505
- self._apply_excel_style(name, writer, df)
506
- if verbose:
507
- print(f"[CubeLogs.to_helper] done with {len(views)} views")
1168
+ for plot in plots:
1169
+ imgs = plot.to_images(
1170
+ verbose=verbose, merge=True, title_suffix=title_suffix
1171
+ )
1172
+ for img in imgs:
1173
+ y = (pos // 2) * 16
1174
+ loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
1175
+ sheet.add_image(Image(io.BytesIO(img)), loc)
1176
+ if verbose:
1177
+ no = f"{output}.png"
1178
+ print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
1179
+ with open(no, "wb") as f:
1180
+ f.write(img)
1181
+ pos += 1
1182
+ empty_row += len(plots) + 2
508
1183
 
509
- def _apply_excel_style(self, name: str, writer: pandas.ExcelWriter, df: pandas.DataFrame):
510
- from openpyxl.styles import Alignment
511
- from openpyxl.utils import get_column_letter
1184
+ if verbose:
1185
+ print(f"[CubeLogs.to_excel] applies style to {output!r}")
1186
+ apply_excel_style(writer, f_highlights) # type: ignore[arg-type]
1187
+ if verbose:
1188
+ print(f"[CubeLogs.to_excel] done with {len(views)} views")
512
1189
 
513
- # from openpyxl.styles import Font, PatternFill, numbers
514
1190
 
515
- left = Alignment(horizontal="left")
516
- right = Alignment(horizontal="right")
517
- # center = Alignment(horizontal="center")
518
- # bold_font = Font(bold=True)
519
- # red = Font(color="FF0000")
520
- # yellow = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
521
- # redf = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid")
1191
+ class CubeLogsPerformance(CubeLogs):
1192
+ """
1193
+ Processes logs coming from experiments.
1194
+ """
522
1195
 
523
- sheet = writer.sheets[name]
524
- n_rows = df.shape[0] + df.columns.nlevels + df.index.nlevels
525
- n_cols = df.shape[1] + df.index.nlevels
526
- co: Dict[int, int] = {}
527
- sizes: Dict[int, int] = {}
528
- cols = set()
529
- for i in range(1, n_rows):
530
- for j, cell in enumerate(sheet[i]):
531
- if j > n_cols:
532
- break
533
- cols.add(cell.column)
534
- if isinstance(cell.value, float):
535
- co[j] = co.get(j, 0) + 1
536
- elif isinstance(cell.value, str):
537
- sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
1196
+ def __init__(
1197
+ self,
1198
+ data: Any,
1199
+ time: str = "DATE",
1200
+ keys: Sequence[str] = (
1201
+ "^version_.*",
1202
+ "^model_.*",
1203
+ "device",
1204
+ "opt_patterns",
1205
+ "suite",
1206
+ "memory_peak",
1207
+ "machine",
1208
+ "exporter",
1209
+ "dynamic",
1210
+ "rtopt",
1211
+ "dtype",
1212
+ "device",
1213
+ "architecture",
1214
+ ),
1215
+ values: Sequence[str] = (
1216
+ "^time_.*",
1217
+ "^disc.*",
1218
+ "^ERR_.*",
1219
+ "CMD",
1220
+ "^ITER",
1221
+ "^onnx_.*",
1222
+ "^op_onnx_.*",
1223
+ "^peak_gpu_.*",
1224
+ ),
1225
+ ignored: Sequence[str] = ("version_python",),
1226
+ recent: bool = True,
1227
+ formulas: Optional[
1228
+ Union[
1229
+ Sequence[str],
1230
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
1231
+ ]
1232
+ ] = (
1233
+ "speedup",
1234
+ "bucket[speedup]",
1235
+ "ERR1",
1236
+ "n_models",
1237
+ "n_model_eager",
1238
+ "n_model_running",
1239
+ "n_model_acc01",
1240
+ "n_model_acc001",
1241
+ "n_model_dynamic",
1242
+ "n_model_pass",
1243
+ "n_model_faster",
1244
+ "n_model_faster2x",
1245
+ "n_model_faster3x",
1246
+ "n_model_faster4x",
1247
+ "n_node_attention",
1248
+ "n_node_control_flow",
1249
+ "n_node_scatter",
1250
+ "n_node_function",
1251
+ "n_node_initializer",
1252
+ "n_node_constant",
1253
+ "n_node_shape",
1254
+ "n_node_expand",
1255
+ "peak_gpu_torch",
1256
+ "peak_gpu_nvidia",
1257
+ "time_export_unbiased",
1258
+ ),
1259
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
1260
+ keep_last_date: bool = False,
1261
+ ):
1262
+ super().__init__(
1263
+ data=data,
1264
+ time=time,
1265
+ keys=keys,
1266
+ values=values,
1267
+ ignored=ignored,
1268
+ recent=recent,
1269
+ formulas=formulas,
1270
+ fill_missing=fill_missing,
1271
+ keep_last_date=keep_last_date,
1272
+ )
538
1273
 
539
- for k, v in sizes.items():
540
- c = get_column_letter(k)
541
- sheet.column_dimensions[c].width = max(15, v)
542
- for k in cols:
543
- if k not in sizes:
544
- c = get_column_letter(k)
545
- sheet.column_dimensions[c].width = 15
1274
+ def _process_formula(
1275
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
1276
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
1277
+ """
1278
+ Processes a formula, converting it into a function.
546
1279
 
547
- for i in range(1, n_rows):
548
- for j, cell in enumerate(sheet[i]):
549
- if j > n_cols:
550
- break
551
- if isinstance(cell.value, pandas.Timestamp):
552
- cell.alignment = right
553
- dt = cell.value.to_pydatetime()
554
- cell.value = dt
555
- cell.number_format = (
556
- "YYYY-MM-DD"
557
- if (
558
- dt.hour == 0
559
- and dt.minute == 0
560
- and dt.second == 0
561
- and dt.microsecond == 0
562
- )
563
- else "YYYY-MM-DD 00:00:00"
564
- )
565
- elif isinstance(cell.value, (float, int)):
566
- cell.alignment = right
567
- x = abs(cell.value)
568
- if int(x) == x:
569
- cell.number_format = "0"
570
- elif x > 5000:
571
- cell.number_format = "# ##0"
572
- elif x >= 500:
573
- cell.number_format = "0.0"
574
- elif x >= 50:
575
- cell.number_format = "0.00"
576
- elif x >= 5:
577
- cell.number_format = "0.000"
578
- elif x > 0.5:
579
- cell.number_format = "0.0000"
580
- elif x > 0.005:
581
- cell.number_format = "0.00000"
1280
+ :param formula: a formula string
1281
+ :return: a function
1282
+ """
1283
+ if callable(formula):
1284
+ return formula
1285
+ assert isinstance(
1286
+ formula, str
1287
+ ), f"Unexpected type for formula {type(formula)}: {formula!r}"
1288
+
1289
+ def gdf(df, cname, default_value=np.nan):
1290
+ if cname in df.columns:
1291
+ return df[cname]
1292
+ return pandas.Series(default_value, index=df.index)
1293
+
1294
+ def ghas_value(df, cname):
1295
+ if cname not in df.columns:
1296
+ return pandas.Series(np.nan, index=df.index)
1297
+ isna = df[cname].isna()
1298
+ return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
1299
+
1300
+ def gpreserve(df, cname, series):
1301
+ if cname not in df.columns:
1302
+ return pandas.Series(np.nan, index=df.index)
1303
+ isna = df[cname].isna()
1304
+ return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
1305
+
1306
+ if formula == "speedup":
1307
+ columns = set(self._filter_column(["^time_.*"], self.data.columns))
1308
+ assert "time_latency" in columns and "time_latency_eager" in columns, (
1309
+ f"Unable to apply formula {formula!r}, with columns\n"
1310
+ f"{pprint.pformat(sorted(columns))}"
1311
+ )
1312
+ return lambda df: df["time_latency_eager"] / df["time_latency"]
1313
+
1314
+ if formula == "bucket[speedup]":
1315
+ columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
1316
+ assert "speedup" in columns, (
1317
+ f"Unable to apply formula {formula!r}, with columns\n"
1318
+ f"{pprint.pformat(sorted(columns))}"
1319
+ )
1320
+ # return lambda df: df["time_latency_eager"] / df["time_latency"]
1321
+ return lambda df: pandas.cut(
1322
+ df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
1323
+ )
1324
+
1325
+ if formula == "ERR1":
1326
+ columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
1327
+ if not columns:
1328
+ return lambda df: np.nan
1329
+
1330
+ def first_err(df: pandas.DataFrame) -> pandas.Series:
1331
+ ordered = [
1332
+ c
1333
+ for c in [
1334
+ "ERR_timeout",
1335
+ "ERR_load",
1336
+ "ERR_feeds",
1337
+ "ERR_warmup_eager",
1338
+ "ERR_export",
1339
+ "ERR_ort",
1340
+ "ERR_warmup",
1341
+ # "ERR_std",
1342
+ # "ERR_crash",
1343
+ # "ERR_stdout",
1344
+ ]
1345
+ if c in df.columns
1346
+ ]
1347
+ res = None
1348
+ for c in ordered:
1349
+ if res is None:
1350
+ res = df[c].fillna("")
582
1351
  else:
583
- cell.number_format = "0.000E+00"
584
- else:
585
- cell.alignment = left
1352
+ res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
1353
+ return res
1354
+
1355
+ return first_err
1356
+
1357
+ if formula.startswith("n_"):
1358
+ lambdas = dict(
1359
+ n_models=lambda df: ghas_value(df, "model_name"),
1360
+ n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
1361
+ n_model_running=lambda df: ghas_value(df, "time_latency"),
1362
+ n_model_acc01=lambda df: gpreserve(
1363
+ df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
1364
+ ),
1365
+ n_model_acc001=lambda df: gpreserve(
1366
+ df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
1367
+ ),
1368
+ n_model_dynamic=lambda df: gpreserve(
1369
+ df,
1370
+ "discrepancies_dynamic_abs",
1371
+ (gdf(df, "discrepancies_dynamic_abs") <= 0.1),
1372
+ ),
1373
+ n_model_pass=lambda df: gpreserve(
1374
+ df,
1375
+ "time_latency",
1376
+ (gdf(df, "discrepancies_abs", np.inf) < 0.1)
1377
+ & (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
1378
+ ),
1379
+ n_model_faster=lambda df: gpreserve(
1380
+ df,
1381
+ "time_latency",
1382
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
1383
+ ),
1384
+ n_model_faster2x=lambda df: gpreserve(
1385
+ df,
1386
+ "time_latency",
1387
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
1388
+ ),
1389
+ n_model_faster3x=lambda df: gpreserve(
1390
+ df,
1391
+ "time_latency",
1392
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
1393
+ ),
1394
+ n_model_faster4x=lambda df: gpreserve(
1395
+ df,
1396
+ "time_latency",
1397
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
1398
+ ),
1399
+ n_node_attention=lambda df: gpreserve(
1400
+ df,
1401
+ "op_onnx_com.microsoft_Attention",
1402
+ gdf(df, "op_onnx_com.microsoft_Attention")
1403
+ + gdf(df, "op_onnx_com.microsoft_MultiHeadAttention"),
1404
+ ),
1405
+ n_node_control_flow=lambda df: gpreserve(
1406
+ df,
1407
+ "op_onnx__If",
1408
+ (
1409
+ gdf(df, "op_onnx__If", 0)
1410
+ + gdf(df, "op_onnx__Scan", 0)
1411
+ + gdf(df, "op_onnx__Loop", 0)
1412
+ ),
1413
+ ),
1414
+ n_node_scatter=lambda df: gpreserve(
1415
+ df,
1416
+ "op_onnx__ScatterND",
1417
+ gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
1418
+ ),
1419
+ n_node_function=lambda df: gpreserve(
1420
+ df, "onnx_n_functions", gdf(df, "onnx_n_functions")
1421
+ ),
1422
+ n_node_initializer=lambda df: gpreserve(
1423
+ df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
1424
+ ),
1425
+ n_node_constant=lambda df: gpreserve(
1426
+ df, "op_onnx__Constant", gdf(df, "op_onnx__Constant")
1427
+ ),
1428
+ n_node_shape=lambda df: gpreserve(
1429
+ df, "op_onnx__Shape", gdf(df, "op_onnx__Shape")
1430
+ ),
1431
+ n_node_expand=lambda df: gpreserve(
1432
+ df, "op_onnx__Expand", gdf(df, "op_onnx__Expand")
1433
+ ),
1434
+ )
1435
+ assert (
1436
+ formula in lambdas
1437
+ ), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
1438
+ return lambdas[formula]
1439
+
1440
+ if formula == "peak_gpu_torch":
1441
+ return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
1442
+ if formula == "peak_gpu_nvidia":
1443
+ return (
1444
+ lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
1445
+ )
1446
+ if formula == "time_export_unbiased":
1447
+
1448
+ def unbiased_export(df):
1449
+ if "time_warmup_first_iteration" not in df.columns:
1450
+ return pandas.Series(np.nan, index=df.index)
1451
+ return pandas.Series(
1452
+ np.where(
1453
+ df["exporter"] == "inductor",
1454
+ df["time_warmup_first_iteration"] + df["time_export_success"],
1455
+ df["time_export_success"],
1456
+ ),
1457
+ index=df.index,
1458
+ )
1459
+
1460
+ return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
1461
+
1462
+ raise ValueError(
1463
+ f"Unexpected formula {formula!r}, available columns are\n"
1464
+ f"{pprint.pformat(sorted(self.data.columns))}"
1465
+ )
1466
+
1467
+ def view(
1468
+ self,
1469
+ view_def: Union[str, CubeViewDef],
1470
+ return_view_def: bool = False,
1471
+ verbose: int = 0,
1472
+ ) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
1473
+ """
1474
+ Returns a dataframe, a pivot view.
1475
+
1476
+ If view_def is a string, it is replaced by a prefined view.
1477
+
1478
+ :param view_def: view definition or a string
1479
+ :param return_view_def: returns the view definition as well
1480
+ :param verbose: verbosity level
1481
+ :return: dataframe
1482
+ """
1483
+ if isinstance(view_def, str):
1484
+ view_def = self.make_view_def(view_def)
1485
+ return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
1486
+
1487
+ def make_view_def(self, name: str) -> CubeViewDef:
1488
+ """
1489
+ Returns a view definition.
1490
+
1491
+ :param name: name of the view
1492
+ :return: a CubeViewDef
1493
+
1494
+ Available views:
1495
+
1496
+ * **agg-suite:** aggregation per suite
1497
+ * **disc:** discrepancies
1498
+ * **speedup:** speedup
1499
+ * **bucket_speedup:** speedup in buckets
1500
+ * **time:** latency
1501
+ * **time_export:** time to export
1502
+ * **counts:** status, running, faster, has control flow, ...
1503
+ * **err:** important errors
1504
+ * **cmd:** command lines
1505
+ * **raw-short:** raw data without all the unused columns
1506
+ """
1507
+ fs = ["suite", "model_suite", "task", "model_name", "model_task"]
1508
+ index_cols = self._filter_column(fs, self.keys_time)
1509
+ assert index_cols, (
1510
+ f"No index columns found for {fs!r} in "
1511
+ f"{pprint.pformat(sorted(self.keys_time))}"
1512
+ )
1513
+ index_cols = [c for c in fs if c in set(index_cols)]
1514
+
1515
+ f_speedup = lambda x: ( # noqa: E731
1516
+ CubeViewDef.HighLightKind.NONE
1517
+ if not isinstance(x, (float, int))
1518
+ else (
1519
+ CubeViewDef.HighLightKind.RED
1520
+ if x < 0.9
1521
+ else (
1522
+ CubeViewDef.HighLightKind.GREEN
1523
+ if x > 1.1
1524
+ else CubeViewDef.HighLightKind.NONE
1525
+ )
1526
+ )
1527
+ )
1528
+ f_disc = lambda x: ( # noqa: E731
1529
+ CubeViewDef.HighLightKind.NONE
1530
+ if not isinstance(x, (float, int))
1531
+ else (
1532
+ CubeViewDef.HighLightKind.RED
1533
+ if x > 0.1
1534
+ else (
1535
+ CubeViewDef.HighLightKind.GREEN
1536
+ if x < 0.01
1537
+ else CubeViewDef.HighLightKind.NONE
1538
+ )
1539
+ )
1540
+ )
1541
+ f_bucket = lambda x: ( # noqa: E731
1542
+ CubeViewDef.HighLightKind.NONE
1543
+ if not isinstance(x, str)
1544
+ else (
1545
+ CubeViewDef.HighLightKind.RED
1546
+ if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
1547
+ else (
1548
+ CubeViewDef.HighLightKind.NONE
1549
+ if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
1550
+ else (
1551
+ CubeViewDef.HighLightKind.GREEN
1552
+ if "[" in x
1553
+ else CubeViewDef.HighLightKind.NONE
1554
+ )
1555
+ )
1556
+ )
1557
+ )
1558
+
1559
+ def mean_weight(gr):
1560
+ weight = gr["time_latency_eager"]
1561
+ x = gr["speedup"]
1562
+ if x.shape[0] == 0:
1563
+ return np.nan
1564
+ div = weight.sum()
1565
+ if div > 0:
1566
+ return (x * weight).sum() / div
1567
+ return np.nan
1568
+
1569
+ def mean_geo(gr):
1570
+ x = gr["speedup"]
1571
+ return np.exp(np.log(x.dropna()).mean())
1572
+
1573
+ order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
1574
+ implemented_views = {
1575
+ "agg-suite": lambda: CubeViewDef(
1576
+ key_index=index_cols,
1577
+ values=self._filter_column(
1578
+ [
1579
+ "TIME_ITER",
1580
+ "speedup",
1581
+ "time_latency",
1582
+ "time_latency_eager",
1583
+ "time_export_success",
1584
+ "time_export_unbiased",
1585
+ "^n_.*",
1586
+ "target_opset",
1587
+ "onnx_filesize",
1588
+ "onnx_weight_size_torch",
1589
+ "onnx_weight_size_proto",
1590
+ "onnx_n_nodes",
1591
+ "peak_gpu_torch",
1592
+ "peak_gpu_nvidia",
1593
+ ],
1594
+ self.values,
1595
+ ),
1596
+ ignore_unique=True,
1597
+ key_agg=["model_name", "task", "model_task"],
1598
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1599
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1600
+ keep_columns_in_index=["suite"],
1601
+ name="agg-suite",
1602
+ order=order,
1603
+ ),
1604
+ "agg-all": lambda: CubeViewDef(
1605
+ key_index=index_cols,
1606
+ values=self._filter_column(
1607
+ [
1608
+ "TIME_ITER",
1609
+ "speedup",
1610
+ "time_latency",
1611
+ "time_latency_eager",
1612
+ "time_export_success",
1613
+ "time_export_unbiased",
1614
+ "^n_.*",
1615
+ "target_opset",
1616
+ "onnx_filesize",
1617
+ "onnx_weight_size_torch",
1618
+ "onnx_weight_size_proto",
1619
+ "onnx_n_nodes",
1620
+ "peak_gpu_torch",
1621
+ "peak_gpu_nvidia",
1622
+ ],
1623
+ self.values,
1624
+ ),
1625
+ ignore_unique=True,
1626
+ key_agg=["model_name", "task", "model_task", "suite"],
1627
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1628
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1629
+ name="agg-all",
1630
+ order=order,
1631
+ plots=True,
1632
+ ),
1633
+ "disc": lambda: CubeViewDef(
1634
+ key_index=index_cols,
1635
+ values=self._filter_column(["discrepancies_abs"], self.values),
1636
+ ignore_unique=True,
1637
+ keep_columns_in_index=["suite"],
1638
+ f_highlight=f_disc,
1639
+ name="disc",
1640
+ order=order,
1641
+ ),
1642
+ "speedup": lambda: CubeViewDef(
1643
+ key_index=index_cols,
1644
+ values=self._filter_column(["speedup"], self.values),
1645
+ ignore_unique=True,
1646
+ keep_columns_in_index=["suite"],
1647
+ f_highlight=f_speedup,
1648
+ name="speedup",
1649
+ order=order,
1650
+ ),
1651
+ "counts": lambda: CubeViewDef(
1652
+ key_index=index_cols,
1653
+ values=self._filter_column(["^n_.*"], self.values),
1654
+ ignore_unique=True,
1655
+ keep_columns_in_index=["suite"],
1656
+ name="counts",
1657
+ order=order,
1658
+ ),
1659
+ "peak-gpu": lambda: CubeViewDef(
1660
+ key_index=index_cols,
1661
+ values=self._filter_column(["^peak_gpu_.*"], self.values),
1662
+ ignore_unique=True,
1663
+ keep_columns_in_index=["suite"],
1664
+ name="peak-gpu",
1665
+ order=order,
1666
+ ),
1667
+ "time": lambda: CubeViewDef(
1668
+ key_index=index_cols,
1669
+ values=self._filter_column(
1670
+ ["time_latency", "time_latency_eager"], self.values
1671
+ ),
1672
+ ignore_unique=True,
1673
+ keep_columns_in_index=["suite"],
1674
+ name="time",
1675
+ order=order,
1676
+ ),
1677
+ "time_export": lambda: CubeViewDef(
1678
+ key_index=index_cols,
1679
+ values=self._filter_column(["time_export_unbiased"], self.values),
1680
+ ignore_unique=True,
1681
+ keep_columns_in_index=["suite"],
1682
+ name="time_export",
1683
+ order=order,
1684
+ ),
1685
+ "err": lambda: CubeViewDef(
1686
+ key_index=index_cols,
1687
+ values=self._filter_column(
1688
+ ["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
1689
+ ),
1690
+ ignore_unique=True,
1691
+ keep_columns_in_index=["suite"],
1692
+ name="err",
1693
+ order=order,
1694
+ ),
1695
+ "bucket-speedup": lambda: CubeViewDef(
1696
+ key_index=index_cols,
1697
+ values=self._filter_column(["bucket[speedup]"], self.values),
1698
+ ignore_unique=True,
1699
+ keep_columns_in_index=["suite"],
1700
+ name="bucket-speedup",
1701
+ f_highlight=f_bucket,
1702
+ order=order,
1703
+ ),
1704
+ "cmd": lambda: CubeViewDef(
1705
+ key_index=index_cols,
1706
+ values=self._filter_column(["CMD"], self.values),
1707
+ ignore_unique=True,
1708
+ keep_columns_in_index=["suite"],
1709
+ name="cmd",
1710
+ order=order,
1711
+ ),
1712
+ "raw-short": lambda: CubeViewDef(
1713
+ key_index=self.keys_time,
1714
+ values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
1715
+ ignore_unique=False,
1716
+ keep_columns_in_index=["suite"],
1717
+ name="raw-short",
1718
+ no_index=True,
1719
+ ),
1720
+ }
1721
+ assert name in implemented_views, (
1722
+ f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
1723
+ f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
1724
+ f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
1725
+ )
1726
+ return implemented_views[name]()
1727
+
1728
+ def post_load_process_piece(
1729
+ self, df: pandas.DataFrame, unique: bool = False
1730
+ ) -> pandas.DataFrame:
1731
+ df = super().post_load_process_piece(df, unique=unique)
1732
+ if unique:
1733
+ return df
1734
+ cols = self._filter_column(self._keys, df)
1735
+ res = None
1736
+ for c in cols:
1737
+ if df[c].isna().any():
1738
+ # Missing values for keys are not supposed to happen.
1739
+ uniq = set(df[c].dropna())
1740
+ if len(uniq) == 1:
1741
+ if res is None:
1742
+ res = df.copy()
1743
+ res[c] = res[c].fillna(uniq.pop())
1744
+ return df if res is None else res