onnx-diagnostic 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. onnx_diagnostic/__init__.py +1 -1
  2. onnx_diagnostic/_command_lines_parser.py +213 -5
  3. onnx_diagnostic/export/dynamic_shapes.py +48 -20
  4. onnx_diagnostic/export/shape_helper.py +126 -0
  5. onnx_diagnostic/ext_test_case.py +31 -0
  6. onnx_diagnostic/helpers/cache_helper.py +42 -20
  7. onnx_diagnostic/helpers/config_helper.py +16 -1
  8. onnx_diagnostic/helpers/log_helper.py +1561 -177
  9. onnx_diagnostic/helpers/torch_helper.py +6 -2
  10. onnx_diagnostic/tasks/__init__.py +2 -0
  11. onnx_diagnostic/tasks/image_text_to_text.py +69 -18
  12. onnx_diagnostic/tasks/text_generation.py +17 -8
  13. onnx_diagnostic/tasks/text_to_image.py +91 -0
  14. onnx_diagnostic/torch_export_patches/onnx_export_errors.py +24 -7
  15. onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +144 -349
  16. onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +87 -7
  17. onnx_diagnostic/torch_export_patches/serialization/__init__.py +46 -0
  18. onnx_diagnostic/torch_export_patches/serialization/diffusers_impl.py +34 -0
  19. onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py +259 -0
  20. onnx_diagnostic/torch_models/hghub/hub_api.py +73 -5
  21. onnx_diagnostic/torch_models/hghub/hub_data.py +7 -2
  22. onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +28 -0
  23. onnx_diagnostic/torch_models/hghub/model_inputs.py +74 -14
  24. onnx_diagnostic/torch_models/validate.py +45 -16
  25. {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/METADATA +1 -1
  26. {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/RECORD +29 -24
  27. {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/WHEEL +0 -0
  28. {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/licenses/LICENSE.txt +0 -0
  29. {onnx_diagnostic-0.7.0.dist-info → onnx_diagnostic-0.7.2.dist-info}/top_level.txt +0 -0
@@ -1,20 +1,89 @@
1
1
  import datetime
2
+ import enum
2
3
  import glob
4
+ import io
3
5
  import os
6
+ import pprint
4
7
  import re
8
+ import warnings
5
9
  import zipfile
6
10
  from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
7
11
  import numpy as np
8
12
  import pandas
9
- from pandas.api.types import is_numeric_dtype
13
+ from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
10
14
  from .helper import string_sig
11
15
 
16
+ BUCKET_SCALES_VALUES = np.array(
17
+ [-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
18
+ )
19
+
20
+
21
+ BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
22
+
23
+
24
+ def filter_data(
25
+ df: pandas.DataFrame,
26
+ filter_in: Optional[str] = None,
27
+ filter_out: Optional[str] = None,
28
+ verbose: int = 0,
29
+ ) -> pandas.DataFrame:
30
+ """
31
+ Argument `filter` follows the syntax
32
+ ``<column1>:<fmt1>//<column2>:<fmt2>``.
33
+
34
+ The format is the following:
35
+
36
+ * a value or a set of values separated by ``;``
37
+ """
38
+ if not filter_in and not filter_out:
39
+ return df
40
+
41
+ def _f(fmt):
42
+ cond = {}
43
+ if isinstance(fmt, str):
44
+ cols = fmt.split("//")
45
+ for c in cols:
46
+ assert ":" in c, f"Unexpected value {c!r} in fmt={fmt!r}"
47
+ spl = c.split(":")
48
+ assert len(spl) == 2, f"Unexpected value {c!r} in fmt={fmt!r}"
49
+ name, fil = spl
50
+ cond[name] = set(fil.split(";"))
51
+ return cond
52
+
53
+ if filter_in:
54
+ cond = _f(filter_in)
55
+ assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_in!r}"
56
+ for k, v in cond.items():
57
+ if k not in df.columns:
58
+ continue
59
+ if verbose:
60
+ print(
61
+ f"[_filter_data] filter in column {k!r}, "
62
+ f"values {v!r} among {set(df[k].astype(str))}"
63
+ )
64
+ df = df[df[k].astype(str).isin(v)]
65
+
66
+ if filter_out:
67
+ cond = _f(filter_out)
68
+ assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_out!r}"
69
+ for k, v in cond.items():
70
+ if k not in df.columns:
71
+ continue
72
+ if verbose:
73
+ print(
74
+ f"[_filter_data] filter out column {k!r}, "
75
+ f"values {v!r} among {set(df[k].astype(str))}"
76
+ )
77
+ df = df[~df[k].astype(str).isin(v)]
78
+ return df
79
+
12
80
 
13
81
  def enumerate_csv_files(
14
82
  data: Union[
15
83
  pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
16
84
  ],
17
85
  verbose: int = 0,
86
+ filtering: Optional[Callable[[str], bool]] = None,
18
87
  ) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
19
88
  """
20
89
  Enumerates files considered for the aggregation.
@@ -23,6 +92,10 @@ def enumerate_csv_files(
23
92
  loops over csv candidates.
24
93
 
25
94
  :param data: dataframe with the raw data or a file or list of files
95
+ :param vrbose: verbosity
96
+ :param filtering: function to filter in or out files in zip files,
97
+ must return true to keep the file, false to skip it.
98
+ :return: a generator yielding tuples with the filename, date, full path and zip file
26
99
 
27
100
  data can contains:
28
101
  * a dataframe
@@ -52,13 +125,9 @@ def enumerate_csv_files(
52
125
  # We check the first line is ok.
53
126
  if verbose:
54
127
  print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
55
- with open(filename, "r", encoding="utf-8") as f:
56
- line = f.readline()
57
- if "~help" in line or (",CMD" not in line and ",DATE" not in line):
58
- continue
59
- dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
60
- du = dt.strftime("%Y-%m-%d %H:%M:%S")
61
- yield (os.path.split(filename)[-1], du, filename, "")
128
+ dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
129
+ du = dt.strftime("%Y-%m-%d %H:%M:%S")
130
+ yield (os.path.split(filename)[-1], du, filename, "")
62
131
  continue
63
132
 
64
133
  if ext == ".zip":
@@ -67,8 +136,11 @@ def enumerate_csv_files(
67
136
  zf = zipfile.ZipFile(filename, "r")
68
137
  for ii, info in enumerate(zf.infolist()):
69
138
  name = info.filename
70
- ext = os.path.splitext(name)[-1]
71
- if ext != ".csv":
139
+ if filtering is None:
140
+ ext = os.path.splitext(name)[-1]
141
+ if ext != ".csv":
142
+ continue
143
+ elif not filtering(name):
72
144
  continue
73
145
  if verbose:
74
146
  print(
@@ -96,14 +168,15 @@ def enumerate_csv_files(
96
168
  for ii, f in enumerate(found):
97
169
  if verbose:
98
170
  print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
99
- yield from enumerate_csv_files(f, verbose=verbose)
171
+ yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
100
172
 
101
173
 
102
174
  def open_dataframe(
103
175
  data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
104
176
  ) -> pandas.DataFrame:
105
177
  """
106
- Opens a filename.
178
+ Opens a filename defined by function
179
+ :func:`onnx_diagnostic.helpers.log_helper.enumerate_csv_files`.
107
180
 
108
181
  :param data: a dataframe, a filename, a tuple indicating the file is coming
109
182
  from a zip file
@@ -140,10 +213,26 @@ class CubeViewDef:
140
213
  :param order: to reorder key in columns index
141
214
  :param key_agg: aggregate according to these columns before
142
215
  creating the view
143
- :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
216
+ :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`,
217
+ it can be also a callable to return a different aggregation
218
+ method depending on the column name
144
219
  :param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
220
+ :param agg_multi: aggregation over multiple columns
221
+ :param ignore_columns: ignore the following columns if known to overload the view
222
+ :param keep_columns_in_index: keeps the columns even if there is only one unique value
223
+ :param dropna: drops rows with nan if not relevant
224
+ :param transpose: transpose
225
+ :param f_highlight: to highlights some values
226
+ :param name: name of the view, used mostly to debug
227
+ :param plots: adds plot to the Excel sheet
228
+ :param no_index: remove the index (but keeps the columns)
145
229
  """
146
230
 
231
+ class HighLightKind(enum.IntEnum):
232
+ NONE = 0
233
+ RED = 1
234
+ GREEN = 2
235
+
147
236
  def __init__(
148
237
  self,
149
238
  key_index: Sequence[str],
@@ -151,8 +240,19 @@ class CubeViewDef:
151
240
  ignore_unique: bool = True,
152
241
  order: Optional[Sequence[str]] = None,
153
242
  key_agg: Optional[Sequence[str]] = None,
154
- agg_args: Sequence[Any] = ("sum",),
243
+ agg_args: Union[Sequence[Any], Callable[[str], Any]] = ("sum",),
155
244
  agg_kwargs: Optional[Dict[str, Any]] = None,
245
+ agg_multi: Optional[
246
+ Dict[str, Callable[[pandas.core.groupby.DataFrameGroupBy], pandas.Series]]
247
+ ] = None,
248
+ ignore_columns: Optional[Sequence[str]] = None,
249
+ keep_columns_in_index: Optional[Sequence[str]] = None,
250
+ dropna: bool = True,
251
+ transpose: bool = False,
252
+ f_highlight: Optional[Callable[[Any], "CubeViewDef.HighLightKind"]] = None,
253
+ name: Optional[str] = None,
254
+ no_index: bool = False,
255
+ plots: bool = False,
156
256
  ):
157
257
  self.key_index = key_index
158
258
  self.values = values
@@ -161,12 +261,349 @@ class CubeViewDef:
161
261
  self.key_agg = key_agg
162
262
  self.agg_args = agg_args
163
263
  self.agg_kwargs = agg_kwargs
264
+ self.agg_multi = agg_multi
265
+ self.dropna = dropna
266
+ self.ignore_columns = ignore_columns
267
+ self.keep_columns_in_index = keep_columns_in_index
268
+ self.f_highlight = f_highlight
269
+ self.transpose = transpose
270
+ self.name = name
271
+ self.no_index = no_index
272
+ self.plots = plots
164
273
 
165
274
  def __repr__(self) -> str:
166
275
  "usual"
167
276
  return string_sig(self) # type: ignore[arg-type]
168
277
 
169
278
 
279
+ def apply_excel_style(
280
+ filename_or_writer: Any,
281
+ f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
282
+ ):
283
+ """
284
+ Applies styles on all sheets in a file unless the sheet is too big.
285
+
286
+ :param filename_or_writer: filename, modified inplace
287
+ :param f_highlight: color function to apply, one per sheet
288
+ """
289
+ from openpyxl import load_workbook
290
+ from openpyxl.styles import Alignment
291
+ from openpyxl.utils import get_column_letter
292
+ from openpyxl.styles import Font # , PatternFill, numbers
293
+
294
+ if isinstance(filename_or_writer, str):
295
+ workbook = load_workbook(filename_or_writer)
296
+ save = True
297
+ else:
298
+ workbook = filename_or_writer.book
299
+ save = False
300
+
301
+ left = Alignment(horizontal="left")
302
+ left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
303
+ right = Alignment(horizontal="right")
304
+ font_colors = {
305
+ CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
306
+ CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
307
+ }
308
+
309
+ for name in workbook.sheetnames:
310
+ f_highlight = f_highlights.get(name, None) if f_highlights else None
311
+ sheet = workbook[name]
312
+ n_rows = sheet.max_row
313
+ n_cols = sheet.max_column
314
+ if n_rows * n_cols > 2**18:
315
+ # Too big.
316
+ continue
317
+ co: Dict[int, int] = {}
318
+ sizes: Dict[int, int] = {}
319
+ cols = set()
320
+ for i in range(1, n_rows + 1):
321
+ for j, cell in enumerate(sheet[i]):
322
+ if j > n_cols:
323
+ break
324
+ cols.add(cell.column)
325
+ if isinstance(cell.value, float):
326
+ co[j] = co.get(j, 0) + 1
327
+ elif isinstance(cell.value, str):
328
+ sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
329
+
330
+ for k, v in sizes.items():
331
+ c = get_column_letter(k)
332
+ sheet.column_dimensions[c].width = min(max(8, v), 30)
333
+ for k in cols:
334
+ if k not in sizes:
335
+ c = get_column_letter(k)
336
+ sheet.column_dimensions[c].width = 15
337
+
338
+ for i in range(1, n_rows + 1):
339
+ for j, cell in enumerate(sheet[i]):
340
+ if j > n_cols:
341
+ break
342
+ if isinstance(cell.value, pandas.Timestamp):
343
+ cell.alignment = right
344
+ dt = cell.value.to_pydatetime()
345
+ cell.value = dt
346
+ cell.number_format = (
347
+ "YYYY-MM-DD"
348
+ if (
349
+ dt.hour == 0
350
+ and dt.minute == 0
351
+ and dt.second == 0
352
+ and dt.microsecond == 0
353
+ )
354
+ else "YYYY-MM-DD 00:00:00"
355
+ )
356
+ elif isinstance(cell.value, (float, int)):
357
+ cell.alignment = right
358
+ x = abs(cell.value)
359
+ if int(x) == x:
360
+ cell.number_format = "0"
361
+ elif x > 5000:
362
+ cell.number_format = "# ##0"
363
+ elif x >= 500:
364
+ cell.number_format = "0.0"
365
+ elif x >= 50:
366
+ cell.number_format = "0.00"
367
+ elif x >= 5:
368
+ cell.number_format = "0.000"
369
+ elif x > 0.5:
370
+ cell.number_format = "0.0000"
371
+ elif x > 0.005:
372
+ cell.number_format = "0.00000"
373
+ else:
374
+ cell.number_format = "0.000E+00"
375
+ if f_highlight:
376
+ h = f_highlight(cell.value)
377
+ if h in font_colors:
378
+ cell.font = font_colors[h]
379
+ elif isinstance(cell.value, str) and len(cell.value) > 70:
380
+ cell.alignment = left_shrink
381
+ else:
382
+ cell.alignment = left
383
+ if f_highlight:
384
+ h = f_highlight(cell.value)
385
+ if h in font_colors:
386
+ cell.font = font_colors[h]
387
+ if save:
388
+ workbook.save(filename_or_writer)
389
+
390
+
391
+ class CubePlot:
392
+ """
393
+ Creates a plot.
394
+
395
+ :param df: dataframe
396
+ :param kind: kind of graph to plot, bar, barh, line
397
+ :param split: draw a graph per line in the dataframe
398
+ :param timeseries: this assumes the time is one level of the columns,
399
+ this argument indices the level name
400
+ """
401
+
402
+ KINDS = {"bar", "barh", "line"}
403
+
404
+ @classmethod
405
+ def group_columns(
406
+ cls, columns: List[str], sep: str = "/", depth: int = 2
407
+ ) -> List[List[str]]:
408
+ """Groups columns to have nice display."""
409
+ res: Dict[str, List[str]] = {}
410
+ for c in columns:
411
+ p = c.split("/")
412
+ k = "/".join(p[:depth])
413
+ if k not in res:
414
+ res[k] = []
415
+ res[k].append(c)
416
+ new_res: Dict[str, List[str]] = {}
417
+ for k, v in res.items():
418
+ if len(v) >= 3:
419
+ new_res[k] = v
420
+ else:
421
+ if "0" not in new_res:
422
+ new_res["0"] = []
423
+ new_res["0"].extend(v)
424
+ groups: List[List[str]] = [sorted(v) for k, v in sorted(new_res.items())]
425
+ if depth <= 1:
426
+ return groups
427
+ new_groups: List[List[str]] = []
428
+ for v in groups:
429
+ if len(v) >= 6:
430
+ new_groups.extend(cls.group_columns(v, depth=1, sep=sep))
431
+ else:
432
+ new_groups.append(v)
433
+ return new_groups
434
+
435
+ def __init__(
436
+ self,
437
+ df: pandas.DataFrame,
438
+ kind: str = "bar",
439
+ orientation="col",
440
+ split: bool = True,
441
+ timeseries: Optional[str] = None,
442
+ ):
443
+ assert (
444
+ not timeseries or timeseries in df.columns.names
445
+ ), f"Level {timeseries!r} is not part of the columns levels {df.columns.names}"
446
+ assert (
447
+ kind in self.__class__.KINDS
448
+ ), f"Unexpected kind={kind!r} not in {self.__class__.KINDS}"
449
+ assert split, f"split={split} not implemented"
450
+ assert (
451
+ not timeseries or orientation == "row"
452
+ ), f"orientation={orientation!r} must be 'row' for timeseries"
453
+ self.df = df.copy()
454
+ self.kind = kind
455
+ self.orientation = orientation
456
+ self.split = split
457
+ self.timeseries = timeseries
458
+
459
+ if timeseries:
460
+ if isinstance(self.df.columns, pandas.MultiIndex):
461
+ index_time = list(self.df.columns.names).index(self.timeseries)
462
+
463
+ def _drop(t, i=index_time):
464
+ return (*t[:i], *t[i + 1 :])
465
+
466
+ self.df.columns = pandas.MultiIndex.from_tuples(
467
+ [("/".join(map(str, _drop(i))), i[index_time]) for i in self.df.columns],
468
+ names=["metric", timeseries],
469
+ )
470
+ else:
471
+ if isinstance(self.df.columns, pandas.MultiIndex):
472
+ self.df.columns = ["/".join(map(str, i)) for i in self.df.columns]
473
+ if isinstance(self.df.index, pandas.MultiIndex):
474
+ self.df.index = ["/".join(map(str, i)) for i in self.df.index]
475
+
476
+ def __repr__(self) -> str:
477
+ "usual"
478
+ return string_sig(self) # type: ignore[arg-type]
479
+
480
+ def to_images(
481
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
482
+ ) -> List[bytes]:
483
+ """
484
+ Converts data into plots and images.
485
+
486
+ :param verbose: verbosity
487
+ :param merge: returns all graphs in a single image (True)
488
+ or an image for every graph (False)
489
+ :param title_suffix: prefix for the title of every graph
490
+ :return: list of binary images (format PNG)
491
+ """
492
+ if self.kind in ("barh", "bar"):
493
+ return self._to_images_bar(verbose=verbose, merge=merge, title_suffix=title_suffix)
494
+ if self.kind == "line":
495
+ return self._to_images_line(
496
+ verbose=verbose, merge=merge, title_suffix=title_suffix
497
+ )
498
+ raise AssertionError(f"self.kind={self.kind!r} not implemented")
499
+
500
+ @classmethod
501
+ def _make_loop(cls, ensemble, verbose):
502
+ if verbose:
503
+ from tqdm import tqdm
504
+
505
+ loop = tqdm(ensemble)
506
+ else:
507
+ loop = ensemble
508
+ return loop
509
+
510
+ def _to_images_bar(
511
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
512
+ ) -> List[bytes]:
513
+ assert merge, f"merge={merge} not implemented yet"
514
+ import matplotlib.pyplot as plt
515
+
516
+ df = self.df.T if self.orientation == "row" else self.df
517
+ title_suffix = f"\n{title_suffix}" if title_suffix else ""
518
+
519
+ n_cols = 3
520
+ nn = df.shape[1] // n_cols
521
+ nn += int(df.shape[1] % n_cols != 0)
522
+ fig, axs = plt.subplots(nn, n_cols, figsize=(6 * n_cols, nn * df.shape[0] / 5))
523
+ pos = 0
524
+ imgs = []
525
+ for c in self._make_loop(df.columns, verbose):
526
+ ax = axs[pos // n_cols, pos % n_cols]
527
+ (
528
+ df[c].plot.barh(title=f"{c}{title_suffix}", ax=ax)
529
+ if self.kind == "barh"
530
+ else df[c].plot.bar(title=f"{c}{title_suffix}", ax=ax)
531
+ )
532
+ ax.tick_params(axis="both", which="major", labelsize=8)
533
+ ax.grid(True)
534
+ pos += 1 # noqa: SIM113
535
+ fig.tight_layout()
536
+ imgdata = io.BytesIO()
537
+ fig.savefig(imgdata, format="png")
538
+ imgs.append(imgdata.getvalue())
539
+ plt.close()
540
+ return imgs
541
+
542
+ def _to_images_line(
543
+ self, verbose: int = 0, merge: bool = True, title_suffix: Optional[str] = None
544
+ ) -> List[bytes]:
545
+ assert merge, f"merge={merge} not implemented yet"
546
+ assert (
547
+ self.orientation == "row"
548
+ ), f"self.orientation={self.orientation!r} not implemented for this kind of graph."
549
+
550
+ def rotate_align(ax, angle=15, align="right"):
551
+ for label in ax.get_xticklabels():
552
+ label.set_rotation(angle)
553
+ label.set_horizontalalignment(align)
554
+ ax.tick_params(axis="both", which="major", labelsize=8)
555
+ ax.grid(True)
556
+ ax.legend()
557
+ ax.tick_params(labelleft=True)
558
+ return ax
559
+
560
+ import matplotlib.pyplot as plt
561
+
562
+ df = self.df.T
563
+
564
+ confs = list(df.unstack(self.timeseries).index)
565
+ groups = self.group_columns(confs)
566
+ n_cols = len(groups)
567
+
568
+ title_suffix = f"\n{title_suffix}" if title_suffix else ""
569
+ fig, axs = plt.subplots(
570
+ df.shape[1],
571
+ n_cols,
572
+ figsize=(5 * n_cols, max(len(g) for g in groups) * df.shape[1] / 2),
573
+ sharex=True,
574
+ sharey="row" if n_cols > 1 else False,
575
+ )
576
+ imgs = []
577
+ row = 0
578
+ for c in self._make_loop(df.columns, verbose):
579
+ dfc = df[[c]]
580
+ dfc = dfc.unstack(self.timeseries).T.droplevel(0)
581
+ if n_cols == 1:
582
+ dfc.plot(title=f"{c}{title_suffix}", ax=axs[row], linewidth=3)
583
+ axs[row].grid(True)
584
+ rotate_align(axs[row])
585
+ else:
586
+ x = list(range(dfc.shape[0]))
587
+ ticks = list(dfc.index)
588
+ for ii, group in enumerate(groups):
589
+ ddd = dfc.loc[:, group].copy()
590
+ axs[row, ii].set_xticks(x)
591
+ axs[row, ii].set_xticklabels(ticks)
592
+ # This is very slow
593
+ # ddd.plot(ax=axs[row, ii],linewidth=3)
594
+ for jj in range(ddd.shape[1]):
595
+ axs[row, ii].plot(x, ddd.iloc[:, jj], lw=3, label=ddd.columns[jj])
596
+ axs[row, ii].set_title(f"{c}{title_suffix}")
597
+ rotate_align(axs[row, ii])
598
+ row += 1 # noqa: SIM113
599
+ fig.tight_layout()
600
+ imgdata = io.BytesIO()
601
+ fig.savefig(imgdata, format="png")
602
+ imgs.append(imgdata.getvalue())
603
+ plt.close()
604
+ return imgs
605
+
606
+
170
607
  class CubeLogs:
171
608
  """
172
609
  Processes logs coming from experiments.
@@ -180,7 +617,14 @@ class CubeLogs:
180
617
  values: Sequence[str] = ("time_.*", "disc_.*"),
181
618
  ignored: Sequence[str] = (),
182
619
  recent: bool = False,
183
- formulas: Optional[Dict[str, Callable[[pandas.DataFrame], pandas.Series]]] = None,
620
+ formulas: Optional[
621
+ Union[
622
+ Sequence[str],
623
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
624
+ ]
625
+ ] = None,
626
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
627
+ keep_last_date: bool = False,
184
628
  ):
185
629
  self._data = data
186
630
  self._time = time
@@ -189,24 +633,51 @@ class CubeLogs:
189
633
  self._ignored = ignored
190
634
  self.recent = recent
191
635
  self._formulas = formulas
636
+ self.fill_missing = fill_missing
637
+ self.keep_last_date = keep_last_date
638
+
639
+ def post_load_process_piece(
640
+ self, df: pandas.DataFrame, unique: bool = False
641
+ ) -> pandas.DataFrame:
642
+ """
643
+ Postprocesses a piece when a cube is made of multiple pieces
644
+ before it gets merged.
645
+ """
646
+ if not self.fill_missing:
647
+ return df
648
+ missing = dict(self.fill_missing)
649
+ for k, v in missing.items():
650
+ if k not in df.columns:
651
+ df[k] = v
652
+ return df
192
653
 
193
654
  def load(self, verbose: int = 0):
194
655
  """Loads and preprocesses the data. Returns self."""
195
656
  if isinstance(self._data, pandas.DataFrame):
196
657
  if verbose:
197
658
  print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
198
- self.data = self._data
659
+ self.data = self.post_load_process_piece(self._data, unique=True)
660
+ if verbose:
661
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
199
662
  elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
200
663
  if verbose:
201
664
  print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
202
- self.data = pandas.DataFrame(self._data)
665
+ self.data = pandas.DataFrame(self.post_load_process_piece(self._data, unique=True))
666
+ if verbose:
667
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
203
668
  elif isinstance(self._data, list) and all(
204
669
  isinstance(r, pandas.DataFrame) for r in self._data
205
670
  ):
206
671
  if verbose:
207
672
  print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
208
- self.data = pandas.concat(self._data, axis=0)
673
+ self.data = pandas.concat(
674
+ [self.post_load_process_piece(c) for c in self._data], axis=0
675
+ )
676
+ if verbose:
677
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
209
678
  elif isinstance(self._data, list):
679
+ if verbose:
680
+ print("[CubeLogs.load] load from list of Cubes")
210
681
  cubes = []
211
682
  for item in enumerate_csv_files(self._data, verbose=verbose):
212
683
  df = open_dataframe(item)
@@ -219,8 +690,10 @@ class CubeLogs:
219
690
  recent=self.recent,
220
691
  )
221
692
  cube.load()
222
- cubes.append(cube.data)
693
+ cubes.append(self.post_load_process_piece(cube.data))
223
694
  self.data = pandas.concat(cubes, axis=0)
695
+ if verbose:
696
+ print(f"[CubeLogs.load] after postprocessing shape={self.data.shape}")
224
697
  else:
225
698
  raise NotImplementedError(
226
699
  f"Not implemented with the provided data (type={type(self._data)})"
@@ -236,59 +709,101 @@ class CubeLogs:
236
709
  self._initialize_columns()
237
710
  if verbose:
238
711
  print(f"[CubeLogs.load] time={self.time}")
239
- print(f"[CubeLogs.load] keys={self.keys}")
712
+ print(f"[CubeLogs.load] keys={self.keys_no_time}")
240
713
  print(f"[CubeLogs.load] values={self.values}")
241
714
  print(f"[CubeLogs.load] ignored={self.ignored}")
242
715
  print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
243
716
  print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
717
+ assert self.keys_no_time, f"No keys found with {self._keys} from {self.data.columns}"
718
+ assert self.values, f"No values found with {self._values} from {self.data.columns}"
244
719
  assert not (
245
- set(self.keys) & set(self.values)
246
- ), f"Columns {set(self.keys) & set(self.values)} cannot be keys and values"
720
+ set(self.keys_no_time) & set(self.values)
721
+ ), f"Columns {set(self.keys_no_time) & set(self.values)} cannot be keys and values"
247
722
  assert not (
248
- set(self.keys) & set(self.ignored)
249
- ), f"Columns {set(self.keys) & set(self.ignored)} cannot be keys and ignored"
723
+ set(self.keys_no_time) & set(self.ignored)
724
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be keys and ignored"
250
725
  assert not (
251
726
  set(self.values) & set(self.ignored)
252
- ), f"Columns {set(self.keys) & set(self.ignored)} cannot be values and ignored"
727
+ ), f"Columns {set(self.keys_no_time) & set(self.ignored)} cannot be values and ignored"
253
728
  assert (
254
- self.time not in self.keys
729
+ self.time not in self.keys_no_time
255
730
  and self.time not in self.values
256
731
  and self.time not in self.ignored
257
- ), f"Column {self.time!r} is also a key, a value or ignored"
258
- self._columns = [self.time, *self.keys, *self.values, *self.ignored]
732
+ ), (
733
+ f"Column {self.time!r} is also a key, a value or ignored, "
734
+ f"keys={sorted(self.keys_no_time)}, values={sorted(self.values)}, "
735
+ f"ignored={sorted(self.ignored)}"
736
+ )
737
+ self._columns = [self.time, *self.keys_no_time, *self.values, *self.ignored]
259
738
  self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
260
739
  self.data = self.data[self.columns]
261
740
  if verbose:
262
741
  print(f"[CubeLogs.load] dropped={self.dropped}")
263
742
  print(f"[CubeLogs.load] data.shape={self.data.shape}")
264
743
 
744
+ shape = self.data.shape
745
+ if verbose:
746
+ print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
265
747
  self._preprocess()
748
+ if verbose:
749
+ print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
750
+ assert (
751
+ self.data.shape[0] > 0
752
+ ), f"The preprocessing reduced shape {shape} to {self.data.shape}."
266
753
  if self.recent and verbose:
267
754
  print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
268
755
 
269
756
  # Let's apply the formulas
270
757
  if self._formulas:
271
- cols = set(self.data.columns)
272
- for k, f in self._formulas.items():
273
- if k in cols:
758
+ forms = (
759
+ {k: k for k in self._formulas}
760
+ if not isinstance(self._formulas, dict)
761
+ else self._formulas
762
+ )
763
+ cols = set(self.values)
764
+ for k, ff in forms.items():
765
+ f = self._process_formula(ff)
766
+ if k in cols or f is None:
274
767
  if verbose:
275
768
  print(f"[CubeLogs.load] skip formula {k!r}")
276
769
  else:
277
770
  if verbose:
278
771
  print(f"[CubeLogs.load] apply formula {k!r}")
279
772
  self.data[k] = f(self.data)
280
- self.values_for_key = {k: set(self.data[k]) for k in self.keys}
281
- nans = [
282
- c for c in [self.time, *self.keys] if self.data[c].isna().astype(int).sum() > 0
773
+ self.values.append(k)
774
+ cols.add(k)
775
+ self.values_for_key = {k: set(self.data[k].dropna()) for k in self.keys_time}
776
+ for k in self.keys_no_time:
777
+ if self.data[k].isna().max():
778
+ self.values_for_key[k].add(np.nan)
779
+ self.keys_with_nans = [
780
+ c for c in self.keys_time if self.data[c].isna().astype(int).sum() > 0
283
781
  ]
284
- assert not nans, f"The following keys {nans} have nan values. This is not allowed."
285
782
  if verbose:
286
783
  print(f"[CubeLogs.load] convert column {self.time!r} into date")
784
+ if self.keys_with_nans:
785
+ print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
287
786
  self.data[self.time] = pandas.to_datetime(self.data[self.time])
787
+
788
+ if self.keep_last_date:
789
+ times = self.data[self.time].dropna()
790
+ mi, mx = times.min(), times.max()
791
+ if mi != mx:
792
+ print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
793
+ self.data.loc[~self.data[self.time].isna(), self.time] = mx
794
+ self.values_for_key[self.time] = {mx}
795
+ if self.data[self.time].isna().max():
796
+ self.values_for_key[self.time].add(np.nan)
288
797
  if verbose:
289
798
  print(f"[CubeLogs.load] done, shape={self.shape}")
290
799
  return self
291
800
 
801
+ def _process_formula(
802
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
803
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
804
+ assert callable(formula), f"formula={formula!r} is not supported."
805
+ return formula
806
+
292
807
  @property
293
808
  def shape(self) -> Tuple[int, int]:
294
809
  "Returns the shape."
@@ -303,7 +818,7 @@ class CubeLogs:
303
818
 
304
819
  def _preprocess(self):
305
820
  last = self.values[0]
306
- gr = self.data[[self.time, *self.keys, last]].groupby([self.time, *self.keys]).count()
821
+ gr = self.data[[*self.keys_time, last]].groupby(self.keys_time, dropna=False).count()
307
822
  gr = gr[gr[last] > 1]
308
823
  if self.recent:
309
824
  cp = self.data.copy()
@@ -312,11 +827,15 @@ class CubeLogs:
312
827
  ), f"'__index__' should not be a column in {cp.columns}"
313
828
  cp["__index__"] = np.arange(cp.shape[0])
314
829
  gr = (
315
- cp[[*self.keys, self.time, "__index__"]]
316
- .groupby(self.keys, as_index=False)
830
+ cp[[*self.keys_time, "__index__"]]
831
+ .groupby(self.keys_no_time, as_index=False, dropna=False)
317
832
  .max()
318
833
  )
319
- filtered = pandas.merge(cp, gr, on=[self.time, "__index__", *self.keys])
834
+ assert gr.shape[0] > 0, (
835
+ f"Something went wrong after the groupby.\n"
836
+ f"{cp[[*self.keys, self.time, '__index__']].head().T}"
837
+ )
838
+ filtered = pandas.merge(cp, gr, on=["__index__", *self.keys_time])
320
839
  assert filtered.shape[0] <= self.data.shape[0], (
321
840
  f"Keeping the latest row brings more row {filtered.shape} "
322
841
  f"(initial is {self.data.shape})."
@@ -324,18 +843,20 @@ class CubeLogs:
324
843
  self.data = filtered.drop("__index__", axis=1)
325
844
  else:
326
845
  assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
327
- gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
328
- gr = gr[gr[self.time] > 1]
329
- assert (
330
- gr.shape[0] == 0
331
- ), f"recent should be true to keep the most recent row:\n{gr}"
332
846
 
333
847
  @classmethod
334
848
  def _filter_column(cls, filters, columns, can_be_empty=False):
849
+ assert list(columns), "columns is empty"
335
850
  set_cols = set()
336
851
  for f in filters:
337
- reg = re.compile(f)
338
- cols = [c for c in columns if reg.search(c)]
852
+ if set(f) & {'"', "^", ".", "*", "+", "{", "}"}:
853
+ reg = re.compile(f)
854
+ cols = [c for c in columns if reg.search(c)]
855
+ elif f in columns:
856
+ # No regular expression.
857
+ cols = [f]
858
+ else:
859
+ continue
339
860
  set_cols |= set(cols)
340
861
  assert (
341
862
  can_be_empty or set_cols
@@ -343,25 +864,31 @@ class CubeLogs:
343
864
  return sorted(set_cols)
344
865
 
345
866
  def _initialize_columns(self):
346
- self.keys = self._filter_column(self._keys, self.data.columns)
867
+ keys = self._filter_column(self._keys, self.data.columns)
347
868
  self.values = self._filter_column(self._values, self.data.columns)
348
869
  self.ignored = self._filter_column(self._ignored, self.data.columns, True)
349
870
  assert (
350
871
  self._time in self.data.columns
351
- ), f"Column {self._time} not found in {self.data.columns}"
352
- ignored_keys = set(self.ignored) & set(self.keys)
872
+ ), f"Column {self._time} not found in {pprint.pformat(sorted(self.data.columns))}"
873
+ ignored_keys = set(self.ignored) & set(keys)
353
874
  ignored_values = set(self.ignored) & set(self.values)
354
- self.keys = [c for c in self.keys if c not in ignored_keys]
875
+ self.keys_no_time = [c for c in keys if c not in ignored_keys]
355
876
  self.values = [c for c in self.values if c not in ignored_values]
356
877
  self.ignored_keys = sorted(ignored_keys)
357
878
  self.ignored_values = sorted(ignored_values)
358
879
  self.time = self._time
880
+ self.keys_time = [self.time, *[c for c in keys if c not in ignored_keys]]
359
881
 
360
882
  def __str__(self) -> str:
361
883
  "usual"
362
884
  return str(self.data) if hasattr(self, "data") else str(self._data)
363
885
 
364
- def view(self, view_def: CubeViewDef) -> pandas.DataFrame:
886
+ def view(
887
+ self,
888
+ view_def: Union[str, CubeViewDef],
889
+ return_view_def: bool = False,
890
+ verbose: int = 0,
891
+ ) -> Union[pandas.DataFrame, Tuple[pandas.DataFrame, CubeViewDef]]:
365
892
  """
366
893
  Returns a dataframe, a pivot view.
367
894
  `key_index` determines the index, the other key columns determines
@@ -369,58 +896,274 @@ class CubeLogs:
369
896
  is removed.
370
897
 
371
898
  :param view_def: view definition
899
+ :param return_view_def: returns the view as well
900
+ :param verbose: verbosity level
372
901
  :return: dataframe
373
902
  """
374
- key_agg = self._filter_column(view_def.key_agg, self.keys) if view_def.key_agg else []
903
+ assert isinstance(
904
+ view_def, CubeViewDef
905
+ ), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
906
+ if verbose:
907
+ print(f"[CubeLogs.view] -- start view {view_def.name!r}: {view_def}")
908
+ key_agg = (
909
+ self._filter_column(view_def.key_agg, self.keys_time) if view_def.key_agg else []
910
+ )
375
911
  set_key_agg = set(key_agg)
376
- assert set_key_agg <= set(
377
- self.keys
378
- ), f"Non existing keys in key_agg {set_key_agg - set(self.keys)}"
912
+ assert set_key_agg <= set(self.keys_time), (
913
+ f"view_def.name={view_def.name!r}, "
914
+ f"non existing keys in key_agg {set_key_agg - set(self.keys_time)}",
915
+ f"keys={sorted(self.keys_time)}",
916
+ )
379
917
 
380
918
  values = self._filter_column(view_def.values, self.values)
381
- assert set(values) <= set(
382
- self.values
383
- ), f"Non existing columns in values {set(values) - set(self.values)}"
919
+ assert set(values) <= set(self.values), (
920
+ f"view_def.name={view_def.name!r}, "
921
+ f"non existing columns in values {set(values) - set(self.values)}, "
922
+ f"values={sorted(self.values)}"
923
+ )
384
924
 
925
+ # aggregation
385
926
  if key_agg:
927
+ final_stack = True
386
928
  key_index = [
387
929
  c
388
- for c in self._filter_column(view_def.key_index, self.keys)
930
+ for c in self._filter_column(view_def.key_index, self.keys_time)
389
931
  if c not in set_key_agg
390
932
  ]
391
- keys_no_agg = [c for c in self.keys if c not in set_key_agg]
392
- data = (
393
- self.data[[*keys_no_agg, *values]]
394
- .groupby(key_index, as_index=False)
395
- .agg(*view_def.agg_args, **(view_def.agg_kwargs or {}))
933
+ keys_no_agg = [c for c in self.keys_time if c not in set_key_agg]
934
+ if verbose:
935
+ print(f"[CubeLogs.view] aggregation of {set_key_agg}")
936
+ print(f"[CubeLogs.view] groupby {keys_no_agg}")
937
+
938
+ data_red = self.data[[*keys_no_agg, *values]]
939
+ assert set(key_index) <= set(data_red.columns), (
940
+ f"view_def.name={view_def.name!r}, "
941
+ f"nnable to find {set(key_index) - set(data_red.columns)}, "
942
+ f"key_agg={key_agg}, keys_no_agg={keys_no_agg},\n--\n"
943
+ f"selected={pprint.pformat(sorted(data_red.columns))},\n--\n"
944
+ f"keys={pprint.pformat(sorted(self.keys_time))}"
396
945
  )
946
+ grouped_data = data_red.groupby(keys_no_agg, as_index=True, dropna=False)
947
+ if callable(view_def.agg_args):
948
+ agg_kwargs = view_def.agg_kwargs or {}
949
+ agg_args = ({c: view_def.agg_args(c) for c in values},)
950
+ else:
951
+ agg_args = view_def.agg_args # type: ignore[assignment]
952
+ agg_kwargs = view_def.agg_kwargs or {}
953
+ data = grouped_data.agg(*agg_args, **agg_kwargs)
954
+ if view_def.agg_multi:
955
+ append = []
956
+ for k, f in view_def.agg_multi.items():
957
+ cv = grouped_data.apply(f, include_groups=False)
958
+ append.append(cv.to_frame(k))
959
+ data = pandas.concat([data, *append], axis=1)
960
+ set_all_keys = set(keys_no_agg)
961
+ values = list(data.columns)
962
+ data = data.reset_index(drop=False)
397
963
  else:
398
- key_index = self._filter_column(view_def.key_index, self.keys)
399
- data = self.data[[*self.keys, *values]]
964
+ key_index = self._filter_column(view_def.key_index, self.keys_time)
965
+ if verbose:
966
+ print(f"[CubeLogs.view] no aggregation, index={key_index}")
967
+ data = self.data[[*self.keys_time, *values]]
968
+ set_all_keys = set(self.keys_time)
969
+ final_stack = False
400
970
 
401
- assert set(key_index) <= set(
402
- self.keys
403
- ), f"Non existing keys in key_index {set(key_index) - set(self.keys)}"
971
+ assert set(key_index) <= set_all_keys, (
972
+ f"view_def.name={view_def.name!r}, "
973
+ f"Non existing keys in key_index {set(key_index) - set_all_keys}"
974
+ )
404
975
 
976
+ # remove unnecessary column
405
977
  set_key_columns = {
406
- c for c in self.keys if c not in key_index and c not in set(key_agg)
978
+ c for c in self.keys_time if c not in key_index and c not in set(key_agg)
407
979
  }
980
+ key_index0 = key_index
408
981
  if view_def.ignore_unique:
409
- key_index = [k for k in key_index if len(self.values_for_key[k]) > 1]
410
- key_columns = [k for k in set_key_columns if len(self.values_for_key[k]) > 1]
982
+ unique = {
983
+ k for k, v in self.values_for_key.items() if k in set_all_keys and len(v) <= 1
984
+ }
985
+ keep_anyway = (
986
+ set(view_def.keep_columns_in_index)
987
+ if view_def.keep_columns_in_index
988
+ else set()
989
+ )
990
+ key_index = [k for k in key_index if k not in unique or k in keep_anyway]
991
+ key_columns = [k for k in set_key_columns if k not in unique or k in keep_anyway]
992
+ if verbose:
993
+ print(f"[CubeLogs.view] unique={unique}, keep_anyway={keep_anyway}")
994
+ print(
995
+ f"[CubeLogs.view] columns with unique values "
996
+ f"{set(key_index0) - set(key_index)}"
997
+ )
411
998
  else:
999
+ if verbose:
1000
+ print("[CubeLogs.view] keep all columns")
412
1001
  key_columns = sorted(set_key_columns)
1002
+ unique = set()
413
1003
 
1004
+ _md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
1005
+ all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
1006
+ assert all_cols == set(self.keys_time), (
1007
+ f"view_def.name={view_def.name!r}, "
1008
+ f"key_columns + key_index + key_agg + unique != keys, left="
1009
+ f"{set(self.keys_time) - all_cols}, "
1010
+ f"unique={unique}, index={set(key_index)}, columns={set(key_columns)}, "
1011
+ f"agg={set(key_agg)}, keys={set(self.keys_time)}, values={values}"
1012
+ )
1013
+
1014
+ # reorder
414
1015
  if view_def.order:
415
- assert set(view_def.order) <= set_key_columns, (
416
- f"Non existing columns from order in key_columns "
417
- f"{set(view_def.order) - set_key_columns}"
1016
+ subset = self._filter_column(view_def.order, all_cols | {self.time})
1017
+ corder = [o for o in view_def.order if o in subset]
1018
+ assert set(corder) <= set_key_columns, (
1019
+ f"view_def.name={view_def.name!r}, "
1020
+ f"non existing columns from order in key_columns "
1021
+ f"{set(corder) - set_key_columns}"
418
1022
  )
419
1023
  key_columns = [
420
- *view_def.order,
1024
+ *[o for o in corder if o in key_columns],
421
1025
  *[c for c in key_columns if c not in view_def.order],
422
1026
  ]
423
- return data.pivot(index=key_index[::-1], columns=key_columns, values=values)
1027
+ else:
1028
+ corder = None
1029
+
1030
+ if view_def.dropna:
1031
+ data, key_index, key_columns, values = self._dropna( # type: ignore[assignment]
1032
+ data,
1033
+ key_index,
1034
+ key_columns,
1035
+ values,
1036
+ keep_columns_in_index=view_def.keep_columns_in_index,
1037
+ )
1038
+ if view_def.ignore_columns:
1039
+ if verbose:
1040
+ print(f"[CubeLogs.view] ignore_columns {view_def.ignore_columns}")
1041
+ data = data.drop(view_def.ignore_columns, axis=1)
1042
+ seti = set(view_def.ignore_columns)
1043
+ if view_def.keep_columns_in_index:
1044
+ seti -= set(view_def.keep_columns_in_index)
1045
+ key_index = [c for c in key_index if c not in seti]
1046
+ key_columns = [c for c in key_columns if c not in seti]
1047
+ values = [c for c in values if c not in seti]
1048
+
1049
+ # final verification
1050
+ if verbose:
1051
+ print(f"[CubeLogs.view] key_index={key_index}")
1052
+ print(f"[CubeLogs.view] key_columns={key_columns}")
1053
+ g = data[[*key_index, *key_columns]].copy()
1054
+ g["count"] = 1
1055
+ r = g.groupby([*key_index, *key_columns], dropna=False).sum()
1056
+ not_unique = r[r["count"] > 1]
1057
+ assert not_unique.shape[0] == 0, (
1058
+ f"view_def.name={view_def.name!r}, "
1059
+ f"unable to run the pivot with index={sorted(key_index)}, "
1060
+ f"key={sorted(key_columns)}, key_agg={key_agg}, values={sorted(values)}, "
1061
+ f"columns={sorted(data.columns)}, ignored={view_def.ignore_columns}, "
1062
+ f"not unique={set(data.columns) - unique}"
1063
+ f"\n--\n{not_unique.head()}"
1064
+ )
1065
+
1066
+ # pivot
1067
+ if verbose:
1068
+ print(f"[CubeLogs.view] values={values}")
1069
+ if key_index:
1070
+ piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
1071
+ else:
1072
+ # pivot does return the same rank with it is empty.
1073
+ # Let's add arficially one
1074
+ data = data.copy()
1075
+ data["ALL"] = "ALL"
1076
+ piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
1077
+ if isinstance(piv, pandas.Series):
1078
+ piv = piv.to_frame(name="series")
1079
+ names = list(piv.columns.names)
1080
+ assert (
1081
+ "METRICS" not in names
1082
+ ), f"Not implemented when a level METRICS already exists {names!r}"
1083
+ names[0] = "METRICS"
1084
+ piv.columns = piv.columns.set_names(names)
1085
+ if final_stack:
1086
+ piv = piv.stack("METRICS", future_stack=True)
1087
+ if view_def.transpose:
1088
+ piv = piv.T
1089
+ if isinstance(piv, pandas.Series):
1090
+ piv = piv.to_frame("VALUE")
1091
+ piv.sort_index(inplace=True)
1092
+
1093
+ if isinstance(piv.columns, pandas.MultiIndex):
1094
+ if corder:
1095
+ # reorder the levels for the columns with the view definition
1096
+ new_corder = [c for c in corder if c in piv.columns.names]
1097
+ new_names = [
1098
+ *[c for c in piv.columns.names if c not in new_corder],
1099
+ *new_corder,
1100
+ ]
1101
+ piv.columns = piv.columns.reorder_levels(new_names)
1102
+ elif self.time in piv.columns.names:
1103
+ # put time at the end
1104
+ new_names = list(piv.columns.names)
1105
+ ind = new_names.index(self.time)
1106
+ if ind < len(new_names) - 1:
1107
+ del new_names[ind]
1108
+ new_names.append(self.time)
1109
+ piv.columns = piv.columns.reorder_levels(new_names)
1110
+
1111
+ if view_def.no_index:
1112
+ piv = piv.reset_index(drop=False)
1113
+ else:
1114
+ piv.sort_index(inplace=True, axis=1)
1115
+
1116
+ if verbose:
1117
+ print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
1118
+ print(f"[CubeLogs.view] -- done view {view_def.name!r}")
1119
+ return (piv, view_def) if return_view_def else piv
1120
+
1121
+ def _dropna(
1122
+ self,
1123
+ data: pandas.DataFrame,
1124
+ key_index: Sequence[str],
1125
+ key_columns: Sequence[str],
1126
+ values: Sequence[str],
1127
+ keep_columns_in_index: Optional[Sequence[str]] = None,
1128
+ ) -> Tuple[pandas.DataFrame, Sequence[str], Sequence[str], Sequence[str]]:
1129
+ set_keep_columns_in_index = (
1130
+ set(keep_columns_in_index) if keep_columns_in_index else set()
1131
+ )
1132
+ v = data[values]
1133
+ new_data = data[~v.isnull().all(1)]
1134
+ if data.shape == new_data.shape:
1135
+ return data, key_index, key_columns, values
1136
+ new_data = new_data.copy()
1137
+ new_key_index = []
1138
+ for c in key_index:
1139
+ if c in set_keep_columns_in_index:
1140
+ new_key_index.append(c)
1141
+ continue
1142
+ v = new_data[c]
1143
+ sv = set(v.dropna())
1144
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
1145
+ new_key_index.append(c)
1146
+ new_key_columns = []
1147
+ for c in key_columns:
1148
+ if c in set_keep_columns_in_index:
1149
+ new_key_columns.append(c)
1150
+ continue
1151
+ v = new_data[c]
1152
+ sv = set(v.dropna())
1153
+ if len(sv) > 1 or (v.isna().max() and len(sv) > 0):
1154
+ new_key_columns.append(c)
1155
+ for c in set(key_index) | set(key_columns):
1156
+ s = new_data[c]
1157
+ if s.isna().max():
1158
+ if pandas.api.types.is_numeric_dtype(s):
1159
+ min_v = s.dropna().min()
1160
+ assert (
1161
+ min_v >= 0
1162
+ ), f"Unable to replace nan values in column {c!r}, min_v={min_v}"
1163
+ new_data[c] = s.fillna(-1)
1164
+ else:
1165
+ new_data[c] = s.fillna("NAN")
1166
+ return new_data, new_key_index, new_key_columns, values
424
1167
 
425
1168
  def describe(self) -> pandas.DataFrame:
426
1169
  """Basic description of all variables."""
@@ -433,22 +1176,42 @@ class CubeLogs:
433
1176
  name=name,
434
1177
  dtype=str(dtype),
435
1178
  missing=len(values) - len(nonan),
1179
+ kind=(
1180
+ "time"
1181
+ if name == self.time
1182
+ else (
1183
+ "keys"
1184
+ if name in self.keys_no_time
1185
+ else (
1186
+ "values"
1187
+ if name in self.values
1188
+ else ("ignored" if name in self.ignored else "unused")
1189
+ )
1190
+ )
1191
+ ),
436
1192
  )
437
1193
  if len(nonan) > 0:
438
- obs.update(
439
- dict(
440
- min=nonan.min(),
441
- max=nonan.max(),
442
- count=len(nonan),
443
- )
444
- )
1194
+ obs.update(dict(count=len(nonan)))
445
1195
  if is_numeric_dtype(nonan):
446
1196
  obs.update(
447
1197
  dict(
1198
+ min=nonan.min(),
1199
+ max=nonan.max(),
448
1200
  mean=nonan.mean(),
449
1201
  sum=nonan.sum(),
1202
+ n_values=len(set(nonan)),
450
1203
  )
451
1204
  )
1205
+ elif obs["kind"] == "time":
1206
+ unique = set(nonan)
1207
+ obs["n_values"] = len(unique)
1208
+ o = dict(
1209
+ min=str(nonan.min()),
1210
+ max=str(nonan.max()),
1211
+ n_values=len(set(nonan)),
1212
+ )
1213
+ o["values"] = f"{o['min']} - {o['max']}"
1214
+ obs.update(o)
452
1215
  else:
453
1216
  unique = set(nonan)
454
1217
  obs["n_values"] = len(unique)
@@ -460,126 +1223,747 @@ class CubeLogs:
460
1223
  def to_excel(
461
1224
  self,
462
1225
  output: str,
463
- views: Dict[str, CubeViewDef],
1226
+ views: Union[Sequence[str], Dict[str, Union[str, CubeViewDef]]],
464
1227
  main: Optional[str] = "main",
465
1228
  raw: Optional[str] = "raw",
466
1229
  verbose: int = 0,
1230
+ csv: Optional[Sequence[str]] = None,
467
1231
  ):
468
1232
  """
469
1233
  Creates an excel file with a list of view.
470
1234
 
471
1235
  :param output: output file to create
472
- :param views: list of views to append
1236
+ :param views: sequence or dictionary of views to append
473
1237
  :param main: add a page with statitcs on all variables
474
1238
  :param raw: add a page with the raw data
1239
+ :param csv: views to dump as csv files (same name as outputs + view naw)
475
1240
  :param verbose: verbosity
476
1241
  """
477
-
1242
+ if verbose:
1243
+ print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
1244
+ views = {k: k for k in views} if not isinstance(views, dict) else views
1245
+ f_highlights = {}
1246
+ plots = []
478
1247
  with pandas.ExcelWriter(output, engine="openpyxl") as writer:
479
1248
  if main:
480
1249
  assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
481
- df = self.describe()
1250
+ df = self.describe().sort_values("name")
482
1251
  if verbose:
483
- print(f"[CubeLogs.to_helper] add sheet {main!r} with shape {df.shape}")
1252
+ print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
484
1253
  df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
485
- self._apply_excel_style(main, writer, df)
486
- if raw:
487
- assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
488
- if verbose:
489
- print(f"[CubeLogs.to_helper] add sheet {raw!r} with shape {self.shape}")
490
- self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
491
- self._apply_excel_style(raw, writer, self.data)
492
1254
 
493
1255
  for name, view in views.items():
494
- df = self.view(view)
1256
+ if view is None:
1257
+ continue
1258
+ df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
1259
+ if tview is None:
1260
+ continue
1261
+ memory = df.memory_usage(deep=True).sum()
495
1262
  if verbose:
496
1263
  print(
497
- f"[CubeLogs.to_helper] add sheet {name!r} with shape "
498
- f"{df.shape}, index={df.index.names}, columns={df.columns.names}"
1264
+ f"[CubeLogs.to_excel] add sheet {name!r} with shape "
1265
+ f"{df.shape} ({memory} bytes), index={df.index.names}, "
1266
+ f"columns={df.columns.names}"
1267
+ )
1268
+ if self.time in df.columns.names:
1269
+ # Let's convert the time into str
1270
+ fr = df.columns.to_frame()
1271
+ if is_datetime64_any_dtype(fr[self.time]):
1272
+ dt = fr[self.time]
1273
+ has_time = (dt != dt.dt.normalize()).any()
1274
+ sdt = dt.apply(
1275
+ lambda t, has_time=has_time: t.strftime(
1276
+ "%Y-%m-%dT%H-%M-%S" if has_time else "%Y-%m-%d"
1277
+ )
1278
+ )
1279
+ fr[self.time] = sdt
1280
+ df.columns = pandas.MultiIndex.from_frame(fr)
1281
+ if csv and name in csv:
1282
+ name_csv = f"{output}.{name}.csv"
1283
+ if verbose:
1284
+ print(f"[CubeLogs.to_excel] saving sheet {name!r} in {name_csv!r}")
1285
+ df.reset_index(drop=False).to_csv(f"{output}.{name}.csv", index=False)
1286
+
1287
+ if memory > 2**22:
1288
+ msg = (
1289
+ f"[CubeLogs.to_excel] skipping {name!r}, "
1290
+ f"too big for excel with {memory} bytes"
1291
+ )
1292
+ if verbose:
1293
+ print(msg)
1294
+ else:
1295
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1296
+ else:
1297
+ df.to_excel(
1298
+ writer,
1299
+ sheet_name=name,
1300
+ freeze_panes=(df.columns.nlevels + df.index.nlevels, df.index.nlevels),
499
1301
  )
500
- df.to_excel(
501
- writer,
502
- sheet_name=name,
503
- freeze_panes=(df.index.nlevels, df.columns.nlevels),
1302
+ f_highlights[name] = tview.f_highlight
1303
+ if tview.plots:
1304
+ plots.append(
1305
+ CubePlot(
1306
+ df,
1307
+ kind="line",
1308
+ orientation="row",
1309
+ split=True,
1310
+ timeseries=self.time,
1311
+ )
1312
+ if self.time in df.columns.names
1313
+ else CubePlot(df, kind="barh", orientation="row", split=True)
1314
+ )
1315
+ if raw:
1316
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1317
+ # Too long.
1318
+ # self._apply_excel_style(raw, writer, self.data)
1319
+ if csv and "raw" in csv:
1320
+ df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
1321
+ memory = df.memory_usage(deep=True).sum()
1322
+ if memory > 2**22:
1323
+ msg = (
1324
+ f"[CubeLogs.to_excel] skipping 'raw', "
1325
+ f"too big for excel with {memory} bytes"
1326
+ )
1327
+ if verbose:
1328
+ print(msg)
1329
+ else:
1330
+ warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1331
+ else:
1332
+ if verbose:
1333
+ print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
1334
+ self.data.to_excel(
1335
+ writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1336
+ )
1337
+
1338
+ if plots:
1339
+ from openpyxl.drawing.image import Image
1340
+
1341
+ if verbose:
1342
+ print(f"[CubeLogs.to_excel] plots {len(plots)} plots")
1343
+ sheet = writer.book.create_sheet("plots")
1344
+ pos = 0
1345
+ empty_row = 1
1346
+ times = self.data[self.time].dropna()
1347
+ mini, maxi = times.min(), times.max()
1348
+ title_suffix = (str(mini) if mini == maxi else f"{mini}-{maxi}").replace(
1349
+ " 00:00:00", ""
504
1350
  )
505
- self._apply_excel_style(name, writer, df)
506
- if verbose:
507
- print(f"[CubeLogs.to_helper] done with {len(views)} views")
1351
+ for plot in plots:
1352
+ imgs = plot.to_images(
1353
+ verbose=verbose, merge=True, title_suffix=title_suffix
1354
+ )
1355
+ for img in imgs:
1356
+ y = (pos // 2) * 16
1357
+ loc = f"A{y}" if pos % 2 == 0 else f"M{y}"
1358
+ sheet.add_image(Image(io.BytesIO(img)), loc)
1359
+ if verbose:
1360
+ no = f"{output}.png"
1361
+ print(f"[CubeLogs.to_excel] dump graphs into {no!r}")
1362
+ with open(no, "wb") as f:
1363
+ f.write(img)
1364
+ pos += 1
1365
+ empty_row += len(plots) + 2
508
1366
 
509
- def _apply_excel_style(self, name: str, writer: pandas.ExcelWriter, df: pandas.DataFrame):
510
- from openpyxl.styles import Alignment
511
- from openpyxl.utils import get_column_letter
1367
+ if verbose:
1368
+ print(f"[CubeLogs.to_excel] applies style to {output!r}")
1369
+ apply_excel_style(writer, f_highlights) # type: ignore[arg-type]
1370
+ if verbose:
1371
+ print(f"[CubeLogs.to_excel] done with {len(views)} views")
512
1372
 
513
- # from openpyxl.styles import Font, PatternFill, numbers
514
1373
 
515
- left = Alignment(horizontal="left")
516
- right = Alignment(horizontal="right")
517
- # center = Alignment(horizontal="center")
518
- # bold_font = Font(bold=True)
519
- # red = Font(color="FF0000")
520
- # yellow = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
521
- # redf = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid")
1374
+ class CubeLogsPerformance(CubeLogs):
1375
+ """
1376
+ Processes logs coming from experiments.
1377
+ """
522
1378
 
523
- sheet = writer.sheets[name]
524
- n_rows = df.shape[0] + df.columns.nlevels + df.index.nlevels
525
- n_cols = df.shape[1] + df.index.nlevels
526
- co: Dict[int, int] = {}
527
- sizes: Dict[int, int] = {}
528
- cols = set()
529
- for i in range(1, n_rows):
530
- for j, cell in enumerate(sheet[i]):
531
- if j > n_cols:
532
- break
533
- cols.add(cell.column)
534
- if isinstance(cell.value, float):
535
- co[j] = co.get(j, 0) + 1
536
- elif isinstance(cell.value, str):
537
- sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
1379
+ def __init__(
1380
+ self,
1381
+ data: Any,
1382
+ time: str = "DATE",
1383
+ keys: Sequence[str] = (
1384
+ "^version_.*",
1385
+ "^model_.*",
1386
+ "device",
1387
+ "opt_patterns",
1388
+ "suite",
1389
+ "memory_peak",
1390
+ "machine",
1391
+ "exporter",
1392
+ "dynamic",
1393
+ "rtopt",
1394
+ "dtype",
1395
+ "device",
1396
+ "architecture",
1397
+ ),
1398
+ values: Sequence[str] = (
1399
+ "^time_.*",
1400
+ "^disc.*",
1401
+ "^ERR_.*",
1402
+ "CMD",
1403
+ "^ITER",
1404
+ "^onnx_.*",
1405
+ "^op_onnx_.*",
1406
+ "^peak_gpu_.*",
1407
+ ),
1408
+ ignored: Sequence[str] = ("version_python",),
1409
+ recent: bool = True,
1410
+ formulas: Optional[
1411
+ Union[
1412
+ Sequence[str],
1413
+ Dict[str, Union[str, Callable[[pandas.DataFrame], pandas.Series]]],
1414
+ ]
1415
+ ] = (
1416
+ "speedup",
1417
+ "bucket[speedup]",
1418
+ "ERR1",
1419
+ "n_models",
1420
+ "n_model_eager",
1421
+ "n_model_running",
1422
+ "n_model_acc01",
1423
+ "n_model_acc001",
1424
+ "n_model_dynamic",
1425
+ "n_model_pass",
1426
+ "n_model_faster",
1427
+ "n_model_faster2x",
1428
+ "n_model_faster3x",
1429
+ "n_model_faster4x",
1430
+ "n_node_attention",
1431
+ "n_node_control_flow",
1432
+ "n_node_scatter",
1433
+ "n_node_function",
1434
+ "n_node_initializer",
1435
+ "n_node_initializer_small",
1436
+ "n_node_constant",
1437
+ "n_node_shape",
1438
+ "n_node_expand",
1439
+ "onnx_n_nodes_no_cst",
1440
+ "peak_gpu_torch",
1441
+ "peak_gpu_nvidia",
1442
+ "time_export_unbiased",
1443
+ ),
1444
+ fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
1445
+ keep_last_date: bool = False,
1446
+ ):
1447
+ super().__init__(
1448
+ data=data,
1449
+ time=time,
1450
+ keys=keys,
1451
+ values=values,
1452
+ ignored=ignored,
1453
+ recent=recent,
1454
+ formulas=formulas,
1455
+ fill_missing=fill_missing,
1456
+ keep_last_date=keep_last_date,
1457
+ )
538
1458
 
539
- for k, v in sizes.items():
540
- c = get_column_letter(k)
541
- sheet.column_dimensions[c].width = max(15, v)
542
- for k in cols:
543
- if k not in sizes:
544
- c = get_column_letter(k)
545
- sheet.column_dimensions[c].width = 15
1459
+ def _process_formula(
1460
+ self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
1461
+ ) -> Callable[[pandas.DataFrame], pandas.Series]:
1462
+ """
1463
+ Processes a formula, converting it into a function.
546
1464
 
547
- for i in range(1, n_rows):
548
- for j, cell in enumerate(sheet[i]):
549
- if j > n_cols:
550
- break
551
- if isinstance(cell.value, pandas.Timestamp):
552
- cell.alignment = right
553
- dt = cell.value.to_pydatetime()
554
- cell.value = dt
555
- cell.number_format = (
556
- "YYYY-MM-DD"
557
- if (
558
- dt.hour == 0
559
- and dt.minute == 0
560
- and dt.second == 0
561
- and dt.microsecond == 0
562
- )
563
- else "YYYY-MM-DD 00:00:00"
564
- )
565
- elif isinstance(cell.value, (float, int)):
566
- cell.alignment = right
567
- x = abs(cell.value)
568
- if int(x) == x:
569
- cell.number_format = "0"
570
- elif x > 5000:
571
- cell.number_format = "# ##0"
572
- elif x >= 500:
573
- cell.number_format = "0.0"
574
- elif x >= 50:
575
- cell.number_format = "0.00"
576
- elif x >= 5:
577
- cell.number_format = "0.000"
578
- elif x > 0.5:
579
- cell.number_format = "0.0000"
580
- elif x > 0.005:
581
- cell.number_format = "0.00000"
1465
+ :param formula: a formula string
1466
+ :return: a function
1467
+ """
1468
+ if callable(formula):
1469
+ return formula
1470
+ assert isinstance(
1471
+ formula, str
1472
+ ), f"Unexpected type for formula {type(formula)}: {formula!r}"
1473
+
1474
+ def gdf(df, cname, default_value=np.nan):
1475
+ if cname in df.columns:
1476
+ return df[cname]
1477
+ return pandas.Series(default_value, index=df.index)
1478
+
1479
+ def ghas_value(df, cname):
1480
+ if cname not in df.columns:
1481
+ return pandas.Series(np.nan, index=df.index)
1482
+ isna = df[cname].isna()
1483
+ return pandas.Series(np.where(isna, np.nan, 1.0), index=df.index)
1484
+
1485
+ def gpreserve(df, cname, series):
1486
+ if cname not in df.columns:
1487
+ return pandas.Series(np.nan, index=df.index)
1488
+ isna = df[cname].isna()
1489
+ return pandas.Series(np.where(isna, np.nan, series), index=df.index).astype(float)
1490
+
1491
+ if formula == "speedup":
1492
+ columns = set(self._filter_column(["^time_.*"], self.data.columns))
1493
+ assert "time_latency" in columns and "time_latency_eager" in columns, (
1494
+ f"Unable to apply formula {formula!r}, with columns\n"
1495
+ f"{pprint.pformat(sorted(columns))}"
1496
+ )
1497
+ return lambda df: df["time_latency_eager"] / df["time_latency"]
1498
+
1499
+ if formula == "bucket[speedup]":
1500
+ columns = set(self._filter_column(["^time_.*", "speedup"], self.data.columns))
1501
+ assert "speedup" in columns, (
1502
+ f"Unable to apply formula {formula!r}, with columns\n"
1503
+ f"{pprint.pformat(sorted(columns))}"
1504
+ )
1505
+ # return lambda df: df["time_latency_eager"] / df["time_latency"]
1506
+ return lambda df: pandas.cut(
1507
+ df["speedup"], bins=BUCKET_SCALES, right=False, duplicates="raise"
1508
+ )
1509
+
1510
+ if formula == "ERR1":
1511
+ columns = set(self._filter_column(["^ERR_.*"], self.data.columns))
1512
+ if not columns:
1513
+ return lambda df: np.nan
1514
+
1515
+ def first_err(df: pandas.DataFrame) -> pandas.Series:
1516
+ ordered = [
1517
+ c
1518
+ for c in [
1519
+ "ERR_timeout",
1520
+ "ERR_load",
1521
+ "ERR_feeds",
1522
+ "ERR_warmup_eager",
1523
+ "ERR_export",
1524
+ "ERR_ort",
1525
+ "ERR_warmup",
1526
+ # "ERR_std",
1527
+ # "ERR_crash",
1528
+ # "ERR_stdout",
1529
+ ]
1530
+ if c in df.columns
1531
+ ]
1532
+ res = None
1533
+ for c in ordered:
1534
+ if res is None:
1535
+ res = df[c].fillna("")
582
1536
  else:
583
- cell.number_format = "0.000E+00"
584
- else:
585
- cell.alignment = left
1537
+ res = pandas.Series(np.where(res != "", res, df[c].fillna("")))
1538
+ return res
1539
+
1540
+ return first_err
1541
+
1542
+ if formula.startswith("n_"):
1543
+ lambdas = dict(
1544
+ n_models=lambda df: ghas_value(df, "model_name"),
1545
+ n_model_eager=lambda df: ghas_value(df, "time_latency_eager"),
1546
+ n_model_running=lambda df: ghas_value(df, "time_latency"),
1547
+ n_model_acc01=lambda df: gpreserve(
1548
+ df, "discrepancies_abs", (gdf(df, "discrepancies_abs") <= 0.1)
1549
+ ),
1550
+ n_model_acc001=lambda df: gpreserve(
1551
+ df, "discrepancies_abs", gdf(df, "discrepancies_abs") <= 0.01
1552
+ ),
1553
+ n_model_dynamic=lambda df: gpreserve(
1554
+ df,
1555
+ "discrepancies_dynamic_abs",
1556
+ (gdf(df, "discrepancies_dynamic_abs") <= 0.1),
1557
+ ),
1558
+ n_model_pass=lambda df: gpreserve(
1559
+ df,
1560
+ "time_latency",
1561
+ (gdf(df, "discrepancies_abs", np.inf) < 0.1)
1562
+ & (gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98),
1563
+ ),
1564
+ n_model_faster=lambda df: gpreserve(
1565
+ df,
1566
+ "time_latency",
1567
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 0.98,
1568
+ ),
1569
+ n_model_faster2x=lambda df: gpreserve(
1570
+ df,
1571
+ "time_latency",
1572
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 1.98,
1573
+ ),
1574
+ n_model_faster3x=lambda df: gpreserve(
1575
+ df,
1576
+ "time_latency",
1577
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 2.98,
1578
+ ),
1579
+ n_model_faster4x=lambda df: gpreserve(
1580
+ df,
1581
+ "time_latency",
1582
+ gdf(df, "time_latency_eager") > gdf(df, "time_latency", np.inf) * 3.98,
1583
+ ),
1584
+ n_node_attention=lambda df: gpreserve(
1585
+ df,
1586
+ "op_onnx_com.microsoft_Attention",
1587
+ gdf(df, "op_onnx_com.microsoft_Attention")
1588
+ + gdf(df, "op_onnx_com.microsoft_MultiHeadAttention"),
1589
+ ),
1590
+ n_node_control_flow=lambda df: gpreserve(
1591
+ df,
1592
+ "op_onnx__If",
1593
+ (
1594
+ gdf(df, "op_onnx__If", 0)
1595
+ + gdf(df, "op_onnx__Scan", 0)
1596
+ + gdf(df, "op_onnx__Loop", 0)
1597
+ ),
1598
+ ),
1599
+ n_node_scatter=lambda df: gpreserve(
1600
+ df,
1601
+ "op_onnx__ScatterND",
1602
+ gdf(df, "op_onnx__ScatterND", 0) + gdf(df, "op_onnx__ScatterElements", 0),
1603
+ ),
1604
+ n_node_function=lambda df: gpreserve(
1605
+ df, "onnx_n_functions", gdf(df, "onnx_n_functions")
1606
+ ),
1607
+ n_node_initializer_small=lambda df: gpreserve(
1608
+ df, "op_onnx_initializer_small", gdf(df, "op_onnx_initializer_small")
1609
+ ),
1610
+ n_node_initializer=lambda df: gpreserve(
1611
+ df, "onnx_n_initializer", gdf(df, "onnx_n_initializer")
1612
+ ),
1613
+ n_node_constant=lambda df: gpreserve(
1614
+ df, "op_onnx__Constant", gdf(df, "op_onnx__Constant")
1615
+ ),
1616
+ n_node_shape=lambda df: gpreserve(
1617
+ df, "op_onnx__Shape", gdf(df, "op_onnx__Shape")
1618
+ ),
1619
+ n_node_expand=lambda df: gpreserve(
1620
+ df, "op_onnx__Expand", gdf(df, "op_onnx__Expand")
1621
+ ),
1622
+ )
1623
+ assert (
1624
+ formula in lambdas
1625
+ ), f"Unexpected formula={formula!r}, should be in {sorted(lambdas)}"
1626
+ return lambdas[formula]
1627
+
1628
+ if formula == "onnx_n_nodes_no_cst":
1629
+ return lambda df: gdf(df, "onnx_n_nodes", 0) - gdf(
1630
+ df, "op_onnx__Constant", 0
1631
+ ).fillna(0)
1632
+ if formula == "peak_gpu_torch":
1633
+ return lambda df: gdf(df, "mema_gpu_5_after_export") - gdf(df, "mema_gpu_4_reset")
1634
+ if formula == "peak_gpu_nvidia":
1635
+ return (
1636
+ lambda df: (gdf(df, "memory_gpu0_peak") - gdf(df, "memory_gpu0_begin")) * 2**20
1637
+ )
1638
+ if formula == "time_export_unbiased":
1639
+
1640
+ def unbiased_export(df):
1641
+ if "time_warmup_first_iteration" not in df.columns:
1642
+ return pandas.Series(np.nan, index=df.index)
1643
+ return pandas.Series(
1644
+ np.where(
1645
+ df["exporter"] == "inductor",
1646
+ df["time_warmup_first_iteration"] + df["time_export_success"],
1647
+ df["time_export_success"],
1648
+ ),
1649
+ index=df.index,
1650
+ )
1651
+
1652
+ return lambda df: gpreserve(df, "time_warmup_first_iteration", unbiased_export(df))
1653
+
1654
+ raise ValueError(
1655
+ f"Unexpected formula {formula!r}, available columns are\n"
1656
+ f"{pprint.pformat(sorted(self.data.columns))}"
1657
+ )
1658
+
1659
+ def view(
1660
+ self,
1661
+ view_def: Optional[Union[str, CubeViewDef]],
1662
+ return_view_def: bool = False,
1663
+ verbose: int = 0,
1664
+ ) -> Union[
1665
+ Optional[pandas.DataFrame], Tuple[Optional[pandas.DataFrame], Optional[CubeViewDef]]
1666
+ ]:
1667
+ """
1668
+ Returns a dataframe, a pivot view.
1669
+
1670
+ If view_def is a string, it is replaced by a prefined view.
1671
+
1672
+ :param view_def: view definition or a string
1673
+ :param return_view_def: returns the view definition as well
1674
+ :param verbose: verbosity level
1675
+ :return: dataframe or a couple (dataframe, view definition),
1676
+ both of them can be one if view_def cannot be interpreted
1677
+ """
1678
+ assert view_def is not None, "view_def is None, this is not allowed."
1679
+ if isinstance(view_def, str):
1680
+ view_def = self.make_view_def(view_def)
1681
+ if view_def is None:
1682
+ return (None, None) if return_view_def else None
1683
+ return super().view(view_def, return_view_def=return_view_def, verbose=verbose)
1684
+
1685
+ def make_view_def(self, name: str) -> Optional[CubeViewDef]:
1686
+ """
1687
+ Returns a view definition.
1688
+
1689
+ :param name: name of the view
1690
+ :return: a CubeViewDef or None if name does not make sense
1691
+
1692
+ Available views:
1693
+
1694
+ * **agg-suite:** aggregation per suite
1695
+ * **disc:** discrepancies
1696
+ * **speedup:** speedup
1697
+ * **bucket_speedup:** speedup in buckets
1698
+ * **time:** latency
1699
+ * **time_export:** time to export
1700
+ * **counts:** status, running, faster, has control flow, ...
1701
+ * **err:** important errors
1702
+ * **cmd:** command lines
1703
+ * **raw-short:** raw data without all the unused columns
1704
+ """
1705
+ fs = ["suite", "model_suite", "task", "model_name", "model_task"]
1706
+ index_cols = self._filter_column(fs, self.keys_time)
1707
+ assert index_cols, (
1708
+ f"No index columns found for {fs!r} in "
1709
+ f"{pprint.pformat(sorted(self.keys_time))}"
1710
+ )
1711
+ index_cols = [c for c in fs if c in set(index_cols)]
1712
+
1713
+ f_speedup = lambda x: ( # noqa: E731
1714
+ CubeViewDef.HighLightKind.NONE
1715
+ if not isinstance(x, (float, int))
1716
+ else (
1717
+ CubeViewDef.HighLightKind.RED
1718
+ if x < 0.9
1719
+ else (
1720
+ CubeViewDef.HighLightKind.GREEN
1721
+ if x > 1.1
1722
+ else CubeViewDef.HighLightKind.NONE
1723
+ )
1724
+ )
1725
+ )
1726
+ f_disc = lambda x: ( # noqa: E731
1727
+ CubeViewDef.HighLightKind.NONE
1728
+ if not isinstance(x, (float, int))
1729
+ else (
1730
+ CubeViewDef.HighLightKind.RED
1731
+ if x > 0.1
1732
+ else (
1733
+ CubeViewDef.HighLightKind.GREEN
1734
+ if x < 0.01
1735
+ else CubeViewDef.HighLightKind.NONE
1736
+ )
1737
+ )
1738
+ )
1739
+ f_bucket = lambda x: ( # noqa: E731
1740
+ CubeViewDef.HighLightKind.NONE
1741
+ if not isinstance(x, str)
1742
+ else (
1743
+ CubeViewDef.HighLightKind.RED
1744
+ if x in {"[-inf, 0.8)", "[0.8, 0.9)", "[0.9, 0.95)"}
1745
+ else (
1746
+ CubeViewDef.HighLightKind.NONE
1747
+ if x in {"[0.95, 0.98)", "[0.98, 1.02)", "[1.02, 1.05)"}
1748
+ else (
1749
+ CubeViewDef.HighLightKind.GREEN
1750
+ if "[" in x
1751
+ else CubeViewDef.HighLightKind.NONE
1752
+ )
1753
+ )
1754
+ )
1755
+ )
1756
+
1757
+ def mean_weight(gr):
1758
+ weight = gr["time_latency_eager"]
1759
+ x = gr["speedup"]
1760
+ if x.shape[0] == 0:
1761
+ return np.nan
1762
+ div = weight.sum()
1763
+ if div > 0:
1764
+ return (x * weight).sum() / div
1765
+ return np.nan
1766
+
1767
+ def mean_geo(gr):
1768
+ x = gr["speedup"]
1769
+ return np.exp(np.log(x.dropna()).mean())
1770
+
1771
+ order = ["model_attn_impl", "exporter", "opt_patterns", "DATE"]
1772
+ implemented_views = {
1773
+ "agg-suite": lambda: CubeViewDef(
1774
+ key_index=index_cols,
1775
+ values=self._filter_column(
1776
+ [
1777
+ "TIME_ITER",
1778
+ "speedup",
1779
+ "time_latency",
1780
+ "time_latency_eager",
1781
+ "time_export_success",
1782
+ "time_export_unbiased",
1783
+ "^n_.*",
1784
+ "target_opset",
1785
+ "onnx_filesize",
1786
+ "onnx_weight_size_torch",
1787
+ "onnx_weight_size_proto",
1788
+ "onnx_n_nodes",
1789
+ "onnx_n_nodes_no_cst",
1790
+ "op_onnx__Constant",
1791
+ "peak_gpu_torch",
1792
+ "peak_gpu_nvidia",
1793
+ ],
1794
+ self.values,
1795
+ ),
1796
+ ignore_unique=True,
1797
+ key_agg=["model_name", "task", "model_task"],
1798
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1799
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1800
+ keep_columns_in_index=["suite"],
1801
+ name="agg-suite",
1802
+ order=order,
1803
+ ),
1804
+ "agg-all": lambda: CubeViewDef(
1805
+ key_index=index_cols,
1806
+ values=self._filter_column(
1807
+ [
1808
+ "TIME_ITER",
1809
+ "speedup",
1810
+ "time_latency",
1811
+ "time_latency_eager",
1812
+ "time_export_success",
1813
+ "time_export_unbiased",
1814
+ "^n_.*",
1815
+ "target_opset",
1816
+ "onnx_filesize",
1817
+ "onnx_weight_size_torch",
1818
+ "onnx_weight_size_proto",
1819
+ "onnx_n_nodes",
1820
+ "onnx_n_nodes_no_cst",
1821
+ "peak_gpu_torch",
1822
+ "peak_gpu_nvidia",
1823
+ ],
1824
+ self.values,
1825
+ ),
1826
+ ignore_unique=True,
1827
+ key_agg=["model_name", "task", "model_task", "suite"],
1828
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1829
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1830
+ name="agg-all",
1831
+ order=order,
1832
+ plots=True,
1833
+ ),
1834
+ "disc": lambda: CubeViewDef(
1835
+ key_index=index_cols,
1836
+ values=self._filter_column(["discrepancies_abs"], self.values),
1837
+ ignore_unique=True,
1838
+ keep_columns_in_index=["suite"],
1839
+ f_highlight=f_disc,
1840
+ name="disc",
1841
+ order=order,
1842
+ ),
1843
+ "speedup": lambda: CubeViewDef(
1844
+ key_index=index_cols,
1845
+ values=self._filter_column(["speedup"], self.values),
1846
+ ignore_unique=True,
1847
+ keep_columns_in_index=["suite"],
1848
+ f_highlight=f_speedup,
1849
+ name="speedup",
1850
+ order=order,
1851
+ ),
1852
+ "counts": lambda: CubeViewDef(
1853
+ key_index=index_cols,
1854
+ values=self._filter_column(["^n_.*"], self.values),
1855
+ ignore_unique=True,
1856
+ keep_columns_in_index=["suite"],
1857
+ name="counts",
1858
+ order=order,
1859
+ ),
1860
+ "peak-gpu": lambda: CubeViewDef(
1861
+ key_index=index_cols,
1862
+ values=self._filter_column(["^peak_gpu_.*"], self.values),
1863
+ ignore_unique=True,
1864
+ keep_columns_in_index=["suite"],
1865
+ name="peak-gpu",
1866
+ order=order,
1867
+ ),
1868
+ "time": lambda: CubeViewDef(
1869
+ key_index=index_cols,
1870
+ values=self._filter_column(
1871
+ ["time_latency", "time_latency_eager"], self.values
1872
+ ),
1873
+ ignore_unique=True,
1874
+ keep_columns_in_index=["suite"],
1875
+ name="time",
1876
+ order=order,
1877
+ ),
1878
+ "time_export": lambda: CubeViewDef(
1879
+ key_index=index_cols,
1880
+ values=self._filter_column(["time_export_unbiased"], self.values),
1881
+ ignore_unique=True,
1882
+ keep_columns_in_index=["suite"],
1883
+ name="time_export",
1884
+ order=order,
1885
+ ),
1886
+ "err": lambda: CubeViewDef(
1887
+ key_index=index_cols,
1888
+ values=self._filter_column(
1889
+ ["ERR1", "ERR_timeout", "ERR_export", "ERR_crash"], self.values
1890
+ ),
1891
+ ignore_unique=True,
1892
+ keep_columns_in_index=["suite"],
1893
+ name="err",
1894
+ order=order,
1895
+ ),
1896
+ "bucket-speedup": lambda: CubeViewDef(
1897
+ key_index=index_cols,
1898
+ values=self._filter_column(["bucket[speedup]"], self.values),
1899
+ ignore_unique=True,
1900
+ keep_columns_in_index=["suite"],
1901
+ name="bucket-speedup",
1902
+ f_highlight=f_bucket,
1903
+ order=order,
1904
+ ),
1905
+ "onnx": lambda: CubeViewDef(
1906
+ key_index=index_cols,
1907
+ values=self._filter_column(
1908
+ [
1909
+ "onnx_filesize",
1910
+ "onnx_n_nodes",
1911
+ "onnx_n_nodes_no_cst",
1912
+ "onnx_weight_size_proto",
1913
+ "onnx_weight_size_torch",
1914
+ "op_onnx_initializer_small",
1915
+ ],
1916
+ self.values,
1917
+ ),
1918
+ ignore_unique=True,
1919
+ keep_columns_in_index=["suite"],
1920
+ name="onnx",
1921
+ order=order,
1922
+ ),
1923
+ "raw-short": lambda: CubeViewDef(
1924
+ key_index=self.keys_time,
1925
+ values=[c for c in self.values if c not in {"ERR_std", "ERR_stdout"}],
1926
+ ignore_unique=False,
1927
+ keep_columns_in_index=["suite"],
1928
+ name="raw-short",
1929
+ no_index=True,
1930
+ ),
1931
+ }
1932
+
1933
+ cmd_col = self._filter_column(["CMD"], self.values, can_be_empty=True)
1934
+ if cmd_col:
1935
+ implemented_views["cmd"] = lambda: CubeViewDef(
1936
+ key_index=index_cols,
1937
+ values=cmd_col,
1938
+ ignore_unique=True,
1939
+ keep_columns_in_index=["suite"],
1940
+ name="cmd",
1941
+ order=order,
1942
+ )
1943
+
1944
+ assert name in implemented_views or name in {"cmd"}, (
1945
+ f"Unknown view {name!r}, expected a name in {sorted(implemented_views)},"
1946
+ f"\n--\nkeys={pprint.pformat(sorted(self.keys_time))}, "
1947
+ f"\n--\nvalues={pprint.pformat(sorted(self.values))}"
1948
+ )
1949
+ if name not in implemented_views:
1950
+ return None
1951
+ return implemented_views[name]()
1952
+
1953
+ def post_load_process_piece(
1954
+ self, df: pandas.DataFrame, unique: bool = False
1955
+ ) -> pandas.DataFrame:
1956
+ df = super().post_load_process_piece(df, unique=unique)
1957
+ if unique:
1958
+ return df
1959
+ cols = self._filter_column(self._keys, df)
1960
+ res = None
1961
+ for c in cols:
1962
+ if df[c].isna().any():
1963
+ # Missing values for keys are not supposed to happen.
1964
+ uniq = set(df[c].dropna())
1965
+ if len(uniq) == 1:
1966
+ if res is None:
1967
+ res = df.copy()
1968
+ res[c] = res[c].fillna(uniq.pop())
1969
+ return df if res is None else res