onnx-diagnostic 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. onnx_diagnostic/__init__.py +1 -1
  2. onnx_diagnostic/_command_lines_parser.py +82 -12
  3. onnx_diagnostic/export/shape_helper.py +71 -0
  4. onnx_diagnostic/helpers/_log_helper.py +461 -0
  5. onnx_diagnostic/helpers/cache_helper.py +11 -1
  6. onnx_diagnostic/helpers/log_helper.py +404 -315
  7. onnx_diagnostic/reference/ops/op_cast_like.py +12 -8
  8. onnx_diagnostic/tasks/automatic_speech_recognition.py +6 -2
  9. onnx_diagnostic/tasks/feature_extraction.py +92 -7
  10. onnx_diagnostic/tasks/fill_mask.py +6 -2
  11. onnx_diagnostic/tasks/image_classification.py +7 -3
  12. onnx_diagnostic/tasks/image_text_to_text.py +6 -2
  13. onnx_diagnostic/tasks/mixture_of_expert.py +1 -1
  14. onnx_diagnostic/tasks/object_detection.py +7 -3
  15. onnx_diagnostic/tasks/sentence_similarity.py +6 -2
  16. onnx_diagnostic/tasks/summarization.py +6 -2
  17. onnx_diagnostic/tasks/text2text_generation.py +8 -4
  18. onnx_diagnostic/tasks/text_classification.py +6 -2
  19. onnx_diagnostic/tasks/text_generation.py +5 -3
  20. onnx_diagnostic/tasks/text_to_image.py +6 -2
  21. onnx_diagnostic/tasks/zero_shot_image_classification.py +6 -2
  22. onnx_diagnostic/torch_export_patches/onnx_export_errors.py +63 -7
  23. onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +188 -51
  24. onnx_diagnostic/torch_models/hghub/model_inputs.py +6 -1
  25. onnx_diagnostic/torch_models/validate.py +49 -10
  26. {onnx_diagnostic-0.7.3.dist-info → onnx_diagnostic-0.7.5.dist-info}/METADATA +1 -1
  27. {onnx_diagnostic-0.7.3.dist-info → onnx_diagnostic-0.7.5.dist-info}/RECORD +30 -29
  28. {onnx_diagnostic-0.7.3.dist-info → onnx_diagnostic-0.7.5.dist-info}/WHEEL +0 -0
  29. {onnx_diagnostic-0.7.3.dist-info → onnx_diagnostic-0.7.5.dist-info}/licenses/LICENSE.txt +0 -0
  30. {onnx_diagnostic-0.7.3.dist-info → onnx_diagnostic-0.7.5.dist-info}/top_level.txt +0 -0
@@ -1,208 +1,23 @@
1
- import datetime
2
1
  import enum
3
- import glob
4
2
  import io
5
- import os
6
3
  import pprint
7
4
  import re
8
5
  import warnings
9
- import zipfile
10
- from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
6
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
11
7
  import numpy as np
12
8
  import pandas
13
9
  from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
14
10
  from .helper import string_sig
15
-
16
- BUCKET_SCALES_VALUES = np.array(
17
- [-np.inf, -20, -10, -5, -2, 0, 2, 5, 10, 20, 100, 200, 300, 400, np.inf], dtype=float
11
+ from ._log_helper import (
12
+ BUCKET_SCALES,
13
+ breaking_last_point,
14
+ apply_excel_style,
15
+ align_dataframe_with,
16
+ open_dataframe,
17
+ enumerate_csv_files,
18
18
  )
19
19
 
20
20
 
21
- BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
22
-
23
-
24
- def filter_data(
25
- df: pandas.DataFrame,
26
- filter_in: Optional[str] = None,
27
- filter_out: Optional[str] = None,
28
- verbose: int = 0,
29
- ) -> pandas.DataFrame:
30
- """
31
- Argument `filter` follows the syntax
32
- ``<column1>:<fmt1>//<column2>:<fmt2>``.
33
-
34
- The format is the following:
35
-
36
- * a value or a set of values separated by ``;``
37
- """
38
- if not filter_in and not filter_out:
39
- return df
40
-
41
- def _f(fmt):
42
- cond = {}
43
- if isinstance(fmt, str):
44
- cols = fmt.split("//")
45
- for c in cols:
46
- assert ":" in c, f"Unexpected value {c!r} in fmt={fmt!r}"
47
- spl = c.split(":")
48
- assert len(spl) == 2, f"Unexpected value {c!r} in fmt={fmt!r}"
49
- name, fil = spl
50
- cond[name] = set(fil.split(";"))
51
- return cond
52
-
53
- if filter_in:
54
- cond = _f(filter_in)
55
- assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_in!r}"
56
- for k, v in cond.items():
57
- if k not in df.columns:
58
- continue
59
- if verbose:
60
- print(
61
- f"[_filter_data] filter in column {k!r}, "
62
- f"values {v!r} among {set(df[k].astype(str))}"
63
- )
64
- df = df[df[k].astype(str).isin(v)]
65
-
66
- if filter_out:
67
- cond = _f(filter_out)
68
- assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_out!r}"
69
- for k, v in cond.items():
70
- if k not in df.columns:
71
- continue
72
- if verbose:
73
- print(
74
- f"[_filter_data] filter out column {k!r}, "
75
- f"values {v!r} among {set(df[k].astype(str))}"
76
- )
77
- df = df[~df[k].astype(str).isin(v)]
78
- return df
79
-
80
-
81
- def enumerate_csv_files(
82
- data: Union[
83
- pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
84
- ],
85
- verbose: int = 0,
86
- filtering: Optional[Callable[[str], bool]] = None,
87
- ) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
88
- """
89
- Enumerates files considered for the aggregation.
90
- Only csv files are considered.
91
- If a zip file is given, the function digs into the zip files and
92
- loops over csv candidates.
93
-
94
- :param data: dataframe with the raw data or a file or list of files
95
- :param vrbose: verbosity
96
- :param filtering: function to filter in or out files in zip files,
97
- must return true to keep the file, false to skip it.
98
- :return: a generator yielding tuples with the filename, date, full path and zip file
99
-
100
- data can contains:
101
- * a dataframe
102
- * a string for a filename, zip or csv
103
- * a list of string
104
- * a tuple
105
- """
106
- if not isinstance(data, list):
107
- data = [data]
108
- for itn, filename in enumerate(data):
109
- if isinstance(filename, pandas.DataFrame):
110
- if verbose:
111
- print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
112
- yield filename
113
- continue
114
-
115
- if isinstance(filename, tuple):
116
- # A file in a zipfile
117
- if verbose:
118
- print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
119
- yield filename
120
- continue
121
-
122
- if os.path.exists(filename):
123
- ext = os.path.splitext(filename)[-1]
124
- if ext == ".csv":
125
- # We check the first line is ok.
126
- if verbose:
127
- print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
128
- dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
129
- du = dt.strftime("%Y-%m-%d %H:%M:%S")
130
- yield (os.path.split(filename)[-1], du, filename, "")
131
- continue
132
-
133
- if ext == ".zip":
134
- if verbose:
135
- print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
136
- zf = zipfile.ZipFile(filename, "r")
137
- for ii, info in enumerate(zf.infolist()):
138
- name = info.filename
139
- if filtering is None:
140
- ext = os.path.splitext(name)[-1]
141
- if ext != ".csv":
142
- continue
143
- elif not filtering(name):
144
- continue
145
- if verbose:
146
- print(
147
- f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
148
- )
149
- with zf.open(name) as zzf:
150
- first_line = zzf.readline()
151
- if b"," not in first_line:
152
- continue
153
- yield (
154
- os.path.split(name)[-1],
155
- "%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
156
- name,
157
- filename,
158
- )
159
- zf.close()
160
- continue
161
-
162
- raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
163
-
164
- # filename is a pattern.
165
- found = glob.glob(filename)
166
- if verbose and not found:
167
- print(f"[enumerate_csv_files] unable to find file in {filename!r}")
168
- for ii, f in enumerate(found):
169
- if verbose:
170
- print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
171
- yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
172
-
173
-
174
- def open_dataframe(
175
- data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
176
- ) -> pandas.DataFrame:
177
- """
178
- Opens a filename defined by function
179
- :func:`onnx_diagnostic.helpers.log_helper.enumerate_csv_files`.
180
-
181
- :param data: a dataframe, a filename, a tuple indicating the file is coming
182
- from a zip file
183
- :return: a dataframe
184
- """
185
- if isinstance(data, pandas.DataFrame):
186
- return data
187
- if isinstance(data, str):
188
- df = pandas.read_csv(data)
189
- df["RAWFILENAME"] = data
190
- return df
191
- if isinstance(data, tuple):
192
- if not data[-1]:
193
- df = pandas.read_csv(data[2])
194
- df["RAWFILENAME"] = data[2]
195
- return df
196
- zf = zipfile.ZipFile(data[-1])
197
- with zf.open(data[2]) as f:
198
- df = pandas.read_csv(f)
199
- df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
200
- zf.close()
201
- return df
202
-
203
- raise ValueError(f"Unexpected value for data: {data!r}")
204
-
205
-
206
21
  class CubeViewDef:
207
22
  """
208
23
  Defines how to compute a view.
@@ -226,9 +41,46 @@ class CubeViewDef:
226
41
  :param name: name of the view, used mostly to debug
227
42
  :param plots: adds plot to the Excel sheet
228
43
  :param no_index: remove the index (but keeps the columns)
44
+
45
+ Some examples of views. First example is an aggregated view
46
+ for many metrics.
47
+
48
+ .. code-block:: python
49
+
50
+ cube = CubeLogs(...)
51
+
52
+ CubeViewDef(
53
+ key_index=cube._filter_column(fs, cube.keys_time),
54
+ values=cube._filter_column(
55
+ ["TIME_ITER", "speedup", "time_latency.*", "onnx_n_nodes"],
56
+ cube.values,
57
+ ),
58
+ ignore_unique=True,
59
+ key_agg=["model_name", "task", "model_task", "suite"],
60
+ agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
61
+ agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
62
+ name="agg-all",
63
+ plots=True,
64
+ )
65
+
66
+ Next one focuses on a couple of metrics.
67
+
68
+ .. code-block:: python
69
+
70
+ cube = CubeLogs(...)
71
+
72
+ CubeViewDef(
73
+ key_index=cube._filter_column(fs, cube.keys_time),
74
+ values=cube._filter_column(["speedup"], cube.values),
75
+ ignore_unique=True,
76
+ keep_columns_in_index=["suite"],
77
+ name="speedup",
78
+ )
229
79
  """
230
80
 
231
81
  class HighLightKind(enum.IntEnum):
82
+ "Codes to highlight values."
83
+
232
84
  NONE = 0
233
85
  RED = 1
234
86
  GREEN = 2
@@ -276,118 +128,6 @@ class CubeViewDef:
276
128
  return string_sig(self) # type: ignore[arg-type]
277
129
 
278
130
 
279
- def apply_excel_style(
280
- filename_or_writer: Any,
281
- f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
282
- ):
283
- """
284
- Applies styles on all sheets in a file unless the sheet is too big.
285
-
286
- :param filename_or_writer: filename, modified inplace
287
- :param f_highlight: color function to apply, one per sheet
288
- """
289
- from openpyxl import load_workbook
290
- from openpyxl.styles import Alignment
291
- from openpyxl.utils import get_column_letter
292
- from openpyxl.styles import Font # , PatternFill, numbers
293
-
294
- if isinstance(filename_or_writer, str):
295
- workbook = load_workbook(filename_or_writer)
296
- save = True
297
- else:
298
- workbook = filename_or_writer.book
299
- save = False
300
-
301
- left = Alignment(horizontal="left")
302
- left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
303
- right = Alignment(horizontal="right")
304
- font_colors = {
305
- CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
306
- CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
307
- }
308
-
309
- for name in workbook.sheetnames:
310
- f_highlight = f_highlights.get(name, None) if f_highlights else None
311
- sheet = workbook[name]
312
- n_rows = sheet.max_row
313
- n_cols = sheet.max_column
314
- if n_rows * n_cols > 2**18:
315
- # Too big.
316
- continue
317
- co: Dict[int, int] = {}
318
- sizes: Dict[int, int] = {}
319
- cols = set()
320
- for i in range(1, n_rows + 1):
321
- for j, cell in enumerate(sheet[i]):
322
- if j > n_cols:
323
- break
324
- cols.add(cell.column)
325
- if isinstance(cell.value, float):
326
- co[j] = co.get(j, 0) + 1
327
- elif isinstance(cell.value, str):
328
- sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
329
-
330
- for k, v in sizes.items():
331
- c = get_column_letter(k)
332
- sheet.column_dimensions[c].width = min(max(8, v), 30)
333
- for k in cols:
334
- if k not in sizes:
335
- c = get_column_letter(k)
336
- sheet.column_dimensions[c].width = 15
337
-
338
- for i in range(1, n_rows + 1):
339
- for j, cell in enumerate(sheet[i]):
340
- if j > n_cols:
341
- break
342
- if isinstance(cell.value, pandas.Timestamp):
343
- cell.alignment = right
344
- dt = cell.value.to_pydatetime()
345
- cell.value = dt
346
- cell.number_format = (
347
- "YYYY-MM-DD"
348
- if (
349
- dt.hour == 0
350
- and dt.minute == 0
351
- and dt.second == 0
352
- and dt.microsecond == 0
353
- )
354
- else "YYYY-MM-DD 00:00:00"
355
- )
356
- elif isinstance(cell.value, (float, int)):
357
- cell.alignment = right
358
- x = abs(cell.value)
359
- if int(x) == x:
360
- cell.number_format = "0"
361
- elif x > 5000:
362
- cell.number_format = "# ##0"
363
- elif x >= 500:
364
- cell.number_format = "0.0"
365
- elif x >= 50:
366
- cell.number_format = "0.00"
367
- elif x >= 5:
368
- cell.number_format = "0.000"
369
- elif x > 0.5:
370
- cell.number_format = "0.0000"
371
- elif x > 0.005:
372
- cell.number_format = "0.00000"
373
- else:
374
- cell.number_format = "0.000E+00"
375
- if f_highlight:
376
- h = f_highlight(cell.value)
377
- if h in font_colors:
378
- cell.font = font_colors[h]
379
- elif isinstance(cell.value, str) and len(cell.value) > 70:
380
- cell.alignment = left_shrink
381
- else:
382
- cell.alignment = left
383
- if f_highlight:
384
- h = f_highlight(cell.value)
385
- if h in font_colors:
386
- cell.font = font_colors[h]
387
- if save:
388
- workbook.save(filename_or_writer)
389
-
390
-
391
131
  class CubePlot:
392
132
  """
393
133
  Creates a plot.
@@ -397,6 +137,26 @@ class CubePlot:
397
137
  :param split: draw a graph per line in the dataframe
398
138
  :param timeseries: this assumes the time is one level of the columns,
399
139
  this argument indices the level name
140
+
141
+ It defines a graph. Usually *bar* or *barh* is used to
142
+ compare experiments for every metric, a subplot by metric.
143
+
144
+ .. code-block:: python
145
+
146
+ CubePlot(df, kind="barh", orientation="row", split=True)
147
+
148
+ *line* is usually used to plot timeseries showing the
149
+ evolution of metrics over time.
150
+
151
+ .. code-block:: python
152
+
153
+ CubePlot(
154
+ df,
155
+ kind="line",
156
+ orientation="row",
157
+ split=True,
158
+ timeseries="time",
159
+ )
400
160
  """
401
161
 
402
162
  KINDS = {"bar", "barh", "line"}
@@ -607,6 +367,35 @@ class CubePlot:
607
367
  class CubeLogs:
608
368
  """
609
369
  Processes logs coming from experiments.
370
+ A cube is basically a database with certain columns
371
+ playing specific roles.
372
+
373
+ * time: only one column, it is not mandatory but it is recommended
374
+ to have one
375
+ * keys: they are somehow coordinates, they cannot be aggregated,
376
+ they are not numbers, more like categories, `(time, *keys)`
377
+ identifies an element of the database in an unique way,
378
+ there cannot be more than one row sharing the same key and time
379
+ values
380
+ * values: they are not necessary numerical, but if they are,
381
+ they can be aggregated
382
+
383
+ Every other columns is ignored. More columns can be added
384
+ by using formulas.
385
+
386
+ :param data: the raw data
387
+ :param time: the time column
388
+ :param keys: the keys, can include regular expressions
389
+ :param values: the values, can include regular expressions
390
+ :param ignored: ignores some column, acts as negative regular
391
+ expressions for the other two
392
+ :param recent: if more than one rows share the same keys,
393
+ the cube only keeps the most recent one
394
+ :param formulas: columns to add, defined with formulas
395
+ :param fill_missing: a dictionary, defines values replacing missing one
396
+ for some columns
397
+ :param keep_last_date: overwrites all the times with the most recent
398
+ one, it makes things easier for timeseries
610
399
  """
611
400
 
612
401
  def __init__(
@@ -636,6 +425,22 @@ class CubeLogs:
636
425
  self.fill_missing = fill_missing
637
426
  self.keep_last_date = keep_last_date
638
427
 
428
+ def clone(
429
+ self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
430
+ ) -> "CubeLogs":
431
+ """
432
+ Makes a copy of the dataframe.
433
+ It copies the processed data not the original one.
434
+ """
435
+ cube = self.__class__(
436
+ data if data is not None else self.data.copy(),
437
+ time=self.time,
438
+ keys=keys or self.keys_no_time,
439
+ values=self.values,
440
+ )
441
+ cube.load()
442
+ return cube
443
+
639
444
  def post_load_process_piece(
640
445
  self, df: pandas.DataFrame, unique: bool = False
641
446
  ) -> pandas.DataFrame:
@@ -741,17 +546,13 @@ class CubeLogs:
741
546
  print(f"[CubeLogs.load] dropped={self.dropped}")
742
547
  print(f"[CubeLogs.load] data.shape={self.data.shape}")
743
548
 
744
- shape = self.data.shape
745
549
  if verbose:
746
550
  print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
747
551
  self._preprocess()
748
552
  if verbose:
749
553
  print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
750
- assert (
751
- self.data.shape[0] > 0
752
- ), f"The preprocessing reduced shape {shape} to {self.data.shape}."
753
- if self.recent and verbose:
754
- print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
554
+ if self.recent:
555
+ print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
755
556
 
756
557
  # Let's apply the formulas
757
558
  if self._formulas:
@@ -883,6 +684,18 @@ class CubeLogs:
883
684
  "usual"
884
685
  return str(self.data) if hasattr(self, "data") else str(self._data)
885
686
 
687
+ def make_view_def(self, name: str) -> Optional[CubeViewDef]:
688
+ """
689
+ Returns a view definition.
690
+
691
+ :param name: name of a value
692
+ :return: a CubeViewDef or None if name does not make sense
693
+ """
694
+ assert name in self.values, f"{name!r} is not one of the values {self.values}"
695
+ keys = sorted(self.keys_no_time)
696
+ index = len(keys) // 2 + (len(keys) % 2)
697
+ return CubeViewDef(key_index=keys[:index], values=[name], name=name)
698
+
886
699
  def view(
887
700
  self,
888
701
  view_def: Union[str, CubeViewDef],
@@ -900,6 +713,12 @@ class CubeLogs:
900
713
  :param verbose: verbosity level
901
714
  :return: dataframe
902
715
  """
716
+ if isinstance(view_def, str):
717
+ # We automatically create a view for a metric
718
+ view_def_ = self.make_view_def(view_def)
719
+ assert view_def_ is not None, f"Unable to create a view from {view_def!r}"
720
+ view_def = view_def_
721
+
903
722
  assert isinstance(
904
723
  view_def, CubeViewDef
905
724
  ), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
@@ -1113,6 +932,17 @@ class CubeLogs:
1113
932
  else:
1114
933
  piv.sort_index(inplace=True, axis=1)
1115
934
 
935
+ # final step, force columns with numerical values to be float
936
+ for c in list(piv.columns):
937
+ s = piv[c]
938
+ if not pandas.api.types.is_object_dtype(s):
939
+ continue
940
+ try:
941
+ sf = s.astype(float)
942
+ except (ValueError, TypeError):
943
+ continue
944
+ piv[c] = sf
945
+
1116
946
  if verbose:
1117
947
  print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
1118
948
  print(f"[CubeLogs.view] -- done view {view_def.name!r}")
@@ -1155,7 +985,9 @@ class CubeLogs:
1155
985
  for c in set(key_index) | set(key_columns):
1156
986
  s = new_data[c]
1157
987
  if s.isna().max():
1158
- if pandas.api.types.is_numeric_dtype(s):
988
+ if pandas.api.types.is_numeric_dtype(
989
+ s
990
+ ) and not pandas.api.types.is_object_dtype(s):
1159
991
  min_v = s.dropna().min()
1160
992
  assert (
1161
993
  min_v >= 0
@@ -1192,7 +1024,7 @@ class CubeLogs:
1192
1024
  )
1193
1025
  if len(nonan) > 0:
1194
1026
  obs.update(dict(count=len(nonan)))
1195
- if is_numeric_dtype(nonan):
1027
+ if is_numeric_dtype(nonan) and not pandas.api.types.is_object_dtype(nonan):
1196
1028
  obs.update(
1197
1029
  dict(
1198
1030
  min=nonan.min(),
@@ -1228,9 +1060,11 @@ class CubeLogs:
1228
1060
  raw: Optional[str] = "raw",
1229
1061
  verbose: int = 0,
1230
1062
  csv: Optional[Sequence[str]] = None,
1063
+ time_mask: bool = False,
1064
+ sbs: Optional[Dict[str, Dict[str, Any]]] = None,
1231
1065
  ):
1232
1066
  """
1233
- Creates an excel file with a list of view.
1067
+ Creates an excel file with a list of views.
1234
1068
 
1235
1069
  :param output: output file to create
1236
1070
  :param views: sequence or dictionary of views to append
@@ -1238,9 +1072,17 @@ class CubeLogs:
1238
1072
  :param raw: add a page with the raw data
1239
1073
  :param csv: views to dump as csv files (same name as outputs + view naw)
1240
1074
  :param verbose: verbosity
1075
+ :param time_mask: color the background of the cells if one
1076
+ of the value for the last date is unexpected,
1077
+ assuming they should remain stale
1078
+ :param sbs: configurations to compare side-by-side, this adds two tabs,
1079
+ one gathering raw data about the two configurations, the other one
1080
+ is aggregated by metrics
1241
1081
  """
1242
1082
  if verbose:
1243
1083
  print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
1084
+ time_mask &= len(self.data[self.time].unique()) > 2
1085
+ cube_time = self.cube_time(fill_other_dates=True) if time_mask else None
1244
1086
  views = {k: k for k in views} if not isinstance(views, dict) else views
1245
1087
  f_highlights = {}
1246
1088
  plots = []
@@ -1252,10 +1094,25 @@ class CubeLogs:
1252
1094
  print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
1253
1095
  df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
1254
1096
 
1097
+ time_mask_view: Dict[str, pandas.DataFrame] = {}
1255
1098
  for name, view in views.items():
1256
1099
  if view is None:
1257
1100
  continue
1258
1101
  df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
1102
+ if cube_time is not None:
1103
+ cube_mask = cube_time.view(view)
1104
+ aligned = align_dataframe_with(cube_mask, df)
1105
+ if aligned is not None:
1106
+ assert aligned.shape == df.shape, (
1107
+ f"Shape mismatch between the view {df.shape} and the mask "
1108
+ f"{time_mask_view[name].shape}"
1109
+ )
1110
+ time_mask_view[name] = aligned
1111
+ if verbose:
1112
+ print(
1113
+ f"[CubeLogs.to_excel] compute mask for view {name!r} "
1114
+ f"with shape {aligned.shape}"
1115
+ )
1259
1116
  if tview is None:
1260
1117
  continue
1261
1118
  memory = df.memory_usage(deep=True).sum()
@@ -1335,6 +1192,36 @@ class CubeLogs:
1335
1192
  writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1336
1193
  )
1337
1194
 
1195
+ if sbs:
1196
+ if verbose:
1197
+ for k, v in sbs.items():
1198
+ print(f"[CubeLogs.to_excel] sbs {k}: {v}")
1199
+ name = "∧".join(sbs)
1200
+ sbs_raw, sbs_agg = self.sbs(sbs)
1201
+ if verbose:
1202
+ print(f"[CubeLogs.to_excel] add sheet {name!r} with shape {sbs_raw.shape}")
1203
+ print(
1204
+ f"[CubeLogs.to_excel] add sheet '{name}-AGG' "
1205
+ f"with shape {sbs_agg.shape}"
1206
+ )
1207
+ sbs_raw = sbs_raw.reset_index(drop=False)
1208
+ sbs_raw.to_excel(
1209
+ writer,
1210
+ sheet_name=name,
1211
+ freeze_panes=(
1212
+ sbs_raw.columns.nlevels + sbs_raw.index.nlevels,
1213
+ sbs_raw.index.nlevels,
1214
+ ),
1215
+ )
1216
+ sbs_agg.to_excel(
1217
+ writer,
1218
+ sheet_name=f"{name}-AGG",
1219
+ freeze_panes=(
1220
+ sbs_agg.columns.nlevels + sbs_agg.index.nlevels,
1221
+ sbs_agg.index.nlevels,
1222
+ ),
1223
+ )
1224
+
1338
1225
  if plots:
1339
1226
  from openpyxl.drawing.image import Image
1340
1227
 
@@ -1366,10 +1253,194 @@ class CubeLogs:
1366
1253
 
1367
1254
  if verbose:
1368
1255
  print(f"[CubeLogs.to_excel] applies style to {output!r}")
1369
- apply_excel_style(writer, f_highlights) # type: ignore[arg-type]
1256
+ apply_excel_style(
1257
+ writer, f_highlights, time_mask_view=time_mask_view, verbose=verbose # type: ignore[arg-type]
1258
+ )
1370
1259
  if verbose:
1371
1260
  print(f"[CubeLogs.to_excel] done with {len(views)} views")
1372
1261
 
1262
+ def cube_time(self, fill_other_dates: bool = False, threshold: float = 1.2) -> "CubeLogs":
1263
+ """
1264
+ Aggregates the data over time to detect changes on the last value.
1265
+ If *fill_other_dates* is True, all dates are kept, but values
1266
+ are filled with 0.
1267
+ *threshold* determines the bandwidth within the values are expected,
1268
+ should be a factor of the standard deviation.
1269
+ """
1270
+ unique_time = self.data[self.time].unique()
1271
+ assert len(unique_time) > 2, f"Not enough dates to proceed: unique_time={unique_time}"
1272
+ gr = self.data[[*self.keys_no_time, *self.values]].groupby(
1273
+ self.keys_no_time, dropna=False
1274
+ )
1275
+ dgr = gr.agg(
1276
+ lambda series, th=threshold: int(breaking_last_point(series, threshold=th)[0])
1277
+ )
1278
+ tm = unique_time.max()
1279
+ assert dgr.shape[0] > 0, (
1280
+ f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
1281
+ f"data.shape={self.data.shape}"
1282
+ )
1283
+ dgr[self.time] = tm
1284
+ if fill_other_dates:
1285
+ other_df = []
1286
+ other_dates = [t for t in unique_time if t != tm]
1287
+ for t in other_dates:
1288
+ df = dgr.copy()
1289
+ df[self.time] = t
1290
+ for c in df.columns:
1291
+ if c != self.time:
1292
+ df[c] = 0
1293
+ other_df.append(df)
1294
+ dgr = pandas.concat([dgr, *other_df], axis=0)
1295
+ assert dgr.shape[0] > 0, (
1296
+ f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
1297
+ f"data.shape={self.data.shape}, "
1298
+ f"other_df shapes={[df.shape for df in other_df]}"
1299
+ )
1300
+ return self.clone(data=dgr.reset_index(drop=False))
1301
+
1302
+ def sbs(
1303
+ self, configs: Dict[str, Dict[str, Any]], column_name: str = "CONF"
1304
+ ) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
1305
+ """
1306
+ Creates a side-by-side for two configurations.
1307
+ Every configuration a dictionary column:value which filters in
1308
+ the rows to keep in order to compute the side by side.
1309
+ Every configuration is given a name (the key in configs),
1310
+ it is added in column column_name.
1311
+
1312
+ :param configs: example
1313
+ ``dict(CFA=dict(exporter="E1", opt="O"), CFB=dict(exporter="E2", opt="O"))``
1314
+ :param column_name: column to add with the name of the configuration
1315
+ :return: data and aggregated date
1316
+ """
1317
+ assert (
1318
+ len(configs) >= 2
1319
+ ), f"A side by side needs at least two configs but configs={configs}"
1320
+ set_keys_time = set(self.keys_time)
1321
+ columns_index = None
1322
+ data_list = []
1323
+ for name_conf, conf in configs.items():
1324
+ if columns_index is None:
1325
+ columns_index = list(conf.keys())
1326
+ assert set(columns_index) <= set_keys_time, (
1327
+ f"Configuration {conf} includes columns outside the keys "
1328
+ f"{', '.join(sorted(set_keys_time))}"
1329
+ )
1330
+ else:
1331
+ assert set(columns_index) == set(conf), (
1332
+ f"Every conf should share the same keys but conf={conf} "
1333
+ f"is different from {set(columns_index)}"
1334
+ )
1335
+ data = self.data
1336
+ for k, v in conf.items():
1337
+ data = data[data[k] == v]
1338
+ assert data.shape[0] > 0, f"No rows found for conf={conf}"
1339
+ assert (
1340
+ column_name not in data.columns
1341
+ ), f"column_name={column_name!r} is already in {data.columns}"
1342
+ data = data.copy()
1343
+ data[column_name] = name_conf
1344
+ data_list.append(data)
1345
+
1346
+ new_data = pandas.concat(data_list, axis=0)
1347
+ cube = self.clone(new_data, keys=[*self.keys_no_time, column_name])
1348
+ key_index = set(self.keys_time) - {*columns_index, column_name} # type: ignore[misc]
1349
+ view = CubeViewDef(
1350
+ key_index=set(key_index), # type: ignore[arg-type]
1351
+ name="sbs",
1352
+ values=cube.values,
1353
+ keep_columns_in_index=[self.time],
1354
+ )
1355
+ view_res = cube.view(view)
1356
+ assert isinstance(view_res, pandas.DataFrame), "not needed but mypy complains"
1357
+
1358
+ # add metrics
1359
+ index_column_name = list(view_res.columns.names).index(column_name)
1360
+ index_metrics = list(view_res.columns.names).index("METRICS")
1361
+
1362
+ def _mkc(m, s):
1363
+ c = ["" for c in view_res.columns.names]
1364
+ c[index_column_name] = s
1365
+ c[index_metrics] = m
1366
+ return tuple(c)
1367
+
1368
+ list_configs = list(configs.items())
1369
+ mean_columns = [
1370
+ c
1371
+ for c in view_res.columns
1372
+ if pandas.api.types.is_numeric_dtype(view_res[c])
1373
+ and not pandas.api.types.is_object_dtype(view_res[c])
1374
+ ]
1375
+ assert mean_columns, f"No numerical columns in {view_res.dtypes}"
1376
+ view_res = view_res[mean_columns].copy()
1377
+ metrics = sorted(set(c[index_metrics] for c in view_res.columns))
1378
+ assert metrics, (
1379
+ f"No numerical metrics detected in "
1380
+ f"view_res.columns.names={view_res.columns.names}, "
1381
+ f"columns={view_res.dtypes}"
1382
+ )
1383
+ sum_columns = []
1384
+ columns_to_add = []
1385
+ for i in range(len(list_configs)):
1386
+ for j in range(i + 1, len(list_configs)):
1387
+ for m in metrics:
1388
+ iname, ci = list_configs[i]
1389
+ jname, cj = list_configs[j]
1390
+ ci = ci.copy()
1391
+ cj = cj.copy()
1392
+ ci["METRICS"] = m
1393
+ cj["METRICS"] = m
1394
+ ci["CONF"] = iname
1395
+ cj["CONF"] = jname
1396
+
1397
+ ci_name = tuple(ci[n] for n in view_res.columns.names)
1398
+ cj_name = tuple(cj[n] for n in view_res.columns.names)
1399
+ assert ci_name in view_res.columns or cj_name in view_res.columns, (
1400
+ f"Unable to find column {ci_name} or {cj_name} "
1401
+ f"in columns {view_res.columns}, metrics={metrics}"
1402
+ )
1403
+ if ci_name not in view_res.columns or cj_name not in view_res.columns:
1404
+ # One config does not have such metric.
1405
+ continue
1406
+
1407
+ si = view_res[ci_name]
1408
+ sj = view_res[cj_name]
1409
+
1410
+ sinan = si.isna()
1411
+ sjnan = sj.isna()
1412
+ n1 = iname
1413
+ n2 = jname
1414
+ nas = pandas.DataFrame(
1415
+ {
1416
+ _mkc(m, f"∅{n1}∧∅{n2}"): (sinan & sjnan).astype(int),
1417
+ _mkc(m, f"∅{n1}∧{n2}"): (sinan & ~sjnan).astype(int),
1418
+ _mkc(m, f"{n1}∧∅{n2}"): (~sinan & sjnan).astype(int),
1419
+ _mkc(m, f"{n1}∧{n2}"): (~sinan & ~sjnan).astype(int),
1420
+ _mkc(m, f"{n1}<{n2}"): (si < sj).astype(int),
1421
+ _mkc(m, f"{n1}=={n2}"): (si == sj).astype(int),
1422
+ _mkc(m, f"{n1}>{n2}"): (si > sj).astype(int),
1423
+ }
1424
+ )
1425
+ nas.columns.names = view_res.columns.names
1426
+ columns_to_add.append(nas)
1427
+ sum_columns.extend(nas.columns)
1428
+
1429
+ view_res = pandas.concat([view_res, *columns_to_add], axis=1)
1430
+ res = view_res.stack("METRICS", future_stack=True) # type: ignore[union-attr]
1431
+ res = res.reorder_levels(
1432
+ [res.index.nlevels - 1, *list(range(res.index.nlevels - 1))]
1433
+ ).sort_index()
1434
+
1435
+ # aggregated metrics
1436
+ aggs = {
1437
+ **{k: "mean" for k in mean_columns}, # noqa: C420
1438
+ **{k: "sum" for k in sum_columns}, # noqa: C420
1439
+ }
1440
+ flat = view_res.groupby(self.time).agg(aggs)
1441
+ flat = flat.stack("METRICS", future_stack=True)
1442
+ return res, flat
1443
+
1373
1444
 
1374
1445
  class CubeLogsPerformance(CubeLogs):
1375
1446
  """
@@ -1456,6 +1527,24 @@ class CubeLogsPerformance(CubeLogs):
1456
1527
  keep_last_date=keep_last_date,
1457
1528
  )
1458
1529
 
1530
+ def clone(
1531
+ self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
1532
+ ) -> "CubeLogs":
1533
+ """
1534
+ Makes a copy of the dataframe.
1535
+ It copies the processed data not the original one.
1536
+ keys can be changed as well.
1537
+ """
1538
+ cube = self.__class__(
1539
+ data if data is not None else self.data.copy(),
1540
+ time=self.time,
1541
+ keys=keys or self.keys_no_time,
1542
+ values=self.values,
1543
+ recent=False,
1544
+ )
1545
+ cube.load()
1546
+ return cube
1547
+
1459
1548
  def _process_formula(
1460
1549
  self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
1461
1550
  ) -> Callable[[pandas.DataFrame], pandas.Series]: