onnx-diagnostic 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. onnx_diagnostic/__init__.py +1 -1
  2. onnx_diagnostic/_command_lines_parser.py +108 -77
  3. onnx_diagnostic/doc.py +68 -0
  4. onnx_diagnostic/ext_test_case.py +1 -1
  5. onnx_diagnostic/helpers/cache_helper.py +59 -0
  6. onnx_diagnostic/helpers/config_helper.py +8 -4
  7. onnx_diagnostic/helpers/doc_helper.py +27 -7
  8. onnx_diagnostic/helpers/helper.py +30 -3
  9. onnx_diagnostic/helpers/log_helper.py +585 -0
  10. onnx_diagnostic/helpers/mini_onnx_builder.py +4 -1
  11. onnx_diagnostic/helpers/model_builder_helper.py +57 -73
  12. onnx_diagnostic/helpers/onnx_helper.py +291 -7
  13. onnx_diagnostic/helpers/torch_helper.py +18 -2
  14. onnx_diagnostic/reference/__init__.py +1 -0
  15. onnx_diagnostic/reference/ort_evaluator.py +29 -4
  16. onnx_diagnostic/reference/report_results_comparison.py +95 -0
  17. onnx_diagnostic/reference/torch_evaluator.py +23 -2
  18. onnx_diagnostic/tasks/automatic_speech_recognition.py +3 -0
  19. onnx_diagnostic/tasks/feature_extraction.py +3 -0
  20. onnx_diagnostic/tasks/fill_mask.py +3 -0
  21. onnx_diagnostic/tasks/image_classification.py +7 -1
  22. onnx_diagnostic/tasks/image_text_to_text.py +3 -0
  23. onnx_diagnostic/tasks/mixture_of_expert.py +3 -0
  24. onnx_diagnostic/tasks/object_detection.py +3 -0
  25. onnx_diagnostic/tasks/sentence_similarity.py +3 -0
  26. onnx_diagnostic/tasks/summarization.py +3 -0
  27. onnx_diagnostic/tasks/text2text_generation.py +3 -0
  28. onnx_diagnostic/tasks/text_classification.py +3 -0
  29. onnx_diagnostic/tasks/text_generation.py +90 -43
  30. onnx_diagnostic/tasks/zero_shot_image_classification.py +3 -0
  31. onnx_diagnostic/torch_export_patches/onnx_export_errors.py +78 -25
  32. onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +37 -0
  33. onnx_diagnostic/torch_export_patches/patch_module_helper.py +1 -0
  34. onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +365 -17
  35. onnx_diagnostic/torch_models/hghub/hub_api.py +20 -4
  36. onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +209 -0
  37. onnx_diagnostic/torch_models/hghub/model_inputs.py +3 -0
  38. onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py +23 -50
  39. onnx_diagnostic/torch_models/{test_helper.py → validate.py} +174 -114
  40. {onnx_diagnostic-0.6.2.dist-info → onnx_diagnostic-0.7.0.dist-info}/METADATA +2 -2
  41. {onnx_diagnostic-0.6.2.dist-info → onnx_diagnostic-0.7.0.dist-info}/RECORD +44 -42
  42. {onnx_diagnostic-0.6.2.dist-info → onnx_diagnostic-0.7.0.dist-info}/WHEEL +0 -0
  43. {onnx_diagnostic-0.6.2.dist-info → onnx_diagnostic-0.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  44. {onnx_diagnostic-0.6.2.dist-info → onnx_diagnostic-0.7.0.dist-info}/top_level.txt +0 -0
@@ -558,7 +558,7 @@ def string_type(
558
558
  print(f"[string_type] CACHE1:{type(obj)}")
559
559
  return f"MambaCache(conv_states={c}, ssm_states={d})"
560
560
 
561
- if obj.__class__.__name__ in ("DynamicCache", "SlidingWindowCache"):
561
+ if obj.__class__.__name__ in {"DynamicCache", "SlidingWindowCache", "StaticCache"}:
562
562
  kc = string_type(
563
563
  obj.key_cache,
564
564
  with_shape=with_shape,
@@ -857,7 +857,7 @@ def flatten_object(x: Any, drop_keys: bool = False) -> Any:
857
857
  return flatten_object(list(x.values()), drop_keys=drop_keys)
858
858
  return flatten_object(list(x.items()), drop_keys=drop_keys)
859
859
 
860
- if x.__class__.__name__ == "DynamicCache":
860
+ if x.__class__.__name__ in {"DynamicCache", "StaticCache"}:
861
861
  res = flatten_object(x.key_cache) + flatten_object(x.value_cache)
862
862
  return tuple(res)
863
863
  if x.__class__.__name__ == "EncoderDecoderCache":
@@ -1424,10 +1424,37 @@ def max_diff(
1424
1424
  f"level={level}"
1425
1425
  )
1426
1426
 
1427
+ if expected.__class__.__name__ == "StaticCache":
1428
+ if got.__class__.__name__ == "StaticCache":
1429
+ if verbose >= 6:
1430
+ print(f"[max_diff] StaticCache: {string_type(expected)} ? {string_type(got)}")
1431
+ return max_diff(
1432
+ [expected.key_cache, expected.value_cache],
1433
+ [got.key_cache, got.value_cache],
1434
+ verbose=verbose,
1435
+ hist=hist,
1436
+ )
1437
+ if isinstance(got, tuple) and len(got) == 2:
1438
+ return max_diff(
1439
+ [expected.key_cache, expected.value_cache],
1440
+ [got[0], got[1]],
1441
+ debug_info=_debug(expected.__class__.__name__),
1442
+ **_dkws,
1443
+ )
1444
+ raise AssertionError(
1445
+ f"StaticCache not fully implemented with classes "
1446
+ f"{expected.__class__.__name__!r} and {got.__class__.__name__!r}, "
1447
+ f"and expected={string_type(expected)}, got={string_type(got)},\n"
1448
+ f"level={level}"
1449
+ )
1450
+
1427
1451
  if expected.__class__.__name__ == "SlidingWindowCache":
1428
1452
  if got.__class__.__name__ == "SlidingWindowCache":
1429
1453
  if verbose >= 6:
1430
- print(f"[max_diff] DynamicCache: {string_type(expected)} ? {string_type(got)}")
1454
+ print(
1455
+ f"[max_diff] SlidingWindowCache: "
1456
+ f"{string_type(expected)} ? {string_type(got)}"
1457
+ )
1431
1458
  return max_diff(
1432
1459
  [expected.key_cache, expected.value_cache],
1433
1460
  [got.key_cache, got.value_cache],
@@ -0,0 +1,585 @@
1
+ import datetime
2
+ import glob
3
+ import os
4
+ import re
5
+ import zipfile
6
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
7
+ import numpy as np
8
+ import pandas
9
+ from pandas.api.types import is_numeric_dtype
10
+ from .helper import string_sig
11
+
12
+
13
+ def enumerate_csv_files(
14
+ data: Union[
15
+ pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
16
+ ],
17
+ verbose: int = 0,
18
+ ) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
19
+ """
20
+ Enumerates files considered for the aggregation.
21
+ Only csv files are considered.
22
+ If a zip file is given, the function digs into the zip files and
23
+ loops over csv candidates.
24
+
25
+ :param data: dataframe with the raw data or a file or list of files
26
+
27
+ data can contains:
28
+ * a dataframe
29
+ * a string for a filename, zip or csv
30
+ * a list of string
31
+ * a tuple
32
+ """
33
+ if not isinstance(data, list):
34
+ data = [data]
35
+ for itn, filename in enumerate(data):
36
+ if isinstance(filename, pandas.DataFrame):
37
+ if verbose:
38
+ print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
39
+ yield filename
40
+ continue
41
+
42
+ if isinstance(filename, tuple):
43
+ # A file in a zipfile
44
+ if verbose:
45
+ print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
46
+ yield filename
47
+ continue
48
+
49
+ if os.path.exists(filename):
50
+ ext = os.path.splitext(filename)[-1]
51
+ if ext == ".csv":
52
+ # We check the first line is ok.
53
+ if verbose:
54
+ print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
55
+ with open(filename, "r", encoding="utf-8") as f:
56
+ line = f.readline()
57
+ if "~help" in line or (",CMD" not in line and ",DATE" not in line):
58
+ continue
59
+ dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
60
+ du = dt.strftime("%Y-%m-%d %H:%M:%S")
61
+ yield (os.path.split(filename)[-1], du, filename, "")
62
+ continue
63
+
64
+ if ext == ".zip":
65
+ if verbose:
66
+ print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
67
+ zf = zipfile.ZipFile(filename, "r")
68
+ for ii, info in enumerate(zf.infolist()):
69
+ name = info.filename
70
+ ext = os.path.splitext(name)[-1]
71
+ if ext != ".csv":
72
+ continue
73
+ if verbose:
74
+ print(
75
+ f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
76
+ )
77
+ with zf.open(name) as zzf:
78
+ first_line = zzf.readline()
79
+ if b"," not in first_line:
80
+ continue
81
+ yield (
82
+ os.path.split(name)[-1],
83
+ "%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
84
+ name,
85
+ filename,
86
+ )
87
+ zf.close()
88
+ continue
89
+
90
+ raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
91
+
92
+ # filename is a pattern.
93
+ found = glob.glob(filename)
94
+ if verbose and not found:
95
+ print(f"[enumerate_csv_files] unable to find file in {filename!r}")
96
+ for ii, f in enumerate(found):
97
+ if verbose:
98
+ print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
99
+ yield from enumerate_csv_files(f, verbose=verbose)
100
+
101
+
102
+ def open_dataframe(
103
+ data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
104
+ ) -> pandas.DataFrame:
105
+ """
106
+ Opens a filename.
107
+
108
+ :param data: a dataframe, a filename, a tuple indicating the file is coming
109
+ from a zip file
110
+ :return: a dataframe
111
+ """
112
+ if isinstance(data, pandas.DataFrame):
113
+ return data
114
+ if isinstance(data, str):
115
+ df = pandas.read_csv(data)
116
+ df["RAWFILENAME"] = data
117
+ return df
118
+ if isinstance(data, tuple):
119
+ if not data[-1]:
120
+ df = pandas.read_csv(data[2])
121
+ df["RAWFILENAME"] = data[2]
122
+ return df
123
+ zf = zipfile.ZipFile(data[-1])
124
+ with zf.open(data[2]) as f:
125
+ df = pandas.read_csv(f)
126
+ df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
127
+ zf.close()
128
+ return df
129
+
130
+ raise ValueError(f"Unexpected value for data: {data!r}")
131
+
132
+
133
+ class CubeViewDef:
134
+ """
135
+ Defines how to compute a view.
136
+
137
+ :param key_index: keys to put in the row index
138
+ :param values: values to show
139
+ :param ignore_unique: ignore keys with a unique value
140
+ :param order: to reorder key in columns index
141
+ :param key_agg: aggregate according to these columns before
142
+ creating the view
143
+ :param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
144
+ :param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ key_index: Sequence[str],
150
+ values: Sequence[str],
151
+ ignore_unique: bool = True,
152
+ order: Optional[Sequence[str]] = None,
153
+ key_agg: Optional[Sequence[str]] = None,
154
+ agg_args: Sequence[Any] = ("sum",),
155
+ agg_kwargs: Optional[Dict[str, Any]] = None,
156
+ ):
157
+ self.key_index = key_index
158
+ self.values = values
159
+ self.ignore_unique = ignore_unique
160
+ self.order = order
161
+ self.key_agg = key_agg
162
+ self.agg_args = agg_args
163
+ self.agg_kwargs = agg_kwargs
164
+
165
+ def __repr__(self) -> str:
166
+ "usual"
167
+ return string_sig(self) # type: ignore[arg-type]
168
+
169
+
170
+ class CubeLogs:
171
+ """
172
+ Processes logs coming from experiments.
173
+ """
174
+
175
+ def __init__(
176
+ self,
177
+ data: Any,
178
+ time: str = "date",
179
+ keys: Sequence[str] = ("version_.*", "model_.*"),
180
+ values: Sequence[str] = ("time_.*", "disc_.*"),
181
+ ignored: Sequence[str] = (),
182
+ recent: bool = False,
183
+ formulas: Optional[Dict[str, Callable[[pandas.DataFrame], pandas.Series]]] = None,
184
+ ):
185
+ self._data = data
186
+ self._time = time
187
+ self._keys = keys
188
+ self._values = values
189
+ self._ignored = ignored
190
+ self.recent = recent
191
+ self._formulas = formulas
192
+
193
+ def load(self, verbose: int = 0):
194
+ """Loads and preprocesses the data. Returns self."""
195
+ if isinstance(self._data, pandas.DataFrame):
196
+ if verbose:
197
+ print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
198
+ self.data = self._data
199
+ elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
200
+ if verbose:
201
+ print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
202
+ self.data = pandas.DataFrame(self._data)
203
+ elif isinstance(self._data, list) and all(
204
+ isinstance(r, pandas.DataFrame) for r in self._data
205
+ ):
206
+ if verbose:
207
+ print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
208
+ self.data = pandas.concat(self._data, axis=0)
209
+ elif isinstance(self._data, list):
210
+ cubes = []
211
+ for item in enumerate_csv_files(self._data, verbose=verbose):
212
+ df = open_dataframe(item)
213
+ cube = CubeLogs(
214
+ df,
215
+ time=self._time,
216
+ keys=self._keys,
217
+ values=self._values,
218
+ ignored=self._ignored,
219
+ recent=self.recent,
220
+ )
221
+ cube.load()
222
+ cubes.append(cube.data)
223
+ self.data = pandas.concat(cubes, axis=0)
224
+ else:
225
+ raise NotImplementedError(
226
+ f"Not implemented with the provided data (type={type(self._data)})"
227
+ )
228
+
229
+ assert all(isinstance(c, str) for c in self.data.columns), (
230
+ f"The class only supports string as column names "
231
+ f"but found {[c for c in self.data.columns if not isinstance(c, str)]}"
232
+ )
233
+ if verbose:
234
+ print(f"[CubeLogs.load] loaded with shape={self.data.shape}")
235
+
236
+ self._initialize_columns()
237
+ if verbose:
238
+ print(f"[CubeLogs.load] time={self.time}")
239
+ print(f"[CubeLogs.load] keys={self.keys}")
240
+ print(f"[CubeLogs.load] values={self.values}")
241
+ print(f"[CubeLogs.load] ignored={self.ignored}")
242
+ print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
243
+ print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
244
+ assert not (
245
+ set(self.keys) & set(self.values)
246
+ ), f"Columns {set(self.keys) & set(self.values)} cannot be keys and values"
247
+ assert not (
248
+ set(self.keys) & set(self.ignored)
249
+ ), f"Columns {set(self.keys) & set(self.ignored)} cannot be keys and ignored"
250
+ assert not (
251
+ set(self.values) & set(self.ignored)
252
+ ), f"Columns {set(self.keys) & set(self.ignored)} cannot be values and ignored"
253
+ assert (
254
+ self.time not in self.keys
255
+ and self.time not in self.values
256
+ and self.time not in self.ignored
257
+ ), f"Column {self.time!r} is also a key, a value or ignored"
258
+ self._columns = [self.time, *self.keys, *self.values, *self.ignored]
259
+ self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
260
+ self.data = self.data[self.columns]
261
+ if verbose:
262
+ print(f"[CubeLogs.load] dropped={self.dropped}")
263
+ print(f"[CubeLogs.load] data.shape={self.data.shape}")
264
+
265
+ self._preprocess()
266
+ if self.recent and verbose:
267
+ print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
268
+
269
+ # Let's apply the formulas
270
+ if self._formulas:
271
+ cols = set(self.data.columns)
272
+ for k, f in self._formulas.items():
273
+ if k in cols:
274
+ if verbose:
275
+ print(f"[CubeLogs.load] skip formula {k!r}")
276
+ else:
277
+ if verbose:
278
+ print(f"[CubeLogs.load] apply formula {k!r}")
279
+ self.data[k] = f(self.data)
280
+ self.values_for_key = {k: set(self.data[k]) for k in self.keys}
281
+ nans = [
282
+ c for c in [self.time, *self.keys] if self.data[c].isna().astype(int).sum() > 0
283
+ ]
284
+ assert not nans, f"The following keys {nans} have nan values. This is not allowed."
285
+ if verbose:
286
+ print(f"[CubeLogs.load] convert column {self.time!r} into date")
287
+ self.data[self.time] = pandas.to_datetime(self.data[self.time])
288
+ if verbose:
289
+ print(f"[CubeLogs.load] done, shape={self.shape}")
290
+ return self
291
+
292
+ @property
293
+ def shape(self) -> Tuple[int, int]:
294
+ "Returns the shape."
295
+ assert hasattr(self, "data"), "Method load was not called"
296
+ return self.data.shape
297
+
298
+ @property
299
+ def columns(self) -> Sequence[str]:
300
+ "Returns the columns."
301
+ assert hasattr(self, "data"), "Method load was not called"
302
+ return self.data.columns
303
+
304
+ def _preprocess(self):
305
+ last = self.values[0]
306
+ gr = self.data[[self.time, *self.keys, last]].groupby([self.time, *self.keys]).count()
307
+ gr = gr[gr[last] > 1]
308
+ if self.recent:
309
+ cp = self.data.copy()
310
+ assert (
311
+ "__index__" not in cp.columns
312
+ ), f"'__index__' should not be a column in {cp.columns}"
313
+ cp["__index__"] = np.arange(cp.shape[0])
314
+ gr = (
315
+ cp[[*self.keys, self.time, "__index__"]]
316
+ .groupby(self.keys, as_index=False)
317
+ .max()
318
+ )
319
+ filtered = pandas.merge(cp, gr, on=[self.time, "__index__", *self.keys])
320
+ assert filtered.shape[0] <= self.data.shape[0], (
321
+ f"Keeping the latest row brings more row {filtered.shape} "
322
+ f"(initial is {self.data.shape})."
323
+ )
324
+ self.data = filtered.drop("__index__", axis=1)
325
+ else:
326
+ assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
327
+ gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
328
+ gr = gr[gr[self.time] > 1]
329
+ assert (
330
+ gr.shape[0] == 0
331
+ ), f"recent should be true to keep the most recent row:\n{gr}"
332
+
333
+ @classmethod
334
+ def _filter_column(cls, filters, columns, can_be_empty=False):
335
+ set_cols = set()
336
+ for f in filters:
337
+ reg = re.compile(f)
338
+ cols = [c for c in columns if reg.search(c)]
339
+ set_cols |= set(cols)
340
+ assert (
341
+ can_be_empty or set_cols
342
+ ), f"Filters {filters} returns an empty set from {columns}"
343
+ return sorted(set_cols)
344
+
345
+ def _initialize_columns(self):
346
+ self.keys = self._filter_column(self._keys, self.data.columns)
347
+ self.values = self._filter_column(self._values, self.data.columns)
348
+ self.ignored = self._filter_column(self._ignored, self.data.columns, True)
349
+ assert (
350
+ self._time in self.data.columns
351
+ ), f"Column {self._time} not found in {self.data.columns}"
352
+ ignored_keys = set(self.ignored) & set(self.keys)
353
+ ignored_values = set(self.ignored) & set(self.values)
354
+ self.keys = [c for c in self.keys if c not in ignored_keys]
355
+ self.values = [c for c in self.values if c not in ignored_values]
356
+ self.ignored_keys = sorted(ignored_keys)
357
+ self.ignored_values = sorted(ignored_values)
358
+ self.time = self._time
359
+
360
+ def __str__(self) -> str:
361
+ "usual"
362
+ return str(self.data) if hasattr(self, "data") else str(self._data)
363
+
364
+ def view(self, view_def: CubeViewDef) -> pandas.DataFrame:
365
+ """
366
+ Returns a dataframe, a pivot view.
367
+ `key_index` determines the index, the other key columns determines
368
+ the columns. If `ignore_unique` is True, every columns with a unique value
369
+ is removed.
370
+
371
+ :param view_def: view definition
372
+ :return: dataframe
373
+ """
374
+ key_agg = self._filter_column(view_def.key_agg, self.keys) if view_def.key_agg else []
375
+ set_key_agg = set(key_agg)
376
+ assert set_key_agg <= set(
377
+ self.keys
378
+ ), f"Non existing keys in key_agg {set_key_agg - set(self.keys)}"
379
+
380
+ values = self._filter_column(view_def.values, self.values)
381
+ assert set(values) <= set(
382
+ self.values
383
+ ), f"Non existing columns in values {set(values) - set(self.values)}"
384
+
385
+ if key_agg:
386
+ key_index = [
387
+ c
388
+ for c in self._filter_column(view_def.key_index, self.keys)
389
+ if c not in set_key_agg
390
+ ]
391
+ keys_no_agg = [c for c in self.keys if c not in set_key_agg]
392
+ data = (
393
+ self.data[[*keys_no_agg, *values]]
394
+ .groupby(key_index, as_index=False)
395
+ .agg(*view_def.agg_args, **(view_def.agg_kwargs or {}))
396
+ )
397
+ else:
398
+ key_index = self._filter_column(view_def.key_index, self.keys)
399
+ data = self.data[[*self.keys, *values]]
400
+
401
+ assert set(key_index) <= set(
402
+ self.keys
403
+ ), f"Non existing keys in key_index {set(key_index) - set(self.keys)}"
404
+
405
+ set_key_columns = {
406
+ c for c in self.keys if c not in key_index and c not in set(key_agg)
407
+ }
408
+ if view_def.ignore_unique:
409
+ key_index = [k for k in key_index if len(self.values_for_key[k]) > 1]
410
+ key_columns = [k for k in set_key_columns if len(self.values_for_key[k]) > 1]
411
+ else:
412
+ key_columns = sorted(set_key_columns)
413
+
414
+ if view_def.order:
415
+ assert set(view_def.order) <= set_key_columns, (
416
+ f"Non existing columns from order in key_columns "
417
+ f"{set(view_def.order) - set_key_columns}"
418
+ )
419
+ key_columns = [
420
+ *view_def.order,
421
+ *[c for c in key_columns if c not in view_def.order],
422
+ ]
423
+ return data.pivot(index=key_index[::-1], columns=key_columns, values=values)
424
+
425
+ def describe(self) -> pandas.DataFrame:
426
+ """Basic description of all variables."""
427
+ rows = []
428
+ for name in self.data.columns:
429
+ values = self.data[name]
430
+ dtype = values.dtype
431
+ nonan = values.dropna()
432
+ obs = dict(
433
+ name=name,
434
+ dtype=str(dtype),
435
+ missing=len(values) - len(nonan),
436
+ )
437
+ if len(nonan) > 0:
438
+ obs.update(
439
+ dict(
440
+ min=nonan.min(),
441
+ max=nonan.max(),
442
+ count=len(nonan),
443
+ )
444
+ )
445
+ if is_numeric_dtype(nonan):
446
+ obs.update(
447
+ dict(
448
+ mean=nonan.mean(),
449
+ sum=nonan.sum(),
450
+ )
451
+ )
452
+ else:
453
+ unique = set(nonan)
454
+ obs["n_values"] = len(unique)
455
+ if len(unique) < 20:
456
+ obs["values"] = ",".join(map(str, sorted(unique)))
457
+ rows.append(obs)
458
+ return pandas.DataFrame(rows).set_index("name")
459
+
460
+ def to_excel(
461
+ self,
462
+ output: str,
463
+ views: Dict[str, CubeViewDef],
464
+ main: Optional[str] = "main",
465
+ raw: Optional[str] = "raw",
466
+ verbose: int = 0,
467
+ ):
468
+ """
469
+ Creates an excel file with a list of view.
470
+
471
+ :param output: output file to create
472
+ :param views: list of views to append
473
+ :param main: add a page with statitcs on all variables
474
+ :param raw: add a page with the raw data
475
+ :param verbose: verbosity
476
+ """
477
+
478
+ with pandas.ExcelWriter(output, engine="openpyxl") as writer:
479
+ if main:
480
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
481
+ df = self.describe()
482
+ if verbose:
483
+ print(f"[CubeLogs.to_helper] add sheet {main!r} with shape {df.shape}")
484
+ df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
485
+ self._apply_excel_style(main, writer, df)
486
+ if raw:
487
+ assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
488
+ if verbose:
489
+ print(f"[CubeLogs.to_helper] add sheet {raw!r} with shape {self.shape}")
490
+ self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
491
+ self._apply_excel_style(raw, writer, self.data)
492
+
493
+ for name, view in views.items():
494
+ df = self.view(view)
495
+ if verbose:
496
+ print(
497
+ f"[CubeLogs.to_helper] add sheet {name!r} with shape "
498
+ f"{df.shape}, index={df.index.names}, columns={df.columns.names}"
499
+ )
500
+ df.to_excel(
501
+ writer,
502
+ sheet_name=name,
503
+ freeze_panes=(df.index.nlevels, df.columns.nlevels),
504
+ )
505
+ self._apply_excel_style(name, writer, df)
506
+ if verbose:
507
+ print(f"[CubeLogs.to_helper] done with {len(views)} views")
508
+
509
+ def _apply_excel_style(self, name: str, writer: pandas.ExcelWriter, df: pandas.DataFrame):
510
+ from openpyxl.styles import Alignment
511
+ from openpyxl.utils import get_column_letter
512
+
513
+ # from openpyxl.styles import Font, PatternFill, numbers
514
+
515
+ left = Alignment(horizontal="left")
516
+ right = Alignment(horizontal="right")
517
+ # center = Alignment(horizontal="center")
518
+ # bold_font = Font(bold=True)
519
+ # red = Font(color="FF0000")
520
+ # yellow = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
521
+ # redf = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid")
522
+
523
+ sheet = writer.sheets[name]
524
+ n_rows = df.shape[0] + df.columns.nlevels + df.index.nlevels
525
+ n_cols = df.shape[1] + df.index.nlevels
526
+ co: Dict[int, int] = {}
527
+ sizes: Dict[int, int] = {}
528
+ cols = set()
529
+ for i in range(1, n_rows):
530
+ for j, cell in enumerate(sheet[i]):
531
+ if j > n_cols:
532
+ break
533
+ cols.add(cell.column)
534
+ if isinstance(cell.value, float):
535
+ co[j] = co.get(j, 0) + 1
536
+ elif isinstance(cell.value, str):
537
+ sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
538
+
539
+ for k, v in sizes.items():
540
+ c = get_column_letter(k)
541
+ sheet.column_dimensions[c].width = max(15, v)
542
+ for k in cols:
543
+ if k not in sizes:
544
+ c = get_column_letter(k)
545
+ sheet.column_dimensions[c].width = 15
546
+
547
+ for i in range(1, n_rows):
548
+ for j, cell in enumerate(sheet[i]):
549
+ if j > n_cols:
550
+ break
551
+ if isinstance(cell.value, pandas.Timestamp):
552
+ cell.alignment = right
553
+ dt = cell.value.to_pydatetime()
554
+ cell.value = dt
555
+ cell.number_format = (
556
+ "YYYY-MM-DD"
557
+ if (
558
+ dt.hour == 0
559
+ and dt.minute == 0
560
+ and dt.second == 0
561
+ and dt.microsecond == 0
562
+ )
563
+ else "YYYY-MM-DD 00:00:00"
564
+ )
565
+ elif isinstance(cell.value, (float, int)):
566
+ cell.alignment = right
567
+ x = abs(cell.value)
568
+ if int(x) == x:
569
+ cell.number_format = "0"
570
+ elif x > 5000:
571
+ cell.number_format = "# ##0"
572
+ elif x >= 500:
573
+ cell.number_format = "0.0"
574
+ elif x >= 50:
575
+ cell.number_format = "0.00"
576
+ elif x >= 5:
577
+ cell.number_format = "0.000"
578
+ elif x > 0.5:
579
+ cell.number_format = "0.0000"
580
+ elif x > 0.005:
581
+ cell.number_format = "0.00000"
582
+ else:
583
+ cell.number_format = "0.000E+00"
584
+ else:
585
+ cell.alignment = left
@@ -393,7 +393,8 @@ def create_onnx_model_from_input_tensors(
393
393
  Creates a model proto including all the value as initializers.
394
394
  They can be restored by executing the model.
395
395
  We assume these inputs are not bigger than 2Gb,
396
- the limit of protobuf.
396
+ the limit of protobuf. Nothing is implemented yet to get around
397
+ that limit.
397
398
 
398
399
  :param inputs: anything
399
400
  :param switch_low_high: if None, it is equal to ``switch_low_high=sys.byteorder != "big"``
@@ -532,6 +533,8 @@ def create_input_tensors_from_onnx_model(
532
533
  :param engine: runtime to use, onnx, the default value, onnxruntime
533
534
  :param sep: separator
534
535
  :return: restored data
536
+
537
+ See example :ref:`l-plot-intermediate-results` for an example.
535
538
  """
536
539
  if engine == "ExtendedReferenceEvaluator":
537
540
  from ..reference import ExtendedReferenceEvaluator