onnx-diagnostic 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnx_diagnostic/__init__.py +1 -1
- onnx_diagnostic/_command_lines_parser.py +87 -77
- onnx_diagnostic/doc.py +22 -0
- onnx_diagnostic/ext_test_case.py +1 -1
- onnx_diagnostic/helpers/cache_helper.py +59 -0
- onnx_diagnostic/helpers/config_helper.py +8 -4
- onnx_diagnostic/helpers/helper.py +30 -3
- onnx_diagnostic/helpers/log_helper.py +585 -0
- onnx_diagnostic/helpers/mini_onnx_builder.py +4 -1
- onnx_diagnostic/helpers/model_builder_helper.py +54 -73
- onnx_diagnostic/helpers/torch_helper.py +18 -2
- onnx_diagnostic/reference/__init__.py +1 -0
- onnx_diagnostic/reference/ort_evaluator.py +29 -4
- onnx_diagnostic/reference/report_results_comparison.py +95 -0
- onnx_diagnostic/reference/torch_evaluator.py +21 -0
- onnx_diagnostic/tasks/automatic_speech_recognition.py +3 -0
- onnx_diagnostic/tasks/feature_extraction.py +3 -0
- onnx_diagnostic/tasks/fill_mask.py +3 -0
- onnx_diagnostic/tasks/image_classification.py +7 -1
- onnx_diagnostic/tasks/image_text_to_text.py +3 -0
- onnx_diagnostic/tasks/mixture_of_expert.py +3 -0
- onnx_diagnostic/tasks/object_detection.py +3 -0
- onnx_diagnostic/tasks/sentence_similarity.py +3 -0
- onnx_diagnostic/tasks/summarization.py +3 -0
- onnx_diagnostic/tasks/text2text_generation.py +3 -0
- onnx_diagnostic/tasks/text_classification.py +3 -0
- onnx_diagnostic/tasks/text_generation.py +90 -43
- onnx_diagnostic/tasks/zero_shot_image_classification.py +3 -0
- onnx_diagnostic/torch_export_patches/onnx_export_errors.py +78 -25
- onnx_diagnostic/torch_export_patches/onnx_export_serialization.py +37 -0
- onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +365 -17
- onnx_diagnostic/torch_models/hghub/hub_api.py +20 -4
- onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py +209 -0
- onnx_diagnostic/torch_models/hghub/model_inputs.py +3 -0
- onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py +23 -50
- onnx_diagnostic/torch_models/{test_helper.py → validate.py} +158 -103
- {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/METADATA +2 -2
- {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/RECORD +41 -39
- {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/WHEEL +0 -0
- {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {onnx_diagnostic-0.6.3.dist-info → onnx_diagnostic-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import glob
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import zipfile
|
|
6
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas
|
|
9
|
+
from pandas.api.types import is_numeric_dtype
|
|
10
|
+
from .helper import string_sig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def enumerate_csv_files(
|
|
14
|
+
data: Union[
|
|
15
|
+
pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
|
|
16
|
+
],
|
|
17
|
+
verbose: int = 0,
|
|
18
|
+
) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
|
|
19
|
+
"""
|
|
20
|
+
Enumerates files considered for the aggregation.
|
|
21
|
+
Only csv files are considered.
|
|
22
|
+
If a zip file is given, the function digs into the zip files and
|
|
23
|
+
loops over csv candidates.
|
|
24
|
+
|
|
25
|
+
:param data: dataframe with the raw data or a file or list of files
|
|
26
|
+
|
|
27
|
+
data can contains:
|
|
28
|
+
* a dataframe
|
|
29
|
+
* a string for a filename, zip or csv
|
|
30
|
+
* a list of string
|
|
31
|
+
* a tuple
|
|
32
|
+
"""
|
|
33
|
+
if not isinstance(data, list):
|
|
34
|
+
data = [data]
|
|
35
|
+
for itn, filename in enumerate(data):
|
|
36
|
+
if isinstance(filename, pandas.DataFrame):
|
|
37
|
+
if verbose:
|
|
38
|
+
print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
|
|
39
|
+
yield filename
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
if isinstance(filename, tuple):
|
|
43
|
+
# A file in a zipfile
|
|
44
|
+
if verbose:
|
|
45
|
+
print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
|
|
46
|
+
yield filename
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
if os.path.exists(filename):
|
|
50
|
+
ext = os.path.splitext(filename)[-1]
|
|
51
|
+
if ext == ".csv":
|
|
52
|
+
# We check the first line is ok.
|
|
53
|
+
if verbose:
|
|
54
|
+
print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
|
|
55
|
+
with open(filename, "r", encoding="utf-8") as f:
|
|
56
|
+
line = f.readline()
|
|
57
|
+
if "~help" in line or (",CMD" not in line and ",DATE" not in line):
|
|
58
|
+
continue
|
|
59
|
+
dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
|
|
60
|
+
du = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
61
|
+
yield (os.path.split(filename)[-1], du, filename, "")
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
if ext == ".zip":
|
|
65
|
+
if verbose:
|
|
66
|
+
print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
|
|
67
|
+
zf = zipfile.ZipFile(filename, "r")
|
|
68
|
+
for ii, info in enumerate(zf.infolist()):
|
|
69
|
+
name = info.filename
|
|
70
|
+
ext = os.path.splitext(name)[-1]
|
|
71
|
+
if ext != ".csv":
|
|
72
|
+
continue
|
|
73
|
+
if verbose:
|
|
74
|
+
print(
|
|
75
|
+
f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
|
|
76
|
+
)
|
|
77
|
+
with zf.open(name) as zzf:
|
|
78
|
+
first_line = zzf.readline()
|
|
79
|
+
if b"," not in first_line:
|
|
80
|
+
continue
|
|
81
|
+
yield (
|
|
82
|
+
os.path.split(name)[-1],
|
|
83
|
+
"%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
|
|
84
|
+
name,
|
|
85
|
+
filename,
|
|
86
|
+
)
|
|
87
|
+
zf.close()
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
|
|
91
|
+
|
|
92
|
+
# filename is a pattern.
|
|
93
|
+
found = glob.glob(filename)
|
|
94
|
+
if verbose and not found:
|
|
95
|
+
print(f"[enumerate_csv_files] unable to find file in {filename!r}")
|
|
96
|
+
for ii, f in enumerate(found):
|
|
97
|
+
if verbose:
|
|
98
|
+
print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
|
|
99
|
+
yield from enumerate_csv_files(f, verbose=verbose)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def open_dataframe(
|
|
103
|
+
data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
|
|
104
|
+
) -> pandas.DataFrame:
|
|
105
|
+
"""
|
|
106
|
+
Opens a filename.
|
|
107
|
+
|
|
108
|
+
:param data: a dataframe, a filename, a tuple indicating the file is coming
|
|
109
|
+
from a zip file
|
|
110
|
+
:return: a dataframe
|
|
111
|
+
"""
|
|
112
|
+
if isinstance(data, pandas.DataFrame):
|
|
113
|
+
return data
|
|
114
|
+
if isinstance(data, str):
|
|
115
|
+
df = pandas.read_csv(data)
|
|
116
|
+
df["RAWFILENAME"] = data
|
|
117
|
+
return df
|
|
118
|
+
if isinstance(data, tuple):
|
|
119
|
+
if not data[-1]:
|
|
120
|
+
df = pandas.read_csv(data[2])
|
|
121
|
+
df["RAWFILENAME"] = data[2]
|
|
122
|
+
return df
|
|
123
|
+
zf = zipfile.ZipFile(data[-1])
|
|
124
|
+
with zf.open(data[2]) as f:
|
|
125
|
+
df = pandas.read_csv(f)
|
|
126
|
+
df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
|
|
127
|
+
zf.close()
|
|
128
|
+
return df
|
|
129
|
+
|
|
130
|
+
raise ValueError(f"Unexpected value for data: {data!r}")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class CubeViewDef:
|
|
134
|
+
"""
|
|
135
|
+
Defines how to compute a view.
|
|
136
|
+
|
|
137
|
+
:param key_index: keys to put in the row index
|
|
138
|
+
:param values: values to show
|
|
139
|
+
:param ignore_unique: ignore keys with a unique value
|
|
140
|
+
:param order: to reorder key in columns index
|
|
141
|
+
:param key_agg: aggregate according to these columns before
|
|
142
|
+
creating the view
|
|
143
|
+
:param agg_args: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
|
|
144
|
+
:param agg_kwargs: see :meth:`pandas.core.groupby.DataFrameGroupBy.agg`
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def __init__(
|
|
148
|
+
self,
|
|
149
|
+
key_index: Sequence[str],
|
|
150
|
+
values: Sequence[str],
|
|
151
|
+
ignore_unique: bool = True,
|
|
152
|
+
order: Optional[Sequence[str]] = None,
|
|
153
|
+
key_agg: Optional[Sequence[str]] = None,
|
|
154
|
+
agg_args: Sequence[Any] = ("sum",),
|
|
155
|
+
agg_kwargs: Optional[Dict[str, Any]] = None,
|
|
156
|
+
):
|
|
157
|
+
self.key_index = key_index
|
|
158
|
+
self.values = values
|
|
159
|
+
self.ignore_unique = ignore_unique
|
|
160
|
+
self.order = order
|
|
161
|
+
self.key_agg = key_agg
|
|
162
|
+
self.agg_args = agg_args
|
|
163
|
+
self.agg_kwargs = agg_kwargs
|
|
164
|
+
|
|
165
|
+
def __repr__(self) -> str:
|
|
166
|
+
"usual"
|
|
167
|
+
return string_sig(self) # type: ignore[arg-type]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class CubeLogs:
|
|
171
|
+
"""
|
|
172
|
+
Processes logs coming from experiments.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
def __init__(
|
|
176
|
+
self,
|
|
177
|
+
data: Any,
|
|
178
|
+
time: str = "date",
|
|
179
|
+
keys: Sequence[str] = ("version_.*", "model_.*"),
|
|
180
|
+
values: Sequence[str] = ("time_.*", "disc_.*"),
|
|
181
|
+
ignored: Sequence[str] = (),
|
|
182
|
+
recent: bool = False,
|
|
183
|
+
formulas: Optional[Dict[str, Callable[[pandas.DataFrame], pandas.Series]]] = None,
|
|
184
|
+
):
|
|
185
|
+
self._data = data
|
|
186
|
+
self._time = time
|
|
187
|
+
self._keys = keys
|
|
188
|
+
self._values = values
|
|
189
|
+
self._ignored = ignored
|
|
190
|
+
self.recent = recent
|
|
191
|
+
self._formulas = formulas
|
|
192
|
+
|
|
193
|
+
def load(self, verbose: int = 0):
|
|
194
|
+
"""Loads and preprocesses the data. Returns self."""
|
|
195
|
+
if isinstance(self._data, pandas.DataFrame):
|
|
196
|
+
if verbose:
|
|
197
|
+
print(f"[CubeLogs.load] load from dataframe, shape={self._data.shape}")
|
|
198
|
+
self.data = self._data
|
|
199
|
+
elif isinstance(self._data, list) and all(isinstance(r, dict) for r in self._data):
|
|
200
|
+
if verbose:
|
|
201
|
+
print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
|
|
202
|
+
self.data = pandas.DataFrame(self._data)
|
|
203
|
+
elif isinstance(self._data, list) and all(
|
|
204
|
+
isinstance(r, pandas.DataFrame) for r in self._data
|
|
205
|
+
):
|
|
206
|
+
if verbose:
|
|
207
|
+
print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
|
|
208
|
+
self.data = pandas.concat(self._data, axis=0)
|
|
209
|
+
elif isinstance(self._data, list):
|
|
210
|
+
cubes = []
|
|
211
|
+
for item in enumerate_csv_files(self._data, verbose=verbose):
|
|
212
|
+
df = open_dataframe(item)
|
|
213
|
+
cube = CubeLogs(
|
|
214
|
+
df,
|
|
215
|
+
time=self._time,
|
|
216
|
+
keys=self._keys,
|
|
217
|
+
values=self._values,
|
|
218
|
+
ignored=self._ignored,
|
|
219
|
+
recent=self.recent,
|
|
220
|
+
)
|
|
221
|
+
cube.load()
|
|
222
|
+
cubes.append(cube.data)
|
|
223
|
+
self.data = pandas.concat(cubes, axis=0)
|
|
224
|
+
else:
|
|
225
|
+
raise NotImplementedError(
|
|
226
|
+
f"Not implemented with the provided data (type={type(self._data)})"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
assert all(isinstance(c, str) for c in self.data.columns), (
|
|
230
|
+
f"The class only supports string as column names "
|
|
231
|
+
f"but found {[c for c in self.data.columns if not isinstance(c, str)]}"
|
|
232
|
+
)
|
|
233
|
+
if verbose:
|
|
234
|
+
print(f"[CubeLogs.load] loaded with shape={self.data.shape}")
|
|
235
|
+
|
|
236
|
+
self._initialize_columns()
|
|
237
|
+
if verbose:
|
|
238
|
+
print(f"[CubeLogs.load] time={self.time}")
|
|
239
|
+
print(f"[CubeLogs.load] keys={self.keys}")
|
|
240
|
+
print(f"[CubeLogs.load] values={self.values}")
|
|
241
|
+
print(f"[CubeLogs.load] ignored={self.ignored}")
|
|
242
|
+
print(f"[CubeLogs.load] ignored_values={self.ignored_values}")
|
|
243
|
+
print(f"[CubeLogs.load] ignored_keys={self.ignored_keys}")
|
|
244
|
+
assert not (
|
|
245
|
+
set(self.keys) & set(self.values)
|
|
246
|
+
), f"Columns {set(self.keys) & set(self.values)} cannot be keys and values"
|
|
247
|
+
assert not (
|
|
248
|
+
set(self.keys) & set(self.ignored)
|
|
249
|
+
), f"Columns {set(self.keys) & set(self.ignored)} cannot be keys and ignored"
|
|
250
|
+
assert not (
|
|
251
|
+
set(self.values) & set(self.ignored)
|
|
252
|
+
), f"Columns {set(self.keys) & set(self.ignored)} cannot be values and ignored"
|
|
253
|
+
assert (
|
|
254
|
+
self.time not in self.keys
|
|
255
|
+
and self.time not in self.values
|
|
256
|
+
and self.time not in self.ignored
|
|
257
|
+
), f"Column {self.time!r} is also a key, a value or ignored"
|
|
258
|
+
self._columns = [self.time, *self.keys, *self.values, *self.ignored]
|
|
259
|
+
self.dropped = [c for c in self.data.columns if c not in set(self.columns)]
|
|
260
|
+
self.data = self.data[self.columns]
|
|
261
|
+
if verbose:
|
|
262
|
+
print(f"[CubeLogs.load] dropped={self.dropped}")
|
|
263
|
+
print(f"[CubeLogs.load] data.shape={self.data.shape}")
|
|
264
|
+
|
|
265
|
+
self._preprocess()
|
|
266
|
+
if self.recent and verbose:
|
|
267
|
+
print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
|
|
268
|
+
|
|
269
|
+
# Let's apply the formulas
|
|
270
|
+
if self._formulas:
|
|
271
|
+
cols = set(self.data.columns)
|
|
272
|
+
for k, f in self._formulas.items():
|
|
273
|
+
if k in cols:
|
|
274
|
+
if verbose:
|
|
275
|
+
print(f"[CubeLogs.load] skip formula {k!r}")
|
|
276
|
+
else:
|
|
277
|
+
if verbose:
|
|
278
|
+
print(f"[CubeLogs.load] apply formula {k!r}")
|
|
279
|
+
self.data[k] = f(self.data)
|
|
280
|
+
self.values_for_key = {k: set(self.data[k]) for k in self.keys}
|
|
281
|
+
nans = [
|
|
282
|
+
c for c in [self.time, *self.keys] if self.data[c].isna().astype(int).sum() > 0
|
|
283
|
+
]
|
|
284
|
+
assert not nans, f"The following keys {nans} have nan values. This is not allowed."
|
|
285
|
+
if verbose:
|
|
286
|
+
print(f"[CubeLogs.load] convert column {self.time!r} into date")
|
|
287
|
+
self.data[self.time] = pandas.to_datetime(self.data[self.time])
|
|
288
|
+
if verbose:
|
|
289
|
+
print(f"[CubeLogs.load] done, shape={self.shape}")
|
|
290
|
+
return self
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def shape(self) -> Tuple[int, int]:
|
|
294
|
+
"Returns the shape."
|
|
295
|
+
assert hasattr(self, "data"), "Method load was not called"
|
|
296
|
+
return self.data.shape
|
|
297
|
+
|
|
298
|
+
@property
|
|
299
|
+
def columns(self) -> Sequence[str]:
|
|
300
|
+
"Returns the columns."
|
|
301
|
+
assert hasattr(self, "data"), "Method load was not called"
|
|
302
|
+
return self.data.columns
|
|
303
|
+
|
|
304
|
+
def _preprocess(self):
|
|
305
|
+
last = self.values[0]
|
|
306
|
+
gr = self.data[[self.time, *self.keys, last]].groupby([self.time, *self.keys]).count()
|
|
307
|
+
gr = gr[gr[last] > 1]
|
|
308
|
+
if self.recent:
|
|
309
|
+
cp = self.data.copy()
|
|
310
|
+
assert (
|
|
311
|
+
"__index__" not in cp.columns
|
|
312
|
+
), f"'__index__' should not be a column in {cp.columns}"
|
|
313
|
+
cp["__index__"] = np.arange(cp.shape[0])
|
|
314
|
+
gr = (
|
|
315
|
+
cp[[*self.keys, self.time, "__index__"]]
|
|
316
|
+
.groupby(self.keys, as_index=False)
|
|
317
|
+
.max()
|
|
318
|
+
)
|
|
319
|
+
filtered = pandas.merge(cp, gr, on=[self.time, "__index__", *self.keys])
|
|
320
|
+
assert filtered.shape[0] <= self.data.shape[0], (
|
|
321
|
+
f"Keeping the latest row brings more row {filtered.shape} "
|
|
322
|
+
f"(initial is {self.data.shape})."
|
|
323
|
+
)
|
|
324
|
+
self.data = filtered.drop("__index__", axis=1)
|
|
325
|
+
else:
|
|
326
|
+
assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
|
|
327
|
+
gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
|
|
328
|
+
gr = gr[gr[self.time] > 1]
|
|
329
|
+
assert (
|
|
330
|
+
gr.shape[0] == 0
|
|
331
|
+
), f"recent should be true to keep the most recent row:\n{gr}"
|
|
332
|
+
|
|
333
|
+
@classmethod
|
|
334
|
+
def _filter_column(cls, filters, columns, can_be_empty=False):
|
|
335
|
+
set_cols = set()
|
|
336
|
+
for f in filters:
|
|
337
|
+
reg = re.compile(f)
|
|
338
|
+
cols = [c for c in columns if reg.search(c)]
|
|
339
|
+
set_cols |= set(cols)
|
|
340
|
+
assert (
|
|
341
|
+
can_be_empty or set_cols
|
|
342
|
+
), f"Filters {filters} returns an empty set from {columns}"
|
|
343
|
+
return sorted(set_cols)
|
|
344
|
+
|
|
345
|
+
def _initialize_columns(self):
|
|
346
|
+
self.keys = self._filter_column(self._keys, self.data.columns)
|
|
347
|
+
self.values = self._filter_column(self._values, self.data.columns)
|
|
348
|
+
self.ignored = self._filter_column(self._ignored, self.data.columns, True)
|
|
349
|
+
assert (
|
|
350
|
+
self._time in self.data.columns
|
|
351
|
+
), f"Column {self._time} not found in {self.data.columns}"
|
|
352
|
+
ignored_keys = set(self.ignored) & set(self.keys)
|
|
353
|
+
ignored_values = set(self.ignored) & set(self.values)
|
|
354
|
+
self.keys = [c for c in self.keys if c not in ignored_keys]
|
|
355
|
+
self.values = [c for c in self.values if c not in ignored_values]
|
|
356
|
+
self.ignored_keys = sorted(ignored_keys)
|
|
357
|
+
self.ignored_values = sorted(ignored_values)
|
|
358
|
+
self.time = self._time
|
|
359
|
+
|
|
360
|
+
def __str__(self) -> str:
|
|
361
|
+
"usual"
|
|
362
|
+
return str(self.data) if hasattr(self, "data") else str(self._data)
|
|
363
|
+
|
|
364
|
+
def view(self, view_def: CubeViewDef) -> pandas.DataFrame:
|
|
365
|
+
"""
|
|
366
|
+
Returns a dataframe, a pivot view.
|
|
367
|
+
`key_index` determines the index, the other key columns determines
|
|
368
|
+
the columns. If `ignore_unique` is True, every columns with a unique value
|
|
369
|
+
is removed.
|
|
370
|
+
|
|
371
|
+
:param view_def: view definition
|
|
372
|
+
:return: dataframe
|
|
373
|
+
"""
|
|
374
|
+
key_agg = self._filter_column(view_def.key_agg, self.keys) if view_def.key_agg else []
|
|
375
|
+
set_key_agg = set(key_agg)
|
|
376
|
+
assert set_key_agg <= set(
|
|
377
|
+
self.keys
|
|
378
|
+
), f"Non existing keys in key_agg {set_key_agg - set(self.keys)}"
|
|
379
|
+
|
|
380
|
+
values = self._filter_column(view_def.values, self.values)
|
|
381
|
+
assert set(values) <= set(
|
|
382
|
+
self.values
|
|
383
|
+
), f"Non existing columns in values {set(values) - set(self.values)}"
|
|
384
|
+
|
|
385
|
+
if key_agg:
|
|
386
|
+
key_index = [
|
|
387
|
+
c
|
|
388
|
+
for c in self._filter_column(view_def.key_index, self.keys)
|
|
389
|
+
if c not in set_key_agg
|
|
390
|
+
]
|
|
391
|
+
keys_no_agg = [c for c in self.keys if c not in set_key_agg]
|
|
392
|
+
data = (
|
|
393
|
+
self.data[[*keys_no_agg, *values]]
|
|
394
|
+
.groupby(key_index, as_index=False)
|
|
395
|
+
.agg(*view_def.agg_args, **(view_def.agg_kwargs or {}))
|
|
396
|
+
)
|
|
397
|
+
else:
|
|
398
|
+
key_index = self._filter_column(view_def.key_index, self.keys)
|
|
399
|
+
data = self.data[[*self.keys, *values]]
|
|
400
|
+
|
|
401
|
+
assert set(key_index) <= set(
|
|
402
|
+
self.keys
|
|
403
|
+
), f"Non existing keys in key_index {set(key_index) - set(self.keys)}"
|
|
404
|
+
|
|
405
|
+
set_key_columns = {
|
|
406
|
+
c for c in self.keys if c not in key_index and c not in set(key_agg)
|
|
407
|
+
}
|
|
408
|
+
if view_def.ignore_unique:
|
|
409
|
+
key_index = [k for k in key_index if len(self.values_for_key[k]) > 1]
|
|
410
|
+
key_columns = [k for k in set_key_columns if len(self.values_for_key[k]) > 1]
|
|
411
|
+
else:
|
|
412
|
+
key_columns = sorted(set_key_columns)
|
|
413
|
+
|
|
414
|
+
if view_def.order:
|
|
415
|
+
assert set(view_def.order) <= set_key_columns, (
|
|
416
|
+
f"Non existing columns from order in key_columns "
|
|
417
|
+
f"{set(view_def.order) - set_key_columns}"
|
|
418
|
+
)
|
|
419
|
+
key_columns = [
|
|
420
|
+
*view_def.order,
|
|
421
|
+
*[c for c in key_columns if c not in view_def.order],
|
|
422
|
+
]
|
|
423
|
+
return data.pivot(index=key_index[::-1], columns=key_columns, values=values)
|
|
424
|
+
|
|
425
|
+
def describe(self) -> pandas.DataFrame:
|
|
426
|
+
"""Basic description of all variables."""
|
|
427
|
+
rows = []
|
|
428
|
+
for name in self.data.columns:
|
|
429
|
+
values = self.data[name]
|
|
430
|
+
dtype = values.dtype
|
|
431
|
+
nonan = values.dropna()
|
|
432
|
+
obs = dict(
|
|
433
|
+
name=name,
|
|
434
|
+
dtype=str(dtype),
|
|
435
|
+
missing=len(values) - len(nonan),
|
|
436
|
+
)
|
|
437
|
+
if len(nonan) > 0:
|
|
438
|
+
obs.update(
|
|
439
|
+
dict(
|
|
440
|
+
min=nonan.min(),
|
|
441
|
+
max=nonan.max(),
|
|
442
|
+
count=len(nonan),
|
|
443
|
+
)
|
|
444
|
+
)
|
|
445
|
+
if is_numeric_dtype(nonan):
|
|
446
|
+
obs.update(
|
|
447
|
+
dict(
|
|
448
|
+
mean=nonan.mean(),
|
|
449
|
+
sum=nonan.sum(),
|
|
450
|
+
)
|
|
451
|
+
)
|
|
452
|
+
else:
|
|
453
|
+
unique = set(nonan)
|
|
454
|
+
obs["n_values"] = len(unique)
|
|
455
|
+
if len(unique) < 20:
|
|
456
|
+
obs["values"] = ",".join(map(str, sorted(unique)))
|
|
457
|
+
rows.append(obs)
|
|
458
|
+
return pandas.DataFrame(rows).set_index("name")
|
|
459
|
+
|
|
460
|
+
def to_excel(
|
|
461
|
+
self,
|
|
462
|
+
output: str,
|
|
463
|
+
views: Dict[str, CubeViewDef],
|
|
464
|
+
main: Optional[str] = "main",
|
|
465
|
+
raw: Optional[str] = "raw",
|
|
466
|
+
verbose: int = 0,
|
|
467
|
+
):
|
|
468
|
+
"""
|
|
469
|
+
Creates an excel file with a list of view.
|
|
470
|
+
|
|
471
|
+
:param output: output file to create
|
|
472
|
+
:param views: list of views to append
|
|
473
|
+
:param main: add a page with statitcs on all variables
|
|
474
|
+
:param raw: add a page with the raw data
|
|
475
|
+
:param verbose: verbosity
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
with pandas.ExcelWriter(output, engine="openpyxl") as writer:
|
|
479
|
+
if main:
|
|
480
|
+
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
481
|
+
df = self.describe()
|
|
482
|
+
if verbose:
|
|
483
|
+
print(f"[CubeLogs.to_helper] add sheet {main!r} with shape {df.shape}")
|
|
484
|
+
df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
|
|
485
|
+
self._apply_excel_style(main, writer, df)
|
|
486
|
+
if raw:
|
|
487
|
+
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
|
|
488
|
+
if verbose:
|
|
489
|
+
print(f"[CubeLogs.to_helper] add sheet {raw!r} with shape {self.shape}")
|
|
490
|
+
self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
|
|
491
|
+
self._apply_excel_style(raw, writer, self.data)
|
|
492
|
+
|
|
493
|
+
for name, view in views.items():
|
|
494
|
+
df = self.view(view)
|
|
495
|
+
if verbose:
|
|
496
|
+
print(
|
|
497
|
+
f"[CubeLogs.to_helper] add sheet {name!r} with shape "
|
|
498
|
+
f"{df.shape}, index={df.index.names}, columns={df.columns.names}"
|
|
499
|
+
)
|
|
500
|
+
df.to_excel(
|
|
501
|
+
writer,
|
|
502
|
+
sheet_name=name,
|
|
503
|
+
freeze_panes=(df.index.nlevels, df.columns.nlevels),
|
|
504
|
+
)
|
|
505
|
+
self._apply_excel_style(name, writer, df)
|
|
506
|
+
if verbose:
|
|
507
|
+
print(f"[CubeLogs.to_helper] done with {len(views)} views")
|
|
508
|
+
|
|
509
|
+
def _apply_excel_style(self, name: str, writer: pandas.ExcelWriter, df: pandas.DataFrame):
|
|
510
|
+
from openpyxl.styles import Alignment
|
|
511
|
+
from openpyxl.utils import get_column_letter
|
|
512
|
+
|
|
513
|
+
# from openpyxl.styles import Font, PatternFill, numbers
|
|
514
|
+
|
|
515
|
+
left = Alignment(horizontal="left")
|
|
516
|
+
right = Alignment(horizontal="right")
|
|
517
|
+
# center = Alignment(horizontal="center")
|
|
518
|
+
# bold_font = Font(bold=True)
|
|
519
|
+
# red = Font(color="FF0000")
|
|
520
|
+
# yellow = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
|
|
521
|
+
# redf = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid")
|
|
522
|
+
|
|
523
|
+
sheet = writer.sheets[name]
|
|
524
|
+
n_rows = df.shape[0] + df.columns.nlevels + df.index.nlevels
|
|
525
|
+
n_cols = df.shape[1] + df.index.nlevels
|
|
526
|
+
co: Dict[int, int] = {}
|
|
527
|
+
sizes: Dict[int, int] = {}
|
|
528
|
+
cols = set()
|
|
529
|
+
for i in range(1, n_rows):
|
|
530
|
+
for j, cell in enumerate(sheet[i]):
|
|
531
|
+
if j > n_cols:
|
|
532
|
+
break
|
|
533
|
+
cols.add(cell.column)
|
|
534
|
+
if isinstance(cell.value, float):
|
|
535
|
+
co[j] = co.get(j, 0) + 1
|
|
536
|
+
elif isinstance(cell.value, str):
|
|
537
|
+
sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
|
|
538
|
+
|
|
539
|
+
for k, v in sizes.items():
|
|
540
|
+
c = get_column_letter(k)
|
|
541
|
+
sheet.column_dimensions[c].width = max(15, v)
|
|
542
|
+
for k in cols:
|
|
543
|
+
if k not in sizes:
|
|
544
|
+
c = get_column_letter(k)
|
|
545
|
+
sheet.column_dimensions[c].width = 15
|
|
546
|
+
|
|
547
|
+
for i in range(1, n_rows):
|
|
548
|
+
for j, cell in enumerate(sheet[i]):
|
|
549
|
+
if j > n_cols:
|
|
550
|
+
break
|
|
551
|
+
if isinstance(cell.value, pandas.Timestamp):
|
|
552
|
+
cell.alignment = right
|
|
553
|
+
dt = cell.value.to_pydatetime()
|
|
554
|
+
cell.value = dt
|
|
555
|
+
cell.number_format = (
|
|
556
|
+
"YYYY-MM-DD"
|
|
557
|
+
if (
|
|
558
|
+
dt.hour == 0
|
|
559
|
+
and dt.minute == 0
|
|
560
|
+
and dt.second == 0
|
|
561
|
+
and dt.microsecond == 0
|
|
562
|
+
)
|
|
563
|
+
else "YYYY-MM-DD 00:00:00"
|
|
564
|
+
)
|
|
565
|
+
elif isinstance(cell.value, (float, int)):
|
|
566
|
+
cell.alignment = right
|
|
567
|
+
x = abs(cell.value)
|
|
568
|
+
if int(x) == x:
|
|
569
|
+
cell.number_format = "0"
|
|
570
|
+
elif x > 5000:
|
|
571
|
+
cell.number_format = "# ##0"
|
|
572
|
+
elif x >= 500:
|
|
573
|
+
cell.number_format = "0.0"
|
|
574
|
+
elif x >= 50:
|
|
575
|
+
cell.number_format = "0.00"
|
|
576
|
+
elif x >= 5:
|
|
577
|
+
cell.number_format = "0.000"
|
|
578
|
+
elif x > 0.5:
|
|
579
|
+
cell.number_format = "0.0000"
|
|
580
|
+
elif x > 0.005:
|
|
581
|
+
cell.number_format = "0.00000"
|
|
582
|
+
else:
|
|
583
|
+
cell.number_format = "0.000E+00"
|
|
584
|
+
else:
|
|
585
|
+
cell.alignment = left
|
|
@@ -393,7 +393,8 @@ def create_onnx_model_from_input_tensors(
|
|
|
393
393
|
Creates a model proto including all the value as initializers.
|
|
394
394
|
They can be restored by executing the model.
|
|
395
395
|
We assume these inputs are not bigger than 2Gb,
|
|
396
|
-
the limit of protobuf.
|
|
396
|
+
the limit of protobuf. Nothing is implemented yet to get around
|
|
397
|
+
that limit.
|
|
397
398
|
|
|
398
399
|
:param inputs: anything
|
|
399
400
|
:param switch_low_high: if None, it is equal to ``switch_low_high=sys.byteorder != "big"``
|
|
@@ -532,6 +533,8 @@ def create_input_tensors_from_onnx_model(
|
|
|
532
533
|
:param engine: runtime to use, onnx, the default value, onnxruntime
|
|
533
534
|
:param sep: separator
|
|
534
535
|
:return: restored data
|
|
536
|
+
|
|
537
|
+
See example :ref:`l-plot-intermediate-results` for an example.
|
|
535
538
|
"""
|
|
536
539
|
if engine == "ExtendedReferenceEvaluator":
|
|
537
540
|
from ..reference import ExtendedReferenceEvaluator
|