onnx-diagnostic 0.7.4__py3-none-any.whl → 0.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnx_diagnostic/__init__.py +1 -1
- onnx_diagnostic/_command_lines_parser.py +66 -8
- onnx_diagnostic/helpers/_log_helper.py +461 -0
- onnx_diagnostic/helpers/log_helper.py +404 -315
- onnx_diagnostic/tasks/feature_extraction.py +86 -5
- onnx_diagnostic/tasks/text2text_generation.py +2 -2
- onnx_diagnostic/torch_export_patches/onnx_export_errors.py +7 -1
- onnx_diagnostic/torch_export_patches/patches/patch_transformers.py +92 -3
- onnx_diagnostic/torch_models/hghub/model_inputs.py +5 -0
- onnx_diagnostic/torch_models/validate.py +26 -3
- {onnx_diagnostic-0.7.4.dist-info → onnx_diagnostic-0.7.5.dist-info}/METADATA +1 -1
- {onnx_diagnostic-0.7.4.dist-info → onnx_diagnostic-0.7.5.dist-info}/RECORD +15 -14
- {onnx_diagnostic-0.7.4.dist-info → onnx_diagnostic-0.7.5.dist-info}/WHEEL +0 -0
- {onnx_diagnostic-0.7.4.dist-info → onnx_diagnostic-0.7.5.dist-info}/licenses/LICENSE.txt +0 -0
- {onnx_diagnostic-0.7.4.dist-info → onnx_diagnostic-0.7.5.dist-info}/top_level.txt +0 -0
|
@@ -1,208 +1,23 @@
|
|
|
1
|
-
import datetime
|
|
2
1
|
import enum
|
|
3
|
-
import glob
|
|
4
2
|
import io
|
|
5
|
-
import os
|
|
6
3
|
import pprint
|
|
7
4
|
import re
|
|
8
5
|
import warnings
|
|
9
|
-
import
|
|
10
|
-
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
|
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
11
7
|
import numpy as np
|
|
12
8
|
import pandas
|
|
13
9
|
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
|
|
14
10
|
from .helper import string_sig
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
11
|
+
from ._log_helper import (
|
|
12
|
+
BUCKET_SCALES,
|
|
13
|
+
breaking_last_point,
|
|
14
|
+
apply_excel_style,
|
|
15
|
+
align_dataframe_with,
|
|
16
|
+
open_dataframe,
|
|
17
|
+
enumerate_csv_files,
|
|
18
18
|
)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
BUCKET_SCALES = BUCKET_SCALES_VALUES / 100 + 1
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def filter_data(
|
|
25
|
-
df: pandas.DataFrame,
|
|
26
|
-
filter_in: Optional[str] = None,
|
|
27
|
-
filter_out: Optional[str] = None,
|
|
28
|
-
verbose: int = 0,
|
|
29
|
-
) -> pandas.DataFrame:
|
|
30
|
-
"""
|
|
31
|
-
Argument `filter` follows the syntax
|
|
32
|
-
``<column1>:<fmt1>//<column2>:<fmt2>``.
|
|
33
|
-
|
|
34
|
-
The format is the following:
|
|
35
|
-
|
|
36
|
-
* a value or a set of values separated by ``;``
|
|
37
|
-
"""
|
|
38
|
-
if not filter_in and not filter_out:
|
|
39
|
-
return df
|
|
40
|
-
|
|
41
|
-
def _f(fmt):
|
|
42
|
-
cond = {}
|
|
43
|
-
if isinstance(fmt, str):
|
|
44
|
-
cols = fmt.split("//")
|
|
45
|
-
for c in cols:
|
|
46
|
-
assert ":" in c, f"Unexpected value {c!r} in fmt={fmt!r}"
|
|
47
|
-
spl = c.split(":")
|
|
48
|
-
assert len(spl) == 2, f"Unexpected value {c!r} in fmt={fmt!r}"
|
|
49
|
-
name, fil = spl
|
|
50
|
-
cond[name] = set(fil.split(";"))
|
|
51
|
-
return cond
|
|
52
|
-
|
|
53
|
-
if filter_in:
|
|
54
|
-
cond = _f(filter_in)
|
|
55
|
-
assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_in!r}"
|
|
56
|
-
for k, v in cond.items():
|
|
57
|
-
if k not in df.columns:
|
|
58
|
-
continue
|
|
59
|
-
if verbose:
|
|
60
|
-
print(
|
|
61
|
-
f"[_filter_data] filter in column {k!r}, "
|
|
62
|
-
f"values {v!r} among {set(df[k].astype(str))}"
|
|
63
|
-
)
|
|
64
|
-
df = df[df[k].astype(str).isin(v)]
|
|
65
|
-
|
|
66
|
-
if filter_out:
|
|
67
|
-
cond = _f(filter_out)
|
|
68
|
-
assert isinstance(cond, dict), f"Unexpected type {type(cond)} for fmt={filter_out!r}"
|
|
69
|
-
for k, v in cond.items():
|
|
70
|
-
if k not in df.columns:
|
|
71
|
-
continue
|
|
72
|
-
if verbose:
|
|
73
|
-
print(
|
|
74
|
-
f"[_filter_data] filter out column {k!r}, "
|
|
75
|
-
f"values {v!r} among {set(df[k].astype(str))}"
|
|
76
|
-
)
|
|
77
|
-
df = df[~df[k].astype(str).isin(v)]
|
|
78
|
-
return df
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def enumerate_csv_files(
|
|
82
|
-
data: Union[
|
|
83
|
-
pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
|
|
84
|
-
],
|
|
85
|
-
verbose: int = 0,
|
|
86
|
-
filtering: Optional[Callable[[str], bool]] = None,
|
|
87
|
-
) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
|
|
88
|
-
"""
|
|
89
|
-
Enumerates files considered for the aggregation.
|
|
90
|
-
Only csv files are considered.
|
|
91
|
-
If a zip file is given, the function digs into the zip files and
|
|
92
|
-
loops over csv candidates.
|
|
93
|
-
|
|
94
|
-
:param data: dataframe with the raw data or a file or list of files
|
|
95
|
-
:param vrbose: verbosity
|
|
96
|
-
:param filtering: function to filter in or out files in zip files,
|
|
97
|
-
must return true to keep the file, false to skip it.
|
|
98
|
-
:return: a generator yielding tuples with the filename, date, full path and zip file
|
|
99
|
-
|
|
100
|
-
data can contains:
|
|
101
|
-
* a dataframe
|
|
102
|
-
* a string for a filename, zip or csv
|
|
103
|
-
* a list of string
|
|
104
|
-
* a tuple
|
|
105
|
-
"""
|
|
106
|
-
if not isinstance(data, list):
|
|
107
|
-
data = [data]
|
|
108
|
-
for itn, filename in enumerate(data):
|
|
109
|
-
if isinstance(filename, pandas.DataFrame):
|
|
110
|
-
if verbose:
|
|
111
|
-
print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
|
|
112
|
-
yield filename
|
|
113
|
-
continue
|
|
114
|
-
|
|
115
|
-
if isinstance(filename, tuple):
|
|
116
|
-
# A file in a zipfile
|
|
117
|
-
if verbose:
|
|
118
|
-
print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
|
|
119
|
-
yield filename
|
|
120
|
-
continue
|
|
121
|
-
|
|
122
|
-
if os.path.exists(filename):
|
|
123
|
-
ext = os.path.splitext(filename)[-1]
|
|
124
|
-
if ext == ".csv":
|
|
125
|
-
# We check the first line is ok.
|
|
126
|
-
if verbose:
|
|
127
|
-
print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
|
|
128
|
-
dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
|
|
129
|
-
du = dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
130
|
-
yield (os.path.split(filename)[-1], du, filename, "")
|
|
131
|
-
continue
|
|
132
|
-
|
|
133
|
-
if ext == ".zip":
|
|
134
|
-
if verbose:
|
|
135
|
-
print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
|
|
136
|
-
zf = zipfile.ZipFile(filename, "r")
|
|
137
|
-
for ii, info in enumerate(zf.infolist()):
|
|
138
|
-
name = info.filename
|
|
139
|
-
if filtering is None:
|
|
140
|
-
ext = os.path.splitext(name)[-1]
|
|
141
|
-
if ext != ".csv":
|
|
142
|
-
continue
|
|
143
|
-
elif not filtering(name):
|
|
144
|
-
continue
|
|
145
|
-
if verbose:
|
|
146
|
-
print(
|
|
147
|
-
f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
|
|
148
|
-
)
|
|
149
|
-
with zf.open(name) as zzf:
|
|
150
|
-
first_line = zzf.readline()
|
|
151
|
-
if b"," not in first_line:
|
|
152
|
-
continue
|
|
153
|
-
yield (
|
|
154
|
-
os.path.split(name)[-1],
|
|
155
|
-
"%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
|
|
156
|
-
name,
|
|
157
|
-
filename,
|
|
158
|
-
)
|
|
159
|
-
zf.close()
|
|
160
|
-
continue
|
|
161
|
-
|
|
162
|
-
raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
|
|
163
|
-
|
|
164
|
-
# filename is a pattern.
|
|
165
|
-
found = glob.glob(filename)
|
|
166
|
-
if verbose and not found:
|
|
167
|
-
print(f"[enumerate_csv_files] unable to find file in {filename!r}")
|
|
168
|
-
for ii, f in enumerate(found):
|
|
169
|
-
if verbose:
|
|
170
|
-
print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
|
|
171
|
-
yield from enumerate_csv_files(f, verbose=verbose, filtering=filtering)
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def open_dataframe(
|
|
175
|
-
data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
|
|
176
|
-
) -> pandas.DataFrame:
|
|
177
|
-
"""
|
|
178
|
-
Opens a filename defined by function
|
|
179
|
-
:func:`onnx_diagnostic.helpers.log_helper.enumerate_csv_files`.
|
|
180
|
-
|
|
181
|
-
:param data: a dataframe, a filename, a tuple indicating the file is coming
|
|
182
|
-
from a zip file
|
|
183
|
-
:return: a dataframe
|
|
184
|
-
"""
|
|
185
|
-
if isinstance(data, pandas.DataFrame):
|
|
186
|
-
return data
|
|
187
|
-
if isinstance(data, str):
|
|
188
|
-
df = pandas.read_csv(data)
|
|
189
|
-
df["RAWFILENAME"] = data
|
|
190
|
-
return df
|
|
191
|
-
if isinstance(data, tuple):
|
|
192
|
-
if not data[-1]:
|
|
193
|
-
df = pandas.read_csv(data[2])
|
|
194
|
-
df["RAWFILENAME"] = data[2]
|
|
195
|
-
return df
|
|
196
|
-
zf = zipfile.ZipFile(data[-1])
|
|
197
|
-
with zf.open(data[2]) as f:
|
|
198
|
-
df = pandas.read_csv(f)
|
|
199
|
-
df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
|
|
200
|
-
zf.close()
|
|
201
|
-
return df
|
|
202
|
-
|
|
203
|
-
raise ValueError(f"Unexpected value for data: {data!r}")
|
|
204
|
-
|
|
205
|
-
|
|
206
21
|
class CubeViewDef:
|
|
207
22
|
"""
|
|
208
23
|
Defines how to compute a view.
|
|
@@ -226,9 +41,46 @@ class CubeViewDef:
|
|
|
226
41
|
:param name: name of the view, used mostly to debug
|
|
227
42
|
:param plots: adds plot to the Excel sheet
|
|
228
43
|
:param no_index: remove the index (but keeps the columns)
|
|
44
|
+
|
|
45
|
+
Some examples of views. First example is an aggregated view
|
|
46
|
+
for many metrics.
|
|
47
|
+
|
|
48
|
+
.. code-block:: python
|
|
49
|
+
|
|
50
|
+
cube = CubeLogs(...)
|
|
51
|
+
|
|
52
|
+
CubeViewDef(
|
|
53
|
+
key_index=cube._filter_column(fs, cube.keys_time),
|
|
54
|
+
values=cube._filter_column(
|
|
55
|
+
["TIME_ITER", "speedup", "time_latency.*", "onnx_n_nodes"],
|
|
56
|
+
cube.values,
|
|
57
|
+
),
|
|
58
|
+
ignore_unique=True,
|
|
59
|
+
key_agg=["model_name", "task", "model_task", "suite"],
|
|
60
|
+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
|
|
61
|
+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
|
|
62
|
+
name="agg-all",
|
|
63
|
+
plots=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
Next one focuses on a couple of metrics.
|
|
67
|
+
|
|
68
|
+
.. code-block:: python
|
|
69
|
+
|
|
70
|
+
cube = CubeLogs(...)
|
|
71
|
+
|
|
72
|
+
CubeViewDef(
|
|
73
|
+
key_index=cube._filter_column(fs, cube.keys_time),
|
|
74
|
+
values=cube._filter_column(["speedup"], cube.values),
|
|
75
|
+
ignore_unique=True,
|
|
76
|
+
keep_columns_in_index=["suite"],
|
|
77
|
+
name="speedup",
|
|
78
|
+
)
|
|
229
79
|
"""
|
|
230
80
|
|
|
231
81
|
class HighLightKind(enum.IntEnum):
|
|
82
|
+
"Codes to highlight values."
|
|
83
|
+
|
|
232
84
|
NONE = 0
|
|
233
85
|
RED = 1
|
|
234
86
|
GREEN = 2
|
|
@@ -276,118 +128,6 @@ class CubeViewDef:
|
|
|
276
128
|
return string_sig(self) # type: ignore[arg-type]
|
|
277
129
|
|
|
278
130
|
|
|
279
|
-
def apply_excel_style(
|
|
280
|
-
filename_or_writer: Any,
|
|
281
|
-
f_highlights: Optional[Dict[str, Callable[[Any], CubeViewDef.HighLightKind]]] = None,
|
|
282
|
-
):
|
|
283
|
-
"""
|
|
284
|
-
Applies styles on all sheets in a file unless the sheet is too big.
|
|
285
|
-
|
|
286
|
-
:param filename_or_writer: filename, modified inplace
|
|
287
|
-
:param f_highlight: color function to apply, one per sheet
|
|
288
|
-
"""
|
|
289
|
-
from openpyxl import load_workbook
|
|
290
|
-
from openpyxl.styles import Alignment
|
|
291
|
-
from openpyxl.utils import get_column_letter
|
|
292
|
-
from openpyxl.styles import Font # , PatternFill, numbers
|
|
293
|
-
|
|
294
|
-
if isinstance(filename_or_writer, str):
|
|
295
|
-
workbook = load_workbook(filename_or_writer)
|
|
296
|
-
save = True
|
|
297
|
-
else:
|
|
298
|
-
workbook = filename_or_writer.book
|
|
299
|
-
save = False
|
|
300
|
-
|
|
301
|
-
left = Alignment(horizontal="left")
|
|
302
|
-
left_shrink = Alignment(horizontal="left", shrink_to_fit=True)
|
|
303
|
-
right = Alignment(horizontal="right")
|
|
304
|
-
font_colors = {
|
|
305
|
-
CubeViewDef.HighLightKind.GREEN: Font(color="00AA00"),
|
|
306
|
-
CubeViewDef.HighLightKind.RED: Font(color="FF0000"),
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
for name in workbook.sheetnames:
|
|
310
|
-
f_highlight = f_highlights.get(name, None) if f_highlights else None
|
|
311
|
-
sheet = workbook[name]
|
|
312
|
-
n_rows = sheet.max_row
|
|
313
|
-
n_cols = sheet.max_column
|
|
314
|
-
if n_rows * n_cols > 2**18:
|
|
315
|
-
# Too big.
|
|
316
|
-
continue
|
|
317
|
-
co: Dict[int, int] = {}
|
|
318
|
-
sizes: Dict[int, int] = {}
|
|
319
|
-
cols = set()
|
|
320
|
-
for i in range(1, n_rows + 1):
|
|
321
|
-
for j, cell in enumerate(sheet[i]):
|
|
322
|
-
if j > n_cols:
|
|
323
|
-
break
|
|
324
|
-
cols.add(cell.column)
|
|
325
|
-
if isinstance(cell.value, float):
|
|
326
|
-
co[j] = co.get(j, 0) + 1
|
|
327
|
-
elif isinstance(cell.value, str):
|
|
328
|
-
sizes[cell.column] = max(sizes.get(cell.column, 0), len(cell.value))
|
|
329
|
-
|
|
330
|
-
for k, v in sizes.items():
|
|
331
|
-
c = get_column_letter(k)
|
|
332
|
-
sheet.column_dimensions[c].width = min(max(8, v), 30)
|
|
333
|
-
for k in cols:
|
|
334
|
-
if k not in sizes:
|
|
335
|
-
c = get_column_letter(k)
|
|
336
|
-
sheet.column_dimensions[c].width = 15
|
|
337
|
-
|
|
338
|
-
for i in range(1, n_rows + 1):
|
|
339
|
-
for j, cell in enumerate(sheet[i]):
|
|
340
|
-
if j > n_cols:
|
|
341
|
-
break
|
|
342
|
-
if isinstance(cell.value, pandas.Timestamp):
|
|
343
|
-
cell.alignment = right
|
|
344
|
-
dt = cell.value.to_pydatetime()
|
|
345
|
-
cell.value = dt
|
|
346
|
-
cell.number_format = (
|
|
347
|
-
"YYYY-MM-DD"
|
|
348
|
-
if (
|
|
349
|
-
dt.hour == 0
|
|
350
|
-
and dt.minute == 0
|
|
351
|
-
and dt.second == 0
|
|
352
|
-
and dt.microsecond == 0
|
|
353
|
-
)
|
|
354
|
-
else "YYYY-MM-DD 00:00:00"
|
|
355
|
-
)
|
|
356
|
-
elif isinstance(cell.value, (float, int)):
|
|
357
|
-
cell.alignment = right
|
|
358
|
-
x = abs(cell.value)
|
|
359
|
-
if int(x) == x:
|
|
360
|
-
cell.number_format = "0"
|
|
361
|
-
elif x > 5000:
|
|
362
|
-
cell.number_format = "# ##0"
|
|
363
|
-
elif x >= 500:
|
|
364
|
-
cell.number_format = "0.0"
|
|
365
|
-
elif x >= 50:
|
|
366
|
-
cell.number_format = "0.00"
|
|
367
|
-
elif x >= 5:
|
|
368
|
-
cell.number_format = "0.000"
|
|
369
|
-
elif x > 0.5:
|
|
370
|
-
cell.number_format = "0.0000"
|
|
371
|
-
elif x > 0.005:
|
|
372
|
-
cell.number_format = "0.00000"
|
|
373
|
-
else:
|
|
374
|
-
cell.number_format = "0.000E+00"
|
|
375
|
-
if f_highlight:
|
|
376
|
-
h = f_highlight(cell.value)
|
|
377
|
-
if h in font_colors:
|
|
378
|
-
cell.font = font_colors[h]
|
|
379
|
-
elif isinstance(cell.value, str) and len(cell.value) > 70:
|
|
380
|
-
cell.alignment = left_shrink
|
|
381
|
-
else:
|
|
382
|
-
cell.alignment = left
|
|
383
|
-
if f_highlight:
|
|
384
|
-
h = f_highlight(cell.value)
|
|
385
|
-
if h in font_colors:
|
|
386
|
-
cell.font = font_colors[h]
|
|
387
|
-
if save:
|
|
388
|
-
workbook.save(filename_or_writer)
|
|
389
|
-
|
|
390
|
-
|
|
391
131
|
class CubePlot:
|
|
392
132
|
"""
|
|
393
133
|
Creates a plot.
|
|
@@ -397,6 +137,26 @@ class CubePlot:
|
|
|
397
137
|
:param split: draw a graph per line in the dataframe
|
|
398
138
|
:param timeseries: this assumes the time is one level of the columns,
|
|
399
139
|
this argument indices the level name
|
|
140
|
+
|
|
141
|
+
It defines a graph. Usually *bar* or *barh* is used to
|
|
142
|
+
compare experiments for every metric, a subplot by metric.
|
|
143
|
+
|
|
144
|
+
.. code-block:: python
|
|
145
|
+
|
|
146
|
+
CubePlot(df, kind="barh", orientation="row", split=True)
|
|
147
|
+
|
|
148
|
+
*line* is usually used to plot timeseries showing the
|
|
149
|
+
evolution of metrics over time.
|
|
150
|
+
|
|
151
|
+
.. code-block:: python
|
|
152
|
+
|
|
153
|
+
CubePlot(
|
|
154
|
+
df,
|
|
155
|
+
kind="line",
|
|
156
|
+
orientation="row",
|
|
157
|
+
split=True,
|
|
158
|
+
timeseries="time",
|
|
159
|
+
)
|
|
400
160
|
"""
|
|
401
161
|
|
|
402
162
|
KINDS = {"bar", "barh", "line"}
|
|
@@ -607,6 +367,35 @@ class CubePlot:
|
|
|
607
367
|
class CubeLogs:
|
|
608
368
|
"""
|
|
609
369
|
Processes logs coming from experiments.
|
|
370
|
+
A cube is basically a database with certain columns
|
|
371
|
+
playing specific roles.
|
|
372
|
+
|
|
373
|
+
* time: only one column, it is not mandatory but it is recommended
|
|
374
|
+
to have one
|
|
375
|
+
* keys: they are somehow coordinates, they cannot be aggregated,
|
|
376
|
+
they are not numbers, more like categories, `(time, *keys)`
|
|
377
|
+
identifies an element of the database in an unique way,
|
|
378
|
+
there cannot be more than one row sharing the same key and time
|
|
379
|
+
values
|
|
380
|
+
* values: they are not necessary numerical, but if they are,
|
|
381
|
+
they can be aggregated
|
|
382
|
+
|
|
383
|
+
Every other columns is ignored. More columns can be added
|
|
384
|
+
by using formulas.
|
|
385
|
+
|
|
386
|
+
:param data: the raw data
|
|
387
|
+
:param time: the time column
|
|
388
|
+
:param keys: the keys, can include regular expressions
|
|
389
|
+
:param values: the values, can include regular expressions
|
|
390
|
+
:param ignored: ignores some column, acts as negative regular
|
|
391
|
+
expressions for the other two
|
|
392
|
+
:param recent: if more than one rows share the same keys,
|
|
393
|
+
the cube only keeps the most recent one
|
|
394
|
+
:param formulas: columns to add, defined with formulas
|
|
395
|
+
:param fill_missing: a dictionary, defines values replacing missing one
|
|
396
|
+
for some columns
|
|
397
|
+
:param keep_last_date: overwrites all the times with the most recent
|
|
398
|
+
one, it makes things easier for timeseries
|
|
610
399
|
"""
|
|
611
400
|
|
|
612
401
|
def __init__(
|
|
@@ -636,6 +425,22 @@ class CubeLogs:
|
|
|
636
425
|
self.fill_missing = fill_missing
|
|
637
426
|
self.keep_last_date = keep_last_date
|
|
638
427
|
|
|
428
|
+
def clone(
|
|
429
|
+
self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
|
|
430
|
+
) -> "CubeLogs":
|
|
431
|
+
"""
|
|
432
|
+
Makes a copy of the dataframe.
|
|
433
|
+
It copies the processed data not the original one.
|
|
434
|
+
"""
|
|
435
|
+
cube = self.__class__(
|
|
436
|
+
data if data is not None else self.data.copy(),
|
|
437
|
+
time=self.time,
|
|
438
|
+
keys=keys or self.keys_no_time,
|
|
439
|
+
values=self.values,
|
|
440
|
+
)
|
|
441
|
+
cube.load()
|
|
442
|
+
return cube
|
|
443
|
+
|
|
639
444
|
def post_load_process_piece(
|
|
640
445
|
self, df: pandas.DataFrame, unique: bool = False
|
|
641
446
|
) -> pandas.DataFrame:
|
|
@@ -741,17 +546,13 @@ class CubeLogs:
|
|
|
741
546
|
print(f"[CubeLogs.load] dropped={self.dropped}")
|
|
742
547
|
print(f"[CubeLogs.load] data.shape={self.data.shape}")
|
|
743
548
|
|
|
744
|
-
shape = self.data.shape
|
|
745
549
|
if verbose:
|
|
746
550
|
print(f"[CubeLogs.load] removed columns, shape={self.data.shape}")
|
|
747
551
|
self._preprocess()
|
|
748
552
|
if verbose:
|
|
749
553
|
print(f"[CubeLogs.load] preprocess, shape={self.data.shape}")
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
), f"The preprocessing reduced shape {shape} to {self.data.shape}."
|
|
753
|
-
if self.recent and verbose:
|
|
754
|
-
print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
|
|
554
|
+
if self.recent:
|
|
555
|
+
print(f"[CubeLogs.load] keep most recent data.shape={self.data.shape}")
|
|
755
556
|
|
|
756
557
|
# Let's apply the formulas
|
|
757
558
|
if self._formulas:
|
|
@@ -883,6 +684,18 @@ class CubeLogs:
|
|
|
883
684
|
"usual"
|
|
884
685
|
return str(self.data) if hasattr(self, "data") else str(self._data)
|
|
885
686
|
|
|
687
|
+
def make_view_def(self, name: str) -> Optional[CubeViewDef]:
|
|
688
|
+
"""
|
|
689
|
+
Returns a view definition.
|
|
690
|
+
|
|
691
|
+
:param name: name of a value
|
|
692
|
+
:return: a CubeViewDef or None if name does not make sense
|
|
693
|
+
"""
|
|
694
|
+
assert name in self.values, f"{name!r} is not one of the values {self.values}"
|
|
695
|
+
keys = sorted(self.keys_no_time)
|
|
696
|
+
index = len(keys) // 2 + (len(keys) % 2)
|
|
697
|
+
return CubeViewDef(key_index=keys[:index], values=[name], name=name)
|
|
698
|
+
|
|
886
699
|
def view(
|
|
887
700
|
self,
|
|
888
701
|
view_def: Union[str, CubeViewDef],
|
|
@@ -900,6 +713,12 @@ class CubeLogs:
|
|
|
900
713
|
:param verbose: verbosity level
|
|
901
714
|
:return: dataframe
|
|
902
715
|
"""
|
|
716
|
+
if isinstance(view_def, str):
|
|
717
|
+
# We automatically create a view for a metric
|
|
718
|
+
view_def_ = self.make_view_def(view_def)
|
|
719
|
+
assert view_def_ is not None, f"Unable to create a view from {view_def!r}"
|
|
720
|
+
view_def = view_def_
|
|
721
|
+
|
|
903
722
|
assert isinstance(
|
|
904
723
|
view_def, CubeViewDef
|
|
905
724
|
), f"view_def should be a CubeViewDef, got {type(view_def)}: {view_def!r} instead"
|
|
@@ -1113,6 +932,17 @@ class CubeLogs:
|
|
|
1113
932
|
else:
|
|
1114
933
|
piv.sort_index(inplace=True, axis=1)
|
|
1115
934
|
|
|
935
|
+
# final step, force columns with numerical values to be float
|
|
936
|
+
for c in list(piv.columns):
|
|
937
|
+
s = piv[c]
|
|
938
|
+
if not pandas.api.types.is_object_dtype(s):
|
|
939
|
+
continue
|
|
940
|
+
try:
|
|
941
|
+
sf = s.astype(float)
|
|
942
|
+
except (ValueError, TypeError):
|
|
943
|
+
continue
|
|
944
|
+
piv[c] = sf
|
|
945
|
+
|
|
1116
946
|
if verbose:
|
|
1117
947
|
print(f"[CubeLogs.view] levels {piv.index.names}, {piv.columns.names}")
|
|
1118
948
|
print(f"[CubeLogs.view] -- done view {view_def.name!r}")
|
|
@@ -1155,7 +985,9 @@ class CubeLogs:
|
|
|
1155
985
|
for c in set(key_index) | set(key_columns):
|
|
1156
986
|
s = new_data[c]
|
|
1157
987
|
if s.isna().max():
|
|
1158
|
-
if pandas.api.types.is_numeric_dtype(
|
|
988
|
+
if pandas.api.types.is_numeric_dtype(
|
|
989
|
+
s
|
|
990
|
+
) and not pandas.api.types.is_object_dtype(s):
|
|
1159
991
|
min_v = s.dropna().min()
|
|
1160
992
|
assert (
|
|
1161
993
|
min_v >= 0
|
|
@@ -1192,7 +1024,7 @@ class CubeLogs:
|
|
|
1192
1024
|
)
|
|
1193
1025
|
if len(nonan) > 0:
|
|
1194
1026
|
obs.update(dict(count=len(nonan)))
|
|
1195
|
-
if is_numeric_dtype(nonan):
|
|
1027
|
+
if is_numeric_dtype(nonan) and not pandas.api.types.is_object_dtype(nonan):
|
|
1196
1028
|
obs.update(
|
|
1197
1029
|
dict(
|
|
1198
1030
|
min=nonan.min(),
|
|
@@ -1228,9 +1060,11 @@ class CubeLogs:
|
|
|
1228
1060
|
raw: Optional[str] = "raw",
|
|
1229
1061
|
verbose: int = 0,
|
|
1230
1062
|
csv: Optional[Sequence[str]] = None,
|
|
1063
|
+
time_mask: bool = False,
|
|
1064
|
+
sbs: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
1231
1065
|
):
|
|
1232
1066
|
"""
|
|
1233
|
-
Creates an excel file with a list of
|
|
1067
|
+
Creates an excel file with a list of views.
|
|
1234
1068
|
|
|
1235
1069
|
:param output: output file to create
|
|
1236
1070
|
:param views: sequence or dictionary of views to append
|
|
@@ -1238,9 +1072,17 @@ class CubeLogs:
|
|
|
1238
1072
|
:param raw: add a page with the raw data
|
|
1239
1073
|
:param csv: views to dump as csv files (same name as outputs + view naw)
|
|
1240
1074
|
:param verbose: verbosity
|
|
1075
|
+
:param time_mask: color the background of the cells if one
|
|
1076
|
+
of the value for the last date is unexpected,
|
|
1077
|
+
assuming they should remain stale
|
|
1078
|
+
:param sbs: configurations to compare side-by-side, this adds two tabs,
|
|
1079
|
+
one gathering raw data about the two configurations, the other one
|
|
1080
|
+
is aggregated by metrics
|
|
1241
1081
|
"""
|
|
1242
1082
|
if verbose:
|
|
1243
1083
|
print(f"[CubeLogs.to_excel] create Excel file {output}, shape={self.shape}")
|
|
1084
|
+
time_mask &= len(self.data[self.time].unique()) > 2
|
|
1085
|
+
cube_time = self.cube_time(fill_other_dates=True) if time_mask else None
|
|
1244
1086
|
views = {k: k for k in views} if not isinstance(views, dict) else views
|
|
1245
1087
|
f_highlights = {}
|
|
1246
1088
|
plots = []
|
|
@@ -1252,10 +1094,25 @@ class CubeLogs:
|
|
|
1252
1094
|
print(f"[CubeLogs.to_excel] add sheet {main!r} with shape {df.shape}")
|
|
1253
1095
|
df.to_excel(writer, sheet_name=main, freeze_panes=(1, 1))
|
|
1254
1096
|
|
|
1097
|
+
time_mask_view: Dict[str, pandas.DataFrame] = {}
|
|
1255
1098
|
for name, view in views.items():
|
|
1256
1099
|
if view is None:
|
|
1257
1100
|
continue
|
|
1258
1101
|
df, tview = self.view(view, return_view_def=True, verbose=max(verbose - 1, 0))
|
|
1102
|
+
if cube_time is not None:
|
|
1103
|
+
cube_mask = cube_time.view(view)
|
|
1104
|
+
aligned = align_dataframe_with(cube_mask, df)
|
|
1105
|
+
if aligned is not None:
|
|
1106
|
+
assert aligned.shape == df.shape, (
|
|
1107
|
+
f"Shape mismatch between the view {df.shape} and the mask "
|
|
1108
|
+
f"{time_mask_view[name].shape}"
|
|
1109
|
+
)
|
|
1110
|
+
time_mask_view[name] = aligned
|
|
1111
|
+
if verbose:
|
|
1112
|
+
print(
|
|
1113
|
+
f"[CubeLogs.to_excel] compute mask for view {name!r} "
|
|
1114
|
+
f"with shape {aligned.shape}"
|
|
1115
|
+
)
|
|
1259
1116
|
if tview is None:
|
|
1260
1117
|
continue
|
|
1261
1118
|
memory = df.memory_usage(deep=True).sum()
|
|
@@ -1335,6 +1192,36 @@ class CubeLogs:
|
|
|
1335
1192
|
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
|
|
1336
1193
|
)
|
|
1337
1194
|
|
|
1195
|
+
if sbs:
|
|
1196
|
+
if verbose:
|
|
1197
|
+
for k, v in sbs.items():
|
|
1198
|
+
print(f"[CubeLogs.to_excel] sbs {k}: {v}")
|
|
1199
|
+
name = "∧".join(sbs)
|
|
1200
|
+
sbs_raw, sbs_agg = self.sbs(sbs)
|
|
1201
|
+
if verbose:
|
|
1202
|
+
print(f"[CubeLogs.to_excel] add sheet {name!r} with shape {sbs_raw.shape}")
|
|
1203
|
+
print(
|
|
1204
|
+
f"[CubeLogs.to_excel] add sheet '{name}-AGG' "
|
|
1205
|
+
f"with shape {sbs_agg.shape}"
|
|
1206
|
+
)
|
|
1207
|
+
sbs_raw = sbs_raw.reset_index(drop=False)
|
|
1208
|
+
sbs_raw.to_excel(
|
|
1209
|
+
writer,
|
|
1210
|
+
sheet_name=name,
|
|
1211
|
+
freeze_panes=(
|
|
1212
|
+
sbs_raw.columns.nlevels + sbs_raw.index.nlevels,
|
|
1213
|
+
sbs_raw.index.nlevels,
|
|
1214
|
+
),
|
|
1215
|
+
)
|
|
1216
|
+
sbs_agg.to_excel(
|
|
1217
|
+
writer,
|
|
1218
|
+
sheet_name=f"{name}-AGG",
|
|
1219
|
+
freeze_panes=(
|
|
1220
|
+
sbs_agg.columns.nlevels + sbs_agg.index.nlevels,
|
|
1221
|
+
sbs_agg.index.nlevels,
|
|
1222
|
+
),
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1338
1225
|
if plots:
|
|
1339
1226
|
from openpyxl.drawing.image import Image
|
|
1340
1227
|
|
|
@@ -1366,10 +1253,194 @@ class CubeLogs:
|
|
|
1366
1253
|
|
|
1367
1254
|
if verbose:
|
|
1368
1255
|
print(f"[CubeLogs.to_excel] applies style to {output!r}")
|
|
1369
|
-
apply_excel_style(
|
|
1256
|
+
apply_excel_style(
|
|
1257
|
+
writer, f_highlights, time_mask_view=time_mask_view, verbose=verbose # type: ignore[arg-type]
|
|
1258
|
+
)
|
|
1370
1259
|
if verbose:
|
|
1371
1260
|
print(f"[CubeLogs.to_excel] done with {len(views)} views")
|
|
1372
1261
|
|
|
1262
|
+
def cube_time(self, fill_other_dates: bool = False, threshold: float = 1.2) -> "CubeLogs":
|
|
1263
|
+
"""
|
|
1264
|
+
Aggregates the data over time to detect changes on the last value.
|
|
1265
|
+
If *fill_other_dates* is True, all dates are kept, but values
|
|
1266
|
+
are filled with 0.
|
|
1267
|
+
*threshold* determines the bandwidth within the values are expected,
|
|
1268
|
+
should be a factor of the standard deviation.
|
|
1269
|
+
"""
|
|
1270
|
+
unique_time = self.data[self.time].unique()
|
|
1271
|
+
assert len(unique_time) > 2, f"Not enough dates to proceed: unique_time={unique_time}"
|
|
1272
|
+
gr = self.data[[*self.keys_no_time, *self.values]].groupby(
|
|
1273
|
+
self.keys_no_time, dropna=False
|
|
1274
|
+
)
|
|
1275
|
+
dgr = gr.agg(
|
|
1276
|
+
lambda series, th=threshold: int(breaking_last_point(series, threshold=th)[0])
|
|
1277
|
+
)
|
|
1278
|
+
tm = unique_time.max()
|
|
1279
|
+
assert dgr.shape[0] > 0, (
|
|
1280
|
+
f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
|
|
1281
|
+
f"data.shape={self.data.shape}"
|
|
1282
|
+
)
|
|
1283
|
+
dgr[self.time] = tm
|
|
1284
|
+
if fill_other_dates:
|
|
1285
|
+
other_df = []
|
|
1286
|
+
other_dates = [t for t in unique_time if t != tm]
|
|
1287
|
+
for t in other_dates:
|
|
1288
|
+
df = dgr.copy()
|
|
1289
|
+
df[self.time] = t
|
|
1290
|
+
for c in df.columns:
|
|
1291
|
+
if c != self.time:
|
|
1292
|
+
df[c] = 0
|
|
1293
|
+
other_df.append(df)
|
|
1294
|
+
dgr = pandas.concat([dgr, *other_df], axis=0)
|
|
1295
|
+
assert dgr.shape[0] > 0, (
|
|
1296
|
+
f"Unexpected output shape={dgr.shape}, unique_time={unique_time}, "
|
|
1297
|
+
f"data.shape={self.data.shape}, "
|
|
1298
|
+
f"other_df shapes={[df.shape for df in other_df]}"
|
|
1299
|
+
)
|
|
1300
|
+
return self.clone(data=dgr.reset_index(drop=False))
|
|
1301
|
+
|
|
1302
|
+
def sbs(
|
|
1303
|
+
self, configs: Dict[str, Dict[str, Any]], column_name: str = "CONF"
|
|
1304
|
+
) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
|
|
1305
|
+
"""
|
|
1306
|
+
Creates a side-by-side for two configurations.
|
|
1307
|
+
Every configuration a dictionary column:value which filters in
|
|
1308
|
+
the rows to keep in order to compute the side by side.
|
|
1309
|
+
Every configuration is given a name (the key in configs),
|
|
1310
|
+
it is added in column column_name.
|
|
1311
|
+
|
|
1312
|
+
:param configs: example
|
|
1313
|
+
``dict(CFA=dict(exporter="E1", opt="O"), CFB=dict(exporter="E2", opt="O"))``
|
|
1314
|
+
:param column_name: column to add with the name of the configuration
|
|
1315
|
+
:return: data and aggregated date
|
|
1316
|
+
"""
|
|
1317
|
+
assert (
|
|
1318
|
+
len(configs) >= 2
|
|
1319
|
+
), f"A side by side needs at least two configs but configs={configs}"
|
|
1320
|
+
set_keys_time = set(self.keys_time)
|
|
1321
|
+
columns_index = None
|
|
1322
|
+
data_list = []
|
|
1323
|
+
for name_conf, conf in configs.items():
|
|
1324
|
+
if columns_index is None:
|
|
1325
|
+
columns_index = list(conf.keys())
|
|
1326
|
+
assert set(columns_index) <= set_keys_time, (
|
|
1327
|
+
f"Configuration {conf} includes columns outside the keys "
|
|
1328
|
+
f"{', '.join(sorted(set_keys_time))}"
|
|
1329
|
+
)
|
|
1330
|
+
else:
|
|
1331
|
+
assert set(columns_index) == set(conf), (
|
|
1332
|
+
f"Every conf should share the same keys but conf={conf} "
|
|
1333
|
+
f"is different from {set(columns_index)}"
|
|
1334
|
+
)
|
|
1335
|
+
data = self.data
|
|
1336
|
+
for k, v in conf.items():
|
|
1337
|
+
data = data[data[k] == v]
|
|
1338
|
+
assert data.shape[0] > 0, f"No rows found for conf={conf}"
|
|
1339
|
+
assert (
|
|
1340
|
+
column_name not in data.columns
|
|
1341
|
+
), f"column_name={column_name!r} is already in {data.columns}"
|
|
1342
|
+
data = data.copy()
|
|
1343
|
+
data[column_name] = name_conf
|
|
1344
|
+
data_list.append(data)
|
|
1345
|
+
|
|
1346
|
+
new_data = pandas.concat(data_list, axis=0)
|
|
1347
|
+
cube = self.clone(new_data, keys=[*self.keys_no_time, column_name])
|
|
1348
|
+
key_index = set(self.keys_time) - {*columns_index, column_name} # type: ignore[misc]
|
|
1349
|
+
view = CubeViewDef(
|
|
1350
|
+
key_index=set(key_index), # type: ignore[arg-type]
|
|
1351
|
+
name="sbs",
|
|
1352
|
+
values=cube.values,
|
|
1353
|
+
keep_columns_in_index=[self.time],
|
|
1354
|
+
)
|
|
1355
|
+
view_res = cube.view(view)
|
|
1356
|
+
assert isinstance(view_res, pandas.DataFrame), "not needed but mypy complains"
|
|
1357
|
+
|
|
1358
|
+
# add metrics
|
|
1359
|
+
index_column_name = list(view_res.columns.names).index(column_name)
|
|
1360
|
+
index_metrics = list(view_res.columns.names).index("METRICS")
|
|
1361
|
+
|
|
1362
|
+
def _mkc(m, s):
|
|
1363
|
+
c = ["" for c in view_res.columns.names]
|
|
1364
|
+
c[index_column_name] = s
|
|
1365
|
+
c[index_metrics] = m
|
|
1366
|
+
return tuple(c)
|
|
1367
|
+
|
|
1368
|
+
list_configs = list(configs.items())
|
|
1369
|
+
mean_columns = [
|
|
1370
|
+
c
|
|
1371
|
+
for c in view_res.columns
|
|
1372
|
+
if pandas.api.types.is_numeric_dtype(view_res[c])
|
|
1373
|
+
and not pandas.api.types.is_object_dtype(view_res[c])
|
|
1374
|
+
]
|
|
1375
|
+
assert mean_columns, f"No numerical columns in {view_res.dtypes}"
|
|
1376
|
+
view_res = view_res[mean_columns].copy()
|
|
1377
|
+
metrics = sorted(set(c[index_metrics] for c in view_res.columns))
|
|
1378
|
+
assert metrics, (
|
|
1379
|
+
f"No numerical metrics detected in "
|
|
1380
|
+
f"view_res.columns.names={view_res.columns.names}, "
|
|
1381
|
+
f"columns={view_res.dtypes}"
|
|
1382
|
+
)
|
|
1383
|
+
sum_columns = []
|
|
1384
|
+
columns_to_add = []
|
|
1385
|
+
for i in range(len(list_configs)):
|
|
1386
|
+
for j in range(i + 1, len(list_configs)):
|
|
1387
|
+
for m in metrics:
|
|
1388
|
+
iname, ci = list_configs[i]
|
|
1389
|
+
jname, cj = list_configs[j]
|
|
1390
|
+
ci = ci.copy()
|
|
1391
|
+
cj = cj.copy()
|
|
1392
|
+
ci["METRICS"] = m
|
|
1393
|
+
cj["METRICS"] = m
|
|
1394
|
+
ci["CONF"] = iname
|
|
1395
|
+
cj["CONF"] = jname
|
|
1396
|
+
|
|
1397
|
+
ci_name = tuple(ci[n] for n in view_res.columns.names)
|
|
1398
|
+
cj_name = tuple(cj[n] for n in view_res.columns.names)
|
|
1399
|
+
assert ci_name in view_res.columns or cj_name in view_res.columns, (
|
|
1400
|
+
f"Unable to find column {ci_name} or {cj_name} "
|
|
1401
|
+
f"in columns {view_res.columns}, metrics={metrics}"
|
|
1402
|
+
)
|
|
1403
|
+
if ci_name not in view_res.columns or cj_name not in view_res.columns:
|
|
1404
|
+
# One config does not have such metric.
|
|
1405
|
+
continue
|
|
1406
|
+
|
|
1407
|
+
si = view_res[ci_name]
|
|
1408
|
+
sj = view_res[cj_name]
|
|
1409
|
+
|
|
1410
|
+
sinan = si.isna()
|
|
1411
|
+
sjnan = sj.isna()
|
|
1412
|
+
n1 = iname
|
|
1413
|
+
n2 = jname
|
|
1414
|
+
nas = pandas.DataFrame(
|
|
1415
|
+
{
|
|
1416
|
+
_mkc(m, f"∅{n1}∧∅{n2}"): (sinan & sjnan).astype(int),
|
|
1417
|
+
_mkc(m, f"∅{n1}∧{n2}"): (sinan & ~sjnan).astype(int),
|
|
1418
|
+
_mkc(m, f"{n1}∧∅{n2}"): (~sinan & sjnan).astype(int),
|
|
1419
|
+
_mkc(m, f"{n1}∧{n2}"): (~sinan & ~sjnan).astype(int),
|
|
1420
|
+
_mkc(m, f"{n1}<{n2}"): (si < sj).astype(int),
|
|
1421
|
+
_mkc(m, f"{n1}=={n2}"): (si == sj).astype(int),
|
|
1422
|
+
_mkc(m, f"{n1}>{n2}"): (si > sj).astype(int),
|
|
1423
|
+
}
|
|
1424
|
+
)
|
|
1425
|
+
nas.columns.names = view_res.columns.names
|
|
1426
|
+
columns_to_add.append(nas)
|
|
1427
|
+
sum_columns.extend(nas.columns)
|
|
1428
|
+
|
|
1429
|
+
view_res = pandas.concat([view_res, *columns_to_add], axis=1)
|
|
1430
|
+
res = view_res.stack("METRICS", future_stack=True) # type: ignore[union-attr]
|
|
1431
|
+
res = res.reorder_levels(
|
|
1432
|
+
[res.index.nlevels - 1, *list(range(res.index.nlevels - 1))]
|
|
1433
|
+
).sort_index()
|
|
1434
|
+
|
|
1435
|
+
# aggregated metrics
|
|
1436
|
+
aggs = {
|
|
1437
|
+
**{k: "mean" for k in mean_columns}, # noqa: C420
|
|
1438
|
+
**{k: "sum" for k in sum_columns}, # noqa: C420
|
|
1439
|
+
}
|
|
1440
|
+
flat = view_res.groupby(self.time).agg(aggs)
|
|
1441
|
+
flat = flat.stack("METRICS", future_stack=True)
|
|
1442
|
+
return res, flat
|
|
1443
|
+
|
|
1373
1444
|
|
|
1374
1445
|
class CubeLogsPerformance(CubeLogs):
|
|
1375
1446
|
"""
|
|
@@ -1456,6 +1527,24 @@ class CubeLogsPerformance(CubeLogs):
|
|
|
1456
1527
|
keep_last_date=keep_last_date,
|
|
1457
1528
|
)
|
|
1458
1529
|
|
|
1530
|
+
def clone(
|
|
1531
|
+
self, data: Optional[pandas.DataFrame] = None, keys: Optional[Sequence[str]] = None
|
|
1532
|
+
) -> "CubeLogs":
|
|
1533
|
+
"""
|
|
1534
|
+
Makes a copy of the dataframe.
|
|
1535
|
+
It copies the processed data not the original one.
|
|
1536
|
+
keys can be changed as well.
|
|
1537
|
+
"""
|
|
1538
|
+
cube = self.__class__(
|
|
1539
|
+
data if data is not None else self.data.copy(),
|
|
1540
|
+
time=self.time,
|
|
1541
|
+
keys=keys or self.keys_no_time,
|
|
1542
|
+
values=self.values,
|
|
1543
|
+
recent=False,
|
|
1544
|
+
)
|
|
1545
|
+
cube.load()
|
|
1546
|
+
return cube
|
|
1547
|
+
|
|
1459
1548
|
def _process_formula(
|
|
1460
1549
|
self, formula: Union[str, Callable[[pandas.DataFrame], pandas.Series]]
|
|
1461
1550
|
) -> Callable[[pandas.DataFrame], pandas.Series]:
|