data-manipulation-utilities 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +177 -8
- {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +30 -18
- {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
- dmu/generic/hashing.py +44 -0
- dmu/generic/utilities.py +14 -1
- dmu/generic/version_management.py +3 -5
- dmu/ml/cv_diagnostics.py +221 -0
- dmu/ml/train_mva.py +124 -31
- dmu/pdataframe/utilities.py +36 -3
- dmu/plotting/fwhm.py +64 -0
- dmu/plotting/plotter.py +2 -0
- dmu/plotting/plotter_1d.py +87 -6
- dmu/stats/fitter.py +1 -1
- dmu/stats/model_factory.py +189 -25
- dmu/stats/zfit_models.py +68 -0
- dmu/stats/zfit_plotter.py +29 -21
- dmu/testing/utilities.py +31 -4
- dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
- dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
- dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
- dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
- dmu_data/ml/tests/train_mva.yaml +15 -9
- dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
- dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
- dmu_data/plotting/tests/plug_stats.yaml +19 -0
- dmu_data/plotting/tests/simple.yaml +4 -3
- dmu_data/plotting/tests/styling.yaml +11 -0
- {data_manipulation_utilities-0.2.6.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0
dmu/ml/train_mva.py
CHANGED
@@ -3,8 +3,10 @@ Module with TrainMva class
|
|
3
3
|
'''
|
4
4
|
# pylint: disable = too-many-locals, no-name-in-module
|
5
5
|
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
6
|
+
# pylint: disable = too-many-instance-attributes
|
6
7
|
|
7
8
|
import os
|
9
|
+
import copy
|
8
10
|
|
9
11
|
import joblib
|
10
12
|
import pandas as pnd
|
@@ -20,12 +22,13 @@ import dmu.ml.utilities as ut
|
|
20
22
|
import dmu.pdataframe.utilities as put
|
21
23
|
import dmu.plotting.utilities as plu
|
22
24
|
|
25
|
+
from dmu.ml.cv_diagnostics import CVDiagnostics
|
23
26
|
from dmu.ml.cv_classifier import CVClassifier as cls
|
24
27
|
from dmu.plotting.plotter_1d import Plotter1D as Plotter
|
25
28
|
from dmu.plotting.matrix import MatrixPlotter
|
26
29
|
from dmu.logging.log_store import LogStore
|
27
30
|
|
28
|
-
|
31
|
+
NPA = numpy.ndarray
|
29
32
|
log = LogStore.add_logger('dmu:ml:train_mva')
|
30
33
|
# ---------------------------------------------
|
31
34
|
class TrainMva:
|
@@ -42,30 +45,62 @@ class TrainMva:
|
|
42
45
|
self._cfg = cfg
|
43
46
|
self._l_ft_name = self._cfg['training']['features']
|
44
47
|
|
45
|
-
|
46
|
-
|
48
|
+
self._rdf_sig_org = sig
|
49
|
+
self._rdf_bkg_org = bkg
|
50
|
+
|
51
|
+
rdf_bkg = self._preprocess_rdf(bkg)
|
52
|
+
rdf_sig = self._preprocess_rdf(sig)
|
53
|
+
|
54
|
+
df_ft_sig, l_lab_sig = self._get_sample_inputs(rdf = rdf_sig, label = 1)
|
55
|
+
df_ft_bkg, l_lab_bkg = self._get_sample_inputs(rdf = rdf_bkg, label = 0)
|
47
56
|
|
48
57
|
self._df_ft = pnd.concat([df_ft_sig, df_ft_bkg], axis=0)
|
49
58
|
self._l_lab = numpy.array(l_lab_sig + l_lab_bkg)
|
50
59
|
|
51
|
-
self._rdf_bkg = self._get_rdf(rdf =
|
52
|
-
self._rdf_sig = self._get_rdf(rdf =
|
60
|
+
self._rdf_bkg = self._get_rdf(rdf = rdf_bkg, df_feat=df_ft_bkg)
|
61
|
+
self._rdf_sig = self._get_rdf(rdf = rdf_sig, df_feat=df_ft_sig)
|
53
62
|
# ---------------------------------------------
|
54
|
-
def
|
63
|
+
def _get_extra_columns(self, rdf : RDataFrame, df : pnd.DataFrame) -> list[str]:
|
64
|
+
d_plot = self._cfg['plotting']['features']['plots']
|
65
|
+
l_expr = list(d_plot)
|
66
|
+
l_rdf = [ name.c_str() for name in rdf.GetColumnNames() ]
|
67
|
+
|
68
|
+
l_extr = []
|
69
|
+
for expr in l_expr:
|
70
|
+
if expr not in l_rdf:
|
71
|
+
continue
|
72
|
+
|
73
|
+
if expr in df.columns:
|
74
|
+
continue
|
75
|
+
|
76
|
+
l_extr.append(expr)
|
77
|
+
|
78
|
+
return l_extr
|
79
|
+
# ---------------------------------------------
|
80
|
+
def _get_rdf(self, rdf : RDataFrame, df_feat : pnd.DataFrame) -> RDataFrame:
|
55
81
|
'''
|
56
82
|
Takes original ROOT dataframe and pre-processed features dataframe
|
57
83
|
Adds missing branches to latter and returns expanded ROOT dataframe
|
84
|
+
Need to make plots
|
58
85
|
'''
|
59
86
|
|
60
|
-
|
61
|
-
|
62
|
-
|
87
|
+
l_extr_col = self._get_extra_columns(rdf, df_feat)
|
88
|
+
if len(l_extr_col) > 20:
|
89
|
+
for name in l_extr_col:
|
90
|
+
log.debug(name)
|
91
|
+
raise ValueError('Found more than 20 extra columns')
|
92
|
+
|
93
|
+
d_data = rdf.AsNumpy(l_extr_col)
|
94
|
+
log.debug(f'Adding extra-nonfeature columns: {l_extr_col}')
|
95
|
+
df_extr = pnd.DataFrame(d_data)
|
96
|
+
|
97
|
+
nmain = len(df_feat.columns)
|
98
|
+
nextr = len(df_extr.columns)
|
63
99
|
|
64
|
-
log.debug(f'
|
100
|
+
log.debug(f'Main DF size: {nmain}')
|
101
|
+
log.debug(f'Extra DF size: {nextr}')
|
65
102
|
|
66
|
-
|
67
|
-
df_ext = pnd.DataFrame(d_data)
|
68
|
-
df_all = pnd.concat([df, df_ext], axis=1)
|
103
|
+
df_all = pnd.concat([df_feat, df_extr], axis=1)
|
69
104
|
|
70
105
|
return RDF.FromPandas(df_all)
|
71
106
|
# ---------------------------------------------
|
@@ -90,6 +125,19 @@ class TrainMva:
|
|
90
125
|
|
91
126
|
return df
|
92
127
|
# ---------------------------------------------
|
128
|
+
def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
|
129
|
+
if 'define' not in self._cfg['dataset']:
|
130
|
+
log.debug('No definitions found')
|
131
|
+
return rdf
|
132
|
+
|
133
|
+
log.debug('Definitions found')
|
134
|
+
d_def = self._cfg['dataset']['define']
|
135
|
+
for name, expr in d_def.items():
|
136
|
+
log.debug(f'{name:<20}{expr}')
|
137
|
+
rdf = rdf.Define(name, expr)
|
138
|
+
|
139
|
+
return rdf
|
140
|
+
# ---------------------------------------------
|
93
141
|
def _get_sample_inputs(self, rdf : RDataFrame, label : int) -> tuple[pnd.DataFrame, list[int]]:
|
94
142
|
d_ft = rdf.AsNumpy(self._l_ft_name)
|
95
143
|
df = pnd.DataFrame(d_ft)
|
@@ -99,7 +147,7 @@ class TrainMva:
|
|
99
147
|
|
100
148
|
return df, l_lab
|
101
149
|
# ---------------------------------------------
|
102
|
-
def _get_model(self, arr_index :
|
150
|
+
def _get_model(self, arr_index : NPA) -> cls:
|
103
151
|
model = cls(cfg = self._cfg)
|
104
152
|
df_ft = self._df_ft.iloc[arr_index]
|
105
153
|
l_lab = self._l_lab[arr_index]
|
@@ -111,10 +159,14 @@ class TrainMva:
|
|
111
159
|
|
112
160
|
return model
|
113
161
|
# ---------------------------------------------
|
114
|
-
def _get_models(self):
|
162
|
+
def _get_models(self, load_trained : bool):
|
115
163
|
'''
|
116
164
|
Will create models, train them and return them
|
117
165
|
'''
|
166
|
+
if load_trained:
|
167
|
+
log.warning('Not retraining, but loading trained models')
|
168
|
+
return self._load_trained_models()
|
169
|
+
|
118
170
|
nfold = self._cfg['training']['nfold']
|
119
171
|
rdmst = self._cfg['training']['rdm_stat']
|
120
172
|
|
@@ -141,6 +193,22 @@ class TrainMva:
|
|
141
193
|
|
142
194
|
return l_model
|
143
195
|
# ---------------------------------------------
|
196
|
+
def _load_trained_models(self) -> list[cls]:
|
197
|
+
model_path = self._cfg['saving']['path']
|
198
|
+
nfold = self._cfg['training']['nfold']
|
199
|
+
l_model = []
|
200
|
+
for ifold in range(nfold):
|
201
|
+
fold_path = model_path.replace('.pkl', f'_{ifold:03}.pkl')
|
202
|
+
|
203
|
+
if not os.path.isfile(fold_path):
|
204
|
+
raise FileNotFoundError(f'Missing trained model: {fold_path}')
|
205
|
+
|
206
|
+
log.debug(f'Loading model from: {fold_path}')
|
207
|
+
model = joblib.load(fold_path)
|
208
|
+
l_model.append(model)
|
209
|
+
|
210
|
+
return l_model
|
211
|
+
# ---------------------------------------------
|
144
212
|
def _labels_from_varnames(self, l_var_name : list[str]) -> list[str]:
|
145
213
|
try:
|
146
214
|
d_plot = self._cfg['plotting']['features']['plots']
|
@@ -180,7 +248,7 @@ class TrainMva:
|
|
180
248
|
d_form = {'Variable' : '{}', 'Importance' : '{:.1f}'}
|
181
249
|
put.df_to_tex(df, table_path, d_format = d_form)
|
182
250
|
# ---------------------------------------------
|
183
|
-
def _get_scores(self, model : cls, arr_index :
|
251
|
+
def _get_scores(self, model : cls, arr_index : NPA, on_training_ok : bool) -> tuple[NPA, NPA, NPA, NPA]:
|
184
252
|
'''
|
185
253
|
Returns a tuple of four arrays
|
186
254
|
|
@@ -203,7 +271,7 @@ class TrainMva:
|
|
203
271
|
|
204
272
|
return arr_sig, arr_bkg, arr_all, arr_lab
|
205
273
|
# ---------------------------------------------
|
206
|
-
def _split_scores(self, arr_prob :
|
274
|
+
def _split_scores(self, arr_prob : NPA, arr_label : NPA) -> tuple[NPA, NPA]:
|
207
275
|
'''
|
208
276
|
Will split the testing scores (predictions) based on the training scores
|
209
277
|
|
@@ -259,7 +327,7 @@ class TrainMva:
|
|
259
327
|
|
260
328
|
return cfg
|
261
329
|
# ---------------------------------------------
|
262
|
-
def _plot_correlation(self, arr_index :
|
330
|
+
def _plot_correlation(self, arr_index : NPA, ifold : int) -> None:
|
263
331
|
df_ft = self._df_ft.iloc[arr_index]
|
264
332
|
cfg = self._get_correlation_cfg(df_ft, ifold)
|
265
333
|
cov = df_ft.corr()
|
@@ -276,7 +344,7 @@ class TrainMva:
|
|
276
344
|
plt.savefig(f'{val_dir}/covariance.png')
|
277
345
|
plt.close()
|
278
346
|
# ---------------------------------------------
|
279
|
-
def _get_nentries(self, arr_val :
|
347
|
+
def _get_nentries(self, arr_val : NPA) -> str:
|
280
348
|
size = len(arr_val)
|
281
349
|
size = size / 1000.
|
282
350
|
|
@@ -311,10 +379,10 @@ class TrainMva:
|
|
311
379
|
plt.close()
|
312
380
|
# ---------------------------------------------
|
313
381
|
def _plot_roc(self,
|
314
|
-
l_lab_ts :
|
315
|
-
l_prb_ts :
|
316
|
-
l_lab_tr :
|
317
|
-
l_prb_tr :
|
382
|
+
l_lab_ts : NPA,
|
383
|
+
l_prb_ts : NPA,
|
384
|
+
l_lab_tr : NPA,
|
385
|
+
l_prb_tr : NPA,
|
318
386
|
ifold : int):
|
319
387
|
'''
|
320
388
|
Takes the labels and the probabilities and plots ROC
|
@@ -359,10 +427,10 @@ class TrainMva:
|
|
359
427
|
plt.close()
|
360
428
|
# ---------------------------------------------
|
361
429
|
def _plot_probabilities(self,
|
362
|
-
arr_seff:
|
363
|
-
arr_brej:
|
364
|
-
arr_sprb:
|
365
|
-
arr_labl:
|
430
|
+
arr_seff: NPA,
|
431
|
+
arr_brej: NPA,
|
432
|
+
arr_sprb: NPA,
|
433
|
+
arr_labl: NPA) -> None:
|
366
434
|
|
367
435
|
roc_cfg = self._cfg['plotting']['roc']
|
368
436
|
if 'annotate' not in roc_cfg:
|
@@ -447,11 +515,32 @@ class TrainMva:
|
|
447
515
|
os.makedirs(val_dir, exist_ok=True)
|
448
516
|
put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
|
449
517
|
# ---------------------------------------------
|
450
|
-
def
|
518
|
+
def _run_diagnostics(self, models : list[cls], rdf : RDataFrame, name : str) -> None:
|
519
|
+
if 'diagnostics' not in self._cfg:
|
520
|
+
log.warning('Diagnostics section not found, not running diagnostics')
|
521
|
+
return
|
522
|
+
|
523
|
+
cfg_diag = self._cfg['diagnostics']
|
524
|
+
out_dir = cfg_diag['output']
|
525
|
+
plt_dir = None
|
526
|
+
|
527
|
+
if 'overlay' in cfg_diag['correlations']['target']:
|
528
|
+
plt_dir = cfg_diag['correlations']['target']['overlay']['saving']['plt_dir']
|
529
|
+
|
530
|
+
cfg_diag = copy.deepcopy(cfg_diag)
|
531
|
+
cfg_diag['output'] = f'{out_dir}/{name}'
|
532
|
+
if plt_dir is not None:
|
533
|
+
cfg_diag['correlations']['target']['overlay']['saving']['plt_dir'] = f'{plt_dir}/{name}'
|
534
|
+
|
535
|
+
cvd = CVDiagnostics(models=models, rdf=rdf, cfg=cfg_diag)
|
536
|
+
cvd.run()
|
537
|
+
# ---------------------------------------------
|
538
|
+
def run(self, skip_fit : bool = False, load_trained : bool = False) -> None:
|
451
539
|
'''
|
452
540
|
Will do the training
|
453
541
|
|
454
542
|
skip_fit: By default false, if True, it will only do the plots of features and save tables
|
543
|
+
load_trained: If true, it will load the models instead of training, by default false
|
455
544
|
'''
|
456
545
|
self._save_settings_to_tex()
|
457
546
|
self._plot_features()
|
@@ -459,7 +548,11 @@ class TrainMva:
|
|
459
548
|
if skip_fit:
|
460
549
|
return
|
461
550
|
|
462
|
-
l_mod = self._get_models()
|
463
|
-
|
464
|
-
|
551
|
+
l_mod = self._get_models(load_trained = load_trained)
|
552
|
+
if not load_trained:
|
553
|
+
for ifold, mod in enumerate(l_mod):
|
554
|
+
self._save_model(mod, ifold)
|
555
|
+
|
556
|
+
self._run_diagnostics(models = l_mod, rdf = self._rdf_sig_org, name='Signal' )
|
557
|
+
self._run_diagnostics(models = l_mod, rdf = self._rdf_bkg_org, name='Background')
|
465
558
|
# ---------------------------------------------
|
dmu/pdataframe/utilities.py
CHANGED
@@ -2,20 +2,28 @@
|
|
2
2
|
Module containing utilities for pandas dataframes
|
3
3
|
'''
|
4
4
|
import os
|
5
|
+
import yaml
|
5
6
|
import pandas as pnd
|
6
7
|
|
7
8
|
from dmu.logging.log_store import LogStore
|
8
9
|
|
9
10
|
log=LogStore.add_logger('dmu:pdataframe:utilities')
|
10
|
-
|
11
11
|
# -------------------------------------
|
12
|
-
def df_to_tex(df
|
12
|
+
def df_to_tex(df : pnd.DataFrame,
|
13
|
+
path : str,
|
14
|
+
hide_index : bool = True,
|
15
|
+
d_format : dict[str,str]= None,
|
16
|
+
**kwargs : str ) -> None:
|
13
17
|
'''
|
14
18
|
Saves pandas dataframe to latex
|
15
19
|
|
16
20
|
Parameters
|
17
21
|
-------------
|
22
|
+
df : Dataframe with data
|
23
|
+
path (str) : Path to latex file
|
24
|
+
hide_index : If true (default), index of dataframe won't appear in table
|
18
25
|
d_format (dict) : Dictionary specifying the formattinng of the table, e.g. `{'col1': '{}', 'col2': '{:.3f}', 'col3' : '{:.3f}'}`
|
26
|
+
kwargs : Arguments needed in `to_latex`
|
19
27
|
'''
|
20
28
|
|
21
29
|
if path is not None:
|
@@ -30,7 +38,32 @@ def df_to_tex(df : pnd.DataFrame, path : str, hide_index : bool = True, d_format
|
|
30
38
|
st=st.format(formatter=d_format)
|
31
39
|
|
32
40
|
log.info(f'Saving to: {path}')
|
33
|
-
buf = st.to_latex(buf=path,
|
41
|
+
buf = st.to_latex(buf=path, hrules=True, **kwargs)
|
34
42
|
|
35
43
|
return buf
|
36
44
|
# -------------------------------------
|
45
|
+
def to_yaml(df : pnd.DataFrame, path : str):
|
46
|
+
'''
|
47
|
+
Takes a dataframe and the path to a yaml file
|
48
|
+
Makes the directory path if not found and saves data in YAML file
|
49
|
+
'''
|
50
|
+
dir_path = os.path.dirname(path)
|
51
|
+
os.makedirs(dir_path, exist_ok=True)
|
52
|
+
|
53
|
+
data = df.to_dict()
|
54
|
+
|
55
|
+
with open(path, 'w', encoding='utf-8') as ofile:
|
56
|
+
yaml.safe_dump(data, ofile)
|
57
|
+
# -------------------------------------
|
58
|
+
def from_yaml(path : str) -> pnd.DataFrame:
|
59
|
+
'''
|
60
|
+
Takes path to a yaml file
|
61
|
+
Makes dataframe from it and returns it
|
62
|
+
'''
|
63
|
+
with open(path, encoding='utf-8') as ifile:
|
64
|
+
data = yaml.safe_load(ifile)
|
65
|
+
|
66
|
+
df = pnd.DataFrame(data)
|
67
|
+
|
68
|
+
return df
|
69
|
+
# -------------------------------------
|
dmu/plotting/fwhm.py
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
'''
|
2
|
+
Module with FWHM plugin class
|
3
|
+
'''
|
4
|
+
import zfit
|
5
|
+
import numpy
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
|
8
|
+
from dmu.logging.log_store import LogStore
|
9
|
+
|
10
|
+
log = LogStore.add_logger('dmu:plotting:fwhm')
|
11
|
+
# --------------------------------------------
|
12
|
+
class FWHM:
|
13
|
+
'''
|
14
|
+
Class meant to be used to calculate Full Width at Half Maximum
|
15
|
+
as a Plotter1d plugin
|
16
|
+
'''
|
17
|
+
# -------------------------
|
18
|
+
def __init__(self, cfg : dict, val : numpy.ndarray, wgt : numpy.ndarray, maxy : float):
|
19
|
+
self._cfg = cfg
|
20
|
+
self._arr_val = val
|
21
|
+
self._arr_wgt = wgt
|
22
|
+
self._maxy = maxy
|
23
|
+
# -------------------------
|
24
|
+
def _normalize_yval(self, arr_pdf_val : numpy.ndarray) -> None:
|
25
|
+
max_pdf_val = numpy.max(arr_pdf_val)
|
26
|
+
arr_pdf_val*= self._maxy / max_pdf_val
|
27
|
+
|
28
|
+
return arr_pdf_val
|
29
|
+
# -------------------------
|
30
|
+
def _get_fwhm(self, arr_x : numpy.ndarray, arr_y : numpy.ndarray) -> float:
|
31
|
+
maxy = numpy.max(arr_y)
|
32
|
+
arry = numpy.where(arr_y > maxy/2.)[0]
|
33
|
+
imax = arry[ 0]
|
34
|
+
imin = arry[-1]
|
35
|
+
|
36
|
+
x1 = arr_x[imax]
|
37
|
+
x2 = arr_x[imin]
|
38
|
+
|
39
|
+
if self._cfg['plot']:
|
40
|
+
plt.plot([x1, x2], [maxy/2, maxy/2], linestyle=':', linewidth=1, color='k')
|
41
|
+
|
42
|
+
return x2 - x1
|
43
|
+
# -------------------------
|
44
|
+
def run(self) -> float:
|
45
|
+
'''
|
46
|
+
Runs plugin and return FWHM
|
47
|
+
'''
|
48
|
+
[minx, maxx] = self._cfg['obs']
|
49
|
+
|
50
|
+
log.info('Running FWHM pluggin')
|
51
|
+
obs = zfit.Space('mass', limits=(minx, maxx))
|
52
|
+
pdf= zfit.pdf.KDE1DimExact(obs=obs, data=self._arr_val, weights=self._arr_wgt)
|
53
|
+
|
54
|
+
xval = numpy.linspace(minx, maxx, 200)
|
55
|
+
yval = pdf.pdf(xval)
|
56
|
+
yval = self._normalize_yval(yval)
|
57
|
+
|
58
|
+
if self._cfg['plot']:
|
59
|
+
plt.plot(xval, yval, linestyle='-', linewidth=2, color='gray')
|
60
|
+
|
61
|
+
fwhm = self._get_fwhm(xval, yval)
|
62
|
+
|
63
|
+
return fwhm
|
64
|
+
# --------------------------------------------
|
dmu/plotting/plotter.py
CHANGED
@@ -29,6 +29,8 @@ class Plotter:
|
|
29
29
|
self._d_cfg = cfg
|
30
30
|
self._d_rdf : dict[str, RDataFrame] = { name : self._preprocess_rdf(rdf) for name, rdf in d_rdf.items()}
|
31
31
|
self._d_wgt : Union[dict[str, Union[numpy.ndarray, None]], None]
|
32
|
+
|
33
|
+
self._title : str = ''
|
32
34
|
#-------------------------------------
|
33
35
|
def _check_quantile(self, qnt : float):
|
34
36
|
'''
|
dmu/plotting/plotter_1d.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
'''
|
2
2
|
Module containing plotter class
|
3
3
|
'''
|
4
|
-
|
4
|
+
import copy
|
5
5
|
from hist import Hist
|
6
6
|
|
7
7
|
import numpy
|
@@ -9,6 +9,7 @@ import matplotlib.pyplot as plt
|
|
9
9
|
|
10
10
|
from dmu.logging.log_store import LogStore
|
11
11
|
from dmu.plotting.plotter import Plotter
|
12
|
+
from dmu.plotting.fwhm import FWHM
|
12
13
|
|
13
14
|
log = LogStore.add_logger('dmu:plotting:Plotter1D')
|
14
15
|
# --------------------------------------------
|
@@ -55,6 +56,72 @@ class Plotter1D(Plotter):
|
|
55
56
|
|
56
57
|
return minx, maxx, bins
|
57
58
|
#-------------------------------------
|
59
|
+
def _run_plugins(self,
|
60
|
+
arr_val : numpy.ndarray,
|
61
|
+
arr_wgt : numpy.ndarray,
|
62
|
+
hst,
|
63
|
+
name : str,
|
64
|
+
varname : str) -> None:
|
65
|
+
if 'plugin' not in self._d_cfg:
|
66
|
+
log.debug('No plugins found')
|
67
|
+
return
|
68
|
+
|
69
|
+
if 'fwhm' in self._d_cfg['plugin']:
|
70
|
+
if varname not in self._d_cfg['plugin']['fwhm']:
|
71
|
+
log.debug(f'No FWHM plugin found for variable {varname}')
|
72
|
+
return
|
73
|
+
|
74
|
+
log.debug(f'FWHM plugin found for variable {varname}')
|
75
|
+
cfg = self._d_cfg['plugin']['fwhm'][varname]
|
76
|
+
self._run_fwhm(arr_val = arr_val, arr_wgt=arr_wgt, hst=hst, name=name, cfg = cfg)
|
77
|
+
|
78
|
+
if 'stats' in self._d_cfg['plugin']:
|
79
|
+
if varname not in self._d_cfg['plugin']['stats']:
|
80
|
+
log.debug(f'No stats plugin found for variable {varname}')
|
81
|
+
return
|
82
|
+
|
83
|
+
log.debug(f'stats plugin found for variable {varname}')
|
84
|
+
cfg = self._d_cfg['plugin']['stats'][varname]
|
85
|
+
self._run_stats(arr_val = arr_val, arr_wgt=arr_wgt, name=name, cfg = cfg)
|
86
|
+
#-------------------------------------
|
87
|
+
def _run_stats(self, arr_val : numpy.ndarray, arr_wgt : numpy.ndarray, name : str, cfg : dict[str:str]) -> None:
|
88
|
+
this_title = ''
|
89
|
+
if 'sum' in cfg:
|
90
|
+
form = cfg['sum']
|
91
|
+
sumv = numpy.sum(arr_wgt)
|
92
|
+
this_title += form.format(sumv) + '; '
|
93
|
+
|
94
|
+
if 'mean' in cfg:
|
95
|
+
form = cfg['mean']
|
96
|
+
mean = numpy.average(arr_val, weights=arr_wgt)
|
97
|
+
this_title += form.format(mean) + '; '
|
98
|
+
|
99
|
+
if 'rms' in cfg:
|
100
|
+
form = cfg['rms']
|
101
|
+
mean = numpy.average(arr_val, weights=arr_wgt)
|
102
|
+
rms = numpy.sqrt(numpy.average((arr_val - mean) ** 2, weights=arr_wgt))
|
103
|
+
this_title += form.format(rms ) + '; '
|
104
|
+
|
105
|
+
self._title+= f'\n{name}: {this_title}'
|
106
|
+
#-------------------------------------
|
107
|
+
def _run_fwhm(self, arr_val : numpy.ndarray, arr_wgt : numpy.ndarray, hst, name : str, cfg : dict) -> None:
|
108
|
+
arr_bin_cnt = hst.values()
|
109
|
+
maxy = numpy.max(arr_bin_cnt)
|
110
|
+
obj = FWHM(cfg=cfg, val=arr_val, wgt=arr_wgt, maxy=maxy)
|
111
|
+
fwhm = obj.run()
|
112
|
+
|
113
|
+
form = cfg['format']
|
114
|
+
this_title = form.format(fwhm)
|
115
|
+
|
116
|
+
if 'add_std' in cfg and cfg['add_std']:
|
117
|
+
mu = numpy.average(arr_val , weights=arr_wgt)
|
118
|
+
avg = numpy.average((arr_val - mu) ** 2, weights=arr_wgt)
|
119
|
+
std = numpy.sqrt(avg)
|
120
|
+
form = form.replace('FWHM', 'STD')
|
121
|
+
this_title+= '; ' + form.format(std)
|
122
|
+
|
123
|
+
self._title+= f'\n{name}: {this_title}'
|
124
|
+
#-------------------------------------
|
58
125
|
def _plot_var(self, var : str) -> float:
|
59
126
|
'''
|
60
127
|
Will plot a variable from a dictionary of dataframes
|
@@ -70,6 +137,7 @@ class Plotter1D(Plotter):
|
|
70
137
|
|
71
138
|
d_data = {}
|
72
139
|
for name, rdf in self._d_rdf.items():
|
140
|
+
log.debug(f'Plotting: {var}/{name}')
|
73
141
|
d_data[name] = rdf.AsNumpy([var])[var]
|
74
142
|
|
75
143
|
minx, maxx, bins = self._get_binning(var, d_data)
|
@@ -82,7 +150,18 @@ class Plotter1D(Plotter):
|
|
82
150
|
arr_wgt = self._normalize_weights(arr_wgt, var)
|
83
151
|
hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x').Weight()
|
84
152
|
hst.fill(x=arr_val, weight=arr_wgt)
|
85
|
-
|
153
|
+
self._run_plugins(arr_val, arr_wgt, hst, name, var)
|
154
|
+
|
155
|
+
if 'styling' in self._d_cfg['plots'][var]:
|
156
|
+
style = self._d_cfg['plots'][var]['styling']
|
157
|
+
style = copy.deepcopy(style)
|
158
|
+
else:
|
159
|
+
style = {'label' : label, 'histtype' : 'errorbar', 'marker' : '.', 'linestyle' : 'none'}
|
160
|
+
|
161
|
+
if 'label' not in style:
|
162
|
+
style['label'] = label
|
163
|
+
|
164
|
+
hst.plot(**style)
|
86
165
|
l_bc_all += hst.values().tolist()
|
87
166
|
|
88
167
|
max_y = max(l_bc_all)
|
@@ -131,9 +210,12 @@ class Plotter1D(Plotter):
|
|
131
210
|
if yscale == 'linear':
|
132
211
|
plt.ylim(bottom=0)
|
133
212
|
|
134
|
-
title =
|
213
|
+
title = self._title
|
135
214
|
if 'title' in d_cfg:
|
136
|
-
|
215
|
+
this_title = d_cfg['title']
|
216
|
+
title += f'\n {this_title}'
|
217
|
+
|
218
|
+
title = title.lstrip('\n')
|
137
219
|
|
138
220
|
plt.ylim(top=1.2 * max_y)
|
139
221
|
plt.legend()
|
@@ -160,8 +242,7 @@ class Plotter1D(Plotter):
|
|
160
242
|
|
161
243
|
fig_size = self._get_fig_size()
|
162
244
|
for var in self._d_cfg['plots']:
|
163
|
-
|
164
|
-
|
245
|
+
self._title = ''
|
165
246
|
plt.figure(var, figsize=fig_size)
|
166
247
|
max_y = self._plot_var(var)
|
167
248
|
self._style_plot(var, max_y)
|
dmu/stats/fitter.py
CHANGED
@@ -231,7 +231,7 @@ class Fitter:
|
|
231
231
|
continue
|
232
232
|
|
233
233
|
const = zfit.constraint.GaussianConstraint(params=par, observation=float(par_mu), uncertainty=float(par_sg))
|
234
|
-
log.info(f'{"":<4}{par_name:<
|
234
|
+
log.info(f'{"":<4}{par_name:<45}{par_mu:<15.3e}{par_sg:<15.3e}')
|
235
235
|
l_const.append(const)
|
236
236
|
|
237
237
|
return l_const
|