data-manipulation-utilities 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +179 -10
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +31 -19
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
- dmu/generic/hashing.py +44 -0
- dmu/generic/utilities.py +14 -1
- dmu/generic/version_management.py +3 -5
- dmu/ml/cv_diagnostics.py +221 -0
- dmu/ml/train_mva.py +143 -46
- dmu/pdataframe/utilities.py +36 -3
- dmu/plotting/fwhm.py +64 -0
- dmu/plotting/plotter.py +2 -0
- dmu/plotting/plotter_1d.py +87 -6
- dmu/stats/fitter.py +1 -1
- dmu/stats/minimizers.py +40 -11
- dmu/stats/model_factory.py +248 -44
- dmu/stats/zfit_models.py +68 -0
- dmu/stats/zfit_plotter.py +29 -21
- dmu/testing/utilities.py +31 -4
- dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
- dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
- dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
- dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
- dmu_data/ml/tests/train_mva.yaml +19 -10
- dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
- dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
- dmu_data/plotting/tests/plug_stats.yaml +19 -0
- dmu_data/plotting/tests/simple.yaml +4 -3
- dmu_data/plotting/tests/styling.yaml +11 -0
- {data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0
dmu/ml/train_mva.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
'''
|
2
2
|
Module with TrainMva class
|
3
3
|
'''
|
4
|
-
# pylint: disable = too-many-locals
|
4
|
+
# pylint: disable = too-many-locals, no-name-in-module
|
5
5
|
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
6
|
+
# pylint: disable = too-many-instance-attributes
|
6
7
|
|
7
8
|
import os
|
9
|
+
import copy
|
8
10
|
|
9
11
|
import joblib
|
10
12
|
import pandas as pnd
|
@@ -14,18 +16,19 @@ import matplotlib.pyplot as plt
|
|
14
16
|
from sklearn.metrics import roc_curve, auc
|
15
17
|
from sklearn.model_selection import StratifiedKFold
|
16
18
|
|
17
|
-
from ROOT import RDataFrame
|
19
|
+
from ROOT import RDataFrame, RDF
|
18
20
|
|
19
21
|
import dmu.ml.utilities as ut
|
20
22
|
import dmu.pdataframe.utilities as put
|
21
23
|
import dmu.plotting.utilities as plu
|
22
24
|
|
25
|
+
from dmu.ml.cv_diagnostics import CVDiagnostics
|
23
26
|
from dmu.ml.cv_classifier import CVClassifier as cls
|
24
27
|
from dmu.plotting.plotter_1d import Plotter1D as Plotter
|
25
28
|
from dmu.plotting.matrix import MatrixPlotter
|
26
29
|
from dmu.logging.log_store import LogStore
|
27
30
|
|
28
|
-
|
31
|
+
NPA = numpy.ndarray
|
29
32
|
log = LogStore.add_logger('dmu:ml:train_mva')
|
30
33
|
# ---------------------------------------------
|
31
34
|
class TrainMva:
|
@@ -33,40 +36,73 @@ class TrainMva:
|
|
33
36
|
Interface to scikit learn used to train classifier
|
34
37
|
'''
|
35
38
|
# ---------------------------------------------
|
36
|
-
def __init__(self, bkg
|
39
|
+
def __init__(self, bkg : RDataFrame, sig : RDataFrame, cfg : dict):
|
37
40
|
'''
|
38
41
|
bkg (ROOT dataframe): Holds real data
|
39
42
|
sig (ROOT dataframe): Holds simulation
|
40
43
|
cfg (dict) : Dictionary storing configuration for training
|
41
44
|
'''
|
42
|
-
|
43
|
-
|
45
|
+
self._cfg = cfg
|
46
|
+
self._l_ft_name = self._cfg['training']['features']
|
44
47
|
|
45
|
-
|
46
|
-
|
48
|
+
self._rdf_sig_org = sig
|
49
|
+
self._rdf_bkg_org = bkg
|
47
50
|
|
48
|
-
|
49
|
-
|
51
|
+
rdf_bkg = self._preprocess_rdf(bkg)
|
52
|
+
rdf_sig = self._preprocess_rdf(sig)
|
50
53
|
|
51
|
-
self.
|
52
|
-
self.
|
53
|
-
self._cfg = cfg
|
54
|
+
df_ft_sig, l_lab_sig = self._get_sample_inputs(rdf = rdf_sig, label = 1)
|
55
|
+
df_ft_bkg, l_lab_bkg = self._get_sample_inputs(rdf = rdf_bkg, label = 0)
|
54
56
|
|
55
|
-
self.
|
57
|
+
self._df_ft = pnd.concat([df_ft_sig, df_ft_bkg], axis=0)
|
58
|
+
self._l_lab = numpy.array(l_lab_sig + l_lab_bkg)
|
56
59
|
|
57
|
-
self.
|
60
|
+
self._rdf_bkg = self._get_rdf(rdf = rdf_bkg, df_feat=df_ft_bkg)
|
61
|
+
self._rdf_sig = self._get_rdf(rdf = rdf_sig, df_feat=df_ft_sig)
|
58
62
|
# ---------------------------------------------
|
59
|
-
def
|
60
|
-
|
61
|
-
|
63
|
+
def _get_extra_columns(self, rdf : RDataFrame, df : pnd.DataFrame) -> list[str]:
|
64
|
+
d_plot = self._cfg['plotting']['features']['plots']
|
65
|
+
l_expr = list(d_plot)
|
66
|
+
l_rdf = [ name.c_str() for name in rdf.GetColumnNames() ]
|
67
|
+
|
68
|
+
l_extr = []
|
69
|
+
for expr in l_expr:
|
70
|
+
if expr not in l_rdf:
|
71
|
+
continue
|
62
72
|
|
63
|
-
|
64
|
-
|
73
|
+
if expr in df.columns:
|
74
|
+
continue
|
65
75
|
|
66
|
-
|
67
|
-
arr_lab = numpy.concatenate([arr_lab_sig, arr_lab_bkg])
|
76
|
+
l_extr.append(expr)
|
68
77
|
|
69
|
-
return
|
78
|
+
return l_extr
|
79
|
+
# ---------------------------------------------
|
80
|
+
def _get_rdf(self, rdf : RDataFrame, df_feat : pnd.DataFrame) -> RDataFrame:
|
81
|
+
'''
|
82
|
+
Takes original ROOT dataframe and pre-processed features dataframe
|
83
|
+
Adds missing branches to latter and returns expanded ROOT dataframe
|
84
|
+
Need to make plots
|
85
|
+
'''
|
86
|
+
|
87
|
+
l_extr_col = self._get_extra_columns(rdf, df_feat)
|
88
|
+
if len(l_extr_col) > 20:
|
89
|
+
for name in l_extr_col:
|
90
|
+
log.debug(name)
|
91
|
+
raise ValueError('Found more than 20 extra columns')
|
92
|
+
|
93
|
+
d_data = rdf.AsNumpy(l_extr_col)
|
94
|
+
log.debug(f'Adding extra-nonfeature columns: {l_extr_col}')
|
95
|
+
df_extr = pnd.DataFrame(d_data)
|
96
|
+
|
97
|
+
nmain = len(df_feat.columns)
|
98
|
+
nextr = len(df_extr.columns)
|
99
|
+
|
100
|
+
log.debug(f'Main DF size: {nmain}')
|
101
|
+
log.debug(f'Extra DF size: {nextr}')
|
102
|
+
|
103
|
+
df_all = pnd.concat([df_feat, df_extr], axis=1)
|
104
|
+
|
105
|
+
return RDF.FromPandas(df_all)
|
70
106
|
# ---------------------------------------------
|
71
107
|
def _pre_process_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
108
|
if 'dataset' not in self._cfg:
|
@@ -77,25 +113,41 @@ class TrainMva:
|
|
77
113
|
return df
|
78
114
|
|
79
115
|
d_name_val = self._cfg['dataset']['nan']
|
80
|
-
log.info(
|
116
|
+
log.info(70 * '-')
|
81
117
|
log.info('Doing NaN replacements')
|
82
|
-
log.info(
|
118
|
+
log.info(70 * '-')
|
83
119
|
for var, val in d_name_val.items():
|
84
|
-
|
120
|
+
nna = df[var].isna().sum()
|
121
|
+
|
122
|
+
log.info(f'{var:<20}{"--->":20}{val:<20.3f}{nna}')
|
85
123
|
df[var] = df[var].fillna(val)
|
124
|
+
log.info(70 * '-')
|
86
125
|
|
87
126
|
return df
|
88
127
|
# ---------------------------------------------
|
89
|
-
def
|
128
|
+
def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
|
129
|
+
if 'define' not in self._cfg['dataset']:
|
130
|
+
log.debug('No definitions found')
|
131
|
+
return rdf
|
132
|
+
|
133
|
+
log.debug('Definitions found')
|
134
|
+
d_def = self._cfg['dataset']['define']
|
135
|
+
for name, expr in d_def.items():
|
136
|
+
log.debug(f'{name:<20}{expr}')
|
137
|
+
rdf = rdf.Define(name, expr)
|
138
|
+
|
139
|
+
return rdf
|
140
|
+
# ---------------------------------------------
|
141
|
+
def _get_sample_inputs(self, rdf : RDataFrame, label : int) -> tuple[pnd.DataFrame, list[int]]:
|
90
142
|
d_ft = rdf.AsNumpy(self._l_ft_name)
|
91
143
|
df = pnd.DataFrame(d_ft)
|
92
144
|
df = self._pre_process_nans(df)
|
93
145
|
df = ut.cleanup(df)
|
94
146
|
l_lab= len(df) * [label]
|
95
147
|
|
96
|
-
return df,
|
148
|
+
return df, l_lab
|
97
149
|
# ---------------------------------------------
|
98
|
-
def _get_model(self, arr_index :
|
150
|
+
def _get_model(self, arr_index : NPA) -> cls:
|
99
151
|
model = cls(cfg = self._cfg)
|
100
152
|
df_ft = self._df_ft.iloc[arr_index]
|
101
153
|
l_lab = self._l_lab[arr_index]
|
@@ -107,10 +159,14 @@ class TrainMva:
|
|
107
159
|
|
108
160
|
return model
|
109
161
|
# ---------------------------------------------
|
110
|
-
def _get_models(self):
|
162
|
+
def _get_models(self, load_trained : bool):
|
111
163
|
'''
|
112
164
|
Will create models, train them and return them
|
113
165
|
'''
|
166
|
+
if load_trained:
|
167
|
+
log.warning('Not retraining, but loading trained models')
|
168
|
+
return self._load_trained_models()
|
169
|
+
|
114
170
|
nfold = self._cfg['training']['nfold']
|
115
171
|
rdmst = self._cfg['training']['rdm_stat']
|
116
172
|
|
@@ -137,6 +193,22 @@ class TrainMva:
|
|
137
193
|
|
138
194
|
return l_model
|
139
195
|
# ---------------------------------------------
|
196
|
+
def _load_trained_models(self) -> list[cls]:
|
197
|
+
model_path = self._cfg['saving']['path']
|
198
|
+
nfold = self._cfg['training']['nfold']
|
199
|
+
l_model = []
|
200
|
+
for ifold in range(nfold):
|
201
|
+
fold_path = model_path.replace('.pkl', f'_{ifold:03}.pkl')
|
202
|
+
|
203
|
+
if not os.path.isfile(fold_path):
|
204
|
+
raise FileNotFoundError(f'Missing trained model: {fold_path}')
|
205
|
+
|
206
|
+
log.debug(f'Loading model from: {fold_path}')
|
207
|
+
model = joblib.load(fold_path)
|
208
|
+
l_model.append(model)
|
209
|
+
|
210
|
+
return l_model
|
211
|
+
# ---------------------------------------------
|
140
212
|
def _labels_from_varnames(self, l_var_name : list[str]) -> list[str]:
|
141
213
|
try:
|
142
214
|
d_plot = self._cfg['plotting']['features']['plots']
|
@@ -176,7 +248,7 @@ class TrainMva:
|
|
176
248
|
d_form = {'Variable' : '{}', 'Importance' : '{:.1f}'}
|
177
249
|
put.df_to_tex(df, table_path, d_format = d_form)
|
178
250
|
# ---------------------------------------------
|
179
|
-
def _get_scores(self, model : cls, arr_index :
|
251
|
+
def _get_scores(self, model : cls, arr_index : NPA, on_training_ok : bool) -> tuple[NPA, NPA, NPA, NPA]:
|
180
252
|
'''
|
181
253
|
Returns a tuple of four arrays
|
182
254
|
|
@@ -199,7 +271,7 @@ class TrainMva:
|
|
199
271
|
|
200
272
|
return arr_sig, arr_bkg, arr_all, arr_lab
|
201
273
|
# ---------------------------------------------
|
202
|
-
def _split_scores(self, arr_prob :
|
274
|
+
def _split_scores(self, arr_prob : NPA, arr_label : NPA) -> tuple[NPA, NPA]:
|
203
275
|
'''
|
204
276
|
Will split the testing scores (predictions) based on the training scores
|
205
277
|
|
@@ -255,7 +327,7 @@ class TrainMva:
|
|
255
327
|
|
256
328
|
return cfg
|
257
329
|
# ---------------------------------------------
|
258
|
-
def _plot_correlation(self, arr_index :
|
330
|
+
def _plot_correlation(self, arr_index : NPA, ifold : int) -> None:
|
259
331
|
df_ft = self._df_ft.iloc[arr_index]
|
260
332
|
cfg = self._get_correlation_cfg(df_ft, ifold)
|
261
333
|
cov = df_ft.corr()
|
@@ -272,7 +344,7 @@ class TrainMva:
|
|
272
344
|
plt.savefig(f'{val_dir}/covariance.png')
|
273
345
|
plt.close()
|
274
346
|
# ---------------------------------------------
|
275
|
-
def _get_nentries(self, arr_val :
|
347
|
+
def _get_nentries(self, arr_val : NPA) -> str:
|
276
348
|
size = len(arr_val)
|
277
349
|
size = size / 1000.
|
278
350
|
|
@@ -307,10 +379,10 @@ class TrainMva:
|
|
307
379
|
plt.close()
|
308
380
|
# ---------------------------------------------
|
309
381
|
def _plot_roc(self,
|
310
|
-
l_lab_ts :
|
311
|
-
l_prb_ts :
|
312
|
-
l_lab_tr :
|
313
|
-
l_prb_tr :
|
382
|
+
l_lab_ts : NPA,
|
383
|
+
l_prb_ts : NPA,
|
384
|
+
l_lab_tr : NPA,
|
385
|
+
l_prb_tr : NPA,
|
314
386
|
ifold : int):
|
315
387
|
'''
|
316
388
|
Takes the labels and the probabilities and plots ROC
|
@@ -355,10 +427,10 @@ class TrainMva:
|
|
355
427
|
plt.close()
|
356
428
|
# ---------------------------------------------
|
357
429
|
def _plot_probabilities(self,
|
358
|
-
arr_seff:
|
359
|
-
arr_brej:
|
360
|
-
arr_sprb:
|
361
|
-
arr_labl:
|
430
|
+
arr_seff: NPA,
|
431
|
+
arr_brej: NPA,
|
432
|
+
arr_sprb: NPA,
|
433
|
+
arr_labl: NPA) -> None:
|
362
434
|
|
363
435
|
roc_cfg = self._cfg['plotting']['roc']
|
364
436
|
if 'annotate' not in roc_cfg:
|
@@ -443,11 +515,32 @@ class TrainMva:
|
|
443
515
|
os.makedirs(val_dir, exist_ok=True)
|
444
516
|
put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
|
445
517
|
# ---------------------------------------------
|
446
|
-
def
|
518
|
+
def _run_diagnostics(self, models : list[cls], rdf : RDataFrame, name : str) -> None:
|
519
|
+
if 'diagnostics' not in self._cfg:
|
520
|
+
log.warning('Diagnostics section not found, not running diagnostics')
|
521
|
+
return
|
522
|
+
|
523
|
+
cfg_diag = self._cfg['diagnostics']
|
524
|
+
out_dir = cfg_diag['output']
|
525
|
+
plt_dir = None
|
526
|
+
|
527
|
+
if 'overlay' in cfg_diag['correlations']['target']:
|
528
|
+
plt_dir = cfg_diag['correlations']['target']['overlay']['saving']['plt_dir']
|
529
|
+
|
530
|
+
cfg_diag = copy.deepcopy(cfg_diag)
|
531
|
+
cfg_diag['output'] = f'{out_dir}/{name}'
|
532
|
+
if plt_dir is not None:
|
533
|
+
cfg_diag['correlations']['target']['overlay']['saving']['plt_dir'] = f'{plt_dir}/{name}'
|
534
|
+
|
535
|
+
cvd = CVDiagnostics(models=models, rdf=rdf, cfg=cfg_diag)
|
536
|
+
cvd.run()
|
537
|
+
# ---------------------------------------------
|
538
|
+
def run(self, skip_fit : bool = False, load_trained : bool = False) -> None:
|
447
539
|
'''
|
448
540
|
Will do the training
|
449
541
|
|
450
542
|
skip_fit: By default false, if True, it will only do the plots of features and save tables
|
543
|
+
load_trained: If true, it will load the models instead of training, by default false
|
451
544
|
'''
|
452
545
|
self._save_settings_to_tex()
|
453
546
|
self._plot_features()
|
@@ -455,7 +548,11 @@ class TrainMva:
|
|
455
548
|
if skip_fit:
|
456
549
|
return
|
457
550
|
|
458
|
-
l_mod = self._get_models()
|
459
|
-
|
460
|
-
|
551
|
+
l_mod = self._get_models(load_trained = load_trained)
|
552
|
+
if not load_trained:
|
553
|
+
for ifold, mod in enumerate(l_mod):
|
554
|
+
self._save_model(mod, ifold)
|
555
|
+
|
556
|
+
self._run_diagnostics(models = l_mod, rdf = self._rdf_sig_org, name='Signal' )
|
557
|
+
self._run_diagnostics(models = l_mod, rdf = self._rdf_bkg_org, name='Background')
|
461
558
|
# ---------------------------------------------
|
dmu/pdataframe/utilities.py
CHANGED
@@ -2,20 +2,28 @@
|
|
2
2
|
Module containing utilities for pandas dataframes
|
3
3
|
'''
|
4
4
|
import os
|
5
|
+
import yaml
|
5
6
|
import pandas as pnd
|
6
7
|
|
7
8
|
from dmu.logging.log_store import LogStore
|
8
9
|
|
9
10
|
log=LogStore.add_logger('dmu:pdataframe:utilities')
|
10
|
-
|
11
11
|
# -------------------------------------
|
12
|
-
def df_to_tex(df
|
12
|
+
def df_to_tex(df : pnd.DataFrame,
|
13
|
+
path : str,
|
14
|
+
hide_index : bool = True,
|
15
|
+
d_format : dict[str,str]= None,
|
16
|
+
**kwargs : str ) -> None:
|
13
17
|
'''
|
14
18
|
Saves pandas dataframe to latex
|
15
19
|
|
16
20
|
Parameters
|
17
21
|
-------------
|
22
|
+
df : Dataframe with data
|
23
|
+
path (str) : Path to latex file
|
24
|
+
hide_index : If true (default), index of dataframe won't appear in table
|
18
25
|
d_format (dict) : Dictionary specifying the formattinng of the table, e.g. `{'col1': '{}', 'col2': '{:.3f}', 'col3' : '{:.3f}'}`
|
26
|
+
kwargs : Arguments needed in `to_latex`
|
19
27
|
'''
|
20
28
|
|
21
29
|
if path is not None:
|
@@ -30,7 +38,32 @@ def df_to_tex(df : pnd.DataFrame, path : str, hide_index : bool = True, d_format
|
|
30
38
|
st=st.format(formatter=d_format)
|
31
39
|
|
32
40
|
log.info(f'Saving to: {path}')
|
33
|
-
buf = st.to_latex(buf=path,
|
41
|
+
buf = st.to_latex(buf=path, hrules=True, **kwargs)
|
34
42
|
|
35
43
|
return buf
|
36
44
|
# -------------------------------------
|
45
|
+
def to_yaml(df : pnd.DataFrame, path : str):
|
46
|
+
'''
|
47
|
+
Takes a dataframe and the path to a yaml file
|
48
|
+
Makes the directory path if not found and saves data in YAML file
|
49
|
+
'''
|
50
|
+
dir_path = os.path.dirname(path)
|
51
|
+
os.makedirs(dir_path, exist_ok=True)
|
52
|
+
|
53
|
+
data = df.to_dict()
|
54
|
+
|
55
|
+
with open(path, 'w', encoding='utf-8') as ofile:
|
56
|
+
yaml.safe_dump(data, ofile)
|
57
|
+
# -------------------------------------
|
58
|
+
def from_yaml(path : str) -> pnd.DataFrame:
|
59
|
+
'''
|
60
|
+
Takes path to a yaml file
|
61
|
+
Makes dataframe from it and returns it
|
62
|
+
'''
|
63
|
+
with open(path, encoding='utf-8') as ifile:
|
64
|
+
data = yaml.safe_load(ifile)
|
65
|
+
|
66
|
+
df = pnd.DataFrame(data)
|
67
|
+
|
68
|
+
return df
|
69
|
+
# -------------------------------------
|
dmu/plotting/fwhm.py
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
'''
|
2
|
+
Module with FWHM plugin class
|
3
|
+
'''
|
4
|
+
import zfit
|
5
|
+
import numpy
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
|
8
|
+
from dmu.logging.log_store import LogStore
|
9
|
+
|
10
|
+
log = LogStore.add_logger('dmu:plotting:fwhm')
|
11
|
+
# --------------------------------------------
|
12
|
+
class FWHM:
|
13
|
+
'''
|
14
|
+
Class meant to be used to calculate Full Width at Half Maximum
|
15
|
+
as a Plotter1d plugin
|
16
|
+
'''
|
17
|
+
# -------------------------
|
18
|
+
def __init__(self, cfg : dict, val : numpy.ndarray, wgt : numpy.ndarray, maxy : float):
|
19
|
+
self._cfg = cfg
|
20
|
+
self._arr_val = val
|
21
|
+
self._arr_wgt = wgt
|
22
|
+
self._maxy = maxy
|
23
|
+
# -------------------------
|
24
|
+
def _normalize_yval(self, arr_pdf_val : numpy.ndarray) -> None:
|
25
|
+
max_pdf_val = numpy.max(arr_pdf_val)
|
26
|
+
arr_pdf_val*= self._maxy / max_pdf_val
|
27
|
+
|
28
|
+
return arr_pdf_val
|
29
|
+
# -------------------------
|
30
|
+
def _get_fwhm(self, arr_x : numpy.ndarray, arr_y : numpy.ndarray) -> float:
|
31
|
+
maxy = numpy.max(arr_y)
|
32
|
+
arry = numpy.where(arr_y > maxy/2.)[0]
|
33
|
+
imax = arry[ 0]
|
34
|
+
imin = arry[-1]
|
35
|
+
|
36
|
+
x1 = arr_x[imax]
|
37
|
+
x2 = arr_x[imin]
|
38
|
+
|
39
|
+
if self._cfg['plot']:
|
40
|
+
plt.plot([x1, x2], [maxy/2, maxy/2], linestyle=':', linewidth=1, color='k')
|
41
|
+
|
42
|
+
return x2 - x1
|
43
|
+
# -------------------------
|
44
|
+
def run(self) -> float:
|
45
|
+
'''
|
46
|
+
Runs plugin and return FWHM
|
47
|
+
'''
|
48
|
+
[minx, maxx] = self._cfg['obs']
|
49
|
+
|
50
|
+
log.info('Running FWHM pluggin')
|
51
|
+
obs = zfit.Space('mass', limits=(minx, maxx))
|
52
|
+
pdf= zfit.pdf.KDE1DimExact(obs=obs, data=self._arr_val, weights=self._arr_wgt)
|
53
|
+
|
54
|
+
xval = numpy.linspace(minx, maxx, 200)
|
55
|
+
yval = pdf.pdf(xval)
|
56
|
+
yval = self._normalize_yval(yval)
|
57
|
+
|
58
|
+
if self._cfg['plot']:
|
59
|
+
plt.plot(xval, yval, linestyle='-', linewidth=2, color='gray')
|
60
|
+
|
61
|
+
fwhm = self._get_fwhm(xval, yval)
|
62
|
+
|
63
|
+
return fwhm
|
64
|
+
# --------------------------------------------
|
dmu/plotting/plotter.py
CHANGED
@@ -29,6 +29,8 @@ class Plotter:
|
|
29
29
|
self._d_cfg = cfg
|
30
30
|
self._d_rdf : dict[str, RDataFrame] = { name : self._preprocess_rdf(rdf) for name, rdf in d_rdf.items()}
|
31
31
|
self._d_wgt : Union[dict[str, Union[numpy.ndarray, None]], None]
|
32
|
+
|
33
|
+
self._title : str = ''
|
32
34
|
#-------------------------------------
|
33
35
|
def _check_quantile(self, qnt : float):
|
34
36
|
'''
|
dmu/plotting/plotter_1d.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
'''
|
2
2
|
Module containing plotter class
|
3
3
|
'''
|
4
|
-
|
4
|
+
import copy
|
5
5
|
from hist import Hist
|
6
6
|
|
7
7
|
import numpy
|
@@ -9,6 +9,7 @@ import matplotlib.pyplot as plt
|
|
9
9
|
|
10
10
|
from dmu.logging.log_store import LogStore
|
11
11
|
from dmu.plotting.plotter import Plotter
|
12
|
+
from dmu.plotting.fwhm import FWHM
|
12
13
|
|
13
14
|
log = LogStore.add_logger('dmu:plotting:Plotter1D')
|
14
15
|
# --------------------------------------------
|
@@ -55,6 +56,72 @@ class Plotter1D(Plotter):
|
|
55
56
|
|
56
57
|
return minx, maxx, bins
|
57
58
|
#-------------------------------------
|
59
|
+
def _run_plugins(self,
|
60
|
+
arr_val : numpy.ndarray,
|
61
|
+
arr_wgt : numpy.ndarray,
|
62
|
+
hst,
|
63
|
+
name : str,
|
64
|
+
varname : str) -> None:
|
65
|
+
if 'plugin' not in self._d_cfg:
|
66
|
+
log.debug('No plugins found')
|
67
|
+
return
|
68
|
+
|
69
|
+
if 'fwhm' in self._d_cfg['plugin']:
|
70
|
+
if varname not in self._d_cfg['plugin']['fwhm']:
|
71
|
+
log.debug(f'No FWHM plugin found for variable {varname}')
|
72
|
+
return
|
73
|
+
|
74
|
+
log.debug(f'FWHM plugin found for variable {varname}')
|
75
|
+
cfg = self._d_cfg['plugin']['fwhm'][varname]
|
76
|
+
self._run_fwhm(arr_val = arr_val, arr_wgt=arr_wgt, hst=hst, name=name, cfg = cfg)
|
77
|
+
|
78
|
+
if 'stats' in self._d_cfg['plugin']:
|
79
|
+
if varname not in self._d_cfg['plugin']['stats']:
|
80
|
+
log.debug(f'No stats plugin found for variable {varname}')
|
81
|
+
return
|
82
|
+
|
83
|
+
log.debug(f'stats plugin found for variable {varname}')
|
84
|
+
cfg = self._d_cfg['plugin']['stats'][varname]
|
85
|
+
self._run_stats(arr_val = arr_val, arr_wgt=arr_wgt, name=name, cfg = cfg)
|
86
|
+
#-------------------------------------
|
87
|
+
def _run_stats(self, arr_val : numpy.ndarray, arr_wgt : numpy.ndarray, name : str, cfg : dict[str:str]) -> None:
|
88
|
+
this_title = ''
|
89
|
+
if 'sum' in cfg:
|
90
|
+
form = cfg['sum']
|
91
|
+
sumv = numpy.sum(arr_wgt)
|
92
|
+
this_title += form.format(sumv) + '; '
|
93
|
+
|
94
|
+
if 'mean' in cfg:
|
95
|
+
form = cfg['mean']
|
96
|
+
mean = numpy.average(arr_val, weights=arr_wgt)
|
97
|
+
this_title += form.format(mean) + '; '
|
98
|
+
|
99
|
+
if 'rms' in cfg:
|
100
|
+
form = cfg['rms']
|
101
|
+
mean = numpy.average(arr_val, weights=arr_wgt)
|
102
|
+
rms = numpy.sqrt(numpy.average((arr_val - mean) ** 2, weights=arr_wgt))
|
103
|
+
this_title += form.format(rms ) + '; '
|
104
|
+
|
105
|
+
self._title+= f'\n{name}: {this_title}'
|
106
|
+
#-------------------------------------
|
107
|
+
def _run_fwhm(self, arr_val : numpy.ndarray, arr_wgt : numpy.ndarray, hst, name : str, cfg : dict) -> None:
|
108
|
+
arr_bin_cnt = hst.values()
|
109
|
+
maxy = numpy.max(arr_bin_cnt)
|
110
|
+
obj = FWHM(cfg=cfg, val=arr_val, wgt=arr_wgt, maxy=maxy)
|
111
|
+
fwhm = obj.run()
|
112
|
+
|
113
|
+
form = cfg['format']
|
114
|
+
this_title = form.format(fwhm)
|
115
|
+
|
116
|
+
if 'add_std' in cfg and cfg['add_std']:
|
117
|
+
mu = numpy.average(arr_val , weights=arr_wgt)
|
118
|
+
avg = numpy.average((arr_val - mu) ** 2, weights=arr_wgt)
|
119
|
+
std = numpy.sqrt(avg)
|
120
|
+
form = form.replace('FWHM', 'STD')
|
121
|
+
this_title+= '; ' + form.format(std)
|
122
|
+
|
123
|
+
self._title+= f'\n{name}: {this_title}'
|
124
|
+
#-------------------------------------
|
58
125
|
def _plot_var(self, var : str) -> float:
|
59
126
|
'''
|
60
127
|
Will plot a variable from a dictionary of dataframes
|
@@ -70,6 +137,7 @@ class Plotter1D(Plotter):
|
|
70
137
|
|
71
138
|
d_data = {}
|
72
139
|
for name, rdf in self._d_rdf.items():
|
140
|
+
log.debug(f'Plotting: {var}/{name}')
|
73
141
|
d_data[name] = rdf.AsNumpy([var])[var]
|
74
142
|
|
75
143
|
minx, maxx, bins = self._get_binning(var, d_data)
|
@@ -82,7 +150,18 @@ class Plotter1D(Plotter):
|
|
82
150
|
arr_wgt = self._normalize_weights(arr_wgt, var)
|
83
151
|
hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x').Weight()
|
84
152
|
hst.fill(x=arr_val, weight=arr_wgt)
|
85
|
-
|
153
|
+
self._run_plugins(arr_val, arr_wgt, hst, name, var)
|
154
|
+
|
155
|
+
if 'styling' in self._d_cfg['plots'][var]:
|
156
|
+
style = self._d_cfg['plots'][var]['styling']
|
157
|
+
style = copy.deepcopy(style)
|
158
|
+
else:
|
159
|
+
style = {'label' : label, 'histtype' : 'errorbar', 'marker' : '.', 'linestyle' : 'none'}
|
160
|
+
|
161
|
+
if 'label' not in style:
|
162
|
+
style['label'] = label
|
163
|
+
|
164
|
+
hst.plot(**style)
|
86
165
|
l_bc_all += hst.values().tolist()
|
87
166
|
|
88
167
|
max_y = max(l_bc_all)
|
@@ -131,9 +210,12 @@ class Plotter1D(Plotter):
|
|
131
210
|
if yscale == 'linear':
|
132
211
|
plt.ylim(bottom=0)
|
133
212
|
|
134
|
-
title =
|
213
|
+
title = self._title
|
135
214
|
if 'title' in d_cfg:
|
136
|
-
|
215
|
+
this_title = d_cfg['title']
|
216
|
+
title += f'\n {this_title}'
|
217
|
+
|
218
|
+
title = title.lstrip('\n')
|
137
219
|
|
138
220
|
plt.ylim(top=1.2 * max_y)
|
139
221
|
plt.legend()
|
@@ -160,8 +242,7 @@ class Plotter1D(Plotter):
|
|
160
242
|
|
161
243
|
fig_size = self._get_fig_size()
|
162
244
|
for var in self._d_cfg['plots']:
|
163
|
-
|
164
|
-
|
245
|
+
self._title = ''
|
165
246
|
plt.figure(var, figsize=fig_size)
|
166
247
|
max_y = self._plot_var(var)
|
167
248
|
self._style_plot(var, max_y)
|
dmu/stats/fitter.py
CHANGED
@@ -231,7 +231,7 @@ class Fitter:
|
|
231
231
|
continue
|
232
232
|
|
233
233
|
const = zfit.constraint.GaussianConstraint(params=par, observation=float(par_mu), uncertainty=float(par_sg))
|
234
|
-
log.info(f'{"":<4}{par_name:<
|
234
|
+
log.info(f'{"":<4}{par_name:<45}{par_mu:<15.3e}{par_sg:<15.3e}')
|
235
235
|
l_const.append(const)
|
236
236
|
|
237
237
|
return l_const
|