data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
- data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
- {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
- {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
- dmu/__init__.py +0 -0
- dmu/generic/hashing.py +34 -8
- dmu/generic/utilities.py +164 -11
- dmu/logging/log_store.py +34 -2
- dmu/logging/messages.py +96 -0
- dmu/ml/cv_classifier.py +3 -3
- dmu/ml/cv_diagnostics.py +3 -0
- dmu/ml/cv_performance.py +58 -0
- dmu/ml/cv_predict.py +149 -46
- dmu/ml/train_mva.py +482 -100
- dmu/ml/utilities.py +29 -10
- dmu/pdataframe/utilities.py +28 -3
- dmu/plotting/fwhm.py +2 -2
- dmu/plotting/matrix.py +1 -1
- dmu/plotting/plotter.py +23 -3
- dmu/plotting/plotter_1d.py +96 -32
- dmu/plotting/plotter_2d.py +5 -0
- dmu/rdataframe/utilities.py +54 -3
- dmu/rfile/ddfgetter.py +102 -0
- dmu/stats/fit_stats.py +129 -0
- dmu/stats/fitter.py +55 -22
- dmu/stats/gof_calculator.py +7 -0
- dmu/stats/model_factory.py +153 -62
- dmu/stats/parameters.py +100 -0
- dmu/stats/utilities.py +443 -12
- dmu/stats/wdata.py +187 -0
- dmu/stats/zfit.py +17 -0
- dmu/stats/zfit_plotter.py +147 -36
- dmu/testing/utilities.py +102 -24
- dmu/workflow/__init__.py +0 -0
- dmu/workflow/cache.py +266 -0
- dmu_data/ml/tests/train_mva.yaml +9 -7
- dmu_data/ml/tests/train_mva_def.yaml +75 -0
- dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
- dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
- dmu_data/plotting/tests/2d.yaml +5 -5
- dmu_data/plotting/tests/line.yaml +15 -0
- dmu_data/plotting/tests/styling.yaml +8 -1
- dmu_data/rfile/friends.yaml +13 -0
- dmu_data/stats/fitter/test_simple.yaml +28 -0
- dmu_data/stats/kde_optimizer/control.json +1 -0
- dmu_data/stats/kde_optimizer/signal.json +1 -0
- dmu_data/stats/parameters/data.yaml +178 -0
- dmu_data/tests/config.json +6 -0
- dmu_data/tests/config.yaml +4 -0
- dmu_data/tests/pdf_to_tex.txt +34 -0
- dmu_scripts/kerberos/check_expiration +21 -0
- dmu_scripts/kerberos/convert_certificate +22 -0
- dmu_scripts/ml/compare_classifiers.py +85 -0
- data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
- {data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0
dmu/ml/train_mva.py
CHANGED
@@ -4,17 +4,27 @@ Module with TrainMva class
|
|
4
4
|
# pylint: disable = too-many-locals, no-name-in-module
|
5
5
|
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
6
6
|
# pylint: disable = too-many-instance-attributes
|
7
|
+
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
7
8
|
|
8
9
|
import os
|
9
10
|
import copy
|
11
|
+
import json
|
12
|
+
import math
|
13
|
+
|
14
|
+
from contextlib import contextmanager
|
15
|
+
from typing import Optional, Union
|
16
|
+
from functools import partial
|
10
17
|
|
18
|
+
import tqdm
|
11
19
|
import joblib
|
20
|
+
import optuna
|
12
21
|
import pandas as pnd
|
13
22
|
import numpy
|
14
23
|
import matplotlib.pyplot as plt
|
15
24
|
|
16
25
|
from sklearn.metrics import roc_curve, auc
|
17
|
-
from sklearn.model_selection import StratifiedKFold
|
26
|
+
from sklearn.model_selection import StratifiedKFold, cross_val_score
|
27
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
18
28
|
|
19
29
|
from ROOT import RDataFrame, RDF
|
20
30
|
|
@@ -31,10 +41,20 @@ from dmu.logging.log_store import LogStore
|
|
31
41
|
NPA = numpy.ndarray
|
32
42
|
log = LogStore.add_logger('dmu:ml:train_mva')
|
33
43
|
# ---------------------------------------------
|
44
|
+
class NoFeatureInfo(Exception):
|
45
|
+
'''
|
46
|
+
Used when information about a feature is missing in the config file
|
47
|
+
'''
|
48
|
+
def __init__(self, message : str):
|
49
|
+
super().__init__(message)
|
50
|
+
# ---------------------------------------------
|
34
51
|
class TrainMva:
|
35
52
|
'''
|
36
53
|
Interface to scikit learn used to train classifier
|
37
54
|
'''
|
55
|
+
# TODO:
|
56
|
+
# - Hyperparameter optimization methods should go into their own class
|
57
|
+
# - Data preprocessing methods might need their own class
|
38
58
|
# ---------------------------------------------
|
39
59
|
def __init__(self, bkg : RDataFrame, sig : RDataFrame, cfg : dict):
|
40
60
|
'''
|
@@ -43,13 +63,15 @@ class TrainMva:
|
|
43
63
|
cfg (dict) : Dictionary storing configuration for training
|
44
64
|
'''
|
45
65
|
self._cfg = cfg
|
66
|
+
self._auc = math.nan # This is where the Area Under the ROC curve for the full sample will be saved
|
46
67
|
self._l_ft_name = self._cfg['training']['features']
|
68
|
+
self._pbar : Optional[tqdm.tqdm]
|
47
69
|
|
48
70
|
self._rdf_sig_org = sig
|
49
|
-
self._rdf_bkg_org = bkg
|
71
|
+
self._rdf_bkg_org = bkg
|
50
72
|
|
51
|
-
rdf_bkg = self._preprocess_rdf(bkg)
|
52
|
-
rdf_sig = self._preprocess_rdf(sig)
|
73
|
+
rdf_bkg = self._preprocess_rdf(rdf=bkg, kind='bkg')
|
74
|
+
rdf_sig = self._preprocess_rdf(rdf=sig, kind='sig')
|
53
75
|
|
54
76
|
df_ft_sig, l_lab_sig = self._get_sample_inputs(rdf = rdf_sig, label = 1)
|
55
77
|
df_ft_bkg, l_lab_bkg = self._get_sample_inputs(rdf = rdf_bkg, label = 0)
|
@@ -59,6 +81,11 @@ class TrainMva:
|
|
59
81
|
|
60
82
|
self._rdf_bkg = self._get_rdf(rdf = rdf_bkg, df_feat=df_ft_bkg)
|
61
83
|
self._rdf_sig = self._get_rdf(rdf = rdf_sig, df_feat=df_ft_sig)
|
84
|
+
|
85
|
+
self._rdm_state = 42 # Random state for training classifier
|
86
|
+
self._nworkers = 1 # Used to set number of workers for ANY process. Can be overriden with `use` context manager
|
87
|
+
|
88
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
62
89
|
# ---------------------------------------------
|
63
90
|
def _get_extra_columns(self, rdf : RDataFrame, df : pnd.DataFrame) -> list[str]:
|
64
91
|
d_plot = self._cfg['plotting']['features']['plots']
|
@@ -124,17 +151,51 @@ class TrainMva:
|
|
124
151
|
log.info(70 * '-')
|
125
152
|
|
126
153
|
return df
|
154
|
+
#---------------------------------
|
155
|
+
def _add_sample_columns(
|
156
|
+
self,
|
157
|
+
rdf : RDataFrame,
|
158
|
+
kind : str) -> RDataFrame:
|
159
|
+
'''
|
160
|
+
This will apply sample specific column definitions
|
161
|
+
to the dataframe
|
162
|
+
'''
|
163
|
+
try:
|
164
|
+
d_def = self._cfg['dataset']['samples'][kind]['definitions']
|
165
|
+
except KeyError:
|
166
|
+
log.debug(f'Not found sample definitions for {kind}')
|
167
|
+
return rdf
|
168
|
+
|
169
|
+
log.info(60 * '-')
|
170
|
+
log.info(f'Found sample definitions for {kind}')
|
171
|
+
log.info(60 * '-')
|
172
|
+
for name, expr in d_def.items():
|
173
|
+
log.info(f'{name:<30}{"-->":<10}{expr:<20}')
|
174
|
+
rdf = rdf.Define(name, expr)
|
175
|
+
log.info(60 * '-')
|
176
|
+
|
177
|
+
return rdf
|
127
178
|
# ---------------------------------------------
|
128
|
-
def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
|
179
|
+
def _preprocess_rdf(self, rdf : RDataFrame, kind : str) -> RDataFrame:
|
180
|
+
rdf = self._add_sample_columns(rdf, kind)
|
181
|
+
|
129
182
|
if 'define' not in self._cfg['dataset']:
|
130
183
|
log.debug('No definitions found')
|
131
184
|
return rdf
|
132
185
|
|
133
|
-
log.debug('Definitions found')
|
186
|
+
log.debug(f'Definitions found for {kind}')
|
134
187
|
d_def = self._cfg['dataset']['define']
|
135
188
|
for name, expr in d_def.items():
|
136
189
|
log.debug(f'{name:<20}{expr}')
|
137
|
-
|
190
|
+
try:
|
191
|
+
rdf = rdf.Define(name, expr)
|
192
|
+
except TypeError as exc:
|
193
|
+
l_col = [ name.c_str() for name in rdf.GetColumnNames() ]
|
194
|
+
branch_list = 'found_branches.txt'
|
195
|
+
with open(branch_list, 'w', encoding='utf-8') as ifile:
|
196
|
+
json.dump(l_col, ifile, indent=2)
|
197
|
+
|
198
|
+
raise TypeError(f'Branches found were dumped to {branch_list}') from exc
|
138
199
|
|
139
200
|
return rdf
|
140
201
|
# ---------------------------------------------
|
@@ -159,7 +220,7 @@ class TrainMva:
|
|
159
220
|
|
160
221
|
return model
|
161
222
|
# ---------------------------------------------
|
162
|
-
def _get_models(self, load_trained : bool):
|
223
|
+
def _get_models(self, load_trained : bool) -> list[cls]:
|
163
224
|
'''
|
164
225
|
Will create models, train them and return them
|
165
226
|
'''
|
@@ -174,6 +235,11 @@ class TrainMva:
|
|
174
235
|
|
175
236
|
l_model=[]
|
176
237
|
ifold=0
|
238
|
+
|
239
|
+
l_arr_lab_ts = []
|
240
|
+
l_arr_all_ts = []
|
241
|
+
l_arr_sig_ts = []
|
242
|
+
l_arr_bkg_ts = []
|
177
243
|
for arr_itr, arr_its in kfold.split(self._df_ft, self._l_lab):
|
178
244
|
log.debug(20 * '-')
|
179
245
|
log.info(f'Training fold: {ifold}')
|
@@ -181,20 +247,103 @@ class TrainMva:
|
|
181
247
|
model = self._get_model(arr_itr)
|
182
248
|
l_model.append(model)
|
183
249
|
|
184
|
-
|
185
|
-
|
250
|
+
arr_sig_tr, arr_bkg_tr, arr_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
|
251
|
+
arr_sig_ts, arr_bkg_ts, arr_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
|
186
252
|
|
187
253
|
self._save_feature_importance(model, ifold)
|
188
|
-
self.
|
189
|
-
self._plot_scores(
|
190
|
-
|
254
|
+
self._plot_correlations(arr_itr, ifold)
|
255
|
+
self._plot_scores(
|
256
|
+
ifold = ifold,
|
257
|
+
sig_trn=arr_sig_tr,
|
258
|
+
sig_tst=arr_sig_ts,
|
259
|
+
bkg_trn=arr_bkg_tr,
|
260
|
+
bkg_tst=arr_bkg_ts)
|
261
|
+
|
262
|
+
xval_ts, yval_ts, _ = TrainMva.plot_roc(arr_lab_ts, arr_all_ts, kind='Test' , ifold=ifold)
|
263
|
+
xval_tr, yval_tr, _ = TrainMva.plot_roc(arr_lab_tr, arr_all_tr, kind='Train', ifold=ifold)
|
264
|
+
self._plot_probabilities(xval_tr, yval_tr, arr_all_tr, arr_lab_tr)
|
265
|
+
self._save_roc_plot(ifold=ifold)
|
266
|
+
|
267
|
+
self._save_roc_json(xval=xval_ts, yval=yval_ts, kind='Test' , ifold=ifold)
|
268
|
+
self._save_roc_json(xval=xval_tr, yval=yval_tr, kind='Train', ifold=ifold)
|
191
269
|
|
192
270
|
ifold+=1
|
193
271
|
|
272
|
+
l_arr_lab_ts.append(arr_lab_ts)
|
273
|
+
l_arr_all_ts.append(arr_all_ts)
|
274
|
+
l_arr_sig_ts.append(arr_sig_ts)
|
275
|
+
l_arr_bkg_ts.append(arr_bkg_ts)
|
276
|
+
|
277
|
+
arr_lab_ts = numpy.concatenate(l_arr_lab_ts)
|
278
|
+
arr_all_ts = numpy.concatenate(l_arr_all_ts)
|
279
|
+
arr_sig_ts = numpy.concatenate(l_arr_sig_ts)
|
280
|
+
arr_bkg_ts = numpy.concatenate(l_arr_bkg_ts)
|
281
|
+
|
282
|
+
xval, yval, self._auc = TrainMva.plot_roc(
|
283
|
+
arr_lab_ts,
|
284
|
+
arr_all_ts,
|
285
|
+
kind ='Test',
|
286
|
+
ifold=-1)
|
287
|
+
self._plot_probabilities(xval, yval, arr_all_ts, arr_lab_ts)
|
288
|
+
self._save_roc_plot(ifold=-1)
|
289
|
+
|
290
|
+
self._plot_scores(ifold=-1, sig_tst=arr_sig_ts, bkg_tst=arr_bkg_ts)
|
291
|
+
self._save_roc_json(xval=xval, yval=yval, kind='Full', ifold=-1)
|
292
|
+
|
194
293
|
return l_model
|
195
294
|
# ---------------------------------------------
|
295
|
+
def _save_roc_json(
|
296
|
+
self,
|
297
|
+
ifold : int,
|
298
|
+
kind : str,
|
299
|
+
xval : NPA,
|
300
|
+
yval : NPA) -> None:
|
301
|
+
ifold = 'all' if ifold == -1 else ifold # -1 represents all the testing datasets combined
|
302
|
+
val_dir = self._cfg['saving']['output']
|
303
|
+
|
304
|
+
name = kind.lower()
|
305
|
+
val_dir = f'{val_dir}/fold_{ifold:03}'
|
306
|
+
os.makedirs(val_dir, exist_ok=True)
|
307
|
+
jsn_path = f'{val_dir}/roc_{name}.json'
|
308
|
+
|
309
|
+
df = pnd.DataFrame({'x' : xval, 'y' : yval})
|
310
|
+
df.to_json(jsn_path, indent=2)
|
311
|
+
# ---------------------------------------------
|
312
|
+
def _save_roc_plot(self, ifold : int) -> None:
|
313
|
+
min_x = 0
|
314
|
+
min_y = 0
|
315
|
+
ifold = 'all' if ifold == -1 else ifold
|
316
|
+
|
317
|
+
if 'min' in self._cfg['plotting']['roc']:
|
318
|
+
[min_x, min_y] = self._cfg['plotting']['roc']['min']
|
319
|
+
|
320
|
+
max_x = 1
|
321
|
+
max_y = 1
|
322
|
+
if 'max' in self._cfg['plotting']['roc']:
|
323
|
+
[max_x, max_y] = self._cfg['plotting']['roc']['max']
|
324
|
+
|
325
|
+
val_dir = self._cfg['saving']['output']
|
326
|
+
|
327
|
+
if ifold == 'all':
|
328
|
+
plt_dir = f'{val_dir}/fold_all'
|
329
|
+
else:
|
330
|
+
plt_dir = f'{val_dir}/fold_{ifold:03}'
|
331
|
+
|
332
|
+
os.makedirs(plt_dir, exist_ok=True)
|
333
|
+
|
334
|
+
plt.xlabel('Signal efficiency')
|
335
|
+
plt.ylabel('Background rejection')
|
336
|
+
plt.title(f'Fold: {ifold}')
|
337
|
+
plt.xlim(min_x, max_x)
|
338
|
+
plt.ylim(min_y, max_y)
|
339
|
+
plt.grid()
|
340
|
+
plt.legend()
|
341
|
+
plt.savefig(f'{plt_dir}/roc.png')
|
342
|
+
plt.close()
|
343
|
+
# ---------------------------------------------
|
196
344
|
def _load_trained_models(self) -> list[cls]:
|
197
|
-
|
345
|
+
out_dir = self._cfg['saving']['output']
|
346
|
+
model_path = f'{out_dir}/model.pkl'
|
198
347
|
nfold = self._cfg['training']['nfold']
|
199
348
|
l_model = []
|
200
349
|
for ifold in range(nfold):
|
@@ -212,18 +361,18 @@ class TrainMva:
|
|
212
361
|
def _labels_from_varnames(self, l_var_name : list[str]) -> list[str]:
|
213
362
|
try:
|
214
363
|
d_plot = self._cfg['plotting']['features']['plots']
|
215
|
-
except
|
216
|
-
|
217
|
-
return l_var_name
|
364
|
+
except KeyError as exc:
|
365
|
+
raise KeyError('Cannot find plotting/features/plots section in config, using dataframe names') from exc
|
218
366
|
|
219
367
|
l_label = []
|
220
368
|
for var_name in l_var_name:
|
221
369
|
if var_name not in d_plot:
|
222
|
-
|
223
|
-
l_label.append(var_name)
|
224
|
-
continue
|
370
|
+
raise NoFeatureInfo(f'No plot found for feature {var_name}, cannot extract label')
|
225
371
|
|
226
372
|
d_setting = d_plot[var_name]
|
373
|
+
if 'labels' not in d_setting:
|
374
|
+
raise NoFeatureInfo(f'No no labels present for plot of feature {var_name}, cannot extract label')
|
375
|
+
|
227
376
|
[xlab, _ ]= d_setting['labels']
|
228
377
|
|
229
378
|
l_label.append(xlab)
|
@@ -237,7 +386,7 @@ class TrainMva:
|
|
237
386
|
d_data['Variable' ] = self._labels_from_varnames(l_var_name)
|
238
387
|
d_data['Importance'] = 100 * model.feature_importances_
|
239
388
|
|
240
|
-
val_dir = self._cfg['
|
389
|
+
val_dir = self._cfg['saving']['output']
|
241
390
|
val_dir = f'{val_dir}/fold_{ifold:03}'
|
242
391
|
os.makedirs(val_dir, exist_ok=True)
|
243
392
|
|
@@ -290,7 +439,9 @@ class TrainMva:
|
|
290
439
|
'''
|
291
440
|
Saves a model, associated to a specific fold
|
292
441
|
'''
|
293
|
-
|
442
|
+
out_dir = self._cfg['saving']['output']
|
443
|
+
model_path = f'{out_dir}/model.pkl'
|
444
|
+
|
294
445
|
if os.path.isfile(model_path):
|
295
446
|
log.info(f'Model found in {model_path}, not saving')
|
296
447
|
return
|
@@ -327,21 +478,40 @@ class TrainMva:
|
|
327
478
|
|
328
479
|
return cfg
|
329
480
|
# ---------------------------------------------
|
330
|
-
def
|
481
|
+
def _plot_correlations(self, arr_index : NPA, ifold : int) -> None:
|
482
|
+
log.debug('Plotting correlations')
|
483
|
+
|
331
484
|
df_ft = self._df_ft.iloc[arr_index]
|
485
|
+
l_lab = self._l_lab[arr_index]
|
486
|
+
|
487
|
+
arr_sig_idx, = numpy.where(l_lab == 1)
|
488
|
+
arr_bkg_idx, = numpy.where(l_lab == 0)
|
489
|
+
|
490
|
+
df_ft_sig = df_ft.iloc[arr_sig_idx]
|
491
|
+
df_ft_bkg = df_ft.iloc[arr_bkg_idx]
|
492
|
+
|
493
|
+
self._plot_correlation(df_ft=df_ft_sig, ifold=ifold, name='signal' )
|
494
|
+
self._plot_correlation(df_ft=df_ft_bkg, ifold=ifold, name='background')
|
495
|
+
# ---------------------------------------------
|
496
|
+
def _plot_correlation(
|
497
|
+
self,
|
498
|
+
df_ft : pnd.DataFrame,
|
499
|
+
ifold : int,
|
500
|
+
name : str) -> None:
|
501
|
+
|
502
|
+
log.debug(f'Plotting correlation for {name}/{ifold} fold')
|
503
|
+
|
332
504
|
cfg = self._get_correlation_cfg(df_ft, ifold)
|
333
505
|
cov = df_ft.corr()
|
334
506
|
mat = cov.to_numpy()
|
335
507
|
|
336
|
-
|
337
|
-
|
338
|
-
val_dir = self._cfg['plotting']['val_dir']
|
508
|
+
val_dir = self._cfg['saving']['output']
|
339
509
|
val_dir = f'{val_dir}/fold_{ifold:03}'
|
340
510
|
os.makedirs(val_dir, exist_ok=True)
|
341
511
|
|
342
512
|
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
343
513
|
obj.plot()
|
344
|
-
plt.savefig(f'{val_dir}/
|
514
|
+
plt.savefig(f'{val_dir}/correlation_{name}.png')
|
345
515
|
plt.close()
|
346
516
|
# ---------------------------------------------
|
347
517
|
def _get_nentries(self, arr_val : NPA) -> str:
|
@@ -350,26 +520,29 @@ class TrainMva:
|
|
350
520
|
|
351
521
|
return f'{size:.2f}K'
|
352
522
|
# ---------------------------------------------
|
353
|
-
def _plot_scores(
|
354
|
-
|
523
|
+
def _plot_scores(
|
524
|
+
self,
|
525
|
+
ifold : int,
|
526
|
+
sig_tst : NPA,
|
527
|
+
bkg_tst : NPA,
|
528
|
+
sig_trn : NPA = None,
|
529
|
+
bkg_trn : NPA = None) -> None:
|
355
530
|
'''
|
356
531
|
Will plot an array of scores, associated to a given fold
|
357
532
|
'''
|
533
|
+
ifold = 'all' if ifold == -1 else ifold
|
358
534
|
log.debug(f'Plotting scores for {ifold} fold')
|
359
535
|
|
360
|
-
|
361
|
-
log.warning('Scores path not passed, not plotting scores')
|
362
|
-
return
|
363
|
-
|
364
|
-
val_dir = self._cfg['plotting']['val_dir']
|
536
|
+
val_dir = self._cfg['saving']['output']
|
365
537
|
val_dir = f'{val_dir}/fold_{ifold:03}'
|
366
538
|
os.makedirs(val_dir, exist_ok=True)
|
367
539
|
|
368
|
-
plt.hist(
|
369
|
-
plt.hist(
|
540
|
+
plt.hist(sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test: ' + self._get_nentries(sig_tst))
|
541
|
+
plt.hist(bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test: ' + self._get_nentries(bkg_tst))
|
370
542
|
|
371
|
-
|
372
|
-
|
543
|
+
if sig_trn is not None and bkg_trn is not None:
|
544
|
+
plt.hist(sig_trn, alpha = 0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train: ' + self._get_nentries(sig_trn))
|
545
|
+
plt.hist(bkg_trn, alpha = 0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train: '+ self._get_nentries(bkg_trn))
|
373
546
|
|
374
547
|
plt.legend()
|
375
548
|
plt.title(f'Fold: {ifold}')
|
@@ -378,59 +551,12 @@ class TrainMva:
|
|
378
551
|
plt.savefig(f'{val_dir}/scores.png')
|
379
552
|
plt.close()
|
380
553
|
# ---------------------------------------------
|
381
|
-
def
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
'''
|
388
|
-
Takes the labels and the probabilities and plots ROC
|
389
|
-
curve for given fold
|
390
|
-
'''
|
391
|
-
log.debug(f'Plotting ROC curve for {ifold} fold')
|
392
|
-
|
393
|
-
val_dir = self._cfg['plotting']['val_dir']
|
394
|
-
val_dir = f'{val_dir}/fold_{ifold:03}'
|
395
|
-
os.makedirs(val_dir, exist_ok=True)
|
396
|
-
|
397
|
-
xval_ts, yval_ts, _ = roc_curve(l_lab_ts, l_prb_ts)
|
398
|
-
xval_ts = 1 - xval_ts
|
399
|
-
area_ts = auc(xval_ts, yval_ts)
|
400
|
-
|
401
|
-
xval_tr, yval_tr, _ = roc_curve(l_lab_tr, l_prb_tr)
|
402
|
-
xval_tr = 1 - xval_tr
|
403
|
-
area_tr = auc(xval_tr, yval_tr)
|
404
|
-
|
405
|
-
min_x = 0
|
406
|
-
min_y = 0
|
407
|
-
if 'min' in self._cfg['plotting']['roc']:
|
408
|
-
[min_x, min_y] = self._cfg['plotting']['roc']['min']
|
409
|
-
|
410
|
-
max_x = 1
|
411
|
-
max_y = 1
|
412
|
-
if 'max' in self._cfg['plotting']['roc']:
|
413
|
-
[max_x, max_y] = self._cfg['plotting']['roc']['max']
|
414
|
-
|
415
|
-
plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
|
416
|
-
plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
|
417
|
-
self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
|
418
|
-
|
419
|
-
plt.xlabel('Signal efficiency')
|
420
|
-
plt.ylabel('Background rejection')
|
421
|
-
plt.title(f'Fold: {ifold}')
|
422
|
-
plt.xlim(min_x, max_x)
|
423
|
-
plt.ylim(min_y, max_y)
|
424
|
-
plt.grid()
|
425
|
-
plt.legend()
|
426
|
-
plt.savefig(f'{val_dir}/roc.png')
|
427
|
-
plt.close()
|
428
|
-
# ---------------------------------------------
|
429
|
-
def _plot_probabilities(self,
|
430
|
-
arr_seff: NPA,
|
431
|
-
arr_brej: NPA,
|
432
|
-
arr_sprb: NPA,
|
433
|
-
arr_labl: NPA) -> None:
|
554
|
+
def _plot_probabilities(
|
555
|
+
self,
|
556
|
+
arr_seff: NPA,
|
557
|
+
arr_brej: NPA,
|
558
|
+
arr_sprb: NPA,
|
559
|
+
arr_labl: NPA) -> None:
|
434
560
|
|
435
561
|
roc_cfg = self._cfg['plotting']['roc']
|
436
562
|
if 'annotate' not in roc_cfg:
|
@@ -475,7 +601,10 @@ class TrainMva:
|
|
475
601
|
'''
|
476
602
|
Will plot the features, based on the settings in the config
|
477
603
|
'''
|
478
|
-
|
604
|
+
out_dir = self._cfg['saving']['output']
|
605
|
+
d_cfg = self._cfg['plotting']['features']
|
606
|
+
d_cfg['saving'] = {'plt_dir' : f'{out_dir}/features'}
|
607
|
+
|
479
608
|
ptr = Plotter(d_rdf = {'Signal' : self._rdf_sig, 'Background' : self._rdf_bkg}, cfg=d_cfg)
|
480
609
|
ptr.run()
|
481
610
|
# ---------------------------------------------
|
@@ -498,7 +627,7 @@ class TrainMva:
|
|
498
627
|
|
499
628
|
d_tex = {'Variable' : l_lab, 'Replacement' : l_val}
|
500
629
|
df = pnd.DataFrame(d_tex)
|
501
|
-
val_dir = self._cfg['
|
630
|
+
val_dir = self._cfg['saving']['output']
|
502
631
|
os.makedirs(val_dir, exist_ok=True)
|
503
632
|
put.df_to_tex(df, f'{val_dir}/nan_replacement.tex')
|
504
633
|
# ---------------------------------------------
|
@@ -506,16 +635,23 @@ class TrainMva:
|
|
506
635
|
if 'hyper' not in self._cfg['training']:
|
507
636
|
raise ValueError('Cannot find hyper parameters in configuration')
|
508
637
|
|
638
|
+
def format_value(val : Union[int,float]) -> str:
|
639
|
+
if isinstance(val, float):
|
640
|
+
return f'\\verb|{val:.3f}|'
|
641
|
+
|
642
|
+
return f'\\verb|{val}|'
|
643
|
+
|
509
644
|
d_hyper = self._cfg['training']['hyper']
|
510
|
-
d_form = { f'\\verb|{key}|' :
|
645
|
+
d_form = { f'\\verb|{key}|' : format_value(val) for key, val in d_hyper.items() }
|
511
646
|
d_latex = { 'Hyperparameter' : list(d_form.keys()), 'Value' : list(d_form.values())}
|
512
647
|
|
513
648
|
df = pnd.DataFrame(d_latex)
|
514
|
-
val_dir = self._cfg['
|
649
|
+
val_dir = self._cfg['saving']['output']
|
515
650
|
os.makedirs(val_dir, exist_ok=True)
|
516
651
|
put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
|
517
652
|
# ---------------------------------------------
|
518
653
|
def _run_diagnostics(self, models : list[cls], rdf : RDataFrame, name : str) -> None:
|
654
|
+
log.info(f'Running diagnostics for sample {name}')
|
519
655
|
if 'diagnostics' not in self._cfg:
|
520
656
|
log.warning('Diagnostics section not found, not running diagnostics')
|
521
657
|
return
|
@@ -535,24 +671,270 @@ class TrainMva:
|
|
535
671
|
cvd = CVDiagnostics(models=models, rdf=rdf, cfg=cfg_diag)
|
536
672
|
cvd.run()
|
537
673
|
# ---------------------------------------------
|
538
|
-
|
674
|
+
#
|
675
|
+
# Hyperparameter optimization
|
676
|
+
# ---------------------------------------------
|
677
|
+
def _objective(self, trial, kfold : StratifiedKFold) -> float:
|
678
|
+
ft = self._df_ft
|
679
|
+
lab= self._l_lab
|
680
|
+
|
681
|
+
if not issubclass(cls, GradientBoostingClassifier):
|
682
|
+
raise NotImplementedError('Hyperparameter optimization only implemented for GradientBoostingClassifier')
|
683
|
+
|
684
|
+
nft = len(ft.columns)
|
685
|
+
|
686
|
+
var_learn_rate = trial.suggest_float('learning_rate' , 1e-3, 1e-1, log=True)
|
687
|
+
var_max_depth = trial.suggest_int('max_depth' , 2, 15)
|
688
|
+
var_max_features= trial.suggest_int('max_features' , 2, nft)
|
689
|
+
var_min_split = trial.suggest_int('min_samples_split', 2, 10)
|
690
|
+
var_min_samples = trial.suggest_int('min_samples_leaf' , 2, 30)
|
691
|
+
var_nestimators = trial.suggest_int('n_estimators' , 50, 400)
|
692
|
+
|
693
|
+
classifier = GradientBoostingClassifier(
|
694
|
+
learning_rate = var_learn_rate,
|
695
|
+
max_depth = var_max_depth,
|
696
|
+
max_features = var_max_features,
|
697
|
+
min_samples_split = var_min_split,
|
698
|
+
min_samples_leaf = var_min_samples,
|
699
|
+
n_estimators = var_nestimators,
|
700
|
+
random_state = self._rdm_state)
|
701
|
+
|
702
|
+
score = cross_val_score(
|
703
|
+
classifier,
|
704
|
+
ft,
|
705
|
+
lab,
|
706
|
+
n_jobs=1, # More than this will reach RLIMIT_NPROC in cluster
|
707
|
+
cv=kfold)
|
708
|
+
|
709
|
+
accuracy = score.mean()
|
710
|
+
|
711
|
+
return accuracy
|
712
|
+
# ---------------------------------------------
|
713
|
+
def _optimize_hyperparameters(self, ntrial : int):
|
714
|
+
log.info('Running hyperparameter optimization')
|
715
|
+
|
716
|
+
self._pbar = tqdm.tqdm(total=ntrial, desc='Optimizing')
|
717
|
+
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self._rdm_state)
|
718
|
+
objective = partial(self._objective, kfold=kfold)
|
719
|
+
|
720
|
+
study = optuna.create_study(
|
721
|
+
direction='maximize',
|
722
|
+
pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=5),)
|
723
|
+
|
724
|
+
study.optimize(
|
725
|
+
objective,
|
726
|
+
callbacks = [self._update_progress],
|
727
|
+
n_jobs = self._nworkers,
|
728
|
+
n_trials = ntrial)
|
729
|
+
|
730
|
+
self._print_hyper_opt(study=study)
|
731
|
+
self._plot_hyper_opt(study=study)
|
732
|
+
|
733
|
+
log.info('Overriding hyperparameters with optimized values')
|
734
|
+
|
735
|
+
self._cfg['training']['hyper'] = study.best_params
|
736
|
+
# ---------------------------------------------
|
737
|
+
def _plot_hyper_opt(self, study) -> None:
|
738
|
+
out_dir = self._cfg['saving']['output']
|
739
|
+
opt_dir = f'{out_dir}/optimization'
|
740
|
+
os.makedirs(opt_dir, exist_ok=True)
|
741
|
+
|
742
|
+
trials_df = study.trials_dataframe()
|
743
|
+
|
744
|
+
plt.plot(trials_df['number'], trials_df['value'])
|
745
|
+
plt.xlabel('Trial')
|
746
|
+
plt.ylabel('Accuracy')
|
747
|
+
plt.title('Optimization History')
|
748
|
+
plt.grid(True)
|
749
|
+
plt.savefig(f'{opt_dir}/history.png')
|
750
|
+
plt.close()
|
751
|
+
|
752
|
+
plt.hist(trials_df['value'], bins=20, alpha=0.7)
|
753
|
+
plt.xlabel('Accuracy')
|
754
|
+
plt.ylabel('Frequency')
|
755
|
+
plt.title('Distribution of Trial Results')
|
756
|
+
plt.savefig(f'{opt_dir}/accuracy.png')
|
757
|
+
plt.close()
|
758
|
+
# ---------------------------------------------
|
759
|
+
def _update_progress(self, study, _trial):
|
760
|
+
self._pbar.set_postfix({'Best': f'{study.best_value:.4f}' if study.best_value else 'N/A'})
|
761
|
+
self._pbar.update(1)
|
762
|
+
# ---------------------------------------------
|
763
|
+
def _print_hyper_opt(self, study) -> None:
|
764
|
+
log.info(40 * '-')
|
765
|
+
log.info('Optimized hyperparameters:')
|
766
|
+
log.info(40 * '-')
|
767
|
+
for name, value in study.best_params.items():
|
768
|
+
if isinstance(value, float):
|
769
|
+
log.info(f'{name:<20}{value:.3f}')
|
770
|
+
else:
|
771
|
+
log.info(f'{name:<20}{value}')
|
772
|
+
# ---------------------------------------------
|
773
|
+
# ---------------------------------------------
|
774
|
+
def _auc_from_json(self, ifold : int, kind : str) -> float:
|
775
|
+
val_dir = self._cfg['saving']['output']
|
776
|
+
path = f'{val_dir}/fold_{ifold:03}/roc_{kind}.json'
|
777
|
+
df = pnd.read_json(path)
|
778
|
+
|
779
|
+
return auc(df['x'], df['y'])
|
780
|
+
# ---------------------------------------------
|
781
|
+
def _check_overtraining(self) -> None:
|
782
|
+
nfold = self._cfg['training']['nfold']
|
783
|
+
|
784
|
+
df = pnd.DataFrame(columns=['fold'])
|
785
|
+
df['fold' ]= numpy.linspace(0, nfold - 1, nfold, dtype=int)
|
786
|
+
df['test' ]= df['fold'].apply(self._auc_from_json, args=('test' ,))
|
787
|
+
df['train']= df['fold'].apply(self._auc_from_json, args=('train',))
|
788
|
+
|
789
|
+
ax=None
|
790
|
+
ax=df.plot('fold', 'test' , color='blue', label='Testing sample' , ax=ax)
|
791
|
+
ax=df.plot('fold', 'train', color='red' , label='Training sample', ax=ax)
|
792
|
+
ax.set_ylim(bottom=0.75, top=1.00)
|
793
|
+
ax.set_ylabel('AUC')
|
794
|
+
ax.set_xlabel('Fold')
|
795
|
+
|
796
|
+
plt.grid()
|
797
|
+
|
798
|
+
val_dir = self._cfg['saving']['output']
|
799
|
+
path = f'{val_dir}/fold_all/auc_vs_fold.png'
|
800
|
+
plt.savefig(path)
|
801
|
+
plt.close()
|
802
|
+
# ---------------------------------------------
|
803
|
+
def run(
|
804
|
+
self,
|
805
|
+
skip_fit : bool = False,
|
806
|
+
opt_ntrial : int = 0,
|
807
|
+
load_trained : bool = False) -> float:
|
539
808
|
'''
|
540
809
|
Will do the training
|
541
810
|
|
542
|
-
skip_fit: By default false, if True, it will only do the plots of features and save tables
|
543
|
-
|
811
|
+
skip_fit : By default false, if True, it will only do the plots of features and save tables
|
812
|
+
opt_ntrial : Number of optimization tries for hyperparameter optimization, by default zero, i.e. no optimization will run
|
813
|
+
load_trained: If true, it will load the models instead of training, by default false.
|
814
|
+
|
815
|
+
Returns
|
816
|
+
----------------
|
817
|
+
Area under the ROC curve from evaluating the classifiers
|
818
|
+
on samples that were not used in their training. Uses the full sample
|
544
819
|
'''
|
545
|
-
self._save_settings_to_tex()
|
546
820
|
self._plot_features()
|
547
821
|
|
548
822
|
if skip_fit:
|
549
|
-
return
|
823
|
+
return self._auc
|
824
|
+
|
825
|
+
if opt_ntrial > 0:
|
826
|
+
self._optimize_hyperparameters(ntrial=opt_ntrial)
|
550
827
|
|
828
|
+
self._save_settings_to_tex()
|
551
829
|
l_mod = self._get_models(load_trained = load_trained)
|
552
830
|
if not load_trained:
|
553
831
|
for ifold, mod in enumerate(l_mod):
|
554
832
|
self._save_model(mod, ifold)
|
555
833
|
|
834
|
+
self._check_overtraining()
|
556
835
|
self._run_diagnostics(models = l_mod, rdf = self._rdf_sig_org, name='Signal' )
|
557
836
|
self._run_diagnostics(models = l_mod, rdf = self._rdf_bkg_org, name='Background')
|
837
|
+
|
838
|
+
return self._auc
|
839
|
+
# ---------------------------------------------
|
840
|
+
@contextmanager
|
841
|
+
def use(self, nworkers : int) -> None:
|
842
|
+
'''
|
843
|
+
Context manager used to run with a specific configuration
|
844
|
+
|
845
|
+
nworkers: Use this number of workers for ANY process that can be parallelized.
|
846
|
+
'''
|
847
|
+
old = self._nworkers
|
848
|
+
|
849
|
+
log.info(f'Using {nworkers} workers to run training')
|
850
|
+
|
851
|
+
self._nworkers = nworkers
|
852
|
+
try:
|
853
|
+
yield
|
854
|
+
finally:
|
855
|
+
self._nworkers = old
|
856
|
+
# ---------------------------------------------
|
857
|
+
@staticmethod
|
858
|
+
def plot_roc_from_prob(
|
859
|
+
arr_sig_prb : NPA,
|
860
|
+
arr_bkg_prb : NPA,
|
861
|
+
kind : str,
|
862
|
+
ifold : int,
|
863
|
+
color : str = None) -> tuple[NPA,NPA, float]:
|
864
|
+
'''
|
865
|
+
Takes arrays of signal and background probabilities
|
866
|
+
and plots ROC curve
|
867
|
+
|
868
|
+
Parameters
|
869
|
+
--------------------
|
870
|
+
arr_bkg/sig_prb : Array with background/signal probabilities
|
871
|
+
kind : String used to label the plot
|
872
|
+
ifold : If no fold makes sense (i.e. this is the full sample), use ifold=-1
|
873
|
+
kind : Used to label the plot
|
874
|
+
color : String with color of curve
|
875
|
+
|
876
|
+
Returns
|
877
|
+
--------------------
|
878
|
+
Tuple with 3 elements:
|
879
|
+
|
880
|
+
- Array of x coordinates of ROC curve
|
881
|
+
- Array of y coordinates of ROC curve
|
882
|
+
- Area under the curve
|
883
|
+
'''
|
884
|
+
arr_sig_lab = numpy.ones_like( arr_sig_prb)
|
885
|
+
arr_bkg_lab = numpy.zeros_like(arr_bkg_prb)
|
886
|
+
|
887
|
+
arr_prb = numpy.concatenate([arr_sig_prb, arr_bkg_prb])
|
888
|
+
arr_lab = numpy.concatenate([arr_sig_lab, arr_bkg_lab])
|
889
|
+
|
890
|
+
res = TrainMva.plot_roc(
|
891
|
+
l_lab=arr_lab,
|
892
|
+
l_prb=arr_prb,
|
893
|
+
color=color,
|
894
|
+
kind =kind,
|
895
|
+
ifold=ifold)
|
896
|
+
|
897
|
+
return res
|
898
|
+
# ---------------------------------------------
|
899
|
+
@staticmethod
|
900
|
+
def plot_roc(
|
901
|
+
l_lab : NPA,
|
902
|
+
l_prb : NPA,
|
903
|
+
kind : str,
|
904
|
+
ifold : int,
|
905
|
+
color : str = None) -> tuple[NPA, NPA, float]:
|
906
|
+
'''
|
907
|
+
Takes the labels and the probabilities and plots ROC
|
908
|
+
curve for given fold
|
909
|
+
|
910
|
+
Parameters
|
911
|
+
--------------------
|
912
|
+
ifold : If no fold makes sense (i.e. this is the full sample), use ifold=-1
|
913
|
+
kind : Used to label the plot
|
914
|
+
|
915
|
+
Returns
|
916
|
+
--------------------
|
917
|
+
Tuple with 3 elements:
|
918
|
+
|
919
|
+
- Array of x coordinates of ROC curve
|
920
|
+
- Array of y coordinates of ROC curve
|
921
|
+
- Area under the curve
|
922
|
+
'''
|
923
|
+
log.debug(f'Plotting ROC curve for {ifold} fold')
|
924
|
+
|
925
|
+
xval, yval, _ = roc_curve(l_lab, l_prb)
|
926
|
+
xval = 1 - xval
|
927
|
+
area = auc(xval, yval)
|
928
|
+
|
929
|
+
if color is None:
|
930
|
+
color='red' if kind == 'Train' else 'blue'
|
931
|
+
|
932
|
+
if ifold == -1:
|
933
|
+
label=f'Test sample: {area:.3f}'
|
934
|
+
else:
|
935
|
+
label=f'{kind}: {area:.3f}'
|
936
|
+
|
937
|
+
plt.plot(xval, yval, color=color, label=label)
|
938
|
+
|
939
|
+
return xval, yval, area
|
558
940
|
# ---------------------------------------------
|