data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
  2. data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
  3. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
  4. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
  5. dmu/__init__.py +0 -0
  6. dmu/generic/hashing.py +34 -8
  7. dmu/generic/utilities.py +164 -11
  8. dmu/logging/log_store.py +34 -2
  9. dmu/logging/messages.py +96 -0
  10. dmu/ml/cv_classifier.py +3 -3
  11. dmu/ml/cv_diagnostics.py +3 -0
  12. dmu/ml/cv_performance.py +58 -0
  13. dmu/ml/cv_predict.py +149 -46
  14. dmu/ml/train_mva.py +482 -100
  15. dmu/ml/utilities.py +29 -10
  16. dmu/pdataframe/utilities.py +28 -3
  17. dmu/plotting/fwhm.py +2 -2
  18. dmu/plotting/matrix.py +1 -1
  19. dmu/plotting/plotter.py +23 -3
  20. dmu/plotting/plotter_1d.py +96 -32
  21. dmu/plotting/plotter_2d.py +5 -0
  22. dmu/rdataframe/utilities.py +54 -3
  23. dmu/rfile/ddfgetter.py +102 -0
  24. dmu/stats/fit_stats.py +129 -0
  25. dmu/stats/fitter.py +55 -22
  26. dmu/stats/gof_calculator.py +7 -0
  27. dmu/stats/model_factory.py +153 -62
  28. dmu/stats/parameters.py +100 -0
  29. dmu/stats/utilities.py +443 -12
  30. dmu/stats/wdata.py +187 -0
  31. dmu/stats/zfit.py +17 -0
  32. dmu/stats/zfit_plotter.py +147 -36
  33. dmu/testing/utilities.py +102 -24
  34. dmu/workflow/__init__.py +0 -0
  35. dmu/workflow/cache.py +266 -0
  36. dmu_data/ml/tests/train_mva.yaml +9 -7
  37. dmu_data/ml/tests/train_mva_def.yaml +75 -0
  38. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
  39. dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
  40. dmu_data/plotting/tests/2d.yaml +5 -5
  41. dmu_data/plotting/tests/line.yaml +15 -0
  42. dmu_data/plotting/tests/styling.yaml +8 -1
  43. dmu_data/rfile/friends.yaml +13 -0
  44. dmu_data/stats/fitter/test_simple.yaml +28 -0
  45. dmu_data/stats/kde_optimizer/control.json +1 -0
  46. dmu_data/stats/kde_optimizer/signal.json +1 -0
  47. dmu_data/stats/parameters/data.yaml +178 -0
  48. dmu_data/tests/config.json +6 -0
  49. dmu_data/tests/config.yaml +4 -0
  50. dmu_data/tests/pdf_to_tex.txt +34 -0
  51. dmu_scripts/kerberos/check_expiration +21 -0
  52. dmu_scripts/kerberos/convert_certificate +22 -0
  53. dmu_scripts/ml/compare_classifiers.py +85 -0
  54. data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
  55. {data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
  56. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0
dmu/ml/train_mva.py CHANGED
@@ -4,17 +4,27 @@ Module with TrainMva class
4
4
  # pylint: disable = too-many-locals, no-name-in-module
5
5
  # pylint: disable = too-many-arguments, too-many-positional-arguments
6
6
  # pylint: disable = too-many-instance-attributes
7
+ # pylint: disable = too-many-arguments, too-many-positional-arguments
7
8
 
8
9
  import os
9
10
  import copy
11
+ import json
12
+ import math
13
+
14
+ from contextlib import contextmanager
15
+ from typing import Optional, Union
16
+ from functools import partial
10
17
 
18
+ import tqdm
11
19
  import joblib
20
+ import optuna
12
21
  import pandas as pnd
13
22
  import numpy
14
23
  import matplotlib.pyplot as plt
15
24
 
16
25
  from sklearn.metrics import roc_curve, auc
17
- from sklearn.model_selection import StratifiedKFold
26
+ from sklearn.model_selection import StratifiedKFold, cross_val_score
27
+ from sklearn.ensemble import GradientBoostingClassifier
18
28
 
19
29
  from ROOT import RDataFrame, RDF
20
30
 
@@ -31,10 +41,20 @@ from dmu.logging.log_store import LogStore
31
41
  NPA = numpy.ndarray
32
42
  log = LogStore.add_logger('dmu:ml:train_mva')
33
43
  # ---------------------------------------------
44
+ class NoFeatureInfo(Exception):
45
+ '''
46
+ Used when information about a feature is missing in the config file
47
+ '''
48
+ def __init__(self, message : str):
49
+ super().__init__(message)
50
+ # ---------------------------------------------
34
51
  class TrainMva:
35
52
  '''
36
53
  Interface to scikit learn used to train classifier
37
54
  '''
55
+ # TODO:
56
+ # - Hyperparameter optimization methods should go into their own class
57
+ # - Data preprocessing methods might need their own class
38
58
  # ---------------------------------------------
39
59
  def __init__(self, bkg : RDataFrame, sig : RDataFrame, cfg : dict):
40
60
  '''
@@ -43,13 +63,15 @@ class TrainMva:
43
63
  cfg (dict) : Dictionary storing configuration for training
44
64
  '''
45
65
  self._cfg = cfg
66
+ self._auc = math.nan # This is where the Area Under the ROC curve for the full sample will be saved
46
67
  self._l_ft_name = self._cfg['training']['features']
68
+ self._pbar : Optional[tqdm.tqdm]
47
69
 
48
70
  self._rdf_sig_org = sig
49
- self._rdf_bkg_org = bkg
71
+ self._rdf_bkg_org = bkg
50
72
 
51
- rdf_bkg = self._preprocess_rdf(bkg)
52
- rdf_sig = self._preprocess_rdf(sig)
73
+ rdf_bkg = self._preprocess_rdf(rdf=bkg, kind='bkg')
74
+ rdf_sig = self._preprocess_rdf(rdf=sig, kind='sig')
53
75
 
54
76
  df_ft_sig, l_lab_sig = self._get_sample_inputs(rdf = rdf_sig, label = 1)
55
77
  df_ft_bkg, l_lab_bkg = self._get_sample_inputs(rdf = rdf_bkg, label = 0)
@@ -59,6 +81,11 @@ class TrainMva:
59
81
 
60
82
  self._rdf_bkg = self._get_rdf(rdf = rdf_bkg, df_feat=df_ft_bkg)
61
83
  self._rdf_sig = self._get_rdf(rdf = rdf_sig, df_feat=df_ft_sig)
84
+
85
+ self._rdm_state = 42 # Random state for training classifier
86
+ self._nworkers = 1 # Used to set number of workers for ANY process. Can be overriden with `use` context manager
87
+
88
+ optuna.logging.set_verbosity(optuna.logging.WARNING)
62
89
  # ---------------------------------------------
63
90
  def _get_extra_columns(self, rdf : RDataFrame, df : pnd.DataFrame) -> list[str]:
64
91
  d_plot = self._cfg['plotting']['features']['plots']
@@ -124,17 +151,51 @@ class TrainMva:
124
151
  log.info(70 * '-')
125
152
 
126
153
  return df
154
+ #---------------------------------
155
+ def _add_sample_columns(
156
+ self,
157
+ rdf : RDataFrame,
158
+ kind : str) -> RDataFrame:
159
+ '''
160
+ This will apply sample specific column definitions
161
+ to the dataframe
162
+ '''
163
+ try:
164
+ d_def = self._cfg['dataset']['samples'][kind]['definitions']
165
+ except KeyError:
166
+ log.debug(f'Not found sample definitions for {kind}')
167
+ return rdf
168
+
169
+ log.info(60 * '-')
170
+ log.info(f'Found sample definitions for {kind}')
171
+ log.info(60 * '-')
172
+ for name, expr in d_def.items():
173
+ log.info(f'{name:<30}{"-->":<10}{expr:<20}')
174
+ rdf = rdf.Define(name, expr)
175
+ log.info(60 * '-')
176
+
177
+ return rdf
127
178
  # ---------------------------------------------
128
- def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
179
+ def _preprocess_rdf(self, rdf : RDataFrame, kind : str) -> RDataFrame:
180
+ rdf = self._add_sample_columns(rdf, kind)
181
+
129
182
  if 'define' not in self._cfg['dataset']:
130
183
  log.debug('No definitions found')
131
184
  return rdf
132
185
 
133
- log.debug('Definitions found')
186
+ log.debug(f'Definitions found for {kind}')
134
187
  d_def = self._cfg['dataset']['define']
135
188
  for name, expr in d_def.items():
136
189
  log.debug(f'{name:<20}{expr}')
137
- rdf = rdf.Define(name, expr)
190
+ try:
191
+ rdf = rdf.Define(name, expr)
192
+ except TypeError as exc:
193
+ l_col = [ name.c_str() for name in rdf.GetColumnNames() ]
194
+ branch_list = 'found_branches.txt'
195
+ with open(branch_list, 'w', encoding='utf-8') as ifile:
196
+ json.dump(l_col, ifile, indent=2)
197
+
198
+ raise TypeError(f'Branches found were dumped to {branch_list}') from exc
138
199
 
139
200
  return rdf
140
201
  # ---------------------------------------------
@@ -159,7 +220,7 @@ class TrainMva:
159
220
 
160
221
  return model
161
222
  # ---------------------------------------------
162
- def _get_models(self, load_trained : bool):
223
+ def _get_models(self, load_trained : bool) -> list[cls]:
163
224
  '''
164
225
  Will create models, train them and return them
165
226
  '''
@@ -174,6 +235,11 @@ class TrainMva:
174
235
 
175
236
  l_model=[]
176
237
  ifold=0
238
+
239
+ l_arr_lab_ts = []
240
+ l_arr_all_ts = []
241
+ l_arr_sig_ts = []
242
+ l_arr_bkg_ts = []
177
243
  for arr_itr, arr_its in kfold.split(self._df_ft, self._l_lab):
178
244
  log.debug(20 * '-')
179
245
  log.info(f'Training fold: {ifold}')
@@ -181,20 +247,103 @@ class TrainMva:
181
247
  model = self._get_model(arr_itr)
182
248
  l_model.append(model)
183
249
 
184
- arr_sig_sig_tr, arr_sig_bkg_tr, arr_sig_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
185
- arr_sig_sig_ts, arr_sig_bkg_ts, arr_sig_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
250
+ arr_sig_tr, arr_bkg_tr, arr_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
251
+ arr_sig_ts, arr_bkg_ts, arr_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
186
252
 
187
253
  self._save_feature_importance(model, ifold)
188
- self._plot_correlation(arr_itr, ifold)
189
- self._plot_scores(arr_sig_sig_tr, arr_sig_sig_ts, arr_sig_bkg_tr, arr_sig_bkg_ts, ifold)
190
- self._plot_roc(arr_lab_ts, arr_sig_all_ts, arr_lab_tr, arr_sig_all_tr, ifold)
254
+ self._plot_correlations(arr_itr, ifold)
255
+ self._plot_scores(
256
+ ifold = ifold,
257
+ sig_trn=arr_sig_tr,
258
+ sig_tst=arr_sig_ts,
259
+ bkg_trn=arr_bkg_tr,
260
+ bkg_tst=arr_bkg_ts)
261
+
262
+ xval_ts, yval_ts, _ = TrainMva.plot_roc(arr_lab_ts, arr_all_ts, kind='Test' , ifold=ifold)
263
+ xval_tr, yval_tr, _ = TrainMva.plot_roc(arr_lab_tr, arr_all_tr, kind='Train', ifold=ifold)
264
+ self._plot_probabilities(xval_tr, yval_tr, arr_all_tr, arr_lab_tr)
265
+ self._save_roc_plot(ifold=ifold)
266
+
267
+ self._save_roc_json(xval=xval_ts, yval=yval_ts, kind='Test' , ifold=ifold)
268
+ self._save_roc_json(xval=xval_tr, yval=yval_tr, kind='Train', ifold=ifold)
191
269
 
192
270
  ifold+=1
193
271
 
272
+ l_arr_lab_ts.append(arr_lab_ts)
273
+ l_arr_all_ts.append(arr_all_ts)
274
+ l_arr_sig_ts.append(arr_sig_ts)
275
+ l_arr_bkg_ts.append(arr_bkg_ts)
276
+
277
+ arr_lab_ts = numpy.concatenate(l_arr_lab_ts)
278
+ arr_all_ts = numpy.concatenate(l_arr_all_ts)
279
+ arr_sig_ts = numpy.concatenate(l_arr_sig_ts)
280
+ arr_bkg_ts = numpy.concatenate(l_arr_bkg_ts)
281
+
282
+ xval, yval, self._auc = TrainMva.plot_roc(
283
+ arr_lab_ts,
284
+ arr_all_ts,
285
+ kind ='Test',
286
+ ifold=-1)
287
+ self._plot_probabilities(xval, yval, arr_all_ts, arr_lab_ts)
288
+ self._save_roc_plot(ifold=-1)
289
+
290
+ self._plot_scores(ifold=-1, sig_tst=arr_sig_ts, bkg_tst=arr_bkg_ts)
291
+ self._save_roc_json(xval=xval, yval=yval, kind='Full', ifold=-1)
292
+
194
293
  return l_model
195
294
  # ---------------------------------------------
295
+ def _save_roc_json(
296
+ self,
297
+ ifold : int,
298
+ kind : str,
299
+ xval : NPA,
300
+ yval : NPA) -> None:
301
+ ifold = 'all' if ifold == -1 else ifold # -1 represents all the testing datasets combined
302
+ val_dir = self._cfg['saving']['output']
303
+
304
+ name = kind.lower()
305
+ val_dir = f'{val_dir}/fold_{ifold:03}'
306
+ os.makedirs(val_dir, exist_ok=True)
307
+ jsn_path = f'{val_dir}/roc_{name}.json'
308
+
309
+ df = pnd.DataFrame({'x' : xval, 'y' : yval})
310
+ df.to_json(jsn_path, indent=2)
311
+ # ---------------------------------------------
312
+ def _save_roc_plot(self, ifold : int) -> None:
313
+ min_x = 0
314
+ min_y = 0
315
+ ifold = 'all' if ifold == -1 else ifold
316
+
317
+ if 'min' in self._cfg['plotting']['roc']:
318
+ [min_x, min_y] = self._cfg['plotting']['roc']['min']
319
+
320
+ max_x = 1
321
+ max_y = 1
322
+ if 'max' in self._cfg['plotting']['roc']:
323
+ [max_x, max_y] = self._cfg['plotting']['roc']['max']
324
+
325
+ val_dir = self._cfg['saving']['output']
326
+
327
+ if ifold == 'all':
328
+ plt_dir = f'{val_dir}/fold_all'
329
+ else:
330
+ plt_dir = f'{val_dir}/fold_{ifold:03}'
331
+
332
+ os.makedirs(plt_dir, exist_ok=True)
333
+
334
+ plt.xlabel('Signal efficiency')
335
+ plt.ylabel('Background rejection')
336
+ plt.title(f'Fold: {ifold}')
337
+ plt.xlim(min_x, max_x)
338
+ plt.ylim(min_y, max_y)
339
+ plt.grid()
340
+ plt.legend()
341
+ plt.savefig(f'{plt_dir}/roc.png')
342
+ plt.close()
343
+ # ---------------------------------------------
196
344
  def _load_trained_models(self) -> list[cls]:
197
- model_path = self._cfg['saving']['path']
345
+ out_dir = self._cfg['saving']['output']
346
+ model_path = f'{out_dir}/model.pkl'
198
347
  nfold = self._cfg['training']['nfold']
199
348
  l_model = []
200
349
  for ifold in range(nfold):
@@ -212,18 +361,18 @@ class TrainMva:
212
361
  def _labels_from_varnames(self, l_var_name : list[str]) -> list[str]:
213
362
  try:
214
363
  d_plot = self._cfg['plotting']['features']['plots']
215
- except ValueError:
216
- log.warning('Cannot find plotting/features/plots section in config, using dataframe names')
217
- return l_var_name
364
+ except KeyError as exc:
365
+ raise KeyError('Cannot find plotting/features/plots section in config, using dataframe names') from exc
218
366
 
219
367
  l_label = []
220
368
  for var_name in l_var_name:
221
369
  if var_name not in d_plot:
222
- log.warning(f'No plot found for: {var_name}')
223
- l_label.append(var_name)
224
- continue
370
+ raise NoFeatureInfo(f'No plot found for feature {var_name}, cannot extract label')
225
371
 
226
372
  d_setting = d_plot[var_name]
373
+ if 'labels' not in d_setting:
374
+ raise NoFeatureInfo(f'No no labels present for plot of feature {var_name}, cannot extract label')
375
+
227
376
  [xlab, _ ]= d_setting['labels']
228
377
 
229
378
  l_label.append(xlab)
@@ -237,7 +386,7 @@ class TrainMva:
237
386
  d_data['Variable' ] = self._labels_from_varnames(l_var_name)
238
387
  d_data['Importance'] = 100 * model.feature_importances_
239
388
 
240
- val_dir = self._cfg['plotting']['val_dir']
389
+ val_dir = self._cfg['saving']['output']
241
390
  val_dir = f'{val_dir}/fold_{ifold:03}'
242
391
  os.makedirs(val_dir, exist_ok=True)
243
392
 
@@ -290,7 +439,9 @@ class TrainMva:
290
439
  '''
291
440
  Saves a model, associated to a specific fold
292
441
  '''
293
- model_path = self._cfg['saving']['path']
442
+ out_dir = self._cfg['saving']['output']
443
+ model_path = f'{out_dir}/model.pkl'
444
+
294
445
  if os.path.isfile(model_path):
295
446
  log.info(f'Model found in {model_path}, not saving')
296
447
  return
@@ -327,21 +478,40 @@ class TrainMva:
327
478
 
328
479
  return cfg
329
480
  # ---------------------------------------------
330
- def _plot_correlation(self, arr_index : NPA, ifold : int) -> None:
481
+ def _plot_correlations(self, arr_index : NPA, ifold : int) -> None:
482
+ log.debug('Plotting correlations')
483
+
331
484
  df_ft = self._df_ft.iloc[arr_index]
485
+ l_lab = self._l_lab[arr_index]
486
+
487
+ arr_sig_idx, = numpy.where(l_lab == 1)
488
+ arr_bkg_idx, = numpy.where(l_lab == 0)
489
+
490
+ df_ft_sig = df_ft.iloc[arr_sig_idx]
491
+ df_ft_bkg = df_ft.iloc[arr_bkg_idx]
492
+
493
+ self._plot_correlation(df_ft=df_ft_sig, ifold=ifold, name='signal' )
494
+ self._plot_correlation(df_ft=df_ft_bkg, ifold=ifold, name='background')
495
+ # ---------------------------------------------
496
+ def _plot_correlation(
497
+ self,
498
+ df_ft : pnd.DataFrame,
499
+ ifold : int,
500
+ name : str) -> None:
501
+
502
+ log.debug(f'Plotting correlation for {name}/{ifold} fold')
503
+
332
504
  cfg = self._get_correlation_cfg(df_ft, ifold)
333
505
  cov = df_ft.corr()
334
506
  mat = cov.to_numpy()
335
507
 
336
- log.debug(f'Plotting correlation for {ifold} fold')
337
-
338
- val_dir = self._cfg['plotting']['val_dir']
508
+ val_dir = self._cfg['saving']['output']
339
509
  val_dir = f'{val_dir}/fold_{ifold:03}'
340
510
  os.makedirs(val_dir, exist_ok=True)
341
511
 
342
512
  obj = MatrixPlotter(mat=mat, cfg=cfg)
343
513
  obj.plot()
344
- plt.savefig(f'{val_dir}/covariance.png')
514
+ plt.savefig(f'{val_dir}/correlation_{name}.png')
345
515
  plt.close()
346
516
  # ---------------------------------------------
347
517
  def _get_nentries(self, arr_val : NPA) -> str:
@@ -350,26 +520,29 @@ class TrainMva:
350
520
 
351
521
  return f'{size:.2f}K'
352
522
  # ---------------------------------------------
353
- def _plot_scores(self, arr_sig_trn, arr_sig_tst, arr_bkg_trn, arr_bkg_tst, ifold):
354
- # pylint: disable = too-many-arguments, too-many-positional-arguments
523
+ def _plot_scores(
524
+ self,
525
+ ifold : int,
526
+ sig_tst : NPA,
527
+ bkg_tst : NPA,
528
+ sig_trn : NPA = None,
529
+ bkg_trn : NPA = None) -> None:
355
530
  '''
356
531
  Will plot an array of scores, associated to a given fold
357
532
  '''
533
+ ifold = 'all' if ifold == -1 else ifold
358
534
  log.debug(f'Plotting scores for {ifold} fold')
359
535
 
360
- if 'val_dir' not in self._cfg['plotting']:
361
- log.warning('Scores path not passed, not plotting scores')
362
- return
363
-
364
- val_dir = self._cfg['plotting']['val_dir']
536
+ val_dir = self._cfg['saving']['output']
365
537
  val_dir = f'{val_dir}/fold_{ifold:03}'
366
538
  os.makedirs(val_dir, exist_ok=True)
367
539
 
368
- plt.hist(arr_sig_trn, alpha = 0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train: ' + self._get_nentries(arr_sig_trn))
369
- plt.hist(arr_sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test: ' + self._get_nentries(arr_sig_tst))
540
+ plt.hist(sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test: ' + self._get_nentries(sig_tst))
541
+ plt.hist(bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test: ' + self._get_nentries(bkg_tst))
370
542
 
371
- plt.hist(arr_bkg_trn, alpha = 0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train: '+ self._get_nentries(arr_bkg_trn))
372
- plt.hist(arr_bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test: ' + self._get_nentries(arr_bkg_tst))
543
+ if sig_trn is not None and bkg_trn is not None:
544
+ plt.hist(sig_trn, alpha = 0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train: ' + self._get_nentries(sig_trn))
545
+ plt.hist(bkg_trn, alpha = 0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train: '+ self._get_nentries(bkg_trn))
373
546
 
374
547
  plt.legend()
375
548
  plt.title(f'Fold: {ifold}')
@@ -378,59 +551,12 @@ class TrainMva:
378
551
  plt.savefig(f'{val_dir}/scores.png')
379
552
  plt.close()
380
553
  # ---------------------------------------------
381
- def _plot_roc(self,
382
- l_lab_ts : NPA,
383
- l_prb_ts : NPA,
384
- l_lab_tr : NPA,
385
- l_prb_tr : NPA,
386
- ifold : int):
387
- '''
388
- Takes the labels and the probabilities and plots ROC
389
- curve for given fold
390
- '''
391
- log.debug(f'Plotting ROC curve for {ifold} fold')
392
-
393
- val_dir = self._cfg['plotting']['val_dir']
394
- val_dir = f'{val_dir}/fold_{ifold:03}'
395
- os.makedirs(val_dir, exist_ok=True)
396
-
397
- xval_ts, yval_ts, _ = roc_curve(l_lab_ts, l_prb_ts)
398
- xval_ts = 1 - xval_ts
399
- area_ts = auc(xval_ts, yval_ts)
400
-
401
- xval_tr, yval_tr, _ = roc_curve(l_lab_tr, l_prb_tr)
402
- xval_tr = 1 - xval_tr
403
- area_tr = auc(xval_tr, yval_tr)
404
-
405
- min_x = 0
406
- min_y = 0
407
- if 'min' in self._cfg['plotting']['roc']:
408
- [min_x, min_y] = self._cfg['plotting']['roc']['min']
409
-
410
- max_x = 1
411
- max_y = 1
412
- if 'max' in self._cfg['plotting']['roc']:
413
- [max_x, max_y] = self._cfg['plotting']['roc']['max']
414
-
415
- plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
416
- plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
417
- self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
418
-
419
- plt.xlabel('Signal efficiency')
420
- plt.ylabel('Background rejection')
421
- plt.title(f'Fold: {ifold}')
422
- plt.xlim(min_x, max_x)
423
- plt.ylim(min_y, max_y)
424
- plt.grid()
425
- plt.legend()
426
- plt.savefig(f'{val_dir}/roc.png')
427
- plt.close()
428
- # ---------------------------------------------
429
- def _plot_probabilities(self,
430
- arr_seff: NPA,
431
- arr_brej: NPA,
432
- arr_sprb: NPA,
433
- arr_labl: NPA) -> None:
554
+ def _plot_probabilities(
555
+ self,
556
+ arr_seff: NPA,
557
+ arr_brej: NPA,
558
+ arr_sprb: NPA,
559
+ arr_labl: NPA) -> None:
434
560
 
435
561
  roc_cfg = self._cfg['plotting']['roc']
436
562
  if 'annotate' not in roc_cfg:
@@ -475,7 +601,10 @@ class TrainMva:
475
601
  '''
476
602
  Will plot the features, based on the settings in the config
477
603
  '''
478
- d_cfg = self._cfg['plotting']['features']
604
+ out_dir = self._cfg['saving']['output']
605
+ d_cfg = self._cfg['plotting']['features']
606
+ d_cfg['saving'] = {'plt_dir' : f'{out_dir}/features'}
607
+
479
608
  ptr = Plotter(d_rdf = {'Signal' : self._rdf_sig, 'Background' : self._rdf_bkg}, cfg=d_cfg)
480
609
  ptr.run()
481
610
  # ---------------------------------------------
@@ -498,7 +627,7 @@ class TrainMva:
498
627
 
499
628
  d_tex = {'Variable' : l_lab, 'Replacement' : l_val}
500
629
  df = pnd.DataFrame(d_tex)
501
- val_dir = self._cfg['plotting']['val_dir']
630
+ val_dir = self._cfg['saving']['output']
502
631
  os.makedirs(val_dir, exist_ok=True)
503
632
  put.df_to_tex(df, f'{val_dir}/nan_replacement.tex')
504
633
  # ---------------------------------------------
@@ -506,16 +635,23 @@ class TrainMva:
506
635
  if 'hyper' not in self._cfg['training']:
507
636
  raise ValueError('Cannot find hyper parameters in configuration')
508
637
 
638
+ def format_value(val : Union[int,float]) -> str:
639
+ if isinstance(val, float):
640
+ return f'\\verb|{val:.3f}|'
641
+
642
+ return f'\\verb|{val}|'
643
+
509
644
  d_hyper = self._cfg['training']['hyper']
510
- d_form = { f'\\verb|{key}|' : f'\\verb|{val}|' for key, val in d_hyper.items() }
645
+ d_form = { f'\\verb|{key}|' : format_value(val) for key, val in d_hyper.items() }
511
646
  d_latex = { 'Hyperparameter' : list(d_form.keys()), 'Value' : list(d_form.values())}
512
647
 
513
648
  df = pnd.DataFrame(d_latex)
514
- val_dir = self._cfg['plotting']['val_dir']
649
+ val_dir = self._cfg['saving']['output']
515
650
  os.makedirs(val_dir, exist_ok=True)
516
651
  put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
517
652
  # ---------------------------------------------
518
653
  def _run_diagnostics(self, models : list[cls], rdf : RDataFrame, name : str) -> None:
654
+ log.info(f'Running diagnostics for sample {name}')
519
655
  if 'diagnostics' not in self._cfg:
520
656
  log.warning('Diagnostics section not found, not running diagnostics')
521
657
  return
@@ -535,24 +671,270 @@ class TrainMva:
535
671
  cvd = CVDiagnostics(models=models, rdf=rdf, cfg=cfg_diag)
536
672
  cvd.run()
537
673
  # ---------------------------------------------
538
- def run(self, skip_fit : bool = False, load_trained : bool = False) -> None:
674
+ #
675
+ # Hyperparameter optimization
676
+ # ---------------------------------------------
677
+ def _objective(self, trial, kfold : StratifiedKFold) -> float:
678
+ ft = self._df_ft
679
+ lab= self._l_lab
680
+
681
+ if not issubclass(cls, GradientBoostingClassifier):
682
+ raise NotImplementedError('Hyperparameter optimization only implemented for GradientBoostingClassifier')
683
+
684
+ nft = len(ft.columns)
685
+
686
+ var_learn_rate = trial.suggest_float('learning_rate' , 1e-3, 1e-1, log=True)
687
+ var_max_depth = trial.suggest_int('max_depth' , 2, 15)
688
+ var_max_features= trial.suggest_int('max_features' , 2, nft)
689
+ var_min_split = trial.suggest_int('min_samples_split', 2, 10)
690
+ var_min_samples = trial.suggest_int('min_samples_leaf' , 2, 30)
691
+ var_nestimators = trial.suggest_int('n_estimators' , 50, 400)
692
+
693
+ classifier = GradientBoostingClassifier(
694
+ learning_rate = var_learn_rate,
695
+ max_depth = var_max_depth,
696
+ max_features = var_max_features,
697
+ min_samples_split = var_min_split,
698
+ min_samples_leaf = var_min_samples,
699
+ n_estimators = var_nestimators,
700
+ random_state = self._rdm_state)
701
+
702
+ score = cross_val_score(
703
+ classifier,
704
+ ft,
705
+ lab,
706
+ n_jobs=1, # More than this will reach RLIMIT_NPROC in cluster
707
+ cv=kfold)
708
+
709
+ accuracy = score.mean()
710
+
711
+ return accuracy
712
+ # ---------------------------------------------
713
+ def _optimize_hyperparameters(self, ntrial : int):
714
+ log.info('Running hyperparameter optimization')
715
+
716
+ self._pbar = tqdm.tqdm(total=ntrial, desc='Optimizing')
717
+ kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self._rdm_state)
718
+ objective = partial(self._objective, kfold=kfold)
719
+
720
+ study = optuna.create_study(
721
+ direction='maximize',
722
+ pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=5),)
723
+
724
+ study.optimize(
725
+ objective,
726
+ callbacks = [self._update_progress],
727
+ n_jobs = self._nworkers,
728
+ n_trials = ntrial)
729
+
730
+ self._print_hyper_opt(study=study)
731
+ self._plot_hyper_opt(study=study)
732
+
733
+ log.info('Overriding hyperparameters with optimized values')
734
+
735
+ self._cfg['training']['hyper'] = study.best_params
736
+ # ---------------------------------------------
737
+ def _plot_hyper_opt(self, study) -> None:
738
+ out_dir = self._cfg['saving']['output']
739
+ opt_dir = f'{out_dir}/optimization'
740
+ os.makedirs(opt_dir, exist_ok=True)
741
+
742
+ trials_df = study.trials_dataframe()
743
+
744
+ plt.plot(trials_df['number'], trials_df['value'])
745
+ plt.xlabel('Trial')
746
+ plt.ylabel('Accuracy')
747
+ plt.title('Optimization History')
748
+ plt.grid(True)
749
+ plt.savefig(f'{opt_dir}/history.png')
750
+ plt.close()
751
+
752
+ plt.hist(trials_df['value'], bins=20, alpha=0.7)
753
+ plt.xlabel('Accuracy')
754
+ plt.ylabel('Frequency')
755
+ plt.title('Distribution of Trial Results')
756
+ plt.savefig(f'{opt_dir}/accuracy.png')
757
+ plt.close()
758
+ # ---------------------------------------------
759
+ def _update_progress(self, study, _trial):
760
+ self._pbar.set_postfix({'Best': f'{study.best_value:.4f}' if study.best_value else 'N/A'})
761
+ self._pbar.update(1)
762
+ # ---------------------------------------------
763
+ def _print_hyper_opt(self, study) -> None:
764
+ log.info(40 * '-')
765
+ log.info('Optimized hyperparameters:')
766
+ log.info(40 * '-')
767
+ for name, value in study.best_params.items():
768
+ if isinstance(value, float):
769
+ log.info(f'{name:<20}{value:.3f}')
770
+ else:
771
+ log.info(f'{name:<20}{value}')
772
+ # ---------------------------------------------
773
+ # ---------------------------------------------
774
+ def _auc_from_json(self, ifold : int, kind : str) -> float:
775
+ val_dir = self._cfg['saving']['output']
776
+ path = f'{val_dir}/fold_{ifold:03}/roc_{kind}.json'
777
+ df = pnd.read_json(path)
778
+
779
+ return auc(df['x'], df['y'])
780
+ # ---------------------------------------------
781
+ def _check_overtraining(self) -> None:
782
+ nfold = self._cfg['training']['nfold']
783
+
784
+ df = pnd.DataFrame(columns=['fold'])
785
+ df['fold' ]= numpy.linspace(0, nfold - 1, nfold, dtype=int)
786
+ df['test' ]= df['fold'].apply(self._auc_from_json, args=('test' ,))
787
+ df['train']= df['fold'].apply(self._auc_from_json, args=('train',))
788
+
789
+ ax=None
790
+ ax=df.plot('fold', 'test' , color='blue', label='Testing sample' , ax=ax)
791
+ ax=df.plot('fold', 'train', color='red' , label='Training sample', ax=ax)
792
+ ax.set_ylim(bottom=0.75, top=1.00)
793
+ ax.set_ylabel('AUC')
794
+ ax.set_xlabel('Fold')
795
+
796
+ plt.grid()
797
+
798
+ val_dir = self._cfg['saving']['output']
799
+ path = f'{val_dir}/fold_all/auc_vs_fold.png'
800
+ plt.savefig(path)
801
+ plt.close()
802
+ # ---------------------------------------------
803
+ def run(
804
+ self,
805
+ skip_fit : bool = False,
806
+ opt_ntrial : int = 0,
807
+ load_trained : bool = False) -> float:
539
808
  '''
540
809
  Will do the training
541
810
 
542
- skip_fit: By default false, if True, it will only do the plots of features and save tables
543
- load_trained: If true, it will load the models instead of training, by default false
811
+ skip_fit : By default false, if True, it will only do the plots of features and save tables
812
+ opt_ntrial : Number of optimization tries for hyperparameter optimization, by default zero, i.e. no optimization will run
813
+ load_trained: If true, it will load the models instead of training, by default false.
814
+
815
+ Returns
816
+ ----------------
817
+ Area under the ROC curve from evaluating the classifiers
818
+ on samples that were not used in their training. Uses the full sample
544
819
  '''
545
- self._save_settings_to_tex()
546
820
  self._plot_features()
547
821
 
548
822
  if skip_fit:
549
- return
823
+ return self._auc
824
+
825
+ if opt_ntrial > 0:
826
+ self._optimize_hyperparameters(ntrial=opt_ntrial)
550
827
 
828
+ self._save_settings_to_tex()
551
829
  l_mod = self._get_models(load_trained = load_trained)
552
830
  if not load_trained:
553
831
  for ifold, mod in enumerate(l_mod):
554
832
  self._save_model(mod, ifold)
555
833
 
834
+ self._check_overtraining()
556
835
  self._run_diagnostics(models = l_mod, rdf = self._rdf_sig_org, name='Signal' )
557
836
  self._run_diagnostics(models = l_mod, rdf = self._rdf_bkg_org, name='Background')
837
+
838
+ return self._auc
839
+ # ---------------------------------------------
840
+ @contextmanager
841
+ def use(self, nworkers : int) -> None:
842
+ '''
843
+ Context manager used to run with a specific configuration
844
+
845
+ nworkers: Use this number of workers for ANY process that can be parallelized.
846
+ '''
847
+ old = self._nworkers
848
+
849
+ log.info(f'Using {nworkers} workers to run training')
850
+
851
+ self._nworkers = nworkers
852
+ try:
853
+ yield
854
+ finally:
855
+ self._nworkers = old
856
+ # ---------------------------------------------
857
+ @staticmethod
858
+ def plot_roc_from_prob(
859
+ arr_sig_prb : NPA,
860
+ arr_bkg_prb : NPA,
861
+ kind : str,
862
+ ifold : int,
863
+ color : str = None) -> tuple[NPA,NPA, float]:
864
+ '''
865
+ Takes arrays of signal and background probabilities
866
+ and plots ROC curve
867
+
868
+ Parameters
869
+ --------------------
870
+ arr_bkg/sig_prb : Array with background/signal probabilities
871
+ kind : String used to label the plot
872
+ ifold : If no fold makes sense (i.e. this is the full sample), use ifold=-1
873
+ kind : Used to label the plot
874
+ color : String with color of curve
875
+
876
+ Returns
877
+ --------------------
878
+ Tuple with 3 elements:
879
+
880
+ - Array of x coordinates of ROC curve
881
+ - Array of y coordinates of ROC curve
882
+ - Area under the curve
883
+ '''
884
+ arr_sig_lab = numpy.ones_like( arr_sig_prb)
885
+ arr_bkg_lab = numpy.zeros_like(arr_bkg_prb)
886
+
887
+ arr_prb = numpy.concatenate([arr_sig_prb, arr_bkg_prb])
888
+ arr_lab = numpy.concatenate([arr_sig_lab, arr_bkg_lab])
889
+
890
+ res = TrainMva.plot_roc(
891
+ l_lab=arr_lab,
892
+ l_prb=arr_prb,
893
+ color=color,
894
+ kind =kind,
895
+ ifold=ifold)
896
+
897
+ return res
898
+ # ---------------------------------------------
899
+ @staticmethod
900
+ def plot_roc(
901
+ l_lab : NPA,
902
+ l_prb : NPA,
903
+ kind : str,
904
+ ifold : int,
905
+ color : str = None) -> tuple[NPA, NPA, float]:
906
+ '''
907
+ Takes the labels and the probabilities and plots ROC
908
+ curve for given fold
909
+
910
+ Parameters
911
+ --------------------
912
+ ifold : If no fold makes sense (i.e. this is the full sample), use ifold=-1
913
+ kind : Used to label the plot
914
+
915
+ Returns
916
+ --------------------
917
+ Tuple with 3 elements:
918
+
919
+ - Array of x coordinates of ROC curve
920
+ - Array of y coordinates of ROC curve
921
+ - Area under the curve
922
+ '''
923
+ log.debug(f'Plotting ROC curve for {ifold} fold')
924
+
925
+ xval, yval, _ = roc_curve(l_lab, l_prb)
926
+ xval = 1 - xval
927
+ area = auc(xval, yval)
928
+
929
+ if color is None:
930
+ color='red' if kind == 'Train' else 'blue'
931
+
932
+ if ifold == -1:
933
+ label=f'Test sample: {area:.3f}'
934
+ else:
935
+ label=f'{kind}: {area:.3f}'
936
+
937
+ plt.plot(xval, yval, color=color, label=label)
938
+
939
+ return xval, yval, area
558
940
  # ---------------------------------------------