PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/train_mva.yaml +9 -7
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/styling.yaml +8 -1
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
{data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/ml/train_mva.py CHANGED Viewed

@@ -4,17 +4,27 @@ Module with TrainMva class
 # pylint: disable = too-many-locals, no-name-in-module
 # pylint: disable = too-many-arguments, too-many-positional-arguments
 # pylint: disable = too-many-instance-attributes
+# pylint: disable = too-many-arguments, too-many-positional-arguments
 import os
 import copy
+import json
+import math
+from contextlib import contextmanager
+from typing     import Optional, Union
+from functools  import partial
+import tqdm
 import joblib
+import optuna
 import pandas as pnd
 import numpy
 import matplotlib.pyplot as plt
 from sklearn.metrics         import roc_curve, auc
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import StratifiedKFold, cross_val_score
+from sklearn.ensemble        import GradientBoostingClassifier
 from ROOT import RDataFrame, RDF
@@ -31,10 +41,20 @@ from dmu.logging.log_store   import LogStore
 NPA = numpy.ndarray
 log = LogStore.add_logger('dmu:ml:train_mva')
 # ---------------------------------------------
+class NoFeatureInfo(Exception):
+    '''
+    Used when information about a feature is missing in the config file
+    '''
+    def __init__(self, message : str):
+        super().__init__(message)
+# ---------------------------------------------
 class TrainMva:
     '''
     Interface to scikit learn used to train classifier
     '''
+    # TODO:
+    # - Hyperparameter optimization methods should go into their own class
+    # - Data preprocessing methods might need their own class
     # ---------------------------------------------
     def __init__(self, bkg : RDataFrame, sig : RDataFrame, cfg : dict):
         '''
@@ -43,13 +63,15 @@ class TrainMva:
         cfg (dict)          : Dictionary storing configuration for training
         '''
         self._cfg       = cfg
+        self._auc       = math.nan # This is where the Area Under the ROC curve for the full sample will be saved
         self._l_ft_name = self._cfg['training']['features']
+        self._pbar      : Optional[tqdm.tqdm]
         self._rdf_sig_org = sig
-        self._rdf_bkg_org = bkg
+        self._rdf_bkg_org = bkg
-        rdf_bkg = self._preprocess_rdf(bkg)
-        rdf_sig = self._preprocess_rdf(sig)
+        rdf_bkg = self._preprocess_rdf(rdf=bkg, kind='bkg')
+        rdf_sig = self._preprocess_rdf(rdf=sig, kind='sig')
         df_ft_sig, l_lab_sig = self._get_sample_inputs(rdf = rdf_sig, label = 1)
         df_ft_bkg, l_lab_bkg = self._get_sample_inputs(rdf = rdf_bkg, label = 0)
@@ -59,6 +81,11 @@ class TrainMva:
         self._rdf_bkg = self._get_rdf(rdf = rdf_bkg, df_feat=df_ft_bkg)
         self._rdf_sig = self._get_rdf(rdf = rdf_sig, df_feat=df_ft_sig)
+        self._rdm_state = 42 # Random state for training classifier
+        self._nworkers  =  1 # Used to set number of workers for ANY process. Can be overriden with `use` context manager
+        optuna.logging.set_verbosity(optuna.logging.WARNING)
     # ---------------------------------------------
     def _get_extra_columns(self, rdf : RDataFrame, df : pnd.DataFrame) -> list[str]:
         d_plot = self._cfg['plotting']['features']['plots']
@@ -124,17 +151,51 @@ class TrainMva:
         log.info(70 * '-')
         return df
+    #---------------------------------
+    def _add_sample_columns(
+            self,
+            rdf  : RDataFrame,
+            kind : str) -> RDataFrame:
+        '''
+        This will apply sample specific column definitions
+        to the dataframe
+        '''
+        try:
+            d_def = self._cfg['dataset']['samples'][kind]['definitions']
+        except KeyError:
+            log.debug(f'Not found sample definitions for {kind}')
+            return rdf
+        log.info(60 * '-')
+        log.info(f'Found sample definitions for {kind}')
+        log.info(60 * '-')
+        for name, expr in d_def.items():
+            log.info(f'{name:<30}{"-->":<10}{expr:<20}')
+            rdf = rdf.Define(name, expr)
+        log.info(60 * '-')
+        return rdf
     # ---------------------------------------------
-    def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
+    def _preprocess_rdf(self, rdf : RDataFrame, kind : str) -> RDataFrame:
+        rdf = self._add_sample_columns(rdf, kind)
         if 'define' not in self._cfg['dataset']:
             log.debug('No definitions found')
             return rdf
-        log.debug('Definitions found')
+        log.debug(f'Definitions found for {kind}')
         d_def = self._cfg['dataset']['define']
         for name, expr in d_def.items():
             log.debug(f'{name:<20}{expr}')
-            rdf = rdf.Define(name, expr)
+            try:
+                rdf = rdf.Define(name, expr)
+            except TypeError as exc:
+                l_col = [ name.c_str() for name in rdf.GetColumnNames() ]
+                branch_list = 'found_branches.txt'
+                with open(branch_list, 'w', encoding='utf-8') as ifile:
+                    json.dump(l_col, ifile, indent=2)
+                raise TypeError(f'Branches found were dumped to {branch_list}') from exc
         return rdf
     # ---------------------------------------------
@@ -159,7 +220,7 @@ class TrainMva:
         return model
     # ---------------------------------------------
-    def _get_models(self, load_trained : bool):
+    def _get_models(self, load_trained : bool) -> list[cls]:
         '''
         Will create models, train them and return them
         '''
@@ -174,6 +235,11 @@ class TrainMva:
         l_model=[]
         ifold=0
+        l_arr_lab_ts = []
+        l_arr_all_ts = []
+        l_arr_sig_ts = []
+        l_arr_bkg_ts = []
         for arr_itr, arr_its in kfold.split(self._df_ft, self._l_lab):
             log.debug(20 * '-')
             log.info(f'Training fold: {ifold}')
@@ -181,20 +247,103 @@ class TrainMva:
             model = self._get_model(arr_itr)
             l_model.append(model)
-            arr_sig_sig_tr, arr_sig_bkg_tr, arr_sig_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
-            arr_sig_sig_ts, arr_sig_bkg_ts, arr_sig_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
+            arr_sig_tr, arr_bkg_tr, arr_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
+            arr_sig_ts, arr_bkg_ts, arr_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
             self._save_feature_importance(model, ifold)
-            self._plot_correlation(arr_itr, ifold)
-            self._plot_scores(arr_sig_sig_tr, arr_sig_sig_ts, arr_sig_bkg_tr, arr_sig_bkg_ts, ifold)
-            self._plot_roc(arr_lab_ts, arr_sig_all_ts, arr_lab_tr, arr_sig_all_tr, ifold)
+            self._plot_correlations(arr_itr, ifold)
+            self._plot_scores(
+                    ifold  =     ifold,
+                    sig_trn=arr_sig_tr,
+                    sig_tst=arr_sig_ts,
+                    bkg_trn=arr_bkg_tr,
+                    bkg_tst=arr_bkg_ts)
+            xval_ts, yval_ts, _ = TrainMva.plot_roc(arr_lab_ts, arr_all_ts, kind='Test' , ifold=ifold)
+            xval_tr, yval_tr, _ = TrainMva.plot_roc(arr_lab_tr, arr_all_tr, kind='Train', ifold=ifold)
+            self._plot_probabilities(xval_tr, yval_tr, arr_all_tr, arr_lab_tr)
+            self._save_roc_plot(ifold=ifold)
+            self._save_roc_json(xval=xval_ts, yval=yval_ts, kind='Test' , ifold=ifold)
+            self._save_roc_json(xval=xval_tr, yval=yval_tr, kind='Train', ifold=ifold)
             ifold+=1
+            l_arr_lab_ts.append(arr_lab_ts)
+            l_arr_all_ts.append(arr_all_ts)
+            l_arr_sig_ts.append(arr_sig_ts)
+            l_arr_bkg_ts.append(arr_bkg_ts)
+        arr_lab_ts = numpy.concatenate(l_arr_lab_ts)
+        arr_all_ts = numpy.concatenate(l_arr_all_ts)
+        arr_sig_ts = numpy.concatenate(l_arr_sig_ts)
+        arr_bkg_ts = numpy.concatenate(l_arr_bkg_ts)
+        xval, yval, self._auc = TrainMva.plot_roc(
+                arr_lab_ts,
+                arr_all_ts,
+                kind ='Test',
+                ifold=-1)
+        self._plot_probabilities(xval, yval, arr_all_ts, arr_lab_ts)
+        self._save_roc_plot(ifold=-1)
+        self._plot_scores(ifold=-1, sig_tst=arr_sig_ts, bkg_tst=arr_bkg_ts)
+        self._save_roc_json(xval=xval, yval=yval, kind='Full', ifold=-1)
         return l_model
     # ---------------------------------------------
+    def _save_roc_json(
+            self,
+            ifold : int,
+            kind  : str,
+            xval  : NPA,
+            yval  : NPA) -> None:
+        ifold    = 'all' if ifold == -1 else ifold # -1 represents all the testing datasets combined
+        val_dir  = self._cfg['saving']['output']
+        name     = kind.lower()
+        val_dir  = f'{val_dir}/fold_{ifold:03}'
+        os.makedirs(val_dir, exist_ok=True)
+        jsn_path = f'{val_dir}/roc_{name}.json'
+        df       = pnd.DataFrame({'x' : xval, 'y' : yval})
+        df.to_json(jsn_path, indent=2)
+    # ---------------------------------------------
+    def _save_roc_plot(self, ifold : int) -> None:
+        min_x = 0
+        min_y = 0
+        ifold = 'all' if ifold == -1 else ifold
+        if 'min' in self._cfg['plotting']['roc']:
+            [min_x, min_y] = self._cfg['plotting']['roc']['min']
+        max_x = 1
+        max_y = 1
+        if 'max' in self._cfg['plotting']['roc']:
+            [max_x, max_y] = self._cfg['plotting']['roc']['max']
+        val_dir  = self._cfg['saving']['output']
+        if ifold == 'all':
+            plt_dir  = f'{val_dir}/fold_all'
+        else:
+            plt_dir  = f'{val_dir}/fold_{ifold:03}'
+        os.makedirs(plt_dir, exist_ok=True)
+        plt.xlabel('Signal efficiency')
+        plt.ylabel('Background rejection')
+        plt.title(f'Fold: {ifold}')
+        plt.xlim(min_x, max_x)
+        plt.ylim(min_y, max_y)
+        plt.grid()
+        plt.legend()
+        plt.savefig(f'{plt_dir}/roc.png')
+        plt.close()
+    # ---------------------------------------------
     def _load_trained_models(self) -> list[cls]:
-        model_path = self._cfg['saving']['path']
+        out_dir    = self._cfg['saving']['output']
+        model_path = f'{out_dir}/model.pkl'
         nfold      = self._cfg['training']['nfold']
         l_model    = []
         for ifold in range(nfold):
@@ -212,18 +361,18 @@ class TrainMva:
     def _labels_from_varnames(self, l_var_name : list[str]) -> list[str]:
         try:
             d_plot = self._cfg['plotting']['features']['plots']
-        except ValueError:
-            log.warning('Cannot find plotting/features/plots section in config, using dataframe names')
-            return l_var_name
+        except KeyError as exc:
+            raise KeyError('Cannot find plotting/features/plots section in config, using dataframe names') from exc
         l_label = []
         for var_name in l_var_name:
             if var_name not in d_plot:
-                log.warning(f'No plot found for: {var_name}')
-                l_label.append(var_name)
-                continue
+                raise NoFeatureInfo(f'No plot found for feature {var_name}, cannot extract label')
             d_setting = d_plot[var_name]
+            if 'labels' not in d_setting:
+                raise NoFeatureInfo(f'No no labels present for plot of feature {var_name}, cannot extract label')
             [xlab, _ ]= d_setting['labels']
             l_label.append(xlab)
@@ -237,7 +386,7 @@ class TrainMva:
         d_data['Variable'  ] = self._labels_from_varnames(l_var_name)
         d_data['Importance'] = 100 * model.feature_importances_
-        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = self._cfg['saving']['output']
         val_dir  = f'{val_dir}/fold_{ifold:03}'
         os.makedirs(val_dir, exist_ok=True)
@@ -290,7 +439,9 @@ class TrainMva:
         '''
         Saves a model, associated to a specific fold
         '''
-        model_path = self._cfg['saving']['path']
+        out_dir    = self._cfg['saving']['output']
+        model_path = f'{out_dir}/model.pkl'
         if os.path.isfile(model_path):
             log.info(f'Model found in {model_path}, not saving')
             return
@@ -327,21 +478,40 @@ class TrainMva:
         return cfg
     # ---------------------------------------------
-    def _plot_correlation(self, arr_index : NPA, ifold : int) -> None:
+    def _plot_correlations(self, arr_index : NPA, ifold : int) -> None:
+        log.debug('Plotting correlations')
         df_ft = self._df_ft.iloc[arr_index]
+        l_lab = self._l_lab[arr_index]
+        arr_sig_idx, = numpy.where(l_lab == 1)
+        arr_bkg_idx, = numpy.where(l_lab == 0)
+        df_ft_sig = df_ft.iloc[arr_sig_idx]
+        df_ft_bkg = df_ft.iloc[arr_bkg_idx]
+        self._plot_correlation(df_ft=df_ft_sig, ifold=ifold, name='signal'    )
+        self._plot_correlation(df_ft=df_ft_bkg, ifold=ifold, name='background')
+    # ---------------------------------------------
+    def _plot_correlation(
+            self,
+            df_ft : pnd.DataFrame,
+            ifold : int,
+            name  : str) -> None:
+        log.debug(f'Plotting correlation for {name}/{ifold} fold')
         cfg = self._get_correlation_cfg(df_ft, ifold)
         cov = df_ft.corr()
         mat = cov.to_numpy()
-        log.debug(f'Plotting correlation for {ifold} fold')
-        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = self._cfg['saving']['output']
         val_dir  = f'{val_dir}/fold_{ifold:03}'
         os.makedirs(val_dir, exist_ok=True)
         obj = MatrixPlotter(mat=mat, cfg=cfg)
         obj.plot()
-        plt.savefig(f'{val_dir}/covariance.png')
+        plt.savefig(f'{val_dir}/correlation_{name}.png')
         plt.close()
     # ---------------------------------------------
     def _get_nentries(self, arr_val : NPA) -> str:
@@ -350,26 +520,29 @@ class TrainMva:
         return f'{size:.2f}K'
     # ---------------------------------------------
-    def _plot_scores(self, arr_sig_trn, arr_sig_tst, arr_bkg_trn, arr_bkg_tst, ifold):
-        # pylint: disable = too-many-arguments, too-many-positional-arguments
+    def _plot_scores(
+            self,
+            ifold   : int,
+            sig_tst : NPA,
+            bkg_tst : NPA,
+            sig_trn : NPA = None,
+            bkg_trn : NPA = None) -> None:
         '''
         Will plot an array of scores, associated to a given fold
         '''
+        ifold = 'all' if ifold == -1 else ifold
         log.debug(f'Plotting scores for {ifold} fold')
-        if 'val_dir' not in self._cfg['plotting']:
-            log.warning('Scores path not passed, not plotting scores')
-            return
-        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = self._cfg['saving']['output']
         val_dir  = f'{val_dir}/fold_{ifold:03}'
         os.makedirs(val_dir, exist_ok=True)
-        plt.hist(arr_sig_trn, alpha   =   0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train: '    + self._get_nentries(arr_sig_trn))
-        plt.hist(arr_sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test: '     + self._get_nentries(arr_sig_tst))
+        plt.hist(sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test: '     + self._get_nentries(sig_tst))
+        plt.hist(bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test: ' + self._get_nentries(bkg_tst))
-        plt.hist(arr_bkg_trn, alpha   =   0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train: '+ self._get_nentries(arr_bkg_trn))
-        plt.hist(arr_bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test: ' + self._get_nentries(arr_bkg_tst))
+        if sig_trn is not None and bkg_trn is not None:
+            plt.hist(sig_trn, alpha = 0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train: '    + self._get_nentries(sig_trn))
+            plt.hist(bkg_trn, alpha = 0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train: '+ self._get_nentries(bkg_trn))
         plt.legend()
         plt.title(f'Fold: {ifold}')
@@ -378,59 +551,12 @@ class TrainMva:
         plt.savefig(f'{val_dir}/scores.png')
         plt.close()
     # ---------------------------------------------
-    def _plot_roc(self,
-                  l_lab_ts : NPA,
-                  l_prb_ts : NPA,
-                  l_lab_tr : NPA,
-                  l_prb_tr : NPA,
-                  ifold    : int):
-        '''
-        Takes the labels and the probabilities and plots ROC
-        curve for given fold
-        '''
-        log.debug(f'Plotting ROC curve for {ifold} fold')
-        val_dir  = self._cfg['plotting']['val_dir']
-        val_dir  = f'{val_dir}/fold_{ifold:03}'
-        os.makedirs(val_dir, exist_ok=True)
-        xval_ts, yval_ts, _ = roc_curve(l_lab_ts, l_prb_ts)
-        xval_ts             = 1 - xval_ts
-        area_ts             = auc(xval_ts, yval_ts)
-        xval_tr, yval_tr, _ = roc_curve(l_lab_tr, l_prb_tr)
-        xval_tr             = 1 - xval_tr
-        area_tr             = auc(xval_tr, yval_tr)
-        min_x = 0
-        min_y = 0
-        if 'min' in self._cfg['plotting']['roc']:
-            [min_x, min_y] = self._cfg['plotting']['roc']['min']
-        max_x = 1
-        max_y = 1
-        if 'max' in self._cfg['plotting']['roc']:
-            [max_x, max_y] = self._cfg['plotting']['roc']['max']
-        plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
-        plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
-        self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
-        plt.xlabel('Signal efficiency')
-        plt.ylabel('Background rejection')
-        plt.title(f'Fold: {ifold}')
-        plt.xlim(min_x, max_x)
-        plt.ylim(min_y, max_y)
-        plt.grid()
-        plt.legend()
-        plt.savefig(f'{val_dir}/roc.png')
-        plt.close()
-    # ---------------------------------------------
-    def _plot_probabilities(self,
-                            arr_seff: NPA,
-                            arr_brej: NPA,
-                            arr_sprb: NPA,
-                            arr_labl: NPA) -> None:
+    def _plot_probabilities(
+            self,
+            arr_seff: NPA,
+            arr_brej: NPA,
+            arr_sprb: NPA,
+            arr_labl: NPA) -> None:
         roc_cfg = self._cfg['plotting']['roc']
         if 'annotate' not in roc_cfg:
@@ -475,7 +601,10 @@ class TrainMva:
         '''
         Will plot the features, based on the settings in the config
         '''
-        d_cfg = self._cfg['plotting']['features']
+        out_dir         = self._cfg['saving']['output']
+        d_cfg           = self._cfg['plotting']['features']
+        d_cfg['saving'] = {'plt_dir' : f'{out_dir}/features'}
         ptr   = Plotter(d_rdf = {'Signal' : self._rdf_sig, 'Background' : self._rdf_bkg}, cfg=d_cfg)
         ptr.run()
     # ---------------------------------------------
@@ -498,7 +627,7 @@ class TrainMva:
         d_tex = {'Variable' : l_lab, 'Replacement' : l_val}
         df    = pnd.DataFrame(d_tex)
-        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = self._cfg['saving']['output']
         os.makedirs(val_dir, exist_ok=True)
         put.df_to_tex(df, f'{val_dir}/nan_replacement.tex')
     # ---------------------------------------------
@@ -506,16 +635,23 @@ class TrainMva:
         if 'hyper' not in self._cfg['training']:
             raise ValueError('Cannot find hyper parameters in configuration')
+        def format_value(val : Union[int,float]) -> str:
+            if isinstance(val, float):
+                return f'\\verb|{val:.3f}|'
+            return f'\\verb|{val}|'
         d_hyper = self._cfg['training']['hyper']
-        d_form  = { f'\\verb|{key}|' : f'\\verb|{val}|' for key, val in d_hyper.items() }
+        d_form  = { f'\\verb|{key}|' : format_value(val) for key, val in d_hyper.items() }
         d_latex = { 'Hyperparameter' : list(d_form.keys()), 'Value' : list(d_form.values())}
         df = pnd.DataFrame(d_latex)
-        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = self._cfg['saving']['output']
         os.makedirs(val_dir, exist_ok=True)
         put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
     # ---------------------------------------------
     def _run_diagnostics(self, models : list[cls], rdf : RDataFrame, name : str) -> None:
+        log.info(f'Running diagnostics for sample {name}')
         if 'diagnostics' not in self._cfg:
             log.warning('Diagnostics section not found, not running diagnostics')
             return
@@ -535,24 +671,270 @@ class TrainMva:
         cvd = CVDiagnostics(models=models, rdf=rdf, cfg=cfg_diag)
         cvd.run()
     # ---------------------------------------------
-    def run(self, skip_fit : bool = False, load_trained : bool = False) -> None:
+    #
+    # Hyperparameter optimization
+    # ---------------------------------------------
+    def _objective(self, trial, kfold : StratifiedKFold) -> float:
+        ft = self._df_ft
+        lab= self._l_lab
+        if not issubclass(cls, GradientBoostingClassifier):
+            raise NotImplementedError('Hyperparameter optimization only implemented for GradientBoostingClassifier')
+        nft = len(ft.columns)
+        var_learn_rate  = trial.suggest_float('learning_rate'  , 1e-3, 1e-1, log=True)
+        var_max_depth   = trial.suggest_int('max_depth'        ,    2,   15)
+        var_max_features= trial.suggest_int('max_features'     ,    2,  nft)
+        var_min_split   = trial.suggest_int('min_samples_split',    2,   10)
+        var_min_samples = trial.suggest_int('min_samples_leaf' ,    2,   30)
+        var_nestimators = trial.suggest_int('n_estimators'     ,   50,  400)
+        classifier = GradientBoostingClassifier(
+            learning_rate     = var_learn_rate,
+            max_depth         = var_max_depth,
+            max_features      = var_max_features,
+            min_samples_split = var_min_split,
+            min_samples_leaf  = var_min_samples,
+            n_estimators      = var_nestimators,
+            random_state      = self._rdm_state)
+        score = cross_val_score(
+                classifier,
+                ft,
+                lab,
+                n_jobs=1, # More than this will reach RLIMIT_NPROC in cluster
+                cv=kfold)
+        accuracy = score.mean()
+        return accuracy
+    # ---------------------------------------------
+    def _optimize_hyperparameters(self, ntrial : int):
+        log.info('Running hyperparameter optimization')
+        self._pbar = tqdm.tqdm(total=ntrial, desc='Optimizing')
+        kfold      = StratifiedKFold(n_splits=5, shuffle=True, random_state=self._rdm_state)
+        objective  = partial(self._objective, kfold=kfold)
+        study = optuna.create_study(
+                direction='maximize',
+                pruner   = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=5),)
+        study.optimize(
+                objective,
+                callbacks = [self._update_progress],
+                n_jobs    = self._nworkers,
+                n_trials  = ntrial)
+        self._print_hyper_opt(study=study)
+        self._plot_hyper_opt(study=study)
+        log.info('Overriding hyperparameters with optimized values')
+        self._cfg['training']['hyper'] = study.best_params
+    # ---------------------------------------------
+    def _plot_hyper_opt(self, study) -> None:
+        out_dir = self._cfg['saving']['output']
+        opt_dir = f'{out_dir}/optimization'
+        os.makedirs(opt_dir, exist_ok=True)
+        trials_df = study.trials_dataframe()
+        plt.plot(trials_df['number'], trials_df['value'])
+        plt.xlabel('Trial')
+        plt.ylabel('Accuracy')
+        plt.title('Optimization History')
+        plt.grid(True)
+        plt.savefig(f'{opt_dir}/history.png')
+        plt.close()
+        plt.hist(trials_df['value'], bins=20, alpha=0.7)
+        plt.xlabel('Accuracy')
+        plt.ylabel('Frequency')
+        plt.title('Distribution of Trial Results')
+        plt.savefig(f'{opt_dir}/accuracy.png')
+        plt.close()
+    # ---------------------------------------------
+    def _update_progress(self, study, _trial):
+        self._pbar.set_postfix({'Best': f'{study.best_value:.4f}' if study.best_value else 'N/A'})
+        self._pbar.update(1)
+    # ---------------------------------------------
+    def _print_hyper_opt(self, study) -> None:
+        log.info(40 * '-')
+        log.info('Optimized hyperparameters:')
+        log.info(40 * '-')
+        for name, value in study.best_params.items():
+            if isinstance(value, float):
+                log.info(f'{name:<20}{value:.3f}')
+            else:
+                log.info(f'{name:<20}{value}')
+    # ---------------------------------------------
+    # ---------------------------------------------
+    def _auc_from_json(self, ifold : int, kind : str) -> float:
+        val_dir = self._cfg['saving']['output']
+        path    = f'{val_dir}/fold_{ifold:03}/roc_{kind}.json'
+        df      = pnd.read_json(path)
+        return auc(df['x'], df['y'])
+    # ---------------------------------------------
+    def _check_overtraining(self) -> None:
+        nfold      = self._cfg['training']['nfold']
+        df         = pnd.DataFrame(columns=['fold'])
+        df['fold' ]= numpy.linspace(0, nfold - 1, nfold, dtype=int)
+        df['test' ]= df['fold'].apply(self._auc_from_json, args=('test' ,))
+        df['train']= df['fold'].apply(self._auc_from_json, args=('train',))
+        ax=None
+        ax=df.plot('fold', 'test' , color='blue', label='Testing sample' , ax=ax)
+        ax=df.plot('fold', 'train', color='red' , label='Training sample', ax=ax)
+        ax.set_ylim(bottom=0.75, top=1.00)
+        ax.set_ylabel('AUC')
+        ax.set_xlabel('Fold')
+        plt.grid()
+        val_dir = self._cfg['saving']['output']
+        path    = f'{val_dir}/fold_all/auc_vs_fold.png'
+        plt.savefig(path)
+        plt.close()
+    # ---------------------------------------------
+    def run(
+            self,
+            skip_fit     : bool = False,
+            opt_ntrial   : int  =     0,
+            load_trained : bool = False) -> float:
         '''
         Will do the training
-        skip_fit: By default false, if True, it will only do the plots of features and save tables
-        load_trained: If true, it will load the models instead of training, by default false
+        skip_fit    : By default false, if True, it will only do the plots of features and save tables
+        opt_ntrial  : Number of optimization tries for hyperparameter optimization, by default zero, i.e. no optimization will run
+        load_trained: If true, it will load the models instead of training, by default false.
+        Returns
+        ----------------
+        Area under the ROC curve from evaluating the classifiers
+        on samples that were not used in their training. Uses the full sample
         '''
-        self._save_settings_to_tex()
         self._plot_features()
         if skip_fit:
-            return
+            return self._auc
+        if opt_ntrial > 0:
+            self._optimize_hyperparameters(ntrial=opt_ntrial)
+        self._save_settings_to_tex()
         l_mod = self._get_models(load_trained = load_trained)
         if not load_trained:
             for ifold, mod in enumerate(l_mod):
                 self._save_model(mod, ifold)
+        self._check_overtraining()
         self._run_diagnostics(models = l_mod, rdf = self._rdf_sig_org, name='Signal'    )
         self._run_diagnostics(models = l_mod, rdf = self._rdf_bkg_org, name='Background')
+        return self._auc
+    # ---------------------------------------------
+    @contextmanager
+    def use(self, nworkers : int) -> None:
+        '''
+        Context manager used to run with a specific configuration
+        nworkers: Use this number of workers for ANY process that can be parallelized.
+        '''
+        old = self._nworkers
+        log.info(f'Using {nworkers} workers to run training')
+        self._nworkers = nworkers
+        try:
+            yield
+        finally:
+            self._nworkers = old
+    # ---------------------------------------------
+    @staticmethod
+    def plot_roc_from_prob(
+            arr_sig_prb : NPA,
+            arr_bkg_prb : NPA,
+            kind        : str,
+            ifold       : int,
+            color       : str = None) -> tuple[NPA,NPA, float]:
+        '''
+        Takes arrays of signal and background probabilities
+        and plots ROC curve
+        Parameters
+        --------------------
+        arr_bkg/sig_prb : Array with background/signal probabilities
+        kind            : String used to label the plot
+        ifold           : If no fold makes sense (i.e. this is the full sample), use ifold=-1
+        kind            : Used to label the plot
+        color           : String with color of curve
+        Returns
+        --------------------
+        Tuple with 3 elements:
+        - Array of x coordinates of ROC curve
+        - Array of y coordinates of ROC curve
+        - Area under the curve
+        '''
+        arr_sig_lab = numpy.ones_like( arr_sig_prb)
+        arr_bkg_lab = numpy.zeros_like(arr_bkg_prb)
+        arr_prb     = numpy.concatenate([arr_sig_prb, arr_bkg_prb])
+        arr_lab     = numpy.concatenate([arr_sig_lab, arr_bkg_lab])
+        res = TrainMva.plot_roc(
+                l_lab=arr_lab,
+                l_prb=arr_prb,
+                color=color,
+                kind =kind,
+                ifold=ifold)
+        return res
+    # ---------------------------------------------
+    @staticmethod
+    def plot_roc(
+            l_lab : NPA,
+            l_prb : NPA,
+            kind  : str,
+            ifold : int,
+            color : str = None) -> tuple[NPA, NPA, float]:
+        '''
+        Takes the labels and the probabilities and plots ROC
+        curve for given fold
+        Parameters
+        --------------------
+        ifold : If no fold makes sense (i.e. this is the full sample), use ifold=-1
+        kind  : Used to label the plot
+        Returns
+        --------------------
+        Tuple with 3 elements:
+        - Array of x coordinates of ROC curve
+        - Array of y coordinates of ROC curve
+        - Area under the curve
+        '''
+        log.debug(f'Plotting ROC curve for {ifold} fold')
+        xval, yval, _ = roc_curve(l_lab, l_prb)
+        xval          = 1 - xval
+        area          = auc(xval, yval)
+        if color is None:
+            color='red' if kind == 'Train' else 'blue'
+        if ifold == -1:
+            label=f'Test sample: {area:.3f}'
+        else:
+            label=f'{kind}: {area:.3f}'
+        plt.plot(xval, yval, color=color, label=label)
+        return xval, yval, area
 # ---------------------------------------------

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl