PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/train_mva.yaml +9 -7
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/styling.yaml +8 -1
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
{data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/ml/cv_predict.py CHANGED Viewed

@@ -1,8 +1,6 @@
 '''
 Module holding CVPredict class
 '''
-from typing import Optional
 import pandas as pnd
 import numpy
 import tqdm
@@ -21,41 +19,107 @@ class CVPredict:
     Class used to get classification probabilities from ROOT
     dataframe and a set of models. The models were trained with CVClassifier
     '''
-    def __init__(self, models : Optional[list] = None, rdf : Optional[RDataFrame] = None):
+    def __init__(
+            self,
+            rdf    : RDataFrame,
+            models : list[CVClassifier]):
         '''
         Will take a list of CVClassifier models and a ROOT dataframe
-        '''
-        if models is None:
-            raise ValueError('No list of models passed')
-        if rdf is None:
-            raise ValueError('No ROOT dataframe passed')
+        rdf   : ROOT dataframe where features will be extracted
+        models: List of models, one per fold
+        '''
         self._l_model   = models
         self._rdf       = rdf
+        self._nrows     : int
+        self._l_column  : list[str]
         self._d_nan_rep : dict[str,str]
-        self._arr_patch : numpy.ndarray
+        # Value of score used when no score has been assigned
+        self._dummy_score = -1.0
+        # name of column in ROOT dataframe where 1s will prevent prediction
+        self._skip_index_column = 'skip_mva_prediction'
+        # name of attribute of features dataframe where array of indices to skip are stored
+        self._index_skip  = 'skip_mva_prediction'
     # --------------------------------------------
     def _initialize(self):
+        self._rdf       = self._remove_periods(self._rdf)
         self._rdf       = self._define_columns(self._rdf)
         self._d_nan_rep = self._get_nan_replacements()
+        self._l_column  = [ name.c_str() for name in self._rdf.GetColumnNames() ]
+        self._nrows     = self._rdf.Count().GetValue()
+    # ----------------------------------
+    def _remove_periods(self, rdf : RDataFrame) -> RDataFrame:
+        '''
+        This will redefine all columns associated to friend trees as:
+        friend_preffix.branch_name -> friend_preffix.branch_name
+        '''
+        l_col = [ col.c_str() for col in rdf.GetColumnNames() ]
+        l_col = [ col         for col in l_col if '.' in col  ]
+        if len(l_col) == 0:
+            return rdf
+        log.debug(60 * '-')
+        log.debug('Renaming dotted columns')
+        log.debug(60 * '-')
+        for col in l_col:
+            new = col.replace('.', '_')
+            log.debug(f'{col:<50}{"->":10}{new:<20}')
+            rdf = rdf.Define(new, col)
+        return rdf
+    # --------------------------------------------
+    def _get_definitions(self) -> dict[str,str]:
+        '''
+        This method will search in the configuration the definitions used
+        on the dataframe before the dataframe was used to train the model.
+        '''
+        cfg   = self._l_model[0].cfg
+        d_def = {}
+        if 'define' in cfg['dataset']:
+            d_def_gen = cfg['dataset']['define'] # get generic definitions
+            d_def.update(d_def_gen)
+        sig_name = 'sig'
+        try:
+            # Get sample specific definitions. This will be taken from the signal section
+            # because predicted scores should come from features defined as for the signal.
+            d_def_sam = cfg['dataset']['samples'][sig_name]['definitions']
+        except KeyError:
+            log.debug(f'No sample specific definitions were found in: {sig_name}')
+            return d_def
+        log.info('Adding sample dependent definitions')
+        d_def.update(d_def_sam)
+        return d_def
     # --------------------------------------------
     def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
-        cfg = self._l_model[0].cfg
-        if 'define' not in cfg['dataset']:
-            log.debug('No define section found in config, will not define extra columns')
+        d_def = self._get_definitions()
+        if len(d_def) == 0:
+            log.info('No definitions found')
             return self._rdf
-        d_def = cfg['dataset']['define']
+        dexc = None
         log.debug(60 * '-')
         log.info('Defining columns in RDF before evaluating classifier')
         log.debug(60 * '-')
         for name, expr in d_def.items():
+            expr = expr.replace('.', '_')
             log.debug(f'{name:<20}{"<---":20}{expr:<100}')
-            rdf = rdf.Define(name, expr)
+            try:
+                rdf = rdf.Define(name, expr)
+            except TypeError as exc:
+                log.error(f'Cannot define {name}={expr}')
+                dexc = exc
+        if dexc is not None:
+            raise TypeError('Could not define at least one column') from dexc
         return rdf
     # --------------------------------------------
@@ -68,21 +132,25 @@ class CVPredict:
         return cfg['dataset']['nan']
     # --------------------------------------------
-    def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
+    def _replace_nans(self, df_ft : pnd.DataFrame) -> pnd.DataFrame:
+        '''
+        Funtion replaces nans in user specified columns with user specified values
+        These NaNs are expected
+        '''
         if len(self._d_nan_rep) == 0:
             log.debug('Not doing any NaN replacement')
-            return df
+            return df_ft
         log.info(60 * '-')
         log.info('Doing NaN replacements')
         log.info(60 * '-')
         for var, val in self._d_nan_rep.items():
             log.info(f'{var:<20}{"--->":20}{val:<20.3f}')
-            df[var] = df[var].fillna(val)
+            df_ft[var] = df_ft[var].fillna(val)
-        return df
+        return df_ft
     # --------------------------------------------
-    def _get_df(self):
+    def _get_df(self) -> pnd.DataFrame:
         '''
         Will make ROOT rdf into dataframe and return it
         '''
@@ -90,11 +158,11 @@ class CVPredict:
         l_ft  = model.features
         d_data= self._rdf.AsNumpy(l_ft)
         df_ft = pnd.DataFrame(d_data)
-        df_ft = self._replace_nans(df_ft)
-        df_ft = ut.patch_and_tag(df_ft)
-        if 'patched_indices' in df_ft.attrs:
-            self._arr_patch = df_ft.attrs['patched_indices']
+        df_ft = self._replace_nans(df_ft=df_ft)
+        df_ft = self._tag_skipped(df_ft=df_ft)
+        df_ft = ut.tag_nans(
+            df      = df_ft,
+            indexes = self._index_skip)
         nfeat = len(l_ft)
         log.info(f'Found {nfeat} features')
@@ -103,6 +171,24 @@ class CVPredict:
         return df_ft
     # --------------------------------------------
+    def _tag_skipped(self, df_ft : pnd.DataFrame) -> pnd.DataFrame:
+        '''
+        Will drop rows with features where column with name _skip_name (currently "_skip_mva_prediction") has values of 1
+        '''
+        if self._skip_index_column not in self._l_column:
+            log.debug(f'Not dropping any rows through: {self._skip_index_column}')
+            return df_ft
+        log.info(f'Dropping rows through: {self._skip_index_column}')
+        arr_drop                = self._rdf.AsNumpy([self._skip_index_column])[self._skip_index_column]
+        if self._index_skip in df_ft.attrs:
+            raise ValueError(f'Feature dataframe already contains attribute: {self._index_skip}')
+        df_ft.attrs[self._index_skip] = numpy.where(arr_drop == 1)[0]
+        return df_ft
+    # --------------------------------------------
     def _non_overlapping_hashes(self, model, df_ft):
         '''
         Will return True if hashes of model and data do not overlap
@@ -147,8 +233,8 @@ class CVPredict:
         '''
         Evaluate the dataset for one of the folds, by taking the model and the full dataset
         '''
-        s_dat_hash = set(df_ft.index)
-        s_mod_hash = model.hashes
+        s_dat_hash : set[str] = set(df_ft.index)
+        s_mod_hash : set[str] = model.hashes
         s_dif_hash = s_dat_hash - s_mod_hash
@@ -164,19 +250,29 @@ class CVPredict:
         d_prob = dict(zip(l_hash, l_prob))
         nfeat  = len(df_ft_group)
         nprob  = len(l_prob)
-        log.debug(f'{nfeat:<10}{"->":10}{nprob:<10}')
+        if nfeat != nprob:
+            raise ValueError(f'Number of features and probabilities do not agree: {nfeat} != {nprob}')
         return d_prob
     # --------------------------------------------
-    def _patch_probabilities(self, arr_prb : numpy.ndarray) -> numpy.ndarray:
-        if not hasattr(self, '_arr_patch'):
-            return arr_prb
+    def _predict_signal_probabilities(
+            self,
+            model : CVClassifier,
+            df_ft : pnd.DataFrame) -> numpy.ndarray:
+        '''
+        Takes model and features dataframe, returns array of signal probabilities
+        '''
+        if self._non_overlapping_hashes(model, df_ft):
+            log.debug('No intersecting hashes found between model and data')
+            arr_prb = model.predict_proba(df_ft)
+        else:
+            log.info('Intersecting hashes found between model and data')
+            arr_prb = self._predict_with_overlap(df_ft)
-        nentries = len(self._arr_patch)
-        log.warning(f'Patching {nentries} probabilities with -1')
-        arr_prb[self._arr_patch] = -1
+        arr_sig_prb = arr_prb.T[1]
-        return arr_prb
+        return arr_sig_prb
     # --------------------------------------------
     def predict(self) -> numpy.ndarray:
         '''
@@ -187,15 +283,22 @@ class CVPredict:
         df_ft = self._get_df()
         model = self._l_model[0]
-        if self._non_overlapping_hashes(model, df_ft):
-            log.debug('No intersecting hashes found between model and data')
-            arr_prb = model.predict_proba(df_ft)
-        else:
-            log.info('Intersecting hashes found between model and data')
-            arr_prb = self._predict_with_overlap(df_ft)
+        arr_keep = None
+        arr_skip = None
+        if self._index_skip in df_ft.attrs:
+            arr_skip = df_ft.attrs[self._index_skip]
+            df_ft    = df_ft.drop(arr_skip)
+            arr_keep = df_ft.index.to_numpy()
+        arr_sig_prb  = self._predict_signal_probabilities(
+                model = model,
+                df_ft = df_ft)
+        if arr_skip is None:
+            return arr_sig_prb
-        arr_prb = self._patch_probabilities(arr_prb)
-        arr_prb = arr_prb.T[1]
+        arr_all_sig_prb           = numpy.full(self._nrows, self._dummy_score)
+        arr_all_sig_prb[arr_keep] = arr_sig_prb
-        return arr_prb
+        return arr_all_sig_prb
 # ---------------------------------------

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl