PyPI - data-manipulation-utilities - Versions diffs - 0.0.1__py3-none-any.whl - Mend

data-manipulation-utilities 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data_manipulation_utilities-0.0.1.dist-info/METADATA +713 -0
data_manipulation_utilities-0.0.1.dist-info/RECORD +45 -0
data_manipulation_utilities-0.0.1.dist-info/WHEEL +5 -0
data_manipulation_utilities-0.0.1.dist-info/entry_points.txt +6 -0
data_manipulation_utilities-0.0.1.dist-info/top_level.txt +3 -0
dmu/arrays/utilities.py +55 -0
dmu/dataframe/dataframe.py +36 -0
dmu/generic/utilities.py +69 -0
dmu/logging/log_store.py +129 -0
dmu/ml/cv_classifier.py +122 -0
dmu/ml/cv_predict.py +152 -0
dmu/ml/train_mva.py +257 -0
dmu/ml/utilities.py +132 -0
dmu/plotting/plotter.py +227 -0
dmu/plotting/plotter_1d.py +113 -0
dmu/plotting/plotter_2d.py +87 -0
dmu/rdataframe/atr_mgr.py +79 -0
dmu/rdataframe/utilities.py +72 -0
dmu/rfile/rfprinter.py +91 -0
dmu/rfile/utilities.py +34 -0
dmu/stats/fitter.py +515 -0
dmu/stats/function.py +314 -0
dmu/stats/utilities.py +134 -0
dmu/testing/utilities.py +119 -0
dmu/text/transformer.py +182 -0
dmu_data/__init__.py +0 -0
dmu_data/ml/tests/train_mva.yaml +37 -0
dmu_data/plotting/tests/2d.yaml +14 -0
dmu_data/plotting/tests/fig_size.yaml +13 -0
dmu_data/plotting/tests/high_stat.yaml +22 -0
dmu_data/plotting/tests/name.yaml +14 -0
dmu_data/plotting/tests/no_bounds.yaml +12 -0
dmu_data/plotting/tests/simple.yaml +8 -0
dmu_data/plotting/tests/title.yaml +14 -0
dmu_data/plotting/tests/weights.yaml +13 -0
dmu_data/text/transform.toml +4 -0
dmu_data/text/transform.txt +6 -0
dmu_data/text/transform_set.toml +8 -0
dmu_data/text/transform_set.txt +6 -0
dmu_data/text/transform_trf.txt +12 -0
dmu_scripts/physics/check_truth.py +121 -0
dmu_scripts/rfile/compare_root_files.py +299 -0
dmu_scripts/rfile/print_trees.py +35 -0
dmu_scripts/ssh/coned.py +168 -0
dmu_scripts/text/transform_text.py +46 -0

dmu/ml/train_mva.py ADDED Viewed

@@ -0,0 +1,257 @@
+'''
+Module with TrainMva class
+'''
+import os
+from typing import Union
+import joblib
+import pandas as pnd
+import numpy
+import matplotlib.pyplot as plt
+from sklearn.metrics         import roc_curve, auc
+from sklearn.model_selection import StratifiedKFold
+from ROOT import RDataFrame
+import dmu.ml.utilities    as ut
+from dmu.ml.cv_classifier    import CVClassifier as cls
+from dmu.plotting.plotter_1d import Plotter1D    as Plotter
+from dmu.logging.log_store   import LogStore
+log = LogStore.add_logger('data_checks:train_mva')
+# ---------------------------------------------
+class TrainMva:
+    '''
+    Interface to scikit learn used to train classifier
+    '''
+    # ---------------------------------------------
+    def __init__(self, bkg=None, sig=None, cfg=None):
+        '''
+        bkg (ROOT dataframe): Holds real data
+        sig (ROOT dataframe): Holds simulation
+        cfg (dict)          : Dictionary storing configuration for training
+        '''
+        if bkg is None:
+            raise ValueError('Background dataframe is not a ROOT dataframe')
+        if sig is None:
+            raise ValueError('Signal dataframe is not a ROOT dataframe')
+        if not isinstance(cfg, dict):
+            raise ValueError('Config dictionary is not a dictionary')
+        self._rdf_bkg = bkg
+        self._rdf_sig = sig
+        self._cfg     = cfg if cfg is not None else {}
+        self._l_model   : cls
+        self._l_ft_name = self._cfg['training']['features']
+        self._df_ft, self._l_lab = self._get_inputs()
+    # ---------------------------------------------
+    def _get_inputs(self) -> tuple[pnd.DataFrame, numpy.ndarray]:
+        log.info('Getting signal')
+        df_sig, arr_lab_sig = self._get_sample_inputs(self._rdf_sig, label = 1)
+        log.info('Getting background')
+        df_bkg, arr_lab_bkg = self._get_sample_inputs(self._rdf_bkg, label = 0)
+        df      = pnd.concat([df_sig, df_bkg], axis=0)
+        arr_lab = numpy.concatenate([arr_lab_sig, arr_lab_bkg])
+        return df, arr_lab
+    # ---------------------------------------------
+    def _get_sample_inputs(self, rdf : RDataFrame, label : int) -> tuple[pnd.DataFrame, numpy.ndarray]:
+        d_ft = rdf.AsNumpy(self._l_ft_name)
+        df   = pnd.DataFrame(d_ft)
+        df   = ut.cleanup(df)
+        l_lab= len(df) * [label]
+        return df, numpy.array(l_lab)
+    # ---------------------------------------------
+    def _get_model(self, arr_index : numpy.ndarray) -> cls:
+        model = cls(cfg = self._cfg)
+        df_ft = self._df_ft.iloc[arr_index]
+        l_lab = self._l_lab[arr_index]
+        log.debug(f'Training feature shape: {df_ft.shape}')
+        log.debug(f'Training label size: {len(l_lab)}')
+        model.fit(df_ft, l_lab)
+        return model
+    # ---------------------------------------------
+    def _get_models(self):
+        # pylint: disable = too-many-locals
+        '''
+        Will create models, train them and return them
+        '''
+        nfold = self._cfg['training']['nfold']
+        rdmst = self._cfg['training']['rdm_stat']
+        kfold = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=rdmst)
+        l_model=[]
+        ifold=0
+        for arr_itr, arr_its in kfold.split(self._df_ft, self._l_lab):
+            log.debug(20 * '-')
+            log.info(f'Training fold: {ifold}')
+            log.debug(20 * '-')
+            model = self._get_model(arr_itr)
+            l_model.append(model)
+            arr_sig_sig_tr, arr_sig_bkg_tr, arr_sig_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
+            arr_sig_sig_ts, arr_sig_bkg_ts, arr_sig_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
+            self._plot_scores(arr_sig_sig_tr, arr_sig_sig_ts, arr_sig_bkg_tr, arr_sig_bkg_ts, ifold)
+            self._plot_roc(arr_lab_ts, arr_sig_all_ts, arr_lab_tr, arr_sig_all_tr, ifold)
+            ifold+=1
+        return l_model
+    # ---------------------------------------------
+    def _get_scores(self, model : cls, arr_index : numpy.ndarray, on_training_ok : bool) -> tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]:
+        '''
+        Returns a tuple of four arrays
+        arr_sig : Signal probabilities for signal
+        arr_bkg : Signal probabilities for background
+        arr_all : Signal probabilities for both
+        arr_lab : Labels for both
+        '''
+        nentries = len(arr_index)
+        log.debug(f'Getting {nentries} signal probabilities')
+        df_ft    = self._df_ft.iloc[arr_index]
+        arr_prob = model.predict_proba(df_ft, on_training_ok=on_training_ok)
+        arr_lab  = self._l_lab[arr_index]
+        l_all    = [ sig_prob for [_, sig_prob] in arr_prob ]
+        arr_all  = numpy.array(l_all)
+        arr_sig, arr_bkg= self._split_scores(arr_prob=arr_prob, arr_label=arr_lab)
+        return arr_sig, arr_bkg, arr_all, arr_lab
+    # ---------------------------------------------
+    def _split_scores(self, arr_prob : numpy.ndarray, arr_label : numpy.ndarray) -> tuple[numpy.ndarray, numpy.ndarray]:
+        '''
+        Will split the testing scores (predictions) based on the training scores
+        tst is a list of lists as [p_bkg, p_sig]
+        '''
+        l_sig = [ prb[1] for prb, lab in zip(arr_prob, arr_label) if lab == 1]
+        l_bkg = [ prb[1] for prb, lab in zip(arr_prob, arr_label) if lab == 0]
+        arr_sig = numpy.array(l_sig)
+        arr_bkg = numpy.array(l_bkg)
+        return arr_sig, arr_bkg
+    # ---------------------------------------------
+    def _save_model(self, model, ifold):
+        '''
+        Saves a model, associated to a specific fold
+        '''
+        model_path = self._cfg['saving']['path']
+        if os.path.isfile(model_path):
+            log.info(f'Model found in {model_path}, not saving')
+            return
+        dir_name = os.path.dirname(model_path)
+        os.makedirs(dir_name, exist_ok=True)
+        model_path = model_path.replace('.pkl', f'_{ifold:03}.pkl')
+        log.info(f'Saving model to: {model_path}')
+        joblib.dump(model, model_path)
+    # ---------------------------------------------
+    def _plot_scores(self, arr_sig_trn, arr_sig_tst, arr_bkg_trn, arr_bkg_tst, ifold):
+        # pylint: disable = too-many-arguments, too-many-positional-arguments
+        '''
+        Will plot an array of scores, associated to a given fold
+        '''
+        log.debug(f'Plotting scores for {ifold} fold')
+        if 'val_dir' not in self._cfg['plotting']:
+            log.warning('Scores path not passed, not plotting scores')
+            return
+        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = f'{val_dir}/fold_{ifold:03}'
+        os.makedirs(val_dir, exist_ok=True)
+        plt.hist(arr_sig_trn, alpha   =   0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train')
+        plt.hist(arr_sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test')
+        plt.hist(arr_bkg_trn, alpha   =   0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train')
+        plt.hist(arr_bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test')
+        plt.legend()
+        plt.title(f'Fold: {ifold}')
+        plt.xlabel('Signal probability')
+        plt.ylabel('Normalized')
+        plt.savefig(f'{val_dir}/scores.png')
+        plt.close()
+    # ---------------------------------------------
+    def _plot_roc(self,
+                  l_lab_ts : numpy.ndarray,
+                  l_prb_ts : numpy.ndarray,
+                  l_lab_tr : numpy.ndarray,
+                  l_prb_tr : numpy.ndarray,
+                  ifold    : int):
+        '''
+        Takes the labels and the probabilities and plots ROC
+        curve for given fold
+        '''
+        # pylint: disable = too-many-arguments, too-many-positional-arguments
+        log.debug(f'Plotting ROC curve for {ifold} fold')
+        val_dir  = self._cfg['plotting']['val_dir']
+        val_dir  = f'{val_dir}/fold_{ifold:03}'
+        os.makedirs(val_dir, exist_ok=True)
+        xval_ts, yval_ts, _ = roc_curve(l_lab_ts, l_prb_ts)
+        xval_ts             = 1 - xval_ts
+        area_ts             = auc(xval_ts, yval_ts)
+        xval_tr, yval_tr, _ = roc_curve(l_lab_tr, l_prb_tr)
+        xval_tr             = 1 - xval_tr
+        area_tr             = auc(xval_tr, yval_tr)
+        min_x = 0
+        min_y = 0
+        if 'min' in self._cfg['plotting']['roc']:
+            [min_x, min_y] = self._cfg['plotting']['roc']['min']
+        plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
+        plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
+        plt.xlabel('Signal efficiency')
+        plt.ylabel('Background efficiency')
+        plt.title(f'Fold: {ifold}')
+        plt.xlim(min_x, 1)
+        plt.ylim(min_y, 1)
+        plt.legend()
+        plt.savefig(f'{val_dir}/roc.png')
+        plt.close()
+    # ---------------------------------------------
+    def _plot_features(self):
+        '''
+        Will plot the features, based on the settings in the config
+        '''
+        d_cfg = self._cfg['plotting']['features']
+        ptr   = Plotter(d_rdf = {'Signal' : self._rdf_sig, 'Background' : self._rdf_bkg}, cfg=d_cfg)
+        ptr.run()
+    # ---------------------------------------------
+    def run(self):
+        '''
+        Will do the training
+        '''
+        self._plot_features()
+        l_mod = self._get_models()
+        for ifold, mod in enumerate(l_mod):
+            self._save_model(mod, ifold)
+# ---------------------------------------------

dmu/ml/utilities.py ADDED Viewed

@@ -0,0 +1,132 @@
+'''
+Module containing utility functions for ML tools
+'''
+import hashlib
+from typing import Union
+import numpy
+import pandas as pnd
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:ml:utilities')
+# ---------------------------------------------
+# Patch dataframe with features
+# ---------------------------------------------
+def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
+    '''
+    Takes panda dataframe, replaces NaNs with value introduced, by default 0
+    Returns array of indices where the replacement happened
+    '''
+    l_nan = df.index[df.isna().any(axis=1)].tolist()
+    nnan  = len(l_nan)
+    if nnan == 0:
+        log.debug('No NaNs found')
+        return df
+    log.warning(f'Found {nnan} NaNs, patching them with {value}')
+    df_pa = df.fillna(value)
+    df_pa.attrs['patched_indices'] = numpy.array(l_nan)
+    return df_pa
+# ---------------------------------------------
+# Cleanup of dataframe with features
+# ---------------------------------------------
+def cleanup(df : pnd.DataFrame) -> pnd.DataFrame:
+    '''
+    Takes pandas dataframe with features for classification
+    Removes repeated entries and entries with nans
+    Returns dataframe
+    '''
+    df = _remove_repeated(df)
+    df = _remove_nans(df)
+    return df
+# ---------------------------------------------
+def _remove_nans(df : pnd.DataFrame) -> pnd.DataFrame:
+    if not df.isna().any().any():
+        log.debug('No NaNs found in dataframe')
+        return df
+    ninit = len(df)
+    df    = df.dropna()
+    nfinl = len(df)
+    log.warning(f'NaNs found, cleaning dataset: {ninit} -> {nfinl}')
+    return df
+# ---------------------------------------------
+def _remove_repeated(df : pnd.DataFrame) -> pnd.DataFrame:
+    l_hash = get_hashes(df, rvalue='list')
+    s_hash = set(l_hash)
+    ninit = len(l_hash)
+    nfinl = len(s_hash)
+    if ninit == nfinl:
+        log.debug('No cleaning needed for dataframe')
+        return df
+    log.warning(f'Repeated entries found, cleaning up: {ninit} -> {nfinl}')
+    df['hash_index'] = l_hash
+    df               = df.set_index('hash_index', drop=True)
+    df_clean         = df[~df.index.duplicated(keep='first')]
+    if not isinstance(df_clean, pnd.DataFrame):
+        raise ValueError('Cleaning did not return pandas dataframe')
+    return df_clean
+# ----------------------------------
+# ---------------------------------------------
+def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set, list]:
+    '''
+    Will return hashes for each row in the feature dataframe
+    rvalue (str): Return value, can be a set or a list
+    '''
+    if   rvalue == 'set':
+        res = { hash_from_row(row) for _, row in df_ft.iterrows() }
+    elif rvalue == 'list':
+        res = [ hash_from_row(row) for _, row in df_ft.iterrows() ]
+    else:
+        log.error(f'Invalid return value: {rvalue}')
+        raise ValueError
+    return res
+# ----------------------------------
+def hash_from_row(row):
+    '''
+    Will return a hash from a pandas dataframe row
+    corresponding to an event
+    '''
+    l_val   = [ str(val) for val in row ]
+    row_str = ','.join(l_val)
+    row_str = row_str.encode('utf-8')
+    hsh = hashlib.sha256()
+    hsh.update(row_str)
+    hsh_val = hsh.hexdigest()
+    return hsh_val
+# ----------------------------------
+def index_with_hashes(df):
+    '''
+    Will:
+    - take dataframe with features
+    - calculate hashes and add them as the index column
+    - drop old index column
+    '''
+    l_hash = get_hashes(df, rvalue='list')
+    ind_hsh= pnd.Index(l_hash)
+    df = df.set_index(ind_hsh, drop=True)
+    return df
+# ----------------------------------

dmu/plotting/plotter.py ADDED Viewed

@@ -0,0 +1,227 @@
+'''
+Module containing plotter class
+'''
+import os
+import math
+from typing import Union
+import numpy
+import matplotlib.pyplot as plt
+from ROOT                  import RDataFrame
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:plotting:Plotter')
+# --------------------------------------------
+class Plotter:
+    '''
+    Base class of Plotter1D and Plotter2D
+    '''
+    #-------------------------------------
+    def __init__(self, d_rdf=None, cfg=None):
+        if not isinstance(  cfg, dict):
+            raise ValueError('Config dictionary not passed')
+        if not isinstance(d_rdf, dict):
+            raise ValueError('Dataframe dictionary not passed')
+        self._d_cfg = cfg
+        self._d_rdf : dict[str, RDataFrame]    = { name : self._preprocess_rdf(rdf) for name, rdf in d_rdf.items()}
+        self._d_wgt : Union[dict[str, Union[numpy.ndarray, None]], None]
+    #-------------------------------------
+    def _check_quantile(self, qnt : float):
+        '''
+        Will check validity of quantile
+        '''
+        if 0.5 < qnt <= 1.0:
+            return
+        raise ValueError(f'Invalid quantile: {qnt:.3e}, value needs to be in (0.5, 1.0] interval')
+    #-------------------------------------
+    def _find_bounds(self, d_data : dict, qnt : float = 0.98):
+        '''
+        Will take dictionary between kinds of data and numpy array
+        Will return tuple with bounds, where 95% of the data is found
+        '''
+        self._check_quantile(qnt)
+        l_max = []
+        l_min = []
+        for arr_val in d_data.values():
+            minv = numpy.quantile(arr_val, 1 - qnt)
+            maxv = numpy.quantile(arr_val,     qnt)
+            l_max.append(maxv)
+            l_min.append(minv)
+        minx = min(l_min)
+        maxx = max(l_max)
+        if minx >= maxx:
+            raise ValueError(f'Could not calculate bounds correctly: [{minx:.3e}, {maxx:.3e}]')
+        return minx, maxx
+    #-------------------------------------
+    def _preprocess_rdf(self, rdf):
+        '''
+        rdf (RDataFrame): ROOT dataframe
+        returns preprocessed dataframe
+        '''
+        rdf = self._define_vars(rdf)
+        if 'selection' in self._d_cfg:
+            rdf = self._apply_selection(rdf)
+            rdf = self._max_ran_entries(rdf)
+        return rdf
+    #-------------------------------------
+    def _define_vars(self, rdf):
+        '''
+        Will define extra columns in dataframe and return updated dataframe
+        '''
+        if 'definitions' not in self._d_cfg:
+            log.debug('No definitions section found, returning same RDF')
+            return rdf
+        d_def = self._d_cfg['definitions']
+        log.info('Defining extra variables')
+        for name, expr in d_def.items():
+            log.debug(f'{name:<30}{expr:<150}')
+            rdf = rdf.Define(name, expr)
+        return rdf
+    #-------------------------------------
+    def _apply_selection(self, rdf):
+        '''
+        Will take dataframe, apply selection and return dataframe
+        '''
+        if 'cuts' not in self._d_cfg['selection']:
+            log.debug('Cuts not found in selection section, not applying any cuts')
+            return rdf
+        d_cut = self._d_cfg['selection']['cuts']
+        log.info('Applying cuts')
+        for name, cut in d_cut.items():
+            log.debug(f'{name:<50}{cut:<150}')
+            rdf = rdf.Filter(cut, name)
+        return rdf
+    #-------------------------------------
+    def _max_ran_entries(self, rdf):
+        '''
+        Will take dataframe and randomly drop events
+        '''
+        if 'max_ran_entries' not in self._d_cfg['selection']:
+            log.debug('Cuts not found in selection section, not applying any cuts')
+            return rdf
+        tot_entries = rdf.Count().GetValue()
+        max_entries = self._d_cfg['selection']['max_ran_entries']
+        if tot_entries < max_entries:
+            log.debug(f'Not dropping dandom entries: {tot_entries} < {max_entries}')
+            return rdf
+        prescale = math.floor(tot_entries / max_entries)
+        if prescale < 2:
+            log.debug(f'Not dropping random entries, prescale is below 2: {tot_entries}/{max_entries}')
+            return rdf
+        rdf = rdf.Filter(f'rdfentry_ % {prescale} == 0', 'max_ran_entries')
+        fnl_entries = rdf.Count().GetValue()
+        log.info(f'Dropped entries randomly: {tot_entries} -> {fnl_entries}')
+        return rdf
+    # --------------------------------------------
+    def _print_weights(self, arr_wgt : Union[numpy.ndarray, None], var : str, sample : str) -> None:
+        if arr_wgt is None:
+            log.debug(f'Not using weights for {sample}:{var}')
+            return
+        num_wgt = len(arr_wgt)
+        sum_wgt = numpy.sum(arr_wgt)
+        log.debug(f'Using weights [{num_wgt},{sum_wgt:.0f}] for {var}')
+    # --------------------------------------------
+    def _get_fig_size(self):
+        '''
+        Will read size list from config dictionary if found
+        other wise will return None
+        '''
+        if 'general' not in self._d_cfg:
+            return None
+        if 'size' not in self._d_cfg['general']:
+            return None
+        fig_size = self._d_cfg['general']['size']
+        return fig_size
+    #-------------------------------------
+    def _get_weights(self, var) -> Union[dict[str, Union[numpy.ndarray, None]], None]:
+        d_cfg = self._d_cfg['plots'][var]
+        if 'weights' not in d_cfg:
+            return None
+        if hasattr(self, '_d_wgt'):
+            return self._d_wgt
+        wgt_name = d_cfg['weights']
+        d_weight = {sam_name : self._read_weights(wgt_name, rdf) for sam_name, rdf in self._d_rdf.items()}
+        self._d_wgt = d_weight
+        return d_weight
+    # --------------------------------------------
+    def _read_weights(self, name : str, rdf : RDataFrame) -> Union[numpy.ndarray, None]:
+        v_col = rdf.GetColumnNames()
+        l_col = [ col.c_str() for col in v_col ]
+        if name not in l_col:
+            log.debug(f'Weight {name} not found')
+            return None
+        arr_wgt = rdf.AsNumpy([name])[name]
+        return arr_wgt
+    #-------------------------------------
+    def _get_plot_name(self, var : str) -> str:
+        if 'plots_2d' in self._d_cfg:
+            #For 2D plots the name will always be specified in the config
+            return var
+        if 'name' not in self._d_cfg['plots'][var]:
+            # For 1D plots the name can be taken from variable name itself or specified
+            return var
+        return self._d_cfg['plots'][var]['name']
+    #-------------------------------------
+    def _save_plot(self, var):
+        '''
+        Will save to PNG:
+        var (str) : Name of variable, needed for plot name
+        '''
+        plt.legend()
+        plt_dir = self._d_cfg['saving']['plt_dir']
+        os.makedirs(plt_dir, exist_ok=True)
+        name = self._get_plot_name(var)
+        plot_path = f'{plt_dir}/{name}.png'
+        log.info(f'Saving to: {plot_path}')
+        plt.tight_layout()
+        plt.savefig(plot_path)
+        plt.close(var)
+# --------------------------------------------