PyPI - data-manipulation-utilities - Versions diffs - 0.2.5__tar.gz → 0.2.6__tar.gz - Mend

data-manipulation-utilities 0.2.5tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: data_manipulation_utilities
-Version: 0.2.5
+Version: 0.2.6
 Description-Content-Type: text/markdown
 Requires-Dist: logzero
 Requires-Dist: PyYAML
@@ -427,7 +427,7 @@ rdf_bkg = _get_rdf(kind='bkg')
 cfg     = _get_config()
 obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
-obj.run()
+obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
 ```
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -549,7 +549,7 @@ When evaluating the model with real data, problems might occur, we deal with the
     ```python
     model.cfg
     ```
-    - For whatever entries that are still NaN, they will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
+    - For whatever features that are still NaN, they will be _patched_  with zeros when evaluated. However, the returned probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
 # Pandas dataframes

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/README.md RENAMED Viewed

@@ -407,7 +407,7 @@ rdf_bkg = _get_rdf(kind='bkg')
 cfg     = _get_config()
 obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
-obj.run()
+obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
 ```
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -529,7 +529,7 @@ When evaluating the model with real data, problems might occur, we deal with the
     ```python
     model.cfg
     ```
-    - For whatever entries that are still NaN, they will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
+    - For whatever features that are still NaN, they will be _patched_  with zeros when evaluated. However, the returned probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
 # Pandas dataframes

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name        = 'data_manipulation_utilities'
-version     = '0.2.5'
+version     = '0.2.6'
 readme      = 'README.md'
 dependencies= [
 'logzero',

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: data_manipulation_utilities
-Version: 0.2.5
+Version: 0.2.6
 Description-Content-Type: text/markdown
 Requires-Dist: logzero
 Requires-Dist: PyYAML
@@ -427,7 +427,7 @@ rdf_bkg = _get_rdf(kind='bkg')
 cfg     = _get_config()
 obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
-obj.run()
+obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
 ```
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -549,7 +549,7 @@ When evaluating the model with real data, problems might occur, we deal with the
     ```python
     model.cfg
     ```
-    - For whatever entries that are still NaN, they will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
+    - For whatever features that are still NaN, they will be _patched_  with zeros when evaluated. However, the returned probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
 # Pandas dataframes

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/src/dmu/ml/train_mva.py RENAMED Viewed

@@ -1,7 +1,7 @@
 '''
 Module with TrainMva class
 '''
-# pylint: disable = too-many-locals
+# pylint: disable = too-many-locals, no-name-in-module
 # pylint: disable = too-many-arguments, too-many-positional-arguments
 import os
@@ -14,7 +14,7 @@ import matplotlib.pyplot as plt
 from sklearn.metrics         import roc_curve, auc
 from sklearn.model_selection import StratifiedKFold
-from ROOT import RDataFrame
+from ROOT import RDataFrame, RDF
 import dmu.ml.utilities         as ut
 import dmu.pdataframe.utilities as put
@@ -33,40 +33,41 @@ class TrainMva:
     Interface to scikit learn used to train classifier
     '''
     # ---------------------------------------------
-    def __init__(self, bkg=None, sig=None, cfg=None):
+    def __init__(self, bkg : RDataFrame, sig : RDataFrame, cfg : dict):
         '''
         bkg (ROOT dataframe): Holds real data
         sig (ROOT dataframe): Holds simulation
         cfg (dict)          : Dictionary storing configuration for training
         '''
-        if bkg is None:
-            raise ValueError('Background dataframe is not a ROOT dataframe')
-        if sig is None:
-            raise ValueError('Signal dataframe is not a ROOT dataframe')
-        if not isinstance(cfg, dict):
-            raise ValueError('Config dictionary is not a dictionary')
+        self._cfg       = cfg
+        self._l_ft_name = self._cfg['training']['features']
-        self._rdf_bkg = bkg
-        self._rdf_sig = sig
-        self._cfg     = cfg
+        df_ft_sig, l_lab_sig = self._get_sample_inputs(rdf = sig, label = 1)
+        df_ft_bkg, l_lab_bkg = self._get_sample_inputs(rdf = bkg, label = 0)
-        self._l_ft_name = self._cfg['training']['features']
+        self._df_ft = pnd.concat([df_ft_sig, df_ft_bkg], axis=0)
+        self._l_lab = numpy.array(l_lab_sig + l_lab_bkg)
-        self._df_ft, self._l_lab = self._get_inputs()
+        self._rdf_bkg = self._get_rdf(rdf = bkg, df=df_ft_bkg)
+        self._rdf_sig = self._get_rdf(rdf = sig, df=df_ft_sig)
     # ---------------------------------------------
-    def _get_inputs(self) -> tuple[pnd.DataFrame, npa]:
-        log.info('Getting signal')
-        df_sig, arr_lab_sig = self._get_sample_inputs(self._rdf_sig, label = 1)
+    def _get_rdf(self, rdf : RDataFrame, df : pnd.DataFrame) -> RDataFrame:
+        '''
+        Takes original ROOT dataframe and pre-processed features dataframe
+        Adds missing branches to latter and returns expanded ROOT dataframe
+        '''
-        log.info('Getting background')
-        df_bkg, arr_lab_bkg = self._get_sample_inputs(self._rdf_bkg, label = 0)
+        l_pnd_col = df.columns.tolist()
+        l_rdf_col = [ name.c_str() for name in rdf.GetColumnNames() ]
+        l_mis_col = [ col for col in l_rdf_col if col not in l_pnd_col ]
-        df      = pnd.concat([df_sig, df_bkg], axis=0)
-        arr_lab = numpy.concatenate([arr_lab_sig, arr_lab_bkg])
+        log.debug(f'Adding extra-nonfeature columns: {l_mis_col}')
-        return df, arr_lab
+        d_data = rdf.AsNumpy(l_mis_col)
+        df_ext = pnd.DataFrame(d_data)
+        df_all = pnd.concat([df, df_ext], axis=1)
+        return RDF.FromPandas(df_all)
     # ---------------------------------------------
     def _pre_process_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
         if 'dataset' not in self._cfg:
@@ -77,23 +78,26 @@ class TrainMva:
             return df
         d_name_val = self._cfg['dataset']['nan']
-        log.info(60 * '-')
+        log.info(70 * '-')
         log.info('Doing NaN replacements')
-        log.info(60 * '-')
+        log.info(70 * '-')
         for var, val in d_name_val.items():
-            log.info(f'{var:<20}{"--->":20}{val:<20.3f}')
+            nna = df[var].isna().sum()
+            log.info(f'{var:<20}{"--->":20}{val:<20.3f}{nna}')
             df[var] = df[var].fillna(val)
+        log.info(70 * '-')
         return df
     # ---------------------------------------------
-    def _get_sample_inputs(self, rdf : RDataFrame, label : int) -> tuple[pnd.DataFrame, npa]:
+    def _get_sample_inputs(self, rdf : RDataFrame, label : int) -> tuple[pnd.DataFrame, list[int]]:
         d_ft = rdf.AsNumpy(self._l_ft_name)
         df   = pnd.DataFrame(d_ft)
         df   = self._pre_process_nans(df)
         df   = ut.cleanup(df)
         l_lab= len(df) * [label]
-        return df, numpy.array(l_lab)
+        return df, l_lab
     # ---------------------------------------------
     def _get_model(self, arr_index : npa) -> cls:
         model = cls(cfg = self._cfg)

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/src/dmu/stats/minimizers.py RENAMED Viewed

@@ -1,12 +1,16 @@
 '''
 Module containing derived classes from ZFit minimizer
 '''
+from typing import Union
 import numpy
 import zfit
+import matplotlib.pyplot as plt
 from zfit.result                   import FitResult
 from zfit.core.basepdf             import BasePDF           as zpdf
 from zfit.minimizers.baseminimizer import FailMinimizeNaN
+from dmu.stats.utilities           import print_pdf
 from dmu.stats.gof_calculator      import GofCalculator
 from dmu.logging.log_store         import LogStore
@@ -29,6 +33,7 @@ class AnealingMinimizer(zfit.minimize.Minuit):
         self._chi2ndof = chi2ndof
         self._check_thresholds()
+        self._l_bad_fit_res : list[FitResult] = []
         super().__init__()
     # ------------------------
@@ -66,19 +71,24 @@ class AnealingMinimizer(zfit.minimize.Minuit):
         return is_good
     # ------------------------
     def _is_good_fit(self, res : FitResult) -> bool:
+        good_fit = True
         if not res.valid:
-            log.warning('Skipping invalid fit')
-            return False
+            log.debug('Skipping invalid fit')
+            good_fit = False
         if res.status != 0:
-            log.warning('Skipping fit with bad status')
-            return False
+            log.debug('Skipping fit with bad status')
+            good_fit = False
         if not res.converged:
-            log.warning('Skipping non-converging fit')
-            return False
+            log.debug('Skipping non-converging fit')
+            good_fit = False
-        return True
+        if not good_fit:
+            self._l_bad_fit_res.append(res)
+        return good_fit
     # ------------------------
     def _get_gof(self, nll) -> tuple[float, float]:
         log.debug('Checking GOF')
@@ -108,10 +118,11 @@ class AnealingMinimizer(zfit.minimize.Minuit):
             par.set_value(fval)
             log.debug(f'{par.name:<20}{ival:<15.3f}{"->":<10}{fval:<15.3f}{"in":<5}{par.lower:<15.3e}{par.upper:<15.3e}')
     # ------------------------
-    def _pick_best_fit(self, d_chi2_res : dict) -> FitResult:
+    def _pick_best_fit(self, d_chi2_res : dict) -> Union[FitResult,None]:
         nres = len(d_chi2_res)
         if nres == 0:
-            raise ValueError('No fits found')
+            log.error('No fits found')
+            return None
         l_chi2_res= list(d_chi2_res.items())
         l_chi2_res.sort()
@@ -149,6 +160,15 @@ class AnealingMinimizer(zfit.minimize.Minuit):
         return l_model[0]
     # ------------------------
+    def _print_failed_fit_diagnostics(self, nll) -> None:
+        for res in self._l_bad_fit_res:
+            print(res)
+        arr_mass = nll.data[0].numpy()
+        plt.hist(arr_mass, bins=60)
+        plt.show()
+    # ------------------------
     def minimize(self, nll, **kwargs) -> FitResult:
         '''
         Will run minimization and return FitResult object
@@ -156,18 +176,20 @@ class AnealingMinimizer(zfit.minimize.Minuit):
         d_chi2_res : dict[float,FitResult] = {}
         for i_try in range(self._ntries):
-            log.info(f'try {i_try:02}/{self._ntries:02}')
             try:
                 res = super().minimize(nll, **kwargs)
             except (FailMinimizeNaN, ValueError, RuntimeError) as exc:
-                log.warning(exc)
+                log.error(f'{i_try:02}/{self._ntries:02}{"Failed":>20}')
+                log.debug(exc)
                 self._randomize_parameters(nll)
                 continue
             if not self._is_good_fit(res):
+                log.warning(f'{i_try:02}/{self._ntries:02}{"Bad fit":>20}')
                 continue
             chi2, pvl = self._get_gof(nll)
+            log.info(f'{i_try:02}/{self._ntries:02}{chi2:>20.3f}')
             d_chi2_res[chi2] = res
             if self._is_good_gof(chi2, pvl):
@@ -176,6 +198,13 @@ class AnealingMinimizer(zfit.minimize.Minuit):
             self._randomize_parameters(nll)
         res = self._pick_best_fit(d_chi2_res)
+        if res is None:
+            self._print_failed_fit_diagnostics(nll)
+            pdf = nll.model[0]
+            print_pdf(pdf)
+            raise ValueError('Fit failed')
         pdf = self._pdf_from_nll(nll)
         self._set_pdf_pars(res, pdf)

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/src/dmu/stats/model_factory.py RENAMED Viewed

@@ -37,7 +37,16 @@ class MethodRegistry:
         '''
         Will return method in charge of building PDF, for an input nickname
         '''
-        return cls._d_method.get(nickname, None)
+        method = cls._d_method.get(nickname, None)
+        if method is not None:
+            return method
+        log.warning('Available PDFs:')
+        for value in cls._d_method:
+            log.info(f'    {value}')
+        return method
 #-----------------------------------------
 class ModelFactory:
     '''
@@ -48,39 +57,56 @@ class ModelFactory:
     l_pdf = ['dscb', 'gauss']
     l_shr = ['mu']
-    mod   = ModelFactory(obs = obs, l_pdf = l_pdf, l_shared=l_shr)
+    mod   = ModelFactory(preffix = 'signal', obs = obs, l_pdf = l_pdf, l_shared=l_shr)
     pdf   = mod.get_pdf()
     ```
     where one can specify which parameters can be shared among the PDFs
     '''
     #-----------------------------------------
-    def __init__(self, obs : zobs, l_pdf : list[str], l_shared : list[str]):
+    def __init__(self,
+                 preffix  : str,
+                 obs      : zobs,
+                 l_pdf    : list[str],
+                 l_shared : list[str],
+                 l_float  : list[str]):
         '''
+        preffix:  used to identify PDF, will be used to name every parameter
         obs:      zfit obserbable
         l_pdf:    List of PDF nicknames which are registered below
         l_shared: List of parameter names that are shared
+        l_float:  List of parameter names to allow to float
         '''
+        self._preffix         = preffix
         self._l_pdf           = l_pdf
         self._l_shr           = l_shared
-        self._l_can_be_shared = ['mu', 'sg']
+        self._l_flt           = l_float
         self._obs             = obs
         self._d_par : dict[str,zpar] = {}
     #-----------------------------------------
-    def _fltname_from_name(self, name : str) -> str:
-        if name in ['mu', 'sg']:
-            return f'{name}_flt'
+    def _split_name(self, name : str) -> tuple[str,str]:
+        l_part = name.split('_')
+        pname  = l_part[0]
+        xname  = '_'.join(l_part[1:])
-        return name
+        return pname, xname
     #-----------------------------------------
-    def _get_name(self, name : str, suffix : str) -> str:
-        for can_be_shared in self._l_can_be_shared:
-            if name.startswith(f'{can_be_shared}_') and can_be_shared in self._l_shr:
-                return self._fltname_from_name(can_be_shared)
+    def _get_parameter_name(self, name : str, suffix : str) -> str:
+        pname, xname = self._split_name(name)
+        log.debug(f'Using physical name: {pname}')
-        return self._fltname_from_name(f'{name}{suffix}')
+        if pname in self._l_shr:
+            name = f'{pname}_{self._preffix}'
+        else:
+            name = f'{pname}_{xname}_{self._preffix}{suffix}'
+        if pname in self._l_flt:
+            return f'{name}_flt'
+        return name
     #-----------------------------------------
     def _get_parameter(self,
                        name   : str,
@@ -88,7 +114,10 @@ class ModelFactory:
                        val    : float,
                        low    : float,
                        high   : float) -> zpar:
-        name = self._get_name(name, suffix)
+        name = self._get_parameter_name(name, suffix)
+        log.debug(f'Assigning name: {name}')
         if name in self._d_par:
             return self._d_par[name]
@@ -100,15 +129,15 @@ class ModelFactory:
     #-----------------------------------------
     @MethodRegistry.register('exp')
     def _get_exponential(self, suffix : str = '') -> zpdf:
-        c   = self._get_parameter('c_exp', suffix, -0.005, -0.05, 0.00)
-        pdf = zfit.pdf.Exponential(c, self._obs)
+        c   = self._get_parameter('c_exp', suffix, -0.005, -0.20, 0.00)
+        pdf = zfit.pdf.Exponential(c, self._obs, name=f'exp{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('pol1')
     def _get_pol1(self, suffix : str = '') -> zpdf:
         a   = self._get_parameter('a_pol1', suffix, -0.005, -0.95, 0.00)
-        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a])
+        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a], name=f'pol1{suffix}')
         return pdf
     #-----------------------------------------
@@ -116,51 +145,62 @@ class ModelFactory:
     def _get_pol2(self, suffix : str = '') -> zpdf:
         a   = self._get_parameter('a_pol2', suffix, -0.005, -0.95, 0.00)
         b   = self._get_parameter('b_pol2', suffix,  0.000, -0.95, 0.95)
-        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a, b])
+        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a, b], name=f'pol2{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('cbr')
     def _get_cbr(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_cbr', suffix, 5300, 5250, 5350)
+        mu  = self._get_parameter('mu_cbr', suffix, 5300, 5100, 5350)
         sg  = self._get_parameter('sg_cbr', suffix,   10,    2,  300)
-        ar  = self._get_parameter('ac_cbr', suffix,   -2,  -4.,  -1.)
-        nr  = self._get_parameter('nc_cbr', suffix,    1,  0.5,  5.0)
+        ar  = self._get_parameter('ac_cbr', suffix,   -2, -14., -0.1)
+        nr  = self._get_parameter('nc_cbr', suffix,    1,  0.5,  150)
+        pdf = zfit.pdf.CrystalBall(mu, sg, ar, nr, self._obs, name=f'cbr{suffix}')
+        return pdf
+    #-----------------------------------------
+    @MethodRegistry.register('suj')
+    def _get_suj(self, suffix : str = '') -> zpdf:
+        mu  = self._get_parameter('mu_suj', suffix, 5300, 4000, 6000)
+        sg  = self._get_parameter('sg_suj', suffix,   10,    2, 5000)
+        gm  = self._get_parameter('gm_suj', suffix,    1,  -10,   10)
+        dl  = self._get_parameter('dl_suj', suffix,    1,  0.1,   10)
-        pdf = zfit.pdf.CrystalBall(mu, sg, ar, nr, self._obs)
+        pdf = zfit.pdf.JohnsonSU(mu, sg, gm, dl, self._obs, name=f'suj{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('cbl')
     def _get_cbl(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_cbl', suffix, 5300, 5250, 5350)
+        mu  = self._get_parameter('mu_cbl', suffix, 5300, 5100, 5350)
         sg  = self._get_parameter('sg_cbl', suffix,   10,    2,  300)
-        al  = self._get_parameter('ac_cbl', suffix,    2,   1.,  14.)
-        nl  = self._get_parameter('nc_cbl', suffix,    1,  0.5,  15.)
+        al  = self._get_parameter('ac_cbl', suffix,    2,  0.1,  14.)
+        nl  = self._get_parameter('nc_cbl', suffix,    1,  0.5,  150)
-        pdf = zfit.pdf.CrystalBall(mu, sg, al, nl, self._obs)
+        pdf = zfit.pdf.CrystalBall(mu, sg, al, nl, self._obs, name=f'cbl{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('gauss')
     def _get_gauss(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_gauss', suffix, 5300, 5250, 5350)
+        mu  = self._get_parameter('mu_gauss', suffix, 5300, 5100, 5350)
         sg  = self._get_parameter('sg_gauss', suffix,   10,    2,  300)
-        pdf = zfit.pdf.Gauss(mu, sg, self._obs)
+        pdf = zfit.pdf.Gauss(mu, sg, self._obs, name=f'gauss{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('dscb')
     def _get_dscb(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_dscb', suffix, 5300, 5250, 5400)
-        sg  = self._get_parameter('sg_dscb', suffix,   10,    2,   30)
+        mu  = self._get_parameter('mu_dscb', suffix, 4000, 4000, 5400)
+        sg  = self._get_parameter('sg_dscb', suffix,   10,    2,  500)
         ar  = self._get_parameter('ar_dscb', suffix,    1,    0,    5)
         al  = self._get_parameter('al_dscb', suffix,    1,    0,    5)
-        nr  = self._get_parameter('nr_dscb', suffix,    2,    1,   15)
-        nl  = self._get_parameter('nl_dscb', suffix,    2,    0,   15)
+        nr  = self._get_parameter('nr_dscb', suffix,    2,    1,  150)
+        nl  = self._get_parameter('nl_dscb', suffix,    2,    0,  150)
-        pdf = zfit.pdf.DoubleCB(mu, sg, al, nl, ar, nr, self._obs)
+        pdf = zfit.pdf.DoubleCB(mu, sg, al, nl, ar, nr, self._obs, name=f'dscb{suffix}')
         return pdf
     #-----------------------------------------
@@ -196,7 +236,7 @@ class ModelFactory:
         l_frc= [ zfit.param.Parameter(f'frc_{ifrc + 1}', 0.5, 0, 1) for ifrc in range(nfrc - 1) ]
-        pdf = zfit.pdf.SumPDF(l_pdf, fracs=l_frc)
+        pdf = zfit.pdf.SumPDF(l_pdf, name=self._preffix, fracs=l_frc)
         return pdf
     #-----------------------------------------

{data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.6}/src/dmu_data/ml/tests/train_mva.yaml RENAMED Viewed

@@ -1,7 +1,7 @@
 dataset:
   nan :
-    x : 1
-    y : 2
+    x : -3
+    y : -3
 training :
     nfold    : 3
     features : [x, y, z]
@@ -34,6 +34,10 @@ plotting:
         saving:
             plt_dir : '/tmp/dmu/ml/tests/train_mva/features'
         plots:
+          w :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['w', '']
           x :
             binning : [-4, 4, 100]
             yscale  : 'linear'
@@ -46,4 +50,3 @@ plotting:
             binning : [-4, 4, 100]
             yscale  : 'linear'
             labels  : ['z', '']