PyPI - data-manipulation-utilities - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.6py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +177 -8
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +30 -18
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
dmu/generic/hashing.py +44 -0
dmu/generic/utilities.py +14 -1
dmu/generic/version_management.py +3 -5
dmu/ml/cv_diagnostics.py +221 -0
dmu/ml/train_mva.py +124 -31
dmu/pdataframe/utilities.py +36 -3
dmu/plotting/fwhm.py +64 -0
dmu/plotting/plotter.py +2 -0
dmu/plotting/plotter_1d.py +87 -6
dmu/stats/fitter.py +1 -1
dmu/stats/model_factory.py +189 -25
dmu/stats/zfit_models.py +68 -0
dmu/stats/zfit_plotter.py +29 -21
dmu/testing/utilities.py +31 -4
dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
dmu_data/ml/tests/train_mva.yaml +15 -9
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
dmu_data/plotting/tests/plug_stats.yaml +19 -0
dmu_data/plotting/tests/simple.yaml +4 -3
dmu_data/plotting/tests/styling.yaml +11 -0
{data_manipulation_utilities-0.2.6.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0

dmu/stats/model_factory.py CHANGED Viewed

@@ -6,9 +6,12 @@ Module storing ZModel class
 from typing import Callable, Union
 import zfit
 from zfit.core.interfaces   import ZfitSpace as zobs
 from zfit.core.basepdf      import BasePDF   as zpdf
 from zfit.core.parameter    import Parameter as zpar
+from dmu.stats.zfit_models  import HypExp
+from dmu.stats.zfit_models  import ModExp
 from dmu.logging.log_store  import LogStore
 log=LogStore.add_logger('dmu:stats:model_factory')
@@ -47,6 +50,13 @@ class MethodRegistry:
             log.info(f'    {value}')
         return method
+    @classmethod
+    def get_pdf_names(cls) -> list[str]:
+        '''
+        Returns list of PDFs that are registered/supported
+        '''
+        return list(cls._d_method)
 #-----------------------------------------
 class ModelFactory:
     '''
@@ -57,11 +67,17 @@ class ModelFactory:
     l_pdf = ['dscb', 'gauss']
     l_shr = ['mu']
-    mod   = ModelFactory(preffix = 'signal', obs = obs, l_pdf = l_pdf, l_shared=l_shr)
+    l_flt = ['mu', 'sg']
+    d_rep = {'mu' : 'scale', 'sg' : 'reso'}
+    mod   = ModelFactory(preffix = 'signal', obs = obs, l_pdf = l_pdf, l_shared = l_shr, d_rep = d_rep)
     pdf   = mod.get_pdf()
     ```
-    where one can specify which parameters can be shared among the PDFs
+    where one can specify which parameters
+    - Can be shared among the PDFs
+    - Are meant to float if this fit is done to MC, in order to fix parameters in data.
+    - Are scales or resolutions that need reparametrizations
     '''
     #-----------------------------------------
     def __init__(self,
@@ -69,22 +85,45 @@ class ModelFactory:
                  obs      : zobs,
                  l_pdf    : list[str],
                  l_shared : list[str],
-                 l_float  : list[str]):
+                 l_float  : list[str],
+                 d_fix    : dict[str:float] = None,
+                 d_rep    : dict[str:str]   = None):
         '''
         preffix:  used to identify PDF, will be used to name every parameter
         obs:      zfit obserbable
         l_pdf:    List of PDF nicknames which are registered below
         l_shared: List of parameter names that are shared
         l_float:  List of parameter names to allow to float
+        d_fix:    Dictionary with keys as the beginning of the name of a parameter and value as the number
+                  to which it has to be fixed. If not one and only one parameter is found, ValueError is raised
+        d_rep:    Dictionary with keys as variables that will be reparametrized
         '''
         self._preffix         = preffix
         self._l_pdf           = l_pdf
         self._l_shr           = l_shared
         self._l_flt           = l_float
+        self._d_fix           = d_fix
+        self._d_rep           = d_rep
         self._obs             = obs
         self._d_par : dict[str,zpar] = {}
+        self._check_reparametrization()
+    #-----------------------------------------
+    def _check_reparametrization(self) -> None:
+        if self._d_rep is None:
+            return
+        s_par_1 = set(self._d_rep)
+        s_par_2 = set(self._l_flt)
+        if not s_par_1.isdisjoint(s_par_2):
+            raise ValueError('Non empty intersection between floating and reparametrization parameters')
+        s_kind  = set(self._d_rep.values())
+        if not s_kind.issubset({'scale', 'reso'}):
+            raise ValueError(f'Only scales and resolution reparametrizations allowed, found: {s_kind}')
     #-----------------------------------------
     def _split_name(self, name : str) -> tuple[str,str]:
         l_part = name.split('_')
@@ -108,30 +147,86 @@ class ModelFactory:
         return name
     #-----------------------------------------
-    def _get_parameter(self,
-                       name   : str,
-                       suffix : str,
-                       val    : float,
-                       low    : float,
-                       high   : float) -> zpar:
+    def _get_parameter(
+            self,
+            name   : str,
+            suffix : str,
+            val    : float,
+            low    : float,
+            high   : float) -> zpar:
+        par_name = self._get_parameter_name(name, suffix)
+        log.debug(f'Assigning name: {par_name}')
+        if par_name in self._d_par:
+            return self._d_par[par_name]
+        is_reparametrized = self._is_reparametrized(name)
+        if is_reparametrized:
+            init_name, _ = self._split_name(par_name)
+            par  = self._get_reparametrization(par_name, init_name, val, low, high)
+        else:
+            par  = zfit.param.Parameter(par_name, val, low, high)
+        self._d_par[par_name] = par
+        return par
+    #-----------------------------------------
+    def _is_reparametrized(self, name : str) -> bool:
+        if self._d_rep is None:
+            return False
-        name = self._get_parameter_name(name, suffix)
-        log.debug(f'Assigning name: {name}')
+        root_name, _ = self._split_name(name)
-        if name in self._d_par:
-            return self._d_par[name]
+        is_rep = root_name in self._d_rep
-        par  = zfit.param.Parameter(name, val, low, high)
+        log.debug(f'Reparametrizing {name}: {is_rep}')
-        self._d_par[name] = par
+        return is_rep
+    #-----------------------------------------
+    def _get_reparametrization(self, par_name : str, init_name : str, value : float, low : float, high : float) -> zpar:
+        log.debug(f'Reparametrizing {par_name}')
+        par_const = zfit.Parameter(par_name, value, low, high)
+        par_const.floating = False
+        kind = self._d_rep[init_name]
+        if   kind == 'reso':
+            par_reso  = zfit.Parameter(f'{par_name}_reso_flt' , 1.0, 0.20, 5.0)
+            par       = zfit.ComposedParameter(f'{par_name}_cmp', lambda d_par : d_par['par_const'] * d_par['reso' ], params={'par_const' : par_const, 'reso'  : par_reso } )
+        elif kind == 'scale':
+            par_scale = zfit.Parameter(f'{par_name}_scale_flt', 0.0, -100, 100)
+            par       = zfit.ComposedParameter(f'{par_name}_cmp', lambda d_par : d_par['par_const'] + d_par['scale'], params={'par_const' : par_const, 'scale' : par_scale} )
+        else:
+            raise ValueError(f'Invalid kind: {kind}')
         return par
     #-----------------------------------------
     @MethodRegistry.register('exp')
     def _get_exponential(self, suffix : str = '') -> zpdf:
-        c   = self._get_parameter('c_exp', suffix, -0.005, -0.20, 0.00)
+        c   = self._get_parameter('c_exp', suffix, -0.010, -0.020, -0.0001)
         pdf = zfit.pdf.Exponential(c, self._obs, name=f'exp{suffix}')
+        return pdf
+    # ---------------------------------------------
+    @MethodRegistry.register('hypexp')
+    def _get_hypexp(self, suffix : str = '') -> zpdf:
+        mu = zfit.Parameter('mu_hypexp',  5000,   4000,  6000)
+        ap = zfit.Parameter('ap_hypexp', 0.020,      0,  0.10)
+        bt = zfit.Parameter('bt_hypexp', 0.002, 0.0001, 0.003)
+        pdf= HypExp(obs=self._obs, mu=mu, alpha=ap, beta=bt, name=f'hypexp{suffix}')
+        return pdf
+    # ---------------------------------------------
+    @MethodRegistry.register('modexp')
+    def _get_modexp(self, suffix : str = '') -> zpdf:
+        mu = zfit.Parameter('mu_modexp',  4250,  4250,  4500)
+        ap = zfit.Parameter('ap_modexp', 0.002, 0.002, 0.026)
+        bt = zfit.Parameter('bt_modexp', 0.002, 0.002, 0.020)
+        pdf= ModExp(obs=self._obs, mu=mu, alpha=ap, beta=bt, name=f'modexp{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('pol1')
@@ -145,13 +240,22 @@ class ModelFactory:
     def _get_pol2(self, suffix : str = '') -> zpdf:
         a   = self._get_parameter('a_pol2', suffix, -0.005, -0.95, 0.00)
         b   = self._get_parameter('b_pol2', suffix,  0.000, -0.95, 0.95)
-        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a, b], name=f'pol2{suffix}')
+        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a, b   ], name=f'pol2{suffix}')
+        return pdf
+    # ---------------------------------------------
+    @MethodRegistry.register('pol3')
+    def _get_pol3(self, suffix : str = '') -> zpdf:
+        a   = zfit.Parameter('a_pol3', -0.005, -0.95, 0.00)
+        b   = zfit.Parameter('b_pol3',  0.000, -0.95, 0.95)
+        c   = zfit.Parameter('c_pol3',  0.000, -0.95, 0.95)
+        pdf = zfit.pdf.Chebyshev(obs=self._obs, coeffs=[a, b, c], name=f'pol3{suffix}')
         return pdf
     #-----------------------------------------
     @MethodRegistry.register('cbr')
     def _get_cbr(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_cbr', suffix, 5300, 5100, 5350)
+        mu  = self._get_parameter('mu_cbr', suffix, 5300, 5100, 5500)
         sg  = self._get_parameter('sg_cbr', suffix,   10,    2,  300)
         ar  = self._get_parameter('ac_cbr', suffix,   -2, -14., -0.1)
         nr  = self._get_parameter('nc_cbr', suffix,    1,  0.5,  150)
@@ -162,10 +266,10 @@ class ModelFactory:
     #-----------------------------------------
     @MethodRegistry.register('suj')
     def _get_suj(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_suj', suffix, 5300, 4000, 6000)
+        mu  = self._get_parameter('mu_suj', suffix, 5300, 5000, 6000)
         sg  = self._get_parameter('sg_suj', suffix,   10,    2, 5000)
         gm  = self._get_parameter('gm_suj', suffix,    1,  -10,   10)
-        dl  = self._get_parameter('dl_suj', suffix,    1,  0.1,   10)
+        dl  = self._get_parameter('dl_suj', suffix,    1,  0.1,   40)
         pdf = zfit.pdf.JohnsonSU(mu, sg, gm, dl, self._obs, name=f'suj{suffix}')
@@ -173,9 +277,9 @@ class ModelFactory:
     #-----------------------------------------
     @MethodRegistry.register('cbl')
     def _get_cbl(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_cbl', suffix, 5300, 5100, 5350)
+        mu  = self._get_parameter('mu_cbl', suffix, 5300, 5100, 5500)
         sg  = self._get_parameter('sg_cbl', suffix,   10,    2,  300)
-        al  = self._get_parameter('ac_cbl', suffix,    2,  0.1,  14.)
+        al  = self._get_parameter('ac_cbl', suffix,    2,  0.0,  14.)
         nl  = self._get_parameter('nc_cbl', suffix,    1,  0.5,  150)
         pdf = zfit.pdf.CrystalBall(mu, sg, al, nl, self._obs, name=f'cbl{suffix}')
@@ -184,7 +288,7 @@ class ModelFactory:
     #-----------------------------------------
     @MethodRegistry.register('gauss')
     def _get_gauss(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_gauss', suffix, 5300, 5100, 5350)
+        mu  = self._get_parameter('mu_gauss', suffix, 5300, 5100, 5500)
         sg  = self._get_parameter('sg_gauss', suffix,   10,    2,  300)
         pdf = zfit.pdf.Gauss(mu, sg, self._obs, name=f'gauss{suffix}')
@@ -193,7 +297,7 @@ class ModelFactory:
     #-----------------------------------------
     @MethodRegistry.register('dscb')
     def _get_dscb(self, suffix : str = '') -> zpdf:
-        mu  = self._get_parameter('mu_dscb', suffix, 4000, 4000, 5400)
+        mu  = self._get_parameter('mu_dscb', suffix, 5300, 5000, 5400)
         sg  = self._get_parameter('sg_dscb', suffix,   10,    2,  500)
         ar  = self._get_parameter('ar_dscb', suffix,    1,    0,    5)
         al  = self._get_parameter('al_dscb', suffix,    1,    0,    5)
@@ -204,6 +308,35 @@ class ModelFactory:
         return pdf
     #-----------------------------------------
+    @MethodRegistry.register('voigt')
+    def _get_voigt(self, suffix : str = '') -> zpdf:
+        mu  = zfit.Parameter('mu_voigt', 5280,  5040, 5500)
+        sg  = zfit.Parameter('sg_voigt',   20,    10,  400)
+        gm  = zfit.Parameter('gm_voigt',    4,   0.1,  100)
+        pdf = zfit.pdf.Voigt(m=mu, sigma=sg, gamma=gm, obs=self._obs, name=f'voigt{suffix}')
+        return pdf
+    #-----------------------------------------
+    @MethodRegistry.register('qgauss')
+    def _get_qgauss(self, suffix : str = '') -> zpdf:
+        mu  = zfit.Parameter('mu_qgauss', 5280,  5040, 5500)
+        sg  = zfit.Parameter('sg_qgauss',   20,    10,  400)
+        q   = zfit.Parameter( 'q_qgauss',    1,     1,    3)
+        pdf = zfit.pdf.QGauss(q=q, mu=mu, sigma=sg, obs=self._obs, name =f'qgauss{suffix}')
+        return pdf
+    #-----------------------------------------
+    @MethodRegistry.register('cauchy')
+    def _get_cauchy(self, suffix : str = '') -> zpdf:
+        mu  = zfit.Parameter('mu', 5280,  5040, 5500)
+        gm  = zfit.Parameter('gm',  150,    50,  500)
+        pdf = zfit.pdf.Cauchy(obs=self._obs, m=mu, gamma=gm, name=f'cauchy{suffix}')
+        return pdf
+    #-----------------------------------------
     def _get_pdf_types(self) -> list[tuple[str,str]]:
         d_name_freq = {}
@@ -234,12 +367,42 @@ class ModelFactory:
             log.debug('Requested only one PDF, skipping sum')
             return l_pdf[0]
-        l_frc= [ zfit.param.Parameter(f'frc_{ifrc + 1}', 0.5, 0, 1) for ifrc in range(nfrc - 1) ]
+        l_frc= [ zfit.param.Parameter(f'frc_{self._preffix}_{ifrc + 1}', 0.5, 0, 1) for ifrc in range(nfrc - 1) ]
         pdf = zfit.pdf.SumPDF(l_pdf, name=self._preffix, fracs=l_frc)
         return pdf
     #-----------------------------------------
+    def _find_par(self, s_par : set[zpar], name_start : str) -> zpar:
+        l_par_match = [ par for par in s_par if par.name.startswith(name_start) ]
+        if len(l_par_match) != 1:
+            for par in s_par:
+                log.info(par.name)
+            raise ValueError(f'Not found one and only one parameter starting with: {name_start}')
+        return l_par_match[0]
+    #-----------------------------------------
+    def _fix_parameters(self, pdf : zpdf) -> zpdf:
+        if self._d_fix is None:
+            log.debug('Not fixing any parameter')
+            return pdf
+        s_par = pdf.get_params()
+        log.info('-' * 30)
+        log.info('Fixing parameters')
+        log.info('-' * 30)
+        for name_start, value in self._d_fix.items():
+            par = self._find_par(s_par, name_start)
+            par.set_value(value)
+            log.info(f'{name_start:<20}{value:<20.3f}')
+            par.floating = False
+        return pdf
+    #-----------------------------------------
     def get_pdf(self) -> zpdf:
         '''
         Given a list of strings representing PDFs returns the a zfit PDF which is
@@ -248,6 +411,7 @@ class ModelFactory:
         l_type=   self._get_pdf_types()
         l_pdf = [ self._get_pdf(kind, preffix) for kind, preffix in l_type ]
         pdf   =   self._add_pdf(l_pdf)
+        pdf   =   self._fix_parameters(pdf)
         return pdf
 #-----------------------------------------

dmu/stats/zfit_models.py ADDED Viewed

@@ -0,0 +1,68 @@
+'''
+Module meant to hold classes defining PDFs that can be used by ZFIT
+'''
+import zfit
+from zfit   import z
+#-------------------------------------------------------------------
+class HypExp(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'alpha', 'beta']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        ap   = self.params['alpha']
+        bt   = self.params['beta']
+        u   = (x - mu)
+        val = z.exp(-bt * x) / (1 + z.exp(-ap * u))
+        return val
+#-------------------------------------------------------------------
+class ModExp(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'alpha', 'beta']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        ap   = self.params['alpha']
+        bt   = self.params['beta']
+        u   = x - mu
+        val = (1 - z.exp(-ap * u)) * z.exp(-bt * u)
+        return val
+#-------------------------------------------------------------------
+class GenExp(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'sg', 'alpha', 'beta']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        sg   = self.params['sg']
+        ap   = self.params['alpha']
+        bt   = self.params['beta']
+        u   = (x - mu) / sg
+        val = (1 - z.exp(-ap * u)) * z.exp(-bt * u)
+        return val
+#-------------------------------------------------------------------
+class FermiDirac(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'ap']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        ap   = self.params['ap']
+        exp  = (x - mu) / ap
+        den  = 1 + z.exp(exp)
+        return 1. / den
+#-------------------------------------------------------------------

dmu/stats/zfit_plotter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 '''
 Module containing plot class, used to plot fits
 '''
-# pylint: disable=too-many-instance-attributes
+# pylint: disable=too-many-instance-attributes, too-many-arguments
 import warnings
 import pprint
@@ -51,6 +51,8 @@ class ZFitPlotter:
         self._figsize          = None
         self._leg_loc          = None
+        self.dat_xerr : bool
         # zfit.settings.advanced_warnings['extend_wrapped_extended'] = False
         warnings.filterwarnings("ignore")
     #----------------------------------------
@@ -60,17 +62,17 @@ class ZFitPlotter:
         self._l_def_col = list(mcolors.TABLEAU_COLORS.keys())
     #----------------------------------------
     def _data_to_zdata(self, obs, data, weights):
+        if isinstance(data, zfit.data.Data):
+            return data
         if isinstance(data, np.ndarray):
             data = zfit.Data.from_numpy (obs=obs, array=data           , weights=weights)
         elif isinstance(data, pd.Series):
             data = zfit.Data.from_pandas(obs=obs, df=pd.DataFrame(data), weights=weights)
         elif isinstance(data, pd.DataFrame):
             data = zfit.Data.from_pandas(obs=obs, df=data              , weights=weights)
-        elif isinstance(data, zfit.data.Data):
-            data = data
         else:
-            log.error(f'Passed data is of usupported type {type(data)}')
-            raise
+            raise ValueError(f'Passed data is of usupported type {type(data)}')
         return data
     #----------------------------------------
@@ -200,7 +202,7 @@ class ZFitPlotter:
     #----------------------------------------
     def _get_zfit_gof(self):
         if not hasattr(self._result, 'gof'):
-            return
+            return None
         chi2, ndof, pval = self._result.gof
@@ -211,14 +213,16 @@ class ZFitPlotter:
     def _get_text(self, ext_text):
         gof_text = self._get_zfit_gof()
-        if   ext_text is     None and gof_text is     None:
-            return
-        elif ext_text is not None and gof_text is     None:
+        if ext_text is     None and gof_text is     None:
+            return None
+        if ext_text is not None and gof_text is     None:
             return ext_text
-        elif ext_text is     None and gof_text is not None:
+        if ext_text is     None and gof_text is not None:
             return gof_text
-        else:
-            return f'{ext_text}\n{gof_text}'
+        return f'{ext_text}\n{gof_text}'
     #----------------------------------------
     def _get_pars(self):
         '''
@@ -238,7 +242,7 @@ class ZFitPlotter:
                 name= par if isinstance(par, str) else par.name
                 try:
                     err = d_val['hesse']['error']
-                except:
+                except KeyError:
                     log.warning(f'Cannot extract {name} Hesse errors, using zeros')
                     pprint.pprint(d_val)
                     err = 0
@@ -260,7 +264,7 @@ class ZFitPlotter:
         '''
         d_par = self._get_pars()
-        line = f''
+        line = ''
         for name, [val, err] in d_par.items():
             if add_pars != 'all' and name not in add_pars:
                 continue
@@ -328,7 +332,7 @@ class ZFitPlotter:
             nevt = self._get_component_yield(model, par)
             if   model.name in self._l_plot_components and     hasattr(model, 'pdfs'):
-                l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
+                l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
             elif model.name in self._l_plot_components and not hasattr(model, 'pdfs'):
                 log.warning(f'Cannot plot {model.name} as separate components, despite it was requested')
                 l_model = [ (1, model)]
@@ -347,17 +351,17 @@ class ZFitPlotter:
                 ax.plot(self.x, y, '-',               label=self._leg.get(name, name), color=self._col.get(name))
         if (blind_name is not None) and (was_blinded is False):
-            log.error(f'Blinding was requested, but PDF {blind_name} was not found among:')
             for model in self.total_model.pdfs:
                 log.info(model.name)
-            raise
+            raise ValueError(f'Blinding was requested, but PDF {blind_name} was not found among:')
     #----------------------------------------
     def _get_col(self, name):
         if name in self._col:
             return self._col[name]
         col = self._l_def_col[0]
-        del(self._l_def_col[0])
+        del self._l_def_col[0]
         return col
     #----------------------------------------
@@ -400,9 +404,8 @@ class ZFitPlotter:
         if plot_range is not None:
             try:
                 self.lower, self.upper = plot_range
-            except TypeError:
-                log.error(f'plot_range argument is expected to be a tuple with two numeric values')
-                raise TypeError
+            except TypeError as exc:
+                raise TypeError('plot_range argument is expected to be a tuple with two numeric values') from exc
         return np.linspace(self.lower, self.upper, 2000)
     #----------------------------------------
@@ -439,6 +442,7 @@ class ZFitPlotter:
             add_pars          = None,
             ymax              = None,
             skip_pulls        = False,
+            yscale : str      = None,
             axs               = None,
             figsize:tuple     = (13, 7),
             leg_loc:str       = 'best',
@@ -464,6 +468,7 @@ class ZFitPlotter:
         figsize (tuple)       : Tuple with figure size, default (13, 7)
         leg_loc (str)         : Location of legend, default 'best'
         xerr (bool or float)  : Used to pass xerr to mplhep histplot. True will use error with bin size, False, no error, otherwise it's the size of the xerror bar
+        yscale (str)          : Scale for y axis of main plot, either log or linear
         '''
         # pylint: disable=too-many-locals, too-many-positional-arguments, too-many-arguments
         d_leg           = {} if           d_leg is None else d_leg
@@ -512,6 +517,9 @@ class ZFitPlotter:
         self.axs[0].set(xlabel=xlabel, ylabel=ylabel)
         self.axs[0].set_xlim([self.lower, self.upper])
+        if yscale is not None:
+            self.axs[0].set_yscale(yscale)
         if title is not None:
             self.axs[0].set_title(title)

dmu/testing/utilities.py CHANGED Viewed

@@ -3,16 +3,20 @@ Module containing utility functions needed by unit tests
 '''
 import os
 import math
+import glob
 from typing              import Union
 from dataclasses         import dataclass
 from importlib.resources import files
 from ROOT import RDF, TFile, RDataFrame
+import joblib
 import pandas as pnd
 import numpy
 import yaml
+from dmu.ml.train_mva      import TrainMva
+from dmu.ml.cv_classifier  import CVClassifier
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:testing:utilities')
@@ -22,6 +26,7 @@ class Data:
     '''
     Class storing shared data
     '''
+    out_dir = '/tmp/tests/dmu/ml/cv_predict'
 # -------------------------------
 def _double_data(df_1 : pnd.DataFrame) -> pnd.DataFrame:
     df_2   = df_1.copy()
@@ -39,7 +44,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
     else:
         l_col_index = [ l_col.index(column) for column in columns ]
-    log.debug('Replacing randomly with {size} NaNs')
+    log.debug(f'Replacing randomly with {size} NaNs')
     for _ in range(size):
         irow = numpy.random.randint(0, df.shape[0])      # Random row index
         icol = numpy.random.choice(l_col_index)      # Random column index
@@ -51,7 +56,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
 def get_rdf(kind : Union[str,None] = None,
             repeated : bool        = False,
             nentries : int         = 3_000,
-            add_nans : list[str]   = None):
+            columns_with_nans : list[str] = None):
     '''
     Return ROOT dataframe with toy data
     '''
@@ -76,8 +81,8 @@ def get_rdf(kind : Union[str,None] = None,
     if repeated:
         df = _double_data(df)
-    if add_nans:
-        df = _add_nans(df, columns=add_nans)
+    if columns_with_nans is not None:
+        df = _add_nans(df, columns=columns_with_nans)
     rdf = RDF.FromPandas(df)
@@ -126,3 +131,25 @@ def get_file_with_trees(path : str) -> TFile:
         snap.fMode  = 'update'
     return TFile(path)
+# -------------------------------
+def get_models(rdf_sig : RDataFrame, rdf_bkg : RDataFrame) -> list[CVClassifier]:
+    '''
+    Will train and return models
+    '''
+    cfg                   = get_config('ml/tests/train_mva.yaml')
+    pkl_path              = f'{Data.out_dir}/model.pkl'
+    plt_dir               = f'{Data.out_dir}/cv_predict'
+    cfg['saving']['path'] = pkl_path
+    cfg['plotting']['val_dir'] = plt_dir
+    cfg['plotting']['features']['saving']['plt_dir'] = plt_dir
+    obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
+    obj.run()
+    pkl_wc     = pkl_path.replace('.pkl', '_*.pkl')
+    l_pkl_path = glob.glob(pkl_wc)
+    l_model    = [ joblib.load(pkl_path) for pkl_path in l_pkl_path ]
+    return l_model
+# -------------------------------

dmu_data/ml/tests/diagnostics_from_file.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+output         : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
+  # Will assume that the target is already in the input dataframe
+  # and will use it, instead of evaluating models
+score_from_rdf : w
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target :
+    name : z
+  methods:
+    - Pearson
+  figure:
+    title: Scores from file
+    size : [10, 8]

dmu_data/ml/tests/diagnostics_from_model.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+output      : /tmp/tests/dmu/ml/cv_diagnostics/from_model
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target    :
+    name    : z
+  methods:
+    - Pearson
+  figure:
+    size  : [10, 8]
+    rotate: 90

data-manipulation-utilities 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

data-manipulation-utilities 0.2.6py3-none-any.whl → 0.2.7py3-none-any.whl