PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/train_mva.yaml +9 -7
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/styling.yaml +8 -1
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
{data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/stats/fit_stats.py ADDED Viewed

@@ -0,0 +1,129 @@
+'''
+Module with FitStats class
+'''
+import re
+import pprint
+import pickle
+from typing import Union
+import numpy
+import pandas                as pnd
+from zfit.result            import FitResult  as zres
+from dmu.logging.log_store  import LogStore
+log = LogStore.add_logger('dmu:fit_stats')
+# -------------------------------
+class FitStats:
+    '''
+    Class meant to provide fit statistics
+    '''
+    # -------------------------------
+    def __init__(self, fit_dir : str):
+        '''
+        fit_dir :  Path to directory where fit outputs are stored
+        '''
+        self._fit_dir = fit_dir
+        self._regex   = r'^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s*$'
+        self._sig_yld = 'nsig'
+        # Functions need to be called at the end
+        # When all the needed attributes are already set
+        self._df      = self._get_data()
+    # -------------------------------
+    def _row_from_line(self, line : str) -> Union[list,None]:
+        mtch = re.match(self._regex, line)
+        if not mtch:
+            return None
+        [name, value, low, high, is_floating, mu_sg] = mtch.groups()
+        if mu_sg == 'none':
+            mu = numpy.nan
+            sg = numpy.nan
+        else:
+            [mu, sg] = mu_sg.split('___')
+            mu       = float(mu)
+            sg       = float(sg)
+        is_floating = int(is_floating)  #Direct conversion from '0' to bool will break this
+        is_floating = bool(is_floating)
+        row         = [name, float(value), float(low), float(high), is_floating, mu, sg]
+        return row
+    # -------------------------------
+    def _get_data(self) -> pnd.DataFrame:
+        fit_path = f'{self._fit_dir}/post_fit.txt'
+        with open(fit_path, encoding='utf-8') as ifile:
+            l_line = ifile.read().splitlines()
+        df = pnd.DataFrame(columns=['name', 'value', 'low', 'high', 'float', 'mu', 'sg'])
+        for line in l_line:
+            row = self._row_from_line(line)
+            if row is None:
+                continue
+            df.loc[len(df)] = row
+        df = self._attach_errors(df)
+        log.debug(df)
+        return df
+    # -------------------------------
+    def _error_from_res(self, row : pnd.Series, res : zres) -> float:
+        if not row['float']: # If this parameter is fixed in the fit, the error is zero
+            return 0
+        name = row['name']
+        if name not in res.params:
+            for this_name in res.params:
+                log.info(this_name)
+            raise KeyError(f'{name} not found')
+        d_data = res.params[name]
+        if 'hesse' in d_data:
+            return d_data['hesse']['error']
+        if 'minuit_hesse' in d_data:
+            return d_data['minuit_hesse']['error']
+        pprint.pprint(d_data)
+        raise KeyError(f'Cannot find error in dictionary')
+    # -------------------------------
+    def _attach_errors(self, df : pnd.DataFrame) -> pnd.DataFrame:
+        pkl_path = f'{self._fit_dir}/fit.pkl'
+        with open(pkl_path, 'rb') as ifile:
+            res = pickle.load(ifile)
+        df['error'] = df.apply(lambda row : self._error_from_res(row, res), axis=1)
+        return df
+    # -------------------------------
+    def print_blind_stats(self) -> None:
+        '''
+        Will print statistics, excluding signal information
+        '''
+        df_blind = self._df[self._df['name'] != self._sig_yld]
+        log.info(df_blind)
+    # -------------------------------
+    def get_value(self, name : str, kind : str) -> float:
+        '''
+        Returns float with value associated to fit
+        name : Name of variable, e.g. mu, sg, nsig
+        kind : Type of quantity, e.g. value, error
+        '''
+        log.info(f'Retrieving signal yield from {name} and {kind}')
+        df   = self._df[self._df['name'] == name]
+        nrow = len(df)
+        if nrow != 1:
+            self.print_blind_stats()
+            raise ValueError(f'Cannot retrieve one and only one row, found {nrow}')
+        val = df[kind]
+        return float(val)
+# -------------------------------

dmu/stats/fitter.py CHANGED Viewed

@@ -1,20 +1,23 @@
 '''
 Module holding zfitter class
 '''
+# pylint: disable=wrong-import-order, import-error
 import pprint
 from typing                   import Union
 from functools                import lru_cache
 import numpy
-import zfit
 import pandas as pd
-from scipy                    import stats
+from dmu.logging              import messages  as mes
+from dmu.stats.zfit           import zfit
+from dmu.logging.log_store    import LogStore
 from zfit.minimizers.strategy import FailMinimizeNaN
-from zfit.result              import FitResult
 from zfit.core.data           import Data
-from dmu.logging.log_store    import LogStore
+from zfit.result              import FitResult  as zres
+from scipy                    import stats
 log = LogStore.add_logger('dmu:statistics:fitter')
 #------------------------------
@@ -43,6 +46,15 @@ class Fitter:
         self._obs     : zfit.Space
         self._d_par   : dict
+        # These are substrings found in tensorflow messages
+        # that are pretty useless and need to be hidden
+        self._l_hidden_tf_lines= [
+            'abnormal_detected_host @',
+            'Skipping loop optimization for Merge',
+            'Creating GpuSolver handles for stream',
+            'Loaded cuDNN version',
+            'All log messages before absl::InitializeLog()']
         self._ndof           = 10
         self._pval_threshold = 0.01
         self._initialized    = False
@@ -53,7 +65,7 @@ class Fitter:
         self._check_data()
-        self._intialized = True
+        self._initialized = True
     #------------------------------
     def _check_data(self):
         if   isinstance(self._data_in, numpy.ndarray):
@@ -83,11 +95,9 @@ class Fitter:
         elif len(shp) == 2:
             _, jval = shp
             if jval != 1:
-                log.error(f'Invalid data shape: {shp}')
-                raise
+                raise ValueError(f'Invalid data shape: {shp}')
         else:
-            log.error(f'Invalid data shape: {shp}')
-            raise
+            raise ValueError(f'Invalid data shape: {shp}')
         ival = data.shape[0]
@@ -158,7 +168,7 @@ class Fitter:
         log.debug(f'Ndof: {self._ndof}')
         log.debug(f'pval: {pvalue:<.3e}')
-        return (sum_chi2, self._ndof, pvalue)
+        return sum_chi2, self._ndof, pvalue
     #------------------------------
     def _get_float_pars(self):
         npar     = 0
@@ -240,7 +250,9 @@ class Fitter:
         if 'ranges' not in cfg:
             return [None]
-        ranges = cfg['ranges']
+        ranges_any = cfg['ranges']
+        ranges = [ tuple(elm) for elm in ranges_any ]
         log.info('-' * 30)
         log.info(f'{"Low edge":>15}{"High edge":>15}')
         log.info('-' * 30)
@@ -359,9 +371,13 @@ class Fitter:
         log.info(header)
         log.info(parval)
     #------------------------------
-    def _minimize(self, nll, cfg : dict) -> tuple[FitResult, tuple]:
+    def _minimize(self, nll, cfg : dict) -> tuple[zres, tuple]:
         mnm = zfit.minimize.Minuit()
-        res = mnm.minimize(nll)
+        with mes.filter_stderr(banned_substrings=self._l_hidden_tf_lines):
+            res = mnm.minimize(nll)
+        res = self._calculate_error(res)
         try:
             gof = self._calc_gof()
@@ -376,7 +392,16 @@ class Fitter:
         return res, gof
     #------------------------------
-    def _fit_retries(self, cfg : dict) -> tuple[dict, FitResult]:
+    def _gof_is_bad(self, gof : tuple[float, int, float]) -> bool:
+        chi2, ndof, pval = gof
+        good_ndof = 0 <= ndof < numpy.inf
+        good_chi2 = 0 <= chi2 < numpy.inf
+        good_pval = 0 <= pval < numpy.inf
+        return not (good_chi2 and good_pval and good_ndof)
+    #------------------------------
+    def _fit_retries(self, cfg : dict) -> tuple[dict, zres]:
         ntries       = cfg['strategy']['retry']['ntries']
         pvalue_thresh= cfg['strategy']['retry']['pvalue_thresh']
         ignore_status= cfg['strategy']['retry']['ignore_status']
@@ -401,6 +426,12 @@ class Fitter:
                 continue
             chi2, _, pval   = gof
+            if self._gof_is_bad(gof):
+                log.debug('Reshufling and skipping, found bad gof')
+                self._reshuffle_pdf_pars()
+                continue
             d_pval_res[chi2]=res
             if pval > pvalue_thresh:
@@ -414,7 +445,7 @@ class Fitter:
         return d_pval_res, last_res
     #------------------------------
-    def _pick_best_fit(self, d_pval_res : dict, last_res : FitResult) -> FitResult:
+    def _pick_best_fit(self, d_pval_res : dict, last_res : zres) -> zres:
         nsucc = len(d_pval_res)
         if nsucc == 0:
             log.warning('None of the fits succeeded, returning last result')
@@ -426,7 +457,7 @@ class Fitter:
         l_pval_res.sort()
         _, res = l_pval_res[0]
-        log.debug('Picking out best fit from {nsucc} fits')
+        log.debug(f'Picking out best fit from {nsucc} fits')
         for chi2, _ in l_pval_res:
             log.debug(f'{chi2:.3f}')
@@ -434,7 +465,7 @@ class Fitter:
         return res
     #------------------------------
-    def _fit_in_steps(self, cfg : dict) -> FitResult:
+    def _fit_in_steps(self, cfg : dict) -> zres:
         l_nsample = cfg['strategy']['steps']['nsteps']
         l_nsigma  = cfg['strategy']['steps']['nsigma']
         l_yield   = cfg['strategy']['steps']['yields']
@@ -453,7 +484,6 @@ class Fitter:
         log.info('Fitting full sample')
         nll    = self._get_full_nll(cfg = cfg)
         res, _ = self._minimize(nll, cfg)
-        res.hesse(method='minuit_hesse')
         if res is None:
             nsteps = len(l_nsample)
@@ -461,7 +491,7 @@ class Fitter:
         return res
     #------------------------------
-    def _result_to_value_error(self, res : FitResult) -> dict[str, list[float]]:
+    def _result_to_value_error(self, res : zres) -> dict[str, list[float]]:
         d_par = {}
         for par, d_val in res.params.items():
             try:
@@ -475,7 +505,7 @@ class Fitter:
         return d_par
     #------------------------------
-    def _update_par_bounds(self, res : FitResult, nsigma : float, yields : list[str]) -> None:
+    def _update_par_bounds(self, res : zres, nsigma : float, yields : list[str]) -> None:
         s_shape_par = self._pdf.get_params(is_yield=False, floating=True)
         d_shp_par   = { par.name : par for par in s_shape_par if par.name not in yields}
         d_fit_par   = self._result_to_value_error(res)
@@ -494,6 +524,11 @@ class Fitter:
             log.info(f'{name:<20}{val - err:<20.3e}{val + err:<20.3e}')
     #------------------------------
+    def _calculate_error(self, res : zres) -> zres:
+        res.hesse(name='minuit_hesse')
+        return res
+    #------------------------------
     def fit(self, cfg : Union[dict, None] = None):
         '''
         Runs the fit using the configuration specified by the cfg dictionary
@@ -508,7 +543,6 @@ class Fitter:
         if 'strategy' not in cfg:
             nll    = self._get_full_nll(cfg = cfg)
             res, _ = self._minimize(nll, cfg)
-            res.hesse(method='minuit_hesse')
         elif 'retry' in cfg['strategy']:
             d_pval_res, last_res = self._fit_retries(cfg)
             res = self._pick_best_fit(d_pval_res, last_res)
@@ -517,6 +551,5 @@ class Fitter:
         else:
             raise ValueError('Unsupported fitting strategy')
         return res
 #------------------------------

dmu/stats/gof_calculator.py CHANGED Viewed

@@ -110,6 +110,13 @@ class GofCalculator:
         arr_data    = self._get_data_bin_contents()
         arr_modl    = self._get_pdf_bin_contents()
+        log.debug(40 * '-')
+        log.debug(f'{"Data":<20}{"Model":<20}')
+        log.debug(40 * '-')
+        for dval, mval in zip(arr_data, arr_modl):
+            log.debug(f'{dval:<20.3f}{mval:<20.3f}')
+        log.debug(40 * '-')
         norm        = numpy.sum(arr_data) / numpy.sum(arr_modl)
         arr_modl    = norm * arr_modl
         arr_res     = arr_modl - arr_data

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl