PyPI - data-manipulation-utilities - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +179 -10
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +31 -19
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
dmu/generic/hashing.py +44 -0
dmu/generic/utilities.py +14 -1
dmu/generic/version_management.py +3 -5
dmu/ml/cv_diagnostics.py +221 -0
dmu/ml/train_mva.py +143 -46
dmu/pdataframe/utilities.py +36 -3
dmu/plotting/fwhm.py +64 -0
dmu/plotting/plotter.py +2 -0
dmu/plotting/plotter_1d.py +87 -6
dmu/stats/fitter.py +1 -1
dmu/stats/minimizers.py +40 -11
dmu/stats/model_factory.py +248 -44
dmu/stats/zfit_models.py +68 -0
dmu/stats/zfit_plotter.py +29 -21
dmu/testing/utilities.py +31 -4
dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
dmu_data/ml/tests/train_mva.yaml +19 -10
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
dmu_data/plotting/tests/plug_stats.yaml +19 -0
dmu_data/plotting/tests/simple.yaml +4 -3
dmu_data/plotting/tests/styling.yaml +11 -0
{data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0

dmu/stats/zfit_plotter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 '''
 Module containing plot class, used to plot fits
 '''
-# pylint: disable=too-many-instance-attributes
+# pylint: disable=too-many-instance-attributes, too-many-arguments
 import warnings
 import pprint
@@ -51,6 +51,8 @@ class ZFitPlotter:
         self._figsize          = None
         self._leg_loc          = None
+        self.dat_xerr : bool
         # zfit.settings.advanced_warnings['extend_wrapped_extended'] = False
         warnings.filterwarnings("ignore")
     #----------------------------------------
@@ -60,17 +62,17 @@ class ZFitPlotter:
         self._l_def_col = list(mcolors.TABLEAU_COLORS.keys())
     #----------------------------------------
     def _data_to_zdata(self, obs, data, weights):
+        if isinstance(data, zfit.data.Data):
+            return data
         if isinstance(data, np.ndarray):
             data = zfit.Data.from_numpy (obs=obs, array=data           , weights=weights)
         elif isinstance(data, pd.Series):
             data = zfit.Data.from_pandas(obs=obs, df=pd.DataFrame(data), weights=weights)
         elif isinstance(data, pd.DataFrame):
             data = zfit.Data.from_pandas(obs=obs, df=data              , weights=weights)
-        elif isinstance(data, zfit.data.Data):
-            data = data
         else:
-            log.error(f'Passed data is of usupported type {type(data)}')
-            raise
+            raise ValueError(f'Passed data is of usupported type {type(data)}')
         return data
     #----------------------------------------
@@ -200,7 +202,7 @@ class ZFitPlotter:
     #----------------------------------------
     def _get_zfit_gof(self):
         if not hasattr(self._result, 'gof'):
-            return
+            return None
         chi2, ndof, pval = self._result.gof
@@ -211,14 +213,16 @@ class ZFitPlotter:
     def _get_text(self, ext_text):
         gof_text = self._get_zfit_gof()
-        if   ext_text is     None and gof_text is     None:
-            return
-        elif ext_text is not None and gof_text is     None:
+        if ext_text is     None and gof_text is     None:
+            return None
+        if ext_text is not None and gof_text is     None:
             return ext_text
-        elif ext_text is     None and gof_text is not None:
+        if ext_text is     None and gof_text is not None:
             return gof_text
-        else:
-            return f'{ext_text}\n{gof_text}'
+        return f'{ext_text}\n{gof_text}'
     #----------------------------------------
     def _get_pars(self):
         '''
@@ -238,7 +242,7 @@ class ZFitPlotter:
                 name= par if isinstance(par, str) else par.name
                 try:
                     err = d_val['hesse']['error']
-                except:
+                except KeyError:
                     log.warning(f'Cannot extract {name} Hesse errors, using zeros')
                     pprint.pprint(d_val)
                     err = 0
@@ -260,7 +264,7 @@ class ZFitPlotter:
         '''
         d_par = self._get_pars()
-        line = f''
+        line = ''
         for name, [val, err] in d_par.items():
             if add_pars != 'all' and name not in add_pars:
                 continue
@@ -328,7 +332,7 @@ class ZFitPlotter:
             nevt = self._get_component_yield(model, par)
             if   model.name in self._l_plot_components and     hasattr(model, 'pdfs'):
-                l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
+                l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
             elif model.name in self._l_plot_components and not hasattr(model, 'pdfs'):
                 log.warning(f'Cannot plot {model.name} as separate components, despite it was requested')
                 l_model = [ (1, model)]
@@ -347,17 +351,17 @@ class ZFitPlotter:
                 ax.plot(self.x, y, '-',               label=self._leg.get(name, name), color=self._col.get(name))
         if (blind_name is not None) and (was_blinded is False):
-            log.error(f'Blinding was requested, but PDF {blind_name} was not found among:')
             for model in self.total_model.pdfs:
                 log.info(model.name)
-            raise
+            raise ValueError(f'Blinding was requested, but PDF {blind_name} was not found among:')
     #----------------------------------------
     def _get_col(self, name):
         if name in self._col:
             return self._col[name]
         col = self._l_def_col[0]
-        del(self._l_def_col[0])
+        del self._l_def_col[0]
         return col
     #----------------------------------------
@@ -400,9 +404,8 @@ class ZFitPlotter:
         if plot_range is not None:
             try:
                 self.lower, self.upper = plot_range
-            except TypeError:
-                log.error(f'plot_range argument is expected to be a tuple with two numeric values')
-                raise TypeError
+            except TypeError as exc:
+                raise TypeError('plot_range argument is expected to be a tuple with two numeric values') from exc
         return np.linspace(self.lower, self.upper, 2000)
     #----------------------------------------
@@ -439,6 +442,7 @@ class ZFitPlotter:
             add_pars          = None,
             ymax              = None,
             skip_pulls        = False,
+            yscale : str      = None,
             axs               = None,
             figsize:tuple     = (13, 7),
             leg_loc:str       = 'best',
@@ -464,6 +468,7 @@ class ZFitPlotter:
         figsize (tuple)       : Tuple with figure size, default (13, 7)
         leg_loc (str)         : Location of legend, default 'best'
         xerr (bool or float)  : Used to pass xerr to mplhep histplot. True will use error with bin size, False, no error, otherwise it's the size of the xerror bar
+        yscale (str)          : Scale for y axis of main plot, either log or linear
         '''
         # pylint: disable=too-many-locals, too-many-positional-arguments, too-many-arguments
         d_leg           = {} if           d_leg is None else d_leg
@@ -512,6 +517,9 @@ class ZFitPlotter:
         self.axs[0].set(xlabel=xlabel, ylabel=ylabel)
         self.axs[0].set_xlim([self.lower, self.upper])
+        if yscale is not None:
+            self.axs[0].set_yscale(yscale)
         if title is not None:
             self.axs[0].set_title(title)

dmu/testing/utilities.py CHANGED Viewed

@@ -3,16 +3,20 @@ Module containing utility functions needed by unit tests
 '''
 import os
 import math
+import glob
 from typing              import Union
 from dataclasses         import dataclass
 from importlib.resources import files
 from ROOT import RDF, TFile, RDataFrame
+import joblib
 import pandas as pnd
 import numpy
 import yaml
+from dmu.ml.train_mva      import TrainMva
+from dmu.ml.cv_classifier  import CVClassifier
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:testing:utilities')
@@ -22,6 +26,7 @@ class Data:
     '''
     Class storing shared data
     '''
+    out_dir = '/tmp/tests/dmu/ml/cv_predict'
 # -------------------------------
 def _double_data(df_1 : pnd.DataFrame) -> pnd.DataFrame:
     df_2   = df_1.copy()
@@ -39,7 +44,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
     else:
         l_col_index = [ l_col.index(column) for column in columns ]
-    log.debug('Replacing randomly with {size} NaNs')
+    log.debug(f'Replacing randomly with {size} NaNs')
     for _ in range(size):
         irow = numpy.random.randint(0, df.shape[0])      # Random row index
         icol = numpy.random.choice(l_col_index)      # Random column index
@@ -51,7 +56,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
 def get_rdf(kind : Union[str,None] = None,
             repeated : bool        = False,
             nentries : int         = 3_000,
-            add_nans : list[str]   = None):
+            columns_with_nans : list[str] = None):
     '''
     Return ROOT dataframe with toy data
     '''
@@ -76,8 +81,8 @@ def get_rdf(kind : Union[str,None] = None,
     if repeated:
         df = _double_data(df)
-    if add_nans:
-        df = _add_nans(df, columns=add_nans)
+    if columns_with_nans is not None:
+        df = _add_nans(df, columns=columns_with_nans)
     rdf = RDF.FromPandas(df)
@@ -126,3 +131,25 @@ def get_file_with_trees(path : str) -> TFile:
         snap.fMode  = 'update'
     return TFile(path)
+# -------------------------------
+def get_models(rdf_sig : RDataFrame, rdf_bkg : RDataFrame) -> list[CVClassifier]:
+    '''
+    Will train and return models
+    '''
+    cfg                   = get_config('ml/tests/train_mva.yaml')
+    pkl_path              = f'{Data.out_dir}/model.pkl'
+    plt_dir               = f'{Data.out_dir}/cv_predict'
+    cfg['saving']['path'] = pkl_path
+    cfg['plotting']['val_dir'] = plt_dir
+    cfg['plotting']['features']['saving']['plt_dir'] = plt_dir
+    obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
+    obj.run()
+    pkl_wc     = pkl_path.replace('.pkl', '_*.pkl')
+    l_pkl_path = glob.glob(pkl_wc)
+    l_model    = [ joblib.load(pkl_path) for pkl_path in l_pkl_path ]
+    return l_model
+# -------------------------------

dmu_data/ml/tests/diagnostics_from_file.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+output         : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
+  # Will assume that the target is already in the input dataframe
+  # and will use it, instead of evaluating models
+score_from_rdf : w
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target :
+    name : z
+  methods:
+    - Pearson
+  figure:
+    title: Scores from file
+    size : [10, 8]

dmu_data/ml/tests/diagnostics_from_model.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+output      : /tmp/tests/dmu/ml/cv_diagnostics/from_model
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target    :
+    name    : z
+  methods:
+    - Pearson
+  figure:
+    size  : [10, 8]
+    rotate: 90

dmu_data/ml/tests/diagnostics_multiple_methods.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+output : /tmp/tests/dmu/ml/cv_diagnostics/multiple_methods
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target    :
+    name : z
+  methods:
+    - Pearson
+    - Kendall-$\tau$
+  figure:
+    size : [10, 8]

dmu_data/ml/tests/diagnostics_overlay.yaml ADDED Viewed

@@ -0,0 +1,33 @@
+output         : /tmp/tests/dmu/ml/cv_diagnostics/overlay
+  # Will assume that the target is already in the input dataframe
+  # and will use it, instead of evaluating models
+score_from_rdf : w
+correlations:
+  # Variables with respect to which the correlations with the features will be measured
+  target :
+    name : z
+    overlay :
+      wp :
+        - 0.2
+        - 0.5
+        - 0.7
+        - 0.9
+      general:
+        size : [12, 10]
+      saving:
+        plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/overlay
+      plots:
+        z :
+          binning    : [-4, 4, 10]
+          yscale     : 'linear'
+          labels     : ['$z$', 'Entries']
+          normalized : true
+          styling :
+            linestyle: '-'
+  methods:
+    - Pearson
+    - Kendall-$\tau$
+  figure:
+    title     : Scores from file
+    size      : [12, 10]
+    xlabelsize: 30

dmu_data/ml/tests/train_mva.yaml CHANGED Viewed

@@ -1,10 +1,12 @@
 dataset:
+  define :
+    r : z + x
   nan :
-    x : 1
-    y : 2
+    x : -3
+    y : -3
 training :
     nfold    : 3
-    features : [x, y, z]
+    features : [x, y, r]
     rdm_stat : 1
     hyper    :
       loss              : log_loss
@@ -13,7 +15,7 @@ training :
       learning_rate     : 0.1
       min_samples_split : 2
 saving:
-    path : '/tmp/dmu/ml/tests/train_mva/model.pkl'
+    path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
 plotting:
     roc     :
         min : [0.0, 0.0]
@@ -29,21 +31,28 @@ plotting:
       title      : 'Correlation matrix'
       size       : [10, 10]
       mask_value : 0
-    val_dir : '/tmp/dmu/ml/tests/train_mva'
+    val_dir : '/tmp/tests/dmu/ml/train_mva'
     features:
         saving:
-            plt_dir : '/tmp/dmu/ml/tests/train_mva/features'
+            plt_dir : '/tmp/tests/dmu/ml/train_mva/features'
         plots:
+          r :
+            binning : [-6, 6, 100]
+            yscale  : 'linear'
+            labels  : ['$r$', '']
+          w :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$w$', '']
           x :
             binning : [-4, 4, 100]
             yscale  : 'linear'
-            labels  : ['x', '']
+            labels  : ['$x$', '']
           y :
             binning : [-4, 4, 100]
             yscale  : 'linear'
-            labels  : ['y', '']
+            labels  : ['$y$', '']
           z :
             binning : [-4, 4, 100]
             yscale  : 'linear'
-            labels  : ['z', '']
+            labels  : ['$z$', '']

dmu_data/ml/tests/train_mva_with_diagnostics.yaml ADDED Viewed

@@ -0,0 +1,82 @@
+dataset:
+  define :
+    r : z + x
+  nan :
+    x : -3
+    y : -3
+training :
+    nfold    : 3
+    features : [x, y, r]
+    rdm_stat : 1
+    hyper    :
+      loss              : log_loss
+      n_estimators      : 100
+      max_depth         : 3
+      learning_rate     : 0.1
+      min_samples_split : 2
+saving:
+    path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
+plotting:
+    roc     :
+        min : [0.0, 0.0]
+        max : [1.2, 1.2]
+        annotate:
+          sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
+          form : '{:.2f}'
+          color: 'green'
+          xoff : -15
+          yoff : -15
+          size :  10
+    correlation:
+      title      : 'Correlation matrix'
+      size       : [10, 10]
+      mask_value : 0
+    val_dir : '/tmp/tests/dmu/ml/train_mva'
+    features:
+        saving:
+            plt_dir : '/tmp/tests/dmu/ml/train_mva/features'
+        plots:
+          r :
+            binning : [-6, 6, 100]
+            yscale  : 'linear'
+            labels  : ['$r$', '']
+          w :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$w$', '']
+          x :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$x$', '']
+          y :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$y$', '']
+          z :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$z$', '']
+diagnostics:
+  output         : /tmp/tests/dmu/ml/train_mva/diagnostics
+  correlations:
+    target :
+      name : z
+      overlay :
+        general:
+          size : [20, 10]
+        saving:
+          plt_dir : /tmp/tests/dmu/ml/train_mva/diagnostics
+        plots:
+          z :
+            binning    : [-4, +4, 30]
+            yscale     : 'linear'
+            labels     : ['z', 'Entries']
+            normalized : true
+            styling :
+              linestyle: '-'
+    methods:
+      - Pearson
+      - Kendall-$\tau$
+    figure:
+      title: Training diagnostics
+      size : [10, 8]

dmu_data/plotting/tests/plug_fwhm.yaml ADDED Viewed

@@ -0,0 +1,24 @@
+saving:
+    plt_dir : plotting/pluggins/fwhm
+plots:
+    x :
+      binning : [-5.0, 8.0, 40]
+      title   : x distribution
+    y :
+      binning : [-5.0, 8.0, 40]
+      title   : y distribution
+plugin:
+  fwhm:
+    x :
+      plot   : true
+      obs    : [-2, 4]
+      plot   : true
+      format : FWHM={:.3f}
+      add_std: True
+    y :
+      plot   : true
+      obs    : [-4, 8]
+      plot   : true
+      format : FWHM={:.3f}
+      add_std: True

dmu_data/plotting/tests/plug_stats.yaml ADDED Viewed

@@ -0,0 +1,19 @@
+saving:
+    plt_dir : plotting/pluggins/stats
+plots:
+    x :
+      binning : [-5.0, 8.0, 40]
+      title   : x distribution
+      styling:
+        linestyle : '-'
+    y :
+      binning : [-5.0, 8.0, 40]
+      title   : y distribution
+      styling:
+        linestyle : '-'
+plugin:
+  stats:
+    x :
+      mean : $\mu$={:.2f}
+      rms  : $\sigma$={:.2f}
+      sum  : $\Sigma$={:.0f}

dmu_data/plotting/tests/simple.yaml CHANGED Viewed

@@ -1,8 +1,9 @@
 saving:
     plt_dir : tests/plotting/simple
 plots:
     x :
-        binning : [-5.0, 8.0, 40]
+      binning : [-5.0, 8.0, 40]
+      title   : x distribution
     y :
-        binning : [-5.0, 8.0, 40]
+      binning : [-5.0, 8.0, 40]
+      title   : y distribution

dmu_data/plotting/tests/styling.yaml ADDED Viewed

@@ -0,0 +1,11 @@
+saving:
+    plt_dir : tests/plotting/styling
+plots:
+  x :
+    binning : [-5.0, 8.0, 40]
+    title   : x distribution
+    styling :
+      histtype : step
+  y :
+    binning : [-5.0, 8.0, 40]
+    title   : y distribution

{data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish RENAMED Viewed

File without changes

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

data-manipulation-utilities 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

data-manipulation-utilities 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl