PyPI - data-manipulation-utilities - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.6py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +800 -34
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +70 -0
dmu/generic/utilities.py +175 -9
dmu/generic/version_management.py +3 -5
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +224 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +587 -112
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +61 -3
dmu/plotting/fwhm.py +64 -0
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +25 -3
dmu/plotting/plotter_1d.py +159 -14
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +56 -23
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +305 -50
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_models.py +68 -0
dmu/stats/zfit_plotter.py +175 -56
dmu/testing/utilities.py +120 -15
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
dmu_data/ml/tests/train_mva.yaml +20 -12
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +87 -0
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
dmu_data/plotting/tests/plug_stats.yaml +19 -0
dmu_data/plotting/tests/simple.yaml +4 -3
dmu_data/plotting/tests/styling.yaml +18 -0
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.6.dist-info/RECORD +0 -57
{data_manipulation_utilities-0.2.6.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/stats/zfit_models.py ADDED Viewed

@@ -0,0 +1,68 @@
+'''
+Module meant to hold classes defining PDFs that can be used by ZFIT
+'''
+import zfit
+from zfit   import z
+#-------------------------------------------------------------------
+class HypExp(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'alpha', 'beta']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        ap   = self.params['alpha']
+        bt   = self.params['beta']
+        u   = (x - mu)
+        val = z.exp(-bt * x) / (1 + z.exp(-ap * u))
+        return val
+#-------------------------------------------------------------------
+class ModExp(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'alpha', 'beta']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        ap   = self.params['alpha']
+        bt   = self.params['beta']
+        u   = x - mu
+        val = (1 - z.exp(-ap * u)) * z.exp(-bt * u)
+        return val
+#-------------------------------------------------------------------
+class GenExp(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'sg', 'alpha', 'beta']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        sg   = self.params['sg']
+        ap   = self.params['alpha']
+        bt   = self.params['beta']
+        u   = (x - mu) / sg
+        val = (1 - z.exp(-ap * u)) * z.exp(-bt * u)
+        return val
+#-------------------------------------------------------------------
+class FermiDirac(zfit.pdf.ZPDF):
+    _N_OBS  = 1
+    _PARAMS = ['mu', 'ap']
+    def _unnormalized_pdf(self, x):
+        x    = z.unstack_x(x)
+        mu   = self.params['mu']
+        ap   = self.params['ap']
+        exp  = (x - mu) / ap
+        den  = 1 + z.exp(exp)
+        return 1. / den
+#-------------------------------------------------------------------

dmu/stats/zfit_plotter.py CHANGED Viewed

@@ -1,27 +1,31 @@
 '''
 Module containing plot class, used to plot fits
 '''
-# pylint: disable=too-many-instance-attributes
+# pylint: disable=too-many-instance-attributes, too-many-arguments
+import math
 import warnings
 import pprint
 import zfit
 import hist
 import mplhep
+import tensorflow            as tf
 import pandas                as pd
 import numpy                 as np
 import matplotlib.pyplot     as plt
-import dmu.generic.utilities as gut
+from zfit.core.basepdf      import BasePDF    as zpdf
+import dmu.generic.utilities as gut
 from dmu.logging.log_store  import LogStore
-log = LogStore.add_logger('dmu:fit_plotter')
+log = LogStore.add_logger('dmu:zfit_plotter')
 #----------------------------------------
 class ZFitPlotter:
     '''
     Class used to plot fits done with zfit
     '''
+    #----------------------------------------
     def __init__(self, data=None, model=None, weights=None, result=None, suffix=''):
         '''
         obs: zfit space you are using to define the data and model
@@ -51,6 +55,8 @@ class ZFitPlotter:
         self._figsize          = None
         self._leg_loc          = None
+        self.dat_xerr : bool
         # zfit.settings.advanced_warnings['extend_wrapped_extended'] = False
         warnings.filterwarnings("ignore")
     #----------------------------------------
@@ -60,50 +66,90 @@ class ZFitPlotter:
         self._l_def_col = list(mcolors.TABLEAU_COLORS.keys())
     #----------------------------------------
     def _data_to_zdata(self, obs, data, weights):
+        if isinstance(data, zfit.Data):
+            return data
         if isinstance(data, np.ndarray):
             data = zfit.Data.from_numpy (obs=obs, array=data           , weights=weights)
         elif isinstance(data, pd.Series):
             data = zfit.Data.from_pandas(obs=obs, df=pd.DataFrame(data), weights=weights)
         elif isinstance(data, pd.DataFrame):
             data = zfit.Data.from_pandas(obs=obs, df=data              , weights=weights)
-        elif isinstance(data, zfit.data.Data):
-            data = data
         else:
-            log.error(f'Passed data is of usupported type {type(data)}')
-            raise
+            raise ValueError(f'Passed data is of usupported type {type(data)}')
         return data
     #----------------------------------------
-    def _get_errors(self, nbins=100, l_range=None):
-        dat, wgt  = self._get_range_data(l_range, blind=False)
-        data_hist = hist.Hist.new.Regular(nbins, self.lower, self.upper, name=self.obs.obs[0], underflow=False, overflow=False)
+    def _get_errors(
+            self,
+            nbins  : int = 100,
+            l_range: list[tuple[float,float]]|None = None) -> list[float]:
+        '''
+        Parameters
+        ---------------------
+        nbins  : Number of bins
+        l_range: List of ranges where data should be picked, if None, will pick full range
+        Returns
+        ---------------------
+        list of errors associated to histogram filled with data
+        '''
+        dat, wgt  = self._get_range_data(l_range=l_range, blind=False)
+        data_hist = hist.Hist.new.Regular(
+                nbins,
+                self.lower,
+                self.upper,
+                name       =self.obs.obs[0],
+                underflow  =False,
+                overflow   =False)
         data_hist = data_hist.Weight()
         data_hist.fill(dat, weight=wgt)
         tmp_fig, tmp_ax = plt.subplots()
-        errorbars = mplhep.histplot(
+        errorbars       = mplhep.histplot(
             data_hist,
-            yerr=True,
-            color='white',
-            histtype="errorbar",
-            label=None,
-            ax=tmp_ax,
-        )
-        plt.close(tmp_fig)
-        lines = errorbars[0].errorbar[2]
-        segs = lines[0].get_segments()
+            yerr     =True,
+            color    ='white',
+            histtype ='errorbar',
+            label    =None,
+            ax       =tmp_ax)
+        lines  = errorbars[0].errorbar[2]
+        segs   = lines[0].get_segments()
         values = data_hist.values()
         l_error=[]
         for i in range(nbins):
-            low =  values[i] - segs[i][0][1]
-            up  = -values[i] + segs[i][1][1]
+            seg = segs[i]
+            val = values[i]
+            try:
+                low =  val - seg[0][1]
+                up  = -val + seg[1][1]
+            except IndexError as exc:
+                raise IndexError(f'Cannot read the upper/lower errors, found {seg}') from exc
             l_error.append((low, up))
+        plt.close(tmp_fig)
         return l_error
     #----------------------------------------
-    def _get_range_data(self, l_range, blind=True):
+    def _get_range_data(
+            self,
+            l_range : list[tuple[float,float]]|None,
+            blind   : bool =True) -> tuple[np.ndarray, np.ndarray]:
+        '''
+        Parameters
+        -----------------
+        l_range: List of ranges, i.e. tuples of bounds
+        blind  : If true (default) will blind the range specified, i.e. will exclude it
+        Returns
+        -----------------
+        Tuple with two numpy arrays defined in those ranges, with the observable and the weights.
+        '''
         sdat  = self.data_np
         swgt  = self.data_weight_np
         dmat  = np.array([sdat, swgt]).T
@@ -115,6 +161,8 @@ class ZFitPlotter:
         if l_range is None:
             [dat, wgt] = dmat.T
+            self._check_data(dat=dat, wgt=wgt)
             return dat, wgt
         l_dat = []
@@ -130,23 +178,42 @@ class ZFitPlotter:
         dat_f = np.concatenate(l_dat)
         wgt_f = np.concatenate(l_wgt)
+        self._check_data(dat=dat_f, wgt=wgt_f)
         return dat_f, wgt_f
     #----------------------------------------
+    def _check_data(
+            self,
+            dat : np.ndarray,
+            wgt : np.ndarray) -> None:
+        '''
+        Checks for empty data, etc
+        Parameters
+        ------------
+        Numpy arrays with data and weights
+        '''
+        if dat.shape != wgt.shape:
+            raise ValueError(f'Shapes or data and weights differ: {dat.shape}/{wgt.shape}')
+        if len(dat) == 0:
+            raise ValueError('Dataset is empty')
+    #----------------------------------------
     def _plot_data(self, ax, nbins=100, l_range=None):
         dat, wgt  = self._get_range_data(l_range, blind=True)
         data_hist = hist.Hist.new.Regular(nbins, self.lower, self.upper, name=self.obs.obs[0], underflow=False, overflow=False)
         data_hist = data_hist.Weight()
         data_hist.fill(dat, weight=wgt)
-        _ = mplhep.histplot(
-            data_hist,
-            yerr=True,
-            color="black",
-            histtype="errorbar",
-            label=self._leg.get("Data", "Data"),
-            ax=ax,
-            xerr=self.dat_xerr
-        )
+        mplhep.histplot(
+                data_hist,
+                yerr     = True,
+                color    = 'black',
+                histtype = 'errorbar',
+                label    = self._leg.get('Data', 'Data'),
+                ax       = ax,
+                xerr     = self.dat_xerr)
     #----------------------------------------
     def _pull_hist(self, pdf_hist, nbins, data_yield, l_range=None):
         pdf_values= pdf_hist.values()
@@ -168,8 +235,16 @@ class ZFitPlotter:
             err = low if res > 0 else up
             pul = res / err
-            if abs(pul) > 5:
-                log.warning(f'Large pull: {pul:.1f}=({dat_val:.0f}-{pdf_val:.0f})/{err:.0f}')
+            # If the data is weighted
+            # and the data does not exist
+            # The pulls will have an error of zero => pull is inf
+            # Ignore these cases
+            if math.isinf(pul):
+                pass
+            elif abs(pul) > 5:
+                log.info(f'Pull: {pul:.2f}=({dat_val:.2f}-{pdf_val:.2f})/{err:.2f}')
+            else:
+                log.debug(f'Pull: {pul:.2f}=({dat_val:.2f}-{pdf_val:.2f})/{err:.2f}')
             pulls.append(pul)
             pull_errors[0].append(low / err)
@@ -200,7 +275,7 @@ class ZFitPlotter:
     #----------------------------------------
     def _get_zfit_gof(self):
         if not hasattr(self._result, 'gof'):
-            return
+            return None
         chi2, ndof, pval = self._result.gof
@@ -211,14 +286,16 @@ class ZFitPlotter:
     def _get_text(self, ext_text):
         gof_text = self._get_zfit_gof()
-        if   ext_text is     None and gof_text is     None:
-            return
-        elif ext_text is not None and gof_text is     None:
+        if ext_text is     None and gof_text is     None:
+            return None
+        if ext_text is not None and gof_text is     None:
             return ext_text
-        elif ext_text is     None and gof_text is not None:
+        if ext_text is     None and gof_text is not None:
             return gof_text
-        else:
-            return f'{ext_text}\n{gof_text}'
+        return f'{ext_text}\n{gof_text}'
     #----------------------------------------
     def _get_pars(self):
         '''
@@ -237,8 +314,8 @@ class ZFitPlotter:
                 val = d_val['value']
                 name= par if isinstance(par, str) else par.name
                 try:
-                    err = d_val['hesse']['error']
-                except:
+                    err = d_val['minuit_hesse']['error']
+                except KeyError:
                     log.warning(f'Cannot extract {name} Hesse errors, using zeros')
                     pprint.pprint(d_val)
                     err = 0
@@ -260,7 +337,7 @@ class ZFitPlotter:
         '''
         d_par = self._get_pars()
-        line = f''
+        line = ''
         for name, [val, err] in d_par.items():
             if add_pars != 'all' and name not in add_pars:
                 continue
@@ -328,7 +405,7 @@ class ZFitPlotter:
             nevt = self._get_component_yield(model, par)
             if   model.name in self._l_plot_components and     hasattr(model, 'pdfs'):
-                l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
+                l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
             elif model.name in self._l_plot_components and not hasattr(model, 'pdfs'):
                 log.warning(f'Cannot plot {model.name} as separate components, despite it was requested')
                 l_model = [ (1, model)]
@@ -344,27 +421,51 @@ class ZFitPlotter:
             if stacked:
                 ax.fill_between(self.x, y, alpha=1.0, label=self._leg.get(name, name), color=self._get_col(name))
             else:
-                ax.plot(self.x, y, '-',               label=self._leg.get(name, name), color=self._col.get(name))
+                ax.plot(self.x, y, ':',               label=self._leg.get(name, name), color=self._col.get(name))
         if (blind_name is not None) and (was_blinded is False):
-            log.error(f'Blinding was requested, but PDF {blind_name} was not found among:')
             for model in self.total_model.pdfs:
                 log.info(model.name)
-            raise
+            raise ValueError(f'Blinding was requested, but PDF {blind_name} was not found among:')
     #----------------------------------------
     def _get_col(self, name):
         if name in self._col:
             return self._col[name]
         col = self._l_def_col[0]
-        del(self._l_def_col[0])
+        del self._l_def_col[0]
         return col
     #----------------------------------------
+    def _print_data(self) -> None:
+        log.info(f'Data shape  : {self.data_np.shape}')
+        log.info(f'Weights shape: {self.data_weight_np.shape}')
+        nnans = np.sum(np.isnan(self.data_np))
+        log.info(f'NaNs: {nnans}')
+        # This function will run before program raises
+        # One should be able to drop any plot
+        plt.close('all')
+        plt.hist(self.data_np, weights=self.data_weight_np)
+        plt.show()
+    #----------------------------------------
+    def _evaluate_pdf(self, pdf : zpdf) -> np.ndarray:
+        try:
+            arr_y = pdf.pdf(self.x)
+        except tf.errors.InvalidArgumentError as exc:
+            log.info(f'X values: {self.x}')
+            self._print_data()
+            raise ValueError('Cannot evaluate PDF') from exc
+        return arr_y
+    #----------------------------------------
     def _plot_sub_components(self, y, nbins, stacked, nevt, l_model):
         l_y = []
         for frc, model in l_model:
-            this_y = model.pdf(self.x) * nevt * frc / nbins * (self.upper - self.lower)
+            arr_y  = self._evaluate_pdf(pdf = model)
+            this_y = arr_y * nevt * frc / nbins * (self.upper - self.lower)
             if stacked:
                 y = this_y if y is None else y + this_y
@@ -381,7 +482,13 @@ class ZFitPlotter:
             return
         data_yield = self.data_weight_np.sum()
-        y = model.pdf(self.x) * data_yield / nbins * (self.upper - self.lower)
+        try:
+            arr_y = self._evaluate_pdf(model)
+            y     = arr_y * data_yield / nbins * (self.upper - self.lower)
+        except tf.errors.InvalidArgumentError as exc:
+            log.warning(f'Data yield: {data_yield:.0f}')
+            log.info(self.data_np)
+            raise RuntimeError('Cannot parse PDF') from exc
         name = model.name
         ax.plot(self.x, y, linestyle, label=self._leg.get(name, name), color=self._col.get(name))
@@ -392,7 +499,7 @@ class ZFitPlotter:
         if ylabel == "":
             width  = (self.upper-self.lower)/nbins
-            ylabel = f'Candidates / ({width:.3f} {unit})'
+            ylabel = f'Candidates / ({width:.0f} {unit})'
         return xlabel, ylabel
     #----------------------------------------
@@ -400,9 +507,8 @@ class ZFitPlotter:
         if plot_range is not None:
             try:
                 self.lower, self.upper = plot_range
-            except TypeError:
-                log.error(f'plot_range argument is expected to be a tuple with two numeric values')
-                raise TypeError
+            except TypeError as exc:
+                raise TypeError('plot_range argument is expected to be a tuple with two numeric values') from exc
         return np.linspace(self.lower, self.upper, 2000)
     #----------------------------------------
@@ -439,6 +545,8 @@ class ZFitPlotter:
             add_pars          = None,
             ymax              = None,
             skip_pulls        = False,
+            pull_styling :bool= True,
+            yscale : str      = None,
             axs               = None,
             figsize:tuple     = (13, 7),
             leg_loc:str       = 'best',
@@ -455,6 +563,7 @@ class ZFitPlotter:
         d_leg                 : Customize legend
         d_col                 : Customize color
         plot_range            : Set plot_range
+        pull_styling(bool)    : Will add lines at +/-3 and set range to +/-5 for pull plots, by default True
         plot_components (list): List of strings, with names of PDFs, which are expected to be sums of PDFs and whose components should be plotted separately
         ext_text              : Text that can be added to plot
         add_pars (list|str)   : List of names of parameters to be added or string with value 'all' to add all fit parameters. If this is used, plot won't use LHCb style.
@@ -464,6 +573,7 @@ class ZFitPlotter:
         figsize (tuple)       : Tuple with figure size, default (13, 7)
         leg_loc (str)         : Location of legend, default 'best'
         xerr (bool or float)  : Used to pass xerr to mplhep histplot. True will use error with bin size, False, no error, otherwise it's the size of the xerror bar
+        yscale (str)          : Scale for y axis of main plot, either log or linear
         '''
         # pylint: disable=too-many-locals, too-many-positional-arguments, too-many-arguments
         d_leg           = {} if           d_leg is None else d_leg
@@ -512,6 +622,9 @@ class ZFitPlotter:
         self.axs[0].set(xlabel=xlabel, ylabel=ylabel)
         self.axs[0].set_xlim([self.lower, self.upper])
+        if yscale is not None:
+            self.axs[0].set_yscale(yscale)
         if title is not None:
             self.axs[0].set_title(title)
@@ -524,4 +637,10 @@ class ZFitPlotter:
         for ax in self.axs:
             ax.label_outer()
+        if pull_styling and not skip_pulls:
+            self.axs[1].axhline(y=-3, color='red' , linestyle='-', lw=2)
+            self.axs[1].axhline(y= 0, color='gray', linestyle='-', lw=1)
+            self.axs[1].axhline(y=+3, color='red' , linestyle='-', lw=2)
+            self.axs[1].set_ylim(-5, 5)
 #----------------------------------------

dmu/testing/utilities.py CHANGED Viewed

@@ -3,16 +3,21 @@ Module containing utility functions needed by unit tests
 '''
 import os
 import math
+import glob
 from typing              import Union
 from dataclasses         import dataclass
 from importlib.resources import files
 from ROOT import RDF, TFile, RDataFrame
+import uproot
+import joblib
 import pandas as pnd
 import numpy
 import yaml
+from dmu.ml.train_mva      import TrainMva
+from dmu.ml.cv_classifier  import CVClassifier
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:testing:utilities')
@@ -22,6 +27,15 @@ class Data:
     '''
     Class storing shared data
     '''
+    out_dir = '/tmp/tests/dmu/ml/cv_predict'
+    d_col   = {
+            'main' : ['index', 'a0', 'b0'],
+            'frn1' : ['index', 'a1', 'b1'],
+            'frn2' : ['index', 'a2', 'b2'],
+            'frn3' : ['index', 'a3', 'b3'],
+            'frn4' : ['index', 'a4', 'b4'],
+            }
 # -------------------------------
 def _double_data(df_1 : pnd.DataFrame) -> pnd.DataFrame:
     df_2   = df_1.copy()
@@ -39,7 +53,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
     else:
         l_col_index = [ l_col.index(column) for column in columns ]
-    log.debug('Replacing randomly with {size} NaNs')
+    log.debug(f'Replacing randomly with {size} NaNs')
     for _ in range(size):
         irow = numpy.random.randint(0, df.shape[0])      # Random row index
         icol = numpy.random.choice(l_col_index)      # Random column index
@@ -48,25 +62,39 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
     return df
 # -------------------------------
-def get_rdf(kind : Union[str,None] = None,
-            repeated : bool        = False,
-            nentries : int         = 3_000,
-            add_nans : list[str]   = None):
+def get_rdf(
+        kind              : Union[str,None] = None,
+        repeated          : bool        = False,
+        nentries          : int         = 3_000,
+        use_preffix       : bool        = False,
+        columns_with_nans : list[str]   = None):
     '''
     Return ROOT dataframe with toy data
+    kind              : sig, bkg or bkg_alt
+    repeated          : Will add repeated rows
+    nentries          : Number of rows
+    columns_with_nans : List of column names in [w, y, z]
     '''
+    # Needed for a specific test
+    xnm = 'preffix.x.suffix' if use_preffix else 'x'
     d_data = {}
     if   kind == 'sig':
-        d_data['w'] = numpy.random.normal(0, 1, size=nentries)
-        d_data['x'] = numpy.random.normal(0, 1, size=nentries)
-        d_data['y'] = numpy.random.normal(0, 1, size=nentries)
-        d_data['z'] = numpy.random.normal(0, 1, size=nentries)
+        d_data[xnm] = numpy.random.normal(0.0, 1.0, size=nentries)
+        d_data['w'] = numpy.random.normal(0.0, 1.0, size=nentries)
+        d_data['y'] = numpy.random.normal(0.0, 1.0, size=nentries)
+        d_data['z'] = numpy.random.normal(0.0, 1.0, size=nentries)
     elif kind == 'bkg':
-        d_data['w'] = numpy.random.normal(1, 1, size=nentries)
-        d_data['x'] = numpy.random.normal(1, 1, size=nentries)
-        d_data['y'] = numpy.random.normal(1, 1, size=nentries)
-        d_data['z'] = numpy.random.normal(1, 1, size=nentries)
+        d_data[xnm] = numpy.random.normal(1.0, 1.0, size=nentries)
+        d_data['w'] = numpy.random.normal(1.0, 1.0, size=nentries)
+        d_data['y'] = numpy.random.normal(1.0, 1.0, size=nentries)
+        d_data['z'] = numpy.random.normal(1.0, 1.0, size=nentries)
+    elif kind == 'bkg_alt':
+        d_data[xnm] = numpy.random.normal(1.3, 1.3, size=nentries)
+        d_data['w'] = numpy.random.normal(1.3, 1.3, size=nentries)
+        d_data['y'] = numpy.random.normal(1.3, 1.3, size=nentries)
+        d_data['z'] = numpy.random.normal(1.3, 1.3, size=nentries)
     else:
         log.error(f'Invalid kind: {kind}')
         raise ValueError
@@ -76,8 +104,8 @@ def get_rdf(kind : Union[str,None] = None,
     if repeated:
         df = _double_data(df)
-    if add_nans:
-        df = _add_nans(df, columns=add_nans)
+    if columns_with_nans is not None:
+        df = _add_nans(df, columns=columns_with_nans)
     rdf = RDF.FromPandas(df)
@@ -126,3 +154,80 @@ def get_file_with_trees(path : str) -> TFile:
         snap.fMode  = 'update'
     return TFile(path)
+# -------------------------------
+def get_models(
+        rdf_sig : RDataFrame,
+        rdf_bkg : RDataFrame,
+        name    : str        = 'train_mva',
+        out_dir : str | None = None) -> tuple[list[CVClassifier], float]:
+    '''
+    Will train and return models together with the AUC in a tuple
+    rdf_xxx : Signal or background dataframe used for training
+    name    : Name of config file, e.g. train_mva
+    out_dir : Directory where the training output will go, optional.
+    '''
+    out_dir = Data.out_dir if out_dir is None else out_dir
+    cfg                     = get_config(f'ml/tests/{name}.yaml')
+    cfg['saving']['output'] = out_dir
+    obj = TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
+    auc = obj.run()
+    pkl_wc     = f'{out_dir}/model*.pkl'
+    l_pkl_path = glob.glob(pkl_wc)
+    l_model    = [ joblib.load(pkl_path) for pkl_path in l_pkl_path ]
+    return l_model, auc
+# -------------------------------
+def _make_file(
+        fpath    : str,
+        tree     : str,
+        nentries : int) -> None:
+    fdir       = os.path.dirname(fpath)
+    sample     = os.path.basename(fdir)
+    l_col_name = Data.d_col[sample]
+    data       = {}
+    for col_name in l_col_name:
+        if col_name == 'index':
+            data[col_name] = numpy.arange(nentries)
+            continue
+        data[col_name] = numpy.random.normal(0, 1, nentries)
+    with uproot.recreate(fpath) as ofile:
+        log.debug(f'Saving to: {fpath}:{tree}')
+        ofile[tree] = data
+# -------------------------------
+def build_friend_structure(file_name : str, nentries : int) -> None:
+    '''
+    Will load YAML file with file structure needed to
+    test code that relies on friend trees, e.g. DDFGetter
+    Parameters:
+    -------------------
+    file_name (str): Name of YAML file with wanted structure, e.g. friends.yaml
+    nentries (int) : Number of entries in file
+    '''
+    cfg_path = files('dmu_data').joinpath(f'rfile/{file_name}')
+    with open(cfg_path, encoding='utf=8') as ifile:
+        data = yaml.safe_load(ifile)
+    if 'tree' not in data:
+        raise ValueError('tree entry missing in: {cfg_path}')
+    tree_name = data['tree']
+    if 'samples' not in data:
+        raise ValueError('Samples section missing in: {cfg_path}')
+    if 'files' not in data:
+        raise ValueError('Files section missing in: {cfg_path}')
+    for fdir in data['samples']:
+        for fname in data['files']:
+            path = f'{fdir}/{fname}'
+            _make_file(fpath=path, tree=tree_name, nentries=nentries)
+# ----------------------------------------------

dmu/workflow/__init__.py ADDED Viewed

File without changes

data-manipulation-utilities 0.2.6__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.6py3-none-any.whl → 0.2.8.dev714py3-none-any.whl