PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/train_mva.yaml +9 -7
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/styling.yaml +8 -1
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
{data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/stats/zfit_plotter.py CHANGED Viewed

@@ -3,25 +3,29 @@ Module containing plot class, used to plot fits
 '''
 # pylint: disable=too-many-instance-attributes, too-many-arguments
+import math
 import warnings
 import pprint
 import zfit
 import hist
 import mplhep
+import tensorflow            as tf
 import pandas                as pd
 import numpy                 as np
 import matplotlib.pyplot     as plt
-import dmu.generic.utilities as gut
+from zfit.core.basepdf      import BasePDF    as zpdf
+import dmu.generic.utilities as gut
 from dmu.logging.log_store  import LogStore
-log = LogStore.add_logger('dmu:fit_plotter')
+log = LogStore.add_logger('dmu:zfit_plotter')
 #----------------------------------------
 class ZFitPlotter:
     '''
     Class used to plot fits done with zfit
     '''
+    #----------------------------------------
     def __init__(self, data=None, model=None, weights=None, result=None, suffix=''):
         '''
         obs: zfit space you are using to define the data and model
@@ -62,7 +66,7 @@ class ZFitPlotter:
         self._l_def_col = list(mcolors.TABLEAU_COLORS.keys())
     #----------------------------------------
     def _data_to_zdata(self, obs, data, weights):
-        if isinstance(data, zfit.data.Data):
+        if isinstance(data, zfit.Data):
             return data
         if isinstance(data, np.ndarray):
@@ -76,36 +80,76 @@ class ZFitPlotter:
         return data
     #----------------------------------------
-    def _get_errors(self, nbins=100, l_range=None):
-        dat, wgt  = self._get_range_data(l_range, blind=False)
-        data_hist = hist.Hist.new.Regular(nbins, self.lower, self.upper, name=self.obs.obs[0], underflow=False, overflow=False)
+    def _get_errors(
+            self,
+            nbins  : int = 100,
+            l_range: list[tuple[float,float]]|None = None) -> list[float]:
+        '''
+        Parameters
+        ---------------------
+        nbins  : Number of bins
+        l_range: List of ranges where data should be picked, if None, will pick full range
+        Returns
+        ---------------------
+        list of errors associated to histogram filled with data
+        '''
+        dat, wgt  = self._get_range_data(l_range=l_range, blind=False)
+        data_hist = hist.Hist.new.Regular(
+                nbins,
+                self.lower,
+                self.upper,
+                name       =self.obs.obs[0],
+                underflow  =False,
+                overflow   =False)
         data_hist = data_hist.Weight()
         data_hist.fill(dat, weight=wgt)
         tmp_fig, tmp_ax = plt.subplots()
-        errorbars = mplhep.histplot(
+        errorbars       = mplhep.histplot(
             data_hist,
-            yerr=True,
-            color='white',
-            histtype="errorbar",
-            label=None,
-            ax=tmp_ax,
-        )
-        plt.close(tmp_fig)
-        lines = errorbars[0].errorbar[2]
-        segs = lines[0].get_segments()
+            yerr     =True,
+            color    ='white',
+            histtype ='errorbar',
+            label    =None,
+            ax       =tmp_ax)
+        lines  = errorbars[0].errorbar[2]
+        segs   = lines[0].get_segments()
         values = data_hist.values()
         l_error=[]
         for i in range(nbins):
-            low =  values[i] - segs[i][0][1]
-            up  = -values[i] + segs[i][1][1]
+            seg = segs[i]
+            val = values[i]
+            try:
+                low =  val - seg[0][1]
+                up  = -val + seg[1][1]
+            except IndexError as exc:
+                raise IndexError(f'Cannot read the upper/lower errors, found {seg}') from exc
             l_error.append((low, up))
+        plt.close(tmp_fig)
         return l_error
     #----------------------------------------
-    def _get_range_data(self, l_range, blind=True):
+    def _get_range_data(
+            self,
+            l_range : list[tuple[float,float]]|None,
+            blind   : bool =True) -> tuple[np.ndarray, np.ndarray]:
+        '''
+        Parameters
+        -----------------
+        l_range: List of ranges, i.e. tuples of bounds
+        blind  : If true (default) will blind the range specified, i.e. will exclude it
+        Returns
+        -----------------
+        Tuple with two numpy arrays defined in those ranges, with the observable and the weights.
+        '''
         sdat  = self.data_np
         swgt  = self.data_weight_np
         dmat  = np.array([sdat, swgt]).T
@@ -117,6 +161,8 @@ class ZFitPlotter:
         if l_range is None:
             [dat, wgt] = dmat.T
+            self._check_data(dat=dat, wgt=wgt)
             return dat, wgt
         l_dat = []
@@ -132,23 +178,42 @@ class ZFitPlotter:
         dat_f = np.concatenate(l_dat)
         wgt_f = np.concatenate(l_wgt)
+        self._check_data(dat=dat_f, wgt=wgt_f)
         return dat_f, wgt_f
     #----------------------------------------
+    def _check_data(
+            self,
+            dat : np.ndarray,
+            wgt : np.ndarray) -> None:
+        '''
+        Checks for empty data, etc
+        Parameters
+        ------------
+        Numpy arrays with data and weights
+        '''
+        if dat.shape != wgt.shape:
+            raise ValueError(f'Shapes or data and weights differ: {dat.shape}/{wgt.shape}')
+        if len(dat) == 0:
+            raise ValueError('Dataset is empty')
+    #----------------------------------------
     def _plot_data(self, ax, nbins=100, l_range=None):
         dat, wgt  = self._get_range_data(l_range, blind=True)
         data_hist = hist.Hist.new.Regular(nbins, self.lower, self.upper, name=self.obs.obs[0], underflow=False, overflow=False)
         data_hist = data_hist.Weight()
         data_hist.fill(dat, weight=wgt)
-        _ = mplhep.histplot(
-            data_hist,
-            yerr=True,
-            color="black",
-            histtype="errorbar",
-            label=self._leg.get("Data", "Data"),
-            ax=ax,
-            xerr=self.dat_xerr
-        )
+        mplhep.histplot(
+                data_hist,
+                yerr     = True,
+                color    = 'black',
+                histtype = 'errorbar',
+                label    = self._leg.get('Data', 'Data'),
+                ax       = ax,
+                xerr     = self.dat_xerr)
     #----------------------------------------
     def _pull_hist(self, pdf_hist, nbins, data_yield, l_range=None):
         pdf_values= pdf_hist.values()
@@ -170,8 +235,16 @@ class ZFitPlotter:
             err = low if res > 0 else up
             pul = res / err
-            if abs(pul) > 5:
-                log.warning(f'Large pull: {pul:.1f}=({dat_val:.0f}-{pdf_val:.0f})/{err:.0f}')
+            # If the data is weighted
+            # and the data does not exist
+            # The pulls will have an error of zero => pull is inf
+            # Ignore these cases
+            if math.isinf(pul):
+                pass
+            elif abs(pul) > 5:
+                log.info(f'Pull: {pul:.2f}=({dat_val:.2f}-{pdf_val:.2f})/{err:.2f}')
+            else:
+                log.debug(f'Pull: {pul:.2f}=({dat_val:.2f}-{pdf_val:.2f})/{err:.2f}')
             pulls.append(pul)
             pull_errors[0].append(low / err)
@@ -241,7 +314,7 @@ class ZFitPlotter:
                 val = d_val['value']
                 name= par if isinstance(par, str) else par.name
                 try:
-                    err = d_val['hesse']['error']
+                    err = d_val['minuit_hesse']['error']
                 except KeyError:
                     log.warning(f'Cannot extract {name} Hesse errors, using zeros')
                     pprint.pprint(d_val)
@@ -348,7 +421,7 @@ class ZFitPlotter:
             if stacked:
                 ax.fill_between(self.x, y, alpha=1.0, label=self._leg.get(name, name), color=self._get_col(name))
             else:
-                ax.plot(self.x, y, '-',               label=self._leg.get(name, name), color=self._col.get(name))
+                ax.plot(self.x, y, ':',               label=self._leg.get(name, name), color=self._col.get(name))
         if (blind_name is not None) and (was_blinded is False):
             for model in self.total_model.pdfs:
@@ -365,10 +438,34 @@ class ZFitPlotter:
         return col
     #----------------------------------------
+    def _print_data(self) -> None:
+        log.info(f'Data shape  : {self.data_np.shape}')
+        log.info(f'Weights shape: {self.data_weight_np.shape}')
+        nnans = np.sum(np.isnan(self.data_np))
+        log.info(f'NaNs: {nnans}')
+        # This function will run before program raises
+        # One should be able to drop any plot
+        plt.close('all')
+        plt.hist(self.data_np, weights=self.data_weight_np)
+        plt.show()
+    #----------------------------------------
+    def _evaluate_pdf(self, pdf : zpdf) -> np.ndarray:
+        try:
+            arr_y = pdf.pdf(self.x)
+        except tf.errors.InvalidArgumentError as exc:
+            log.info(f'X values: {self.x}')
+            self._print_data()
+            raise ValueError('Cannot evaluate PDF') from exc
+        return arr_y
+    #----------------------------------------
     def _plot_sub_components(self, y, nbins, stacked, nevt, l_model):
         l_y = []
         for frc, model in l_model:
-            this_y = model.pdf(self.x) * nevt * frc / nbins * (self.upper - self.lower)
+            arr_y  = self._evaluate_pdf(pdf = model)
+            this_y = arr_y * nevt * frc / nbins * (self.upper - self.lower)
             if stacked:
                 y = this_y if y is None else y + this_y
@@ -385,7 +482,13 @@ class ZFitPlotter:
             return
         data_yield = self.data_weight_np.sum()
-        y = model.pdf(self.x) * data_yield / nbins * (self.upper - self.lower)
+        try:
+            arr_y = self._evaluate_pdf(model)
+            y     = arr_y * data_yield / nbins * (self.upper - self.lower)
+        except tf.errors.InvalidArgumentError as exc:
+            log.warning(f'Data yield: {data_yield:.0f}')
+            log.info(self.data_np)
+            raise RuntimeError('Cannot parse PDF') from exc
         name = model.name
         ax.plot(self.x, y, linestyle, label=self._leg.get(name, name), color=self._col.get(name))
@@ -396,7 +499,7 @@ class ZFitPlotter:
         if ylabel == "":
             width  = (self.upper-self.lower)/nbins
-            ylabel = f'Candidates / ({width:.3f} {unit})'
+            ylabel = f'Candidates / ({width:.0f} {unit})'
         return xlabel, ylabel
     #----------------------------------------
@@ -442,6 +545,7 @@ class ZFitPlotter:
             add_pars          = None,
             ymax              = None,
             skip_pulls        = False,
+            pull_styling :bool= True,
             yscale : str      = None,
             axs               = None,
             figsize:tuple     = (13, 7),
@@ -459,6 +563,7 @@ class ZFitPlotter:
         d_leg                 : Customize legend
         d_col                 : Customize color
         plot_range            : Set plot_range
+        pull_styling(bool)    : Will add lines at +/-3 and set range to +/-5 for pull plots, by default True
         plot_components (list): List of strings, with names of PDFs, which are expected to be sums of PDFs and whose components should be plotted separately
         ext_text              : Text that can be added to plot
         add_pars (list|str)   : List of names of parameters to be added or string with value 'all' to add all fit parameters. If this is used, plot won't use LHCb style.
@@ -532,4 +637,10 @@ class ZFitPlotter:
         for ax in self.axs:
             ax.label_outer()
+        if pull_styling and not skip_pulls:
+            self.axs[1].axhline(y=-3, color='red' , linestyle='-', lw=2)
+            self.axs[1].axhline(y= 0, color='gray', linestyle='-', lw=1)
+            self.axs[1].axhline(y=+3, color='red' , linestyle='-', lw=2)
+            self.axs[1].set_ylim(-5, 5)
 #----------------------------------------

dmu/testing/utilities.py CHANGED Viewed

@@ -10,6 +10,7 @@ from importlib.resources import files
 from ROOT import RDF, TFile, RDataFrame
+import uproot
 import joblib
 import pandas as pnd
 import numpy
@@ -27,6 +28,14 @@ class Data:
     Class storing shared data
     '''
     out_dir = '/tmp/tests/dmu/ml/cv_predict'
+    d_col   = {
+            'main' : ['index', 'a0', 'b0'],
+            'frn1' : ['index', 'a1', 'b1'],
+            'frn2' : ['index', 'a2', 'b2'],
+            'frn3' : ['index', 'a3', 'b3'],
+            'frn4' : ['index', 'a4', 'b4'],
+            }
 # -------------------------------
 def _double_data(df_1 : pnd.DataFrame) -> pnd.DataFrame:
     df_2   = df_1.copy()
@@ -53,25 +62,39 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
     return df
 # -------------------------------
-def get_rdf(kind : Union[str,None] = None,
-            repeated : bool        = False,
-            nentries : int         = 3_000,
-            columns_with_nans : list[str] = None):
+def get_rdf(
+        kind              : Union[str,None] = None,
+        repeated          : bool        = False,
+        nentries          : int         = 3_000,
+        use_preffix       : bool        = False,
+        columns_with_nans : list[str]   = None):
     '''
     Return ROOT dataframe with toy data
+    kind              : sig, bkg or bkg_alt
+    repeated          : Will add repeated rows
+    nentries          : Number of rows
+    columns_with_nans : List of column names in [w, y, z]
     '''
+    # Needed for a specific test
+    xnm = 'preffix.x.suffix' if use_preffix else 'x'
     d_data = {}
     if   kind == 'sig':
-        d_data['w'] = numpy.random.normal(0, 1, size=nentries)
-        d_data['x'] = numpy.random.normal(0, 1, size=nentries)
-        d_data['y'] = numpy.random.normal(0, 1, size=nentries)
-        d_data['z'] = numpy.random.normal(0, 1, size=nentries)
+        d_data[xnm] = numpy.random.normal(0.0, 1.0, size=nentries)
+        d_data['w'] = numpy.random.normal(0.0, 1.0, size=nentries)
+        d_data['y'] = numpy.random.normal(0.0, 1.0, size=nentries)
+        d_data['z'] = numpy.random.normal(0.0, 1.0, size=nentries)
     elif kind == 'bkg':
-        d_data['w'] = numpy.random.normal(1, 1, size=nentries)
-        d_data['x'] = numpy.random.normal(1, 1, size=nentries)
-        d_data['y'] = numpy.random.normal(1, 1, size=nentries)
-        d_data['z'] = numpy.random.normal(1, 1, size=nentries)
+        d_data[xnm] = numpy.random.normal(1.0, 1.0, size=nentries)
+        d_data['w'] = numpy.random.normal(1.0, 1.0, size=nentries)
+        d_data['y'] = numpy.random.normal(1.0, 1.0, size=nentries)
+        d_data['z'] = numpy.random.normal(1.0, 1.0, size=nentries)
+    elif kind == 'bkg_alt':
+        d_data[xnm] = numpy.random.normal(1.3, 1.3, size=nentries)
+        d_data['w'] = numpy.random.normal(1.3, 1.3, size=nentries)
+        d_data['y'] = numpy.random.normal(1.3, 1.3, size=nentries)
+        d_data['z'] = numpy.random.normal(1.3, 1.3, size=nentries)
     else:
         log.error(f'Invalid kind: {kind}')
         raise ValueError
@@ -132,24 +155,79 @@ def get_file_with_trees(path : str) -> TFile:
     return TFile(path)
 # -------------------------------
-def get_models(rdf_sig : RDataFrame, rdf_bkg : RDataFrame) -> list[CVClassifier]:
+def get_models(
+        rdf_sig : RDataFrame,
+        rdf_bkg : RDataFrame,
+        name    : str        = 'train_mva',
+        out_dir : str | None = None) -> tuple[list[CVClassifier], float]:
     '''
-    Will train and return models
+    Will train and return models together with the AUC in a tuple
+    rdf_xxx : Signal or background dataframe used for training
+    name    : Name of config file, e.g. train_mva
+    out_dir : Directory where the training output will go, optional.
     '''
+    out_dir = Data.out_dir if out_dir is None else out_dir
-    cfg                   = get_config('ml/tests/train_mva.yaml')
-    pkl_path              = f'{Data.out_dir}/model.pkl'
-    plt_dir               = f'{Data.out_dir}/cv_predict'
-    cfg['saving']['path'] = pkl_path
-    cfg['plotting']['val_dir'] = plt_dir
-    cfg['plotting']['features']['saving']['plt_dir'] = plt_dir
+    cfg                     = get_config(f'ml/tests/{name}.yaml')
+    cfg['saving']['output'] = out_dir
-    obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
-    obj.run()
+    obj = TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
+    auc = obj.run()
-    pkl_wc     = pkl_path.replace('.pkl', '_*.pkl')
+    pkl_wc     = f'{out_dir}/model*.pkl'
     l_pkl_path = glob.glob(pkl_wc)
     l_model    = [ joblib.load(pkl_path) for pkl_path in l_pkl_path ]
-    return l_model
+    return l_model, auc
 # -------------------------------
+def _make_file(
+        fpath    : str,
+        tree     : str,
+        nentries : int) -> None:
+    fdir       = os.path.dirname(fpath)
+    sample     = os.path.basename(fdir)
+    l_col_name = Data.d_col[sample]
+    data       = {}
+    for col_name in l_col_name:
+        if col_name == 'index':
+            data[col_name] = numpy.arange(nentries)
+            continue
+        data[col_name] = numpy.random.normal(0, 1, nentries)
+    with uproot.recreate(fpath) as ofile:
+        log.debug(f'Saving to: {fpath}:{tree}')
+        ofile[tree] = data
+# -------------------------------
+def build_friend_structure(file_name : str, nentries : int) -> None:
+    '''
+    Will load YAML file with file structure needed to
+    test code that relies on friend trees, e.g. DDFGetter
+    Parameters:
+    -------------------
+    file_name (str): Name of YAML file with wanted structure, e.g. friends.yaml
+    nentries (int) : Number of entries in file
+    '''
+    cfg_path = files('dmu_data').joinpath(f'rfile/{file_name}')
+    with open(cfg_path, encoding='utf=8') as ifile:
+        data = yaml.safe_load(ifile)
+    if 'tree' not in data:
+        raise ValueError('tree entry missing in: {cfg_path}')
+    tree_name = data['tree']
+    if 'samples' not in data:
+        raise ValueError('Samples section missing in: {cfg_path}')
+    if 'files' not in data:
+        raise ValueError('Files section missing in: {cfg_path}')
+    for fdir in data['samples']:
+        for fname in data['files']:
+            path = f'{fdir}/{fname}'
+            _make_file(fpath=path, tree=tree_name, nentries=nentries)
+# ----------------------------------------------

dmu/workflow/__init__.py ADDED Viewed

File without changes

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl