PyPI - data-manipulation-utilities - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.6py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +800 -34
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +70 -0
dmu/generic/utilities.py +175 -9
dmu/generic/version_management.py +3 -5
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +224 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +587 -112
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +61 -3
dmu/plotting/fwhm.py +64 -0
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +25 -3
dmu/plotting/plotter_1d.py +159 -14
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +56 -23
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +305 -50
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_models.py +68 -0
dmu/stats/zfit_plotter.py +175 -56
dmu/testing/utilities.py +120 -15
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
dmu_data/ml/tests/train_mva.yaml +20 -12
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +87 -0
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
dmu_data/plotting/tests/plug_stats.yaml +19 -0
dmu_data/plotting/tests/simple.yaml +4 -3
dmu_data/plotting/tests/styling.yaml +18 -0
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.6.dist-info/RECORD +0 -57
{data_manipulation_utilities-0.2.6.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/ml/utilities.py CHANGED Viewed

@@ -14,11 +14,24 @@ log = LogStore.add_logger('dmu:ml:utilities')
 # ---------------------------------------------
 # Patch dataframe with features
 # ---------------------------------------------
-def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
+def tag_nans(
+        df      : pnd.DataFrame,
+        indexes : str) -> pnd.DataFrame:
     '''
-    Takes pandas dataframe, replaces NaNs with value introduced, by default 0
-    Returns array of indices where the replacement happened
+    Parameters
+    ----------------
+    df      : Pandas dataframe
+    indexes : Name of dataframe attribute where array of indices of NaN rows should go
+    Returns
+    ----------------
+    Dataframe:
+    - After filtering, i.e. with dropped rows.
+    - With array of indices dropped as attribute at `patched_indices`
     '''
     l_nan = df.index[df.isna().any(axis=1)].tolist()
     nnan  = len(l_nan)
     if nnan == 0:
@@ -29,15 +42,21 @@ def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
     df_nan_frq = df.isna().sum()
     df_nan_frq = df_nan_frq[df_nan_frq > 0]
-    print(df_nan_frq)
+    log.info(df_nan_frq)
     log.warning(f'Attaching array with NaN {nnan} indexes and removing NaNs from dataframe')
-    df_pa = df.fillna(value)
+    arr_index_2 = numpy.array(l_nan)
+    if indexes in df.attrs:
+        arr_index_1 = df.attrs[indexes]
+        arr_index   = numpy.concatenate((arr_index_1, arr_index_2))
+        arr_index   = numpy.unique(arr_index)
+    else:
+        arr_index   = arr_index_2
-    df_pa.attrs['patched_indices'] = numpy.array(l_nan)
+    df.attrs[indexes] = arr_index
-    return df_pa
+    return df
 # ---------------------------------------------
 # Cleanup of dataframe with features
 # ---------------------------------------------
@@ -96,7 +115,7 @@ def _remove_repeated(df : pnd.DataFrame) -> pnd.DataFrame:
     return df_clean
 # ----------------------------------
 # ---------------------------------------------
-def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set, list]:
+def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set[str], list[str]]:
     '''
     Will return hashes for each row in the feature dataframe
@@ -113,9 +132,9 @@ def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set, list]:
     return res
 # ----------------------------------
-def hash_from_row(row):
+def hash_from_row(row : pnd.Series) -> str:
     '''
-    Will return a hash from a pandas dataframe row
+    Will return a hash in the form or a string from a pandas dataframe row
     corresponding to an event
     '''
     l_val   = [ str(val) for val in row ]

dmu/pdataframe/utilities.py CHANGED Viewed

@@ -2,20 +2,28 @@
 Module containing utilities for pandas dataframes
 '''
 import os
+import yaml
 import pandas as pnd
 from dmu.logging.log_store import LogStore
 log=LogStore.add_logger('dmu:pdataframe:utilities')
 # -------------------------------------
-def df_to_tex(df : pnd.DataFrame, path : str, hide_index : bool = True, d_format : dict[str,str]=None, caption : str =None) -> None:
+def df_to_tex(df         : pnd.DataFrame,
+              path       : str,
+              hide_index : bool         = True,
+              d_format   : dict[str,str]= None,
+              **kwargs   : str       ) -> None:
     '''
     Saves pandas dataframe to latex
     Parameters
     -------------
+    df              : Dataframe with data
+    path     (str)  : Path to latex file
+    hide_index      : If true (default), index of dataframe won't appear in table
     d_format (dict) : Dictionary specifying the formattinng of the table, e.g. `{'col1': '{}', 'col2': '{:.3f}', 'col3' : '{:.3f}'}`
+    kwargs          : Arguments needed in `to_latex`
     '''
     if path is not None:
@@ -30,7 +38,57 @@ def df_to_tex(df : pnd.DataFrame, path : str, hide_index : bool = True, d_format
         st=st.format(formatter=d_format)
     log.info(f'Saving to: {path}')
-    buf = st.to_latex(buf=path, caption=caption, hrules=True)
+    buf = st.to_latex(buf=path, hrules=True, **kwargs)
     return buf
 # -------------------------------------
+def to_yaml(df : pnd.DataFrame, path : str):
+    '''
+    Takes a dataframe and the path to a yaml file
+    Makes the directory path if not found and saves data in YAML file
+    '''
+    dir_path = os.path.dirname(path)
+    if dir_path != '':
+        os.makedirs(dir_path, exist_ok=True)
+    data = df.to_dict()
+    with open(path, 'w', encoding='utf-8') as ofile:
+        yaml.dump(data, ofile, Dumper=yaml.CDumper)
+# -------------------------------------
+def from_yaml(path : str) -> pnd.DataFrame:
+    '''
+    Takes path to a yaml file
+    Makes dataframe from it and returns it
+    '''
+    with open(path, encoding='utf-8') as ifile:
+        data = yaml.load(ifile, Loader=yaml.CSafeLoader)
+    df = pnd.DataFrame(data)
+    return df
+# -------------------------------------
+def dropna(df : pnd.DataFrame, max_frac : float = 0.02) -> pnd.DataFrame:
+    '''
+    Parameters
+    ----------------
+    df      : Pandas dataframe potentially with NaNs
+    max_frac: Maximum fraction of the data that can be dropped, will raise exception beyond
+    '''
+    ini = len(df)
+    df  = df.dropna()
+    fin = len(df)
+    if ini == fin:
+        log.debug('No NaNs were found')
+        return df
+    # If fewer elements survive the filter, raise
+    if fin < ini * (1 - max_frac):
+        raise ValueError(f'Too man NaNs were detected: {ini} --> {fin}')
+    log.info(f'Found NaNs: {ini} --> {fin}')
+    return df
+# -------------------------------------

dmu/plotting/fwhm.py ADDED Viewed

@@ -0,0 +1,64 @@
+'''
+Module with FWHM plugin class
+'''
+import numpy
+import matplotlib.pyplot as plt
+from dmu.stats.zfit        import zfit
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:plotting:fwhm')
+# --------------------------------------------
+class FWHM:
+    '''
+    Class meant to be used to calculate Full Width at Half Maximum
+    as a Plotter1d plugin
+    '''
+    # -------------------------
+    def __init__(self, cfg : dict, val : numpy.ndarray, wgt : numpy.ndarray, maxy : float):
+        self._cfg     = cfg
+        self._arr_val = val
+        self._arr_wgt = wgt
+        self._maxy    = maxy
+    # -------------------------
+    def _normalize_yval(self, arr_pdf_val : numpy.ndarray) -> None:
+        max_pdf_val = numpy.max(arr_pdf_val)
+        arr_pdf_val*= self._maxy / max_pdf_val
+        return arr_pdf_val
+    # -------------------------
+    def _get_fwhm(self, arr_x : numpy.ndarray, arr_y : numpy.ndarray) -> float:
+        maxy = numpy.max(arr_y)
+        arry = numpy.where(arr_y > maxy/2.)[0]
+        imax = arry[ 0]
+        imin = arry[-1]
+        x1 = arr_x[imax]
+        x2 = arr_x[imin]
+        if self._cfg['plot']:
+            plt.plot([x1, x2], [maxy/2, maxy/2], linestyle=':', linewidth=1, color='k')
+        return x2 - x1
+    # -------------------------
+    def run(self) -> float:
+        '''
+        Runs plugin and return FWHM
+        '''
+        [minx, maxx] = self._cfg['obs']
+        log.info('Running FWHM pluggin')
+        obs = zfit.Space('mass', limits=(minx, maxx))
+        pdf= zfit.pdf.KDE1DimISJ(obs=obs, data=self._arr_val, weights=self._arr_wgt)
+        xval = numpy.linspace(minx, maxx, 200)
+        yval = pdf.pdf(xval)
+        yval = self._normalize_yval(yval)
+        if self._cfg['plot']:
+            plt.plot(xval, yval, linestyle='-', linewidth=2, color='gray')
+        fwhm = self._get_fwhm(xval, yval)
+        return fwhm
+# --------------------------------------------

dmu/plotting/matrix.py CHANGED Viewed

@@ -102,7 +102,7 @@ class MatrixPlotter:
         fig, ax = plt.subplots() if fsize is None else plt.subplots(figsize=fsize)
-        palette = plt.cm.viridis
+        palette = plt.cm.viridis #pylint: disable=no-member
         im      = ax.imshow(self._mat, cmap=palette, vmin=zmin, vmax=zmax)
         self._set_axes(ax)

dmu/plotting/plotter.py CHANGED Viewed

@@ -3,6 +3,7 @@ Module containing plotter class
 '''
 import os
+import json
 import math
 from typing import Union
@@ -29,6 +30,8 @@ class Plotter:
         self._d_cfg = cfg
         self._d_rdf : dict[str, RDataFrame]    = { name : self._preprocess_rdf(rdf) for name, rdf in d_rdf.items()}
         self._d_wgt : Union[dict[str, Union[numpy.ndarray, None]], None]
+        self._title : str = ''
     #-------------------------------------
     def _check_quantile(self, qnt : float):
         '''
@@ -183,14 +186,17 @@ class Plotter:
         return d_weight
     # --------------------------------------------
-    def _read_weights(self, name : str, rdf : RDataFrame) -> Union[numpy.ndarray, None]:
+    def _read_weights(self, name : str, rdf : RDataFrame) -> numpy.ndarray:
         v_col = rdf.GetColumnNames()
         l_col = [ col.c_str() for col in v_col ]
         if name not in l_col:
-            log.debug(f'Weight {name} not found')
-            return None
+            nentries = rdf.Count().GetValue()
+            log.debug(f'Weight {name} not found, using ones')
+            return numpy.ones(nentries)
+        log.debug(f'Weight {name} found')
         arr_wgt = rdf.AsNumpy([name])[name]
         return arr_wgt
@@ -228,4 +234,20 @@ class Plotter:
         plt.tight_layout()
         plt.savefig(plot_path)
         plt.close(var)
+    #-------------------------------------
+    def _data_to_json(self,
+                      data : dict[str,float],
+                      name : str) -> None:
+        # In case the values are numpy objects, which are not JSON
+        # serializable
+        data = { key : float(value)  for key, value in data.items() }
+        plt_dir = self._d_cfg['saving']['plt_dir']
+        os.makedirs(plt_dir, exist_ok=True)
+        name      = name.replace(' ', '_')
+        json_path = f'{plt_dir}/{name}.json'
+        with open(json_path, 'w', encoding='utf-8') as ofile:
+            json.dump(data, ofile, indent=2, sort_keys=True)
 # --------------------------------------------

dmu/plotting/plotter_1d.py CHANGED Viewed

@@ -1,7 +1,9 @@
 '''
 Module containing plotter class
 '''
+# pylint: disable=too-many-positional-arguments, too-many-arguments
+import cppyy
 from hist import Hist
 import numpy
@@ -9,6 +11,7 @@ import matplotlib.pyplot as plt
 from dmu.logging.log_store import LogStore
 from dmu.plotting.plotter  import Plotter
+from dmu.plotting.fwhm     import FWHM
 log = LogStore.add_logger('dmu:plotting:Plotter1D')
 # --------------------------------------------
@@ -55,6 +58,110 @@ class Plotter1D(Plotter):
         return minx, maxx, bins
     #-------------------------------------
+    def _run_plugins(
+            self,
+            arr_val : numpy.ndarray,
+            arr_wgt : numpy.ndarray,
+            hst     : Hist,
+            name    : str,
+            varname : str) -> None:
+        if 'plugin' not in self._d_cfg:
+            log.debug('No plugins found')
+            return
+        if 'fwhm' in self._d_cfg['plugin']:
+            if varname not in self._d_cfg['plugin']['fwhm']:
+                log.debug(f'No FWHM plugin found for variable {varname}')
+                return
+            log.debug(f'FWHM plugin found for variable {varname}')
+            cfg = self._d_cfg['plugin']['fwhm'][varname]
+            self._run_fwhm(
+                    arr_val = arr_val,
+                    arr_wgt = arr_wgt,
+                    hst     = hst,
+                    name    = name,
+                    varname = varname,
+                    cfg     = cfg)
+        if 'stats' in self._d_cfg['plugin']:
+            if varname not in self._d_cfg['plugin']['stats']:
+                log.debug(f'No stats plugin found for variable {varname}')
+                return
+            log.debug(f'stats plugin found for variable {varname}')
+            cfg = self._d_cfg['plugin']['stats'][varname]
+            self._run_stats(
+                    arr_val = arr_val,
+                    arr_wgt = arr_wgt,
+                    name    = name,
+                    varname = varname,
+                    cfg     = cfg)
+    #-------------------------------------
+    def _run_stats(
+            self,
+            arr_val : numpy.ndarray,
+            arr_wgt : numpy.ndarray,
+            varname : str,
+            name    : str,
+            cfg     : dict[str:str]) -> None:
+        this_title = ''
+        data       = {}
+        if 'sum' in cfg:
+            form = cfg['sum']
+            sumv = numpy.sum(arr_wgt)
+            this_title += form.format(sumv) + '; '
+            data['sum'] = sumv
+        if 'mean' in cfg:
+            form = cfg['mean']
+            mean = numpy.average(arr_val, weights=arr_wgt)
+            this_title += form.format(mean) + '; '
+            data['mean'] = mean
+        if 'rms'  in cfg:
+            form = cfg['rms']
+            mean = numpy.average(arr_val, weights=arr_wgt)
+            rms  = numpy.sqrt(numpy.average((arr_val - mean) ** 2, weights=arr_wgt))
+            this_title += form.format(rms ) + '; '
+            data['rms'] = rms
+        self._data_to_json(data = data, name = f'stats_{varname}_{name}')
+        self._title+= f'\n{name}: {this_title}'
+    #-------------------------------------
+    def _run_fwhm(
+            self,
+            arr_val : numpy.ndarray,
+            arr_wgt : numpy.ndarray,
+            hst     : Hist,
+            varname : str,
+            name    : str,
+            cfg     : dict) -> None:
+        arr_bin_cnt = hst.values()
+        maxy = numpy.max(arr_bin_cnt)
+        obj  = FWHM(cfg=cfg, val=arr_val, wgt=arr_wgt, maxy=maxy)
+        fwhm = obj.run()
+        form        = cfg['format']
+        this_title  = form.format(fwhm)
+        data        = {}
+        if 'add_std' in cfg and cfg['add_std']:
+            mu         = numpy.average(arr_val            , weights=arr_wgt)
+            var        = numpy.average((arr_val - mu) ** 2, weights=arr_wgt)
+            std        = numpy.sqrt(var)
+            form       = form.replace('FWHM', 'STD')
+            this_title+= '; ' + form.format(std)
+            data       = {'mu' : mu, 'std' : std, 'fwhm' : fwhm}
+        self._data_to_json(data = data, name = f'fwhm_{varname}_{name}')
+        self._title+= f'\n{name}: {this_title}'
+    #-------------------------------------
     def _plot_var(self, var : str) -> float:
         '''
         Will plot a variable from a dictionary of dataframes
@@ -70,39 +177,70 @@ class Plotter1D(Plotter):
         d_data = {}
         for name, rdf in self._d_rdf.items():
-            d_data[name] = rdf.AsNumpy([var])[var]
+            try:
+                log.debug(f'Plotting: {var}/{name}')
+                d_data[name] = rdf.AsNumpy([var])[var]
+            except cppyy.gbl.std.runtime_error as exc:
+                raise ValueError(f'Cannot find variable {var} in category {name}') from exc
         minx, maxx, bins = self._get_binning(var, d_data)
         d_wgt            = self._get_weights(var)
         l_bc_all = []
         for name, arr_val in d_data.items():
-            label        = self._label_from_name(name, arr_val)
+            label        = self._label_from_name(name)
             arr_wgt      = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
             arr_wgt      = self._normalize_weights(arr_wgt, var)
             hst          = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x').Weight()
             hst.fill(x=arr_val, weight=arr_wgt)
-            hst.plot(label=label)
+            self._run_plugins(arr_val, arr_wgt, hst, name, var)
+            style = self._get_style_config(var=var, label=label)
+            log.debug(f'Style: {style}')
+            hst.plot(**style)
             l_bc_all    += hst.values().tolist()
         max_y = max(l_bc_all)
         return max_y
     # --------------------------------------------
-    def _label_from_name(self, name : str, arr_val : numpy.ndarray) -> str:
+    def _get_style_config(self, var : str, label : str) -> dict[str,str]:
+        style = {
+            'label'     : label,
+            'histtype'  : 'errorbar',
+            'linestyle' : 'none'}
+        if 'styling' not in self._d_cfg['plots'][var]:
+            log.debug(f'Styling not specified for {var}')
+            return style
+        if label     not in self._d_cfg['plots'][var]['styling']:
+            log.debug(f'Styling not specified for {var}/{label}')
+            return style
+        custom_style = self._d_cfg['plots'][var]['styling'][label]
+        style.update(custom_style)
+        log.debug(f'Using custom styling for {var}/{label}')
+        return style
+    # --------------------------------------------
+    def _label_from_name(self, name : str) -> str:
         if 'stats' not in self._d_cfg:
             return name
         d_stat = self._d_cfg['stats']
-        if 'nentries' not in d_stat:
+        if 'sumw' not in d_stat:
             return name
-        form = d_stat['nentries']
+        form = d_stat['sumw']
-        nentries = len(arr_val)
-        nentries = form.format(nentries)
+        arr_wgt  = self._d_wgt[name]
+        arr_wgt  = numpy.nan_to_num(arr_wgt, nan=0.0)
+        sumw     = numpy.sum(arr_wgt)
+        nentries = form.format(sumw)
-        return f'{name}{nentries}'
+        return f'{name:<15}{nentries:<10}'
     # --------------------------------------------
     def _normalize_weights(self, arr_wgt : numpy.ndarray, var : str) -> numpy.ndarray:
         cfg_var = self._d_cfg['plots'][var]
@@ -131,9 +269,12 @@ class Plotter1D(Plotter):
         if yscale == 'linear':
             plt.ylim(bottom=0)
-        title = ''
+        title = self._title
         if 'title'      in d_cfg:
-            title = d_cfg['title']
+            this_title = d_cfg['title']
+            title += f'\n {this_title}'
+        title = title.lstrip('\n')
         plt.ylim(top=1.2 * max_y)
         plt.legend()
@@ -145,10 +286,15 @@ class Plotter1D(Plotter):
         var (str) : name of variable
         '''
+        var_cfg = self._d_cfg['plots'][var]
+        if 'vline' in var_cfg:
+            line_cfg = var_cfg['vline']
+            plt.axvline(**line_cfg)
         if 'style' in self._d_cfg and 'skip_lines' in self._d_cfg['style'] and self._d_cfg['style']['skip_lines']:
             return
-        if var in ['B_const_mass_M', 'B_M']:
+        if var in ['B_const_mass_M', 'B_M', 'B_Mass', 'B_Mass_smr']:
             plt.axvline(x=5280, color='r', label=r'$B^+$'   , linestyle=':')
         elif var == 'Jpsi_M':
             plt.axvline(x=3096, color='r', label=r'$J/\psi$', linestyle=':')
@@ -160,8 +306,7 @@ class Plotter1D(Plotter):
         fig_size = self._get_fig_size()
         for var in self._d_cfg['plots']:
-            log.debug(f'Plotting: {var}')
+            self._title = ''
             plt.figure(var, figsize=fig_size)
             max_y = self._plot_var(var)
             self._style_plot(var, max_y)

dmu/plotting/plotter_2d.py CHANGED Viewed

@@ -70,6 +70,11 @@ class Plotter2D(Plotter):
         hst   = Hist(ax_x, ax_y)
         hst.fill(arr_x, arr_y, weight=arr_w)
+        if hst.values().sum() == 0:
+            log.warning('Empty histogram, not using log scale')
+            mplhep.hist2dplot(hst)
+            return
         if use_log:
             mplhep.hist2dplot(hst, norm=LogNorm())
         else:

dmu/rdataframe/utilities.py CHANGED Viewed

@@ -16,7 +16,6 @@ from ROOT import RDataFrame, RDF, Numba
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:rdataframe:utilities')
 # ---------------------------------------------------------------------
 @dataclass
 class Data:
@@ -98,12 +97,17 @@ def add_column_with_numba(
     return rdf
 # ---------------------------------------------------------------------
-def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
+def rdf_report_to_df(rep : RDF.RCutFlowReport) -> Union[pnd.DataFrame, None]:
     '''
     Takes the output of rdf.Report(), i.e. an RDataFrame cutflow report.
-    Produces a pandas dataframe with
+    Produces a pandas dataframe with the total, failed, efficiency, and cummulative efficiency
+    If no cut was applied, i.e. the cutflow is empty, will return None and show warning
     '''
+    if rep.begin() == rep.end():
+        log.warning('Empty cutflow')
+        return None
     d_data = {'cut' : [], 'All' : [], 'Passed' : []}
     for cut in rep:
         name=cut.GetName()
@@ -119,3 +123,50 @@ def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
     df['Cummulative'] = df['Efficiency'].cumprod()
     return df
+# ---------------------------------------------------------------------
+def random_filter(rdf : RDataFrame, entries : int) -> RDataFrame:
+    '''
+    Filters a dataframe, such that the output has **approximately** `entries` entries
+    '''
+    ntot = rdf.Count().GetValue()
+    if entries <= 0 or entries >= ntot:
+        log.warning(f'Requested {entries}/{ntot} random entries, not filtering')
+        return rdf
+    prob = float(entries) / ntot
+    name = f'filter_{entries}'
+    rdf  = rdf.Define(name, 'gRandom->Rndm();')
+    rdf  = rdf.Filter(f'{name} < {prob}', name)
+    nres = rdf.Count().GetValue()
+    log.debug(f'Requested {ntot}, picked {nres}')
+    return rdf
+# ---------------------------------------------------------------------
+def rdf_to_df(
+        rdf     : RDataFrame,
+        columns : list[str]) -> pnd.DataFrame:
+    '''
+    Parameters
+    ---------------
+    rdf      : ROOT dataframe
+    branches : List of columns to keep in pandas dataframe
+    Returns
+    ---------------
+    Pandas dataframe with subset of columns
+    '''
+    log.debug('Storing branches')
+    data     = rdf.AsNumpy(columns)
+    df       = pnd.DataFrame(data)
+    if len(df) == 0:
+        rep      = rdf.Report()
+        cutflow  = rdf_report_to_df(rep)
+        log.warning('Empty dataset:\n')
+        log.info(cutflow)
+    return df
+# ---------------------------------------------------------------------

data-manipulation-utilities 0.2.6__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.6py3-none-any.whl → 0.2.8.dev714py3-none-any.whl