PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev720__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev720py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev720.dist-info}/METADATA +669 -42
data_manipulation_utilities-0.2.8.dev720.dist-info/RECORD +45 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev720.dist-info}/WHEEL +1 -2
data_manipulation_utilities-0.2.8.dev720.dist-info/entry_points.txt +8 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
data_manipulation_utilities-0.2.7.data/scripts/publish +0 -89
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
data_manipulation_utilities-0.2.7.dist-info/entry_points.txt +0 -6
data_manipulation_utilities-0.2.7.dist-info/top_level.txt +0 -3
dmu_data/ml/tests/diagnostics_from_file.yaml +0 -13
dmu_data/ml/tests/diagnostics_from_model.yaml +0 -10
dmu_data/ml/tests/diagnostics_multiple_methods.yaml +0 -10
dmu_data/ml/tests/diagnostics_overlay.yaml +0 -33
dmu_data/ml/tests/train_mva.yaml +0 -58
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +0 -82
dmu_data/plotting/tests/2d.yaml +0 -24
dmu_data/plotting/tests/fig_size.yaml +0 -13
dmu_data/plotting/tests/high_stat.yaml +0 -22
dmu_data/plotting/tests/legend.yaml +0 -12
dmu_data/plotting/tests/name.yaml +0 -14
dmu_data/plotting/tests/no_bounds.yaml +0 -12
dmu_data/plotting/tests/normalized.yaml +0 -9
dmu_data/plotting/tests/plug_fwhm.yaml +0 -24
dmu_data/plotting/tests/plug_stats.yaml +0 -19
dmu_data/plotting/tests/simple.yaml +0 -9
dmu_data/plotting/tests/stats.yaml +0 -9
dmu_data/plotting/tests/styling.yaml +0 -11
dmu_data/plotting/tests/title.yaml +0 -14
dmu_data/plotting/tests/weights.yaml +0 -13
dmu_data/text/transform.toml +0 -4
dmu_data/text/transform.txt +0 -6
dmu_data/text/transform_set.toml +0 -8
dmu_data/text/transform_set.txt +0 -6
dmu_data/text/transform_trf.txt +0 -12
dmu_scripts/git/publish +0 -89
dmu_scripts/physics/check_truth.py +0 -121
dmu_scripts/rfile/compare_root_files.py +0 -299
dmu_scripts/rfile/print_trees.py +0 -35
dmu_scripts/ssh/coned.py +0 -168
dmu_scripts/text/transform_text.py +0 -46
{dmu_data → dmu}/__init__.py +0 -0

dmu/ml/utilities.py CHANGED Viewed

@@ -14,11 +14,24 @@ log = LogStore.add_logger('dmu:ml:utilities')
 # ---------------------------------------------
 # Patch dataframe with features
 # ---------------------------------------------
-def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
+def tag_nans(
+        df      : pnd.DataFrame,
+        indexes : str) -> pnd.DataFrame:
     '''
-    Takes pandas dataframe, replaces NaNs with value introduced, by default 0
-    Returns array of indices where the replacement happened
+    Parameters
+    ----------------
+    df      : Pandas dataframe
+    indexes : Name of dataframe attribute where array of indices of NaN rows should go
+    Returns
+    ----------------
+    Dataframe:
+    - After filtering, i.e. with dropped rows.
+    - With array of indices dropped as attribute at `patched_indices`
     '''
     l_nan = df.index[df.isna().any(axis=1)].tolist()
     nnan  = len(l_nan)
     if nnan == 0:
@@ -29,15 +42,21 @@ def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
     df_nan_frq = df.isna().sum()
     df_nan_frq = df_nan_frq[df_nan_frq > 0]
-    print(df_nan_frq)
+    log.info(df_nan_frq)
     log.warning(f'Attaching array with NaN {nnan} indexes and removing NaNs from dataframe')
-    df_pa = df.fillna(value)
+    arr_index_2 = numpy.array(l_nan)
+    if indexes in df.attrs:
+        arr_index_1 = df.attrs[indexes]
+        arr_index   = numpy.concatenate((arr_index_1, arr_index_2))
+        arr_index   = numpy.unique(arr_index)
+    else:
+        arr_index   = arr_index_2
-    df_pa.attrs['patched_indices'] = numpy.array(l_nan)
+    df.attrs[indexes] = arr_index
-    return df_pa
+    return df
 # ---------------------------------------------
 # Cleanup of dataframe with features
 # ---------------------------------------------
@@ -96,7 +115,7 @@ def _remove_repeated(df : pnd.DataFrame) -> pnd.DataFrame:
     return df_clean
 # ----------------------------------
 # ---------------------------------------------
-def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set, list]:
+def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set[str], list[str]]:
     '''
     Will return hashes for each row in the feature dataframe
@@ -113,9 +132,9 @@ def get_hashes(df_ft : pnd.DataFrame, rvalue : str ='set') -> Union[set, list]:
     return res
 # ----------------------------------
-def hash_from_row(row):
+def hash_from_row(row : pnd.Series) -> str:
     '''
-    Will return a hash from a pandas dataframe row
+    Will return a hash in the form or a string from a pandas dataframe row
     corresponding to an event
     '''
     l_val   = [ str(val) for val in row ]

dmu/pdataframe/utilities.py CHANGED Viewed

@@ -48,12 +48,13 @@ def to_yaml(df : pnd.DataFrame, path : str):
     Makes the directory path if not found and saves data in YAML file
     '''
     dir_path = os.path.dirname(path)
-    os.makedirs(dir_path, exist_ok=True)
+    if dir_path != '':
+        os.makedirs(dir_path, exist_ok=True)
     data = df.to_dict()
     with open(path, 'w', encoding='utf-8') as ofile:
-        yaml.safe_dump(data, ofile)
+        yaml.dump(data, ofile, Dumper=yaml.CDumper)
 # -------------------------------------
 def from_yaml(path : str) -> pnd.DataFrame:
     '''
@@ -61,9 +62,33 @@ def from_yaml(path : str) -> pnd.DataFrame:
     Makes dataframe from it and returns it
     '''
     with open(path, encoding='utf-8') as ifile:
-        data = yaml.safe_load(ifile)
+        data = yaml.load(ifile, Loader=yaml.CSafeLoader)
     df = pnd.DataFrame(data)
     return df
 # -------------------------------------
+def dropna(df : pnd.DataFrame, max_frac : float = 0.02) -> pnd.DataFrame:
+    '''
+    Parameters
+    ----------------
+    df      : Pandas dataframe potentially with NaNs
+    max_frac: Maximum fraction of the data that can be dropped, will raise exception beyond
+    '''
+    ini = len(df)
+    df  = df.dropna()
+    fin = len(df)
+    if ini == fin:
+        log.debug('No NaNs were found')
+        return df
+    # If fewer elements survive the filter, raise
+    if fin < ini * (1 - max_frac):
+        raise ValueError(f'Too man NaNs were detected: {ini} --> {fin}')
+    log.info(f'Found NaNs: {ini} --> {fin}')
+    return df
+# -------------------------------------

dmu/plotting/fwhm.py CHANGED Viewed

@@ -1,10 +1,10 @@
 '''
 Module with FWHM plugin class
 '''
-import zfit
 import numpy
 import matplotlib.pyplot as plt
+from dmu.stats.zfit        import zfit
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:plotting:fwhm')
@@ -49,7 +49,7 @@ class FWHM:
         log.info('Running FWHM pluggin')
         obs = zfit.Space('mass', limits=(minx, maxx))
-        pdf= zfit.pdf.KDE1DimExact(obs=obs, data=self._arr_val, weights=self._arr_wgt)
+        pdf= zfit.pdf.KDE1DimISJ(obs=obs, data=self._arr_val, weights=self._arr_wgt)
         xval = numpy.linspace(minx, maxx, 200)
         yval = pdf.pdf(xval)

dmu/plotting/matrix.py CHANGED Viewed

@@ -102,7 +102,7 @@ class MatrixPlotter:
         fig, ax = plt.subplots() if fsize is None else plt.subplots(figsize=fsize)
-        palette = plt.cm.viridis
+        palette = plt.cm.viridis #pylint: disable=no-member
         im      = ax.imshow(self._mat, cmap=palette, vmin=zmin, vmax=zmax)
         self._set_axes(ax)

dmu/plotting/plotter.py CHANGED Viewed

@@ -3,6 +3,7 @@ Module containing plotter class
 '''
 import os
+import json
 import math
 from typing import Union
@@ -185,14 +186,17 @@ class Plotter:
         return d_weight
     # --------------------------------------------
-    def _read_weights(self, name : str, rdf : RDataFrame) -> Union[numpy.ndarray, None]:
+    def _read_weights(self, name : str, rdf : RDataFrame) -> numpy.ndarray:
         v_col = rdf.GetColumnNames()
         l_col = [ col.c_str() for col in v_col ]
         if name not in l_col:
-            log.debug(f'Weight {name} not found')
-            return None
+            nentries = rdf.Count().GetValue()
+            log.debug(f'Weight {name} not found, using ones')
+            return numpy.ones(nentries)
+        log.debug(f'Weight {name} found')
         arr_wgt = rdf.AsNumpy([name])[name]
         return arr_wgt
@@ -230,4 +234,20 @@ class Plotter:
         plt.tight_layout()
         plt.savefig(plot_path)
         plt.close(var)
+    #-------------------------------------
+    def _data_to_json(self,
+                      data : dict[str,float],
+                      name : str) -> None:
+        # In case the values are numpy objects, which are not JSON
+        # serializable
+        data = { key : float(value)  for key, value in data.items() }
+        plt_dir = self._d_cfg['saving']['plt_dir']
+        os.makedirs(plt_dir, exist_ok=True)
+        name      = name.replace(' ', '_')
+        json_path = f'{plt_dir}/{name}.json'
+        with open(json_path, 'w', encoding='utf-8') as ofile:
+            json.dump(data, ofile, indent=2, sort_keys=True)
 # --------------------------------------------

dmu/plotting/plotter_1d.py CHANGED Viewed

@@ -1,7 +1,9 @@
 '''
 Module containing plotter class
 '''
-import copy
+# pylint: disable=too-many-positional-arguments, too-many-arguments
+import cppyy
 from hist import Hist
 import numpy
@@ -56,12 +58,14 @@ class Plotter1D(Plotter):
         return minx, maxx, bins
     #-------------------------------------
-    def _run_plugins(self,
-                     arr_val : numpy.ndarray,
-                     arr_wgt : numpy.ndarray,
-                     hst,
-                     name    : str,
-                     varname : str) -> None:
+    def _run_plugins(
+            self,
+            arr_val : numpy.ndarray,
+            arr_wgt : numpy.ndarray,
+            hst     : Hist,
+            name    : str,
+            varname : str) -> None:
         if 'plugin' not in self._d_cfg:
             log.debug('No plugins found')
             return
@@ -73,7 +77,13 @@ class Plotter1D(Plotter):
             log.debug(f'FWHM plugin found for variable {varname}')
             cfg = self._d_cfg['plugin']['fwhm'][varname]
-            self._run_fwhm(arr_val = arr_val, arr_wgt=arr_wgt, hst=hst, name=name, cfg = cfg)
+            self._run_fwhm(
+                    arr_val = arr_val,
+                    arr_wgt = arr_wgt,
+                    hst     = hst,
+                    name    = name,
+                    varname = varname,
+                    cfg     = cfg)
         if 'stats' in self._d_cfg['plugin']:
             if varname not in self._d_cfg['plugin']['stats']:
@@ -82,29 +92,55 @@ class Plotter1D(Plotter):
             log.debug(f'stats plugin found for variable {varname}')
             cfg = self._d_cfg['plugin']['stats'][varname]
-            self._run_stats(arr_val = arr_val, arr_wgt=arr_wgt, name=name, cfg = cfg)
+            self._run_stats(
+                    arr_val = arr_val,
+                    arr_wgt = arr_wgt,
+                    name    = name,
+                    varname = varname,
+                    cfg     = cfg)
     #-------------------------------------
-    def _run_stats(self, arr_val : numpy.ndarray, arr_wgt : numpy.ndarray, name : str, cfg : dict[str:str]) -> None:
+    def _run_stats(
+            self,
+            arr_val : numpy.ndarray,
+            arr_wgt : numpy.ndarray,
+            varname : str,
+            name    : str,
+            cfg     : dict[str:str]) -> None:
         this_title = ''
+        data       = {}
         if 'sum' in cfg:
             form = cfg['sum']
             sumv = numpy.sum(arr_wgt)
             this_title += form.format(sumv) + '; '
+            data['sum'] = sumv
         if 'mean' in cfg:
             form = cfg['mean']
             mean = numpy.average(arr_val, weights=arr_wgt)
             this_title += form.format(mean) + '; '
+            data['mean'] = mean
         if 'rms'  in cfg:
             form = cfg['rms']
             mean = numpy.average(arr_val, weights=arr_wgt)
             rms  = numpy.sqrt(numpy.average((arr_val - mean) ** 2, weights=arr_wgt))
             this_title += form.format(rms ) + '; '
+            data['rms'] = rms
+        self._data_to_json(data = data, name = f'stats_{varname}_{name}')
         self._title+= f'\n{name}: {this_title}'
     #-------------------------------------
-    def _run_fwhm(self, arr_val : numpy.ndarray, arr_wgt : numpy.ndarray, hst, name : str, cfg : dict) -> None:
+    def _run_fwhm(
+            self,
+            arr_val : numpy.ndarray,
+            arr_wgt : numpy.ndarray,
+            hst     : Hist,
+            varname : str,
+            name    : str,
+            cfg     : dict) -> None:
         arr_bin_cnt = hst.values()
         maxy = numpy.max(arr_bin_cnt)
         obj  = FWHM(cfg=cfg, val=arr_val, wgt=arr_wgt, maxy=maxy)
@@ -112,13 +148,17 @@ class Plotter1D(Plotter):
         form        = cfg['format']
         this_title  = form.format(fwhm)
+        data        = {}
         if 'add_std' in cfg and cfg['add_std']:
             mu         = numpy.average(arr_val            , weights=arr_wgt)
-            avg        = numpy.average((arr_val - mu) ** 2, weights=arr_wgt)
-            std        = numpy.sqrt(avg)
+            var        = numpy.average((arr_val - mu) ** 2, weights=arr_wgt)
+            std        = numpy.sqrt(var)
             form       = form.replace('FWHM', 'STD')
             this_title+= '; ' + form.format(std)
+            data       = {'mu' : mu, 'std' : std, 'fwhm' : fwhm}
+        self._data_to_json(data = data, name = f'fwhm_{varname}_{name}')
         self._title+= f'\n{name}: {this_title}'
     #-------------------------------------
@@ -137,51 +177,70 @@ class Plotter1D(Plotter):
         d_data = {}
         for name, rdf in self._d_rdf.items():
-            log.debug(f'Plotting: {var}/{name}')
-            d_data[name] = rdf.AsNumpy([var])[var]
+            try:
+                log.debug(f'Plotting: {var}/{name}')
+                d_data[name] = rdf.AsNumpy([var])[var]
+            except cppyy.gbl.std.runtime_error as exc:
+                raise ValueError(f'Cannot find variable {var} in category {name}') from exc
         minx, maxx, bins = self._get_binning(var, d_data)
         d_wgt            = self._get_weights(var)
         l_bc_all = []
         for name, arr_val in d_data.items():
-            label        = self._label_from_name(name, arr_val)
+            label        = self._label_from_name(name)
             arr_wgt      = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
             arr_wgt      = self._normalize_weights(arr_wgt, var)
             hst          = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x').Weight()
             hst.fill(x=arr_val, weight=arr_wgt)
             self._run_plugins(arr_val, arr_wgt, hst, name, var)
+            style = self._get_style_config(var=var, label=label)
-            if 'styling' in self._d_cfg['plots'][var]:
-                style = self._d_cfg['plots'][var]['styling']
-                style = copy.deepcopy(style)
-            else:
-                style = {'label' : label, 'histtype' : 'errorbar', 'marker' : '.', 'linestyle' : 'none'}
-            if 'label' not in style:
-                style['label'] = label
+            log.debug(f'Style: {style}')
             hst.plot(**style)
             l_bc_all    += hst.values().tolist()
         max_y = max(l_bc_all)
         return max_y
     # --------------------------------------------
-    def _label_from_name(self, name : str, arr_val : numpy.ndarray) -> str:
+    def _get_style_config(self, var : str, label : str) -> dict[str,str]:
+        style = {
+            'label'     : label,
+            'histtype'  : 'errorbar',
+            'linestyle' : 'none'}
+        if 'styling' not in self._d_cfg['plots'][var]:
+            log.debug(f'Styling not specified for {var}')
+            return style
+        if label     not in self._d_cfg['plots'][var]['styling']:
+            log.debug(f'Styling not specified for {var}/{label}')
+            return style
+        custom_style = self._d_cfg['plots'][var]['styling'][label]
+        style.update(custom_style)
+        log.debug(f'Using custom styling for {var}/{label}')
+        return style
+    # --------------------------------------------
+    def _label_from_name(self, name : str) -> str:
         if 'stats' not in self._d_cfg:
             return name
         d_stat = self._d_cfg['stats']
-        if 'nentries' not in d_stat:
+        if 'sumw' not in d_stat:
             return name
-        form = d_stat['nentries']
+        form = d_stat['sumw']
-        nentries = len(arr_val)
-        nentries = form.format(nentries)
+        arr_wgt  = self._d_wgt[name]
+        arr_wgt  = numpy.nan_to_num(arr_wgt, nan=0.0)
+        sumw     = numpy.sum(arr_wgt)
+        nentries = form.format(sumw)
-        return f'{name}{nentries}'
+        return f'{name:<15}{nentries:<10}'
     # --------------------------------------------
     def _normalize_weights(self, arr_wgt : numpy.ndarray, var : str) -> numpy.ndarray:
         cfg_var = self._d_cfg['plots'][var]
@@ -227,10 +286,15 @@ class Plotter1D(Plotter):
         var (str) : name of variable
         '''
+        var_cfg = self._d_cfg['plots'][var]
+        if 'vline' in var_cfg:
+            line_cfg = var_cfg['vline']
+            plt.axvline(**line_cfg)
         if 'style' in self._d_cfg and 'skip_lines' in self._d_cfg['style'] and self._d_cfg['style']['skip_lines']:
             return
-        if var in ['B_const_mass_M', 'B_M']:
+        if var in ['B_const_mass_M', 'B_M', 'B_Mass', 'B_Mass_smr']:
             plt.axvline(x=5280, color='r', label=r'$B^+$'   , linestyle=':')
         elif var == 'Jpsi_M':
             plt.axvline(x=3096, color='r', label=r'$J/\psi$', linestyle=':')

dmu/plotting/plotter_2d.py CHANGED Viewed

@@ -70,6 +70,11 @@ class Plotter2D(Plotter):
         hst   = Hist(ax_x, ax_y)
         hst.fill(arr_x, arr_y, weight=arr_w)
+        if hst.values().sum() == 0:
+            log.warning('Empty histogram, not using log scale')
+            mplhep.hist2dplot(hst)
+            return
         if use_log:
             mplhep.hist2dplot(hst, norm=LogNorm())
         else:

dmu/rdataframe/utilities.py CHANGED Viewed

@@ -16,7 +16,6 @@ from ROOT import RDataFrame, RDF, Numba
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:rdataframe:utilities')
 # ---------------------------------------------------------------------
 @dataclass
 class Data:
@@ -98,12 +97,17 @@ def add_column_with_numba(
     return rdf
 # ---------------------------------------------------------------------
-def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
+def rdf_report_to_df(rep : RDF.RCutFlowReport) -> Union[pnd.DataFrame, None]:
     '''
     Takes the output of rdf.Report(), i.e. an RDataFrame cutflow report.
-    Produces a pandas dataframe with
+    Produces a pandas dataframe with the total, failed, efficiency, and cummulative efficiency
+    If no cut was applied, i.e. the cutflow is empty, will return None and show warning
     '''
+    if rep.begin() == rep.end():
+        log.warning('Empty cutflow')
+        return None
     d_data = {'cut' : [], 'All' : [], 'Passed' : []}
     for cut in rep:
         name=cut.GetName()
@@ -119,3 +123,50 @@ def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
     df['Cummulative'] = df['Efficiency'].cumprod()
     return df
+# ---------------------------------------------------------------------
+def random_filter(rdf : RDataFrame, entries : int) -> RDataFrame:
+    '''
+    Filters a dataframe, such that the output has **approximately** `entries` entries
+    '''
+    ntot = rdf.Count().GetValue()
+    if entries <= 0 or entries >= ntot:
+        log.warning(f'Requested {entries}/{ntot} random entries, not filtering')
+        return rdf
+    prob = float(entries) / ntot
+    name = f'filter_{entries}'
+    rdf  = rdf.Define(name, 'gRandom->Rndm();')
+    rdf  = rdf.Filter(f'{name} < {prob}', name)
+    nres = rdf.Count().GetValue()
+    log.debug(f'Requested {ntot}, picked {nres}')
+    return rdf
+# ---------------------------------------------------------------------
+def rdf_to_df(
+        rdf     : RDataFrame,
+        columns : list[str]) -> pnd.DataFrame:
+    '''
+    Parameters
+    ---------------
+    rdf      : ROOT dataframe
+    branches : List of columns to keep in pandas dataframe
+    Returns
+    ---------------
+    Pandas dataframe with subset of columns
+    '''
+    log.debug('Storing branches')
+    data     = rdf.AsNumpy(columns)
+    df       = pnd.DataFrame(data)
+    if len(df) == 0:
+        rep      = rdf.Report()
+        cutflow  = rdf_report_to_df(rep)
+        log.warning('Empty dataset:\n')
+        log.info(cutflow)
+    return df
+# ---------------------------------------------------------------------

dmu/rfile/ddfgetter.py ADDED Viewed

@@ -0,0 +1,102 @@
+'''
+Module holding DDFGetter class
+'''
+# pylint: disable=unnecessary-lambda-assignment
+from functools import reduce
+import dask
+import dask.dataframe as ddf
+import uproot
+import yaml
+import pandas         as pnd
+from dmu.logging.log_store import LogStore
+log=LogStore.add_logger('dmu:rfile:ddfgetter')
+# -------------------------------
+class DDFGetter:
+    '''
+    Class used to provide Dask DataFrames from YAML config files. It should handle:
+    - Friend trees
+    - Multiple files
+    '''
+    # ----------------------
+    def __init__(
+            self,
+            cfg         : dict      = None,
+            config_path : str       = None,
+            columns     : list[str] = None):
+        '''
+        Params
+        --------------
+        cfg         : Dictionary storing the configuration (optional)
+        config_path : Path to YAML configuration file (optional)
+        colums      : If passed, will only use these columns to build dataframe
+        '''
+        self._cfg     = self._load_config(path=config_path) if cfg is None else cfg
+        self._columns = columns
+    # ----------------------
+    def _load_config(self, path : str) -> dict:
+        with open(path, encoding='utf-8') as ifile:
+            data = yaml.safe_load(ifile)
+            return data
+    # ----------------------
+    def _get_columns_to_keep(self, tree) -> list[str]:
+        if self._columns is None:
+            return None
+        columns       = self._columns + self._cfg['primary_keys']
+        columns       = set(columns)
+        available     = set(tree.keys())
+        columns       = columns & available
+        log.debug(f'Keeping only {columns}')
+        return list(columns)
+    # ----------------------
+    def _get_file_df(self, fpath : str) -> pnd.DataFrame:
+        with uproot.open(fpath) as file:
+            tname   = self._cfg['tree']
+            tree    = file[tname]
+            columns = self._get_columns_to_keep(tree)
+            df      = tree.arrays(columns, library='pd')
+        return df
+    # ----------------------
+    def _get_file_dfs(self, fname : str) -> list[pnd.DataFrame]:
+        log.debug(f'Getting dataframes for: {fname}')
+        l_fpath = [ f'{sample_dir}/{fname}'          for sample_dir in self._cfg['samples'] ]
+        l_df    = [ self._get_file_df(fpath = fpath) for fpath in l_fpath ]
+        return l_df
+    # ----------------------
+    def _load_root_file(self, fname : str, ifname : int, size : int) -> pnd.DataFrame:
+        keys = self._cfg['primary_keys']
+        l_df = self._get_file_dfs(fname=fname)
+        fun  = lambda df_l, df_r : pnd.merge(df_l, df_r, on=keys)
+        log.info(f'Merging dataframes: {ifname}/{size}')
+        df   = reduce(fun, l_df)
+        df   = df.drop(columns=keys)
+        return df
+    # ----------------------
+    def get_dataframe(self) -> ddf:
+        '''
+        Returns dask dataframe
+        '''
+        l_fname = self._cfg['files']
+        nfiles  = len(l_fname)
+        log.debug('Building dataframes for single files')
+        l_dfs   = [ dask.delayed(self._load_root_file)(fname = fname, ifname=ifname, size=nfiles) for ifname, fname in enumerate(l_fname)  ]
+        log.debug('Bulding full dask dataframe')
+        output = ddf.from_delayed(l_dfs)
+        return output
+# -------------------------------

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev720__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev720py3-none-any.whl