PyPI - data-manipulation-utilities - Versions diffs - 0.0.1__py3-none-any.whl - Mend

data-manipulation-utilities 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data_manipulation_utilities-0.0.1.dist-info/METADATA +713 -0
data_manipulation_utilities-0.0.1.dist-info/RECORD +45 -0
data_manipulation_utilities-0.0.1.dist-info/WHEEL +5 -0
data_manipulation_utilities-0.0.1.dist-info/entry_points.txt +6 -0
data_manipulation_utilities-0.0.1.dist-info/top_level.txt +3 -0
dmu/arrays/utilities.py +55 -0
dmu/dataframe/dataframe.py +36 -0
dmu/generic/utilities.py +69 -0
dmu/logging/log_store.py +129 -0
dmu/ml/cv_classifier.py +122 -0
dmu/ml/cv_predict.py +152 -0
dmu/ml/train_mva.py +257 -0
dmu/ml/utilities.py +132 -0
dmu/plotting/plotter.py +227 -0
dmu/plotting/plotter_1d.py +113 -0
dmu/plotting/plotter_2d.py +87 -0
dmu/rdataframe/atr_mgr.py +79 -0
dmu/rdataframe/utilities.py +72 -0
dmu/rfile/rfprinter.py +91 -0
dmu/rfile/utilities.py +34 -0
dmu/stats/fitter.py +515 -0
dmu/stats/function.py +314 -0
dmu/stats/utilities.py +134 -0
dmu/testing/utilities.py +119 -0
dmu/text/transformer.py +182 -0
dmu_data/__init__.py +0 -0
dmu_data/ml/tests/train_mva.yaml +37 -0
dmu_data/plotting/tests/2d.yaml +14 -0
dmu_data/plotting/tests/fig_size.yaml +13 -0
dmu_data/plotting/tests/high_stat.yaml +22 -0
dmu_data/plotting/tests/name.yaml +14 -0
dmu_data/plotting/tests/no_bounds.yaml +12 -0
dmu_data/plotting/tests/simple.yaml +8 -0
dmu_data/plotting/tests/title.yaml +14 -0
dmu_data/plotting/tests/weights.yaml +13 -0
dmu_data/text/transform.toml +4 -0
dmu_data/text/transform.txt +6 -0
dmu_data/text/transform_set.toml +8 -0
dmu_data/text/transform_set.txt +6 -0
dmu_data/text/transform_trf.txt +12 -0
dmu_scripts/physics/check_truth.py +121 -0
dmu_scripts/rfile/compare_root_files.py +299 -0
dmu_scripts/rfile/print_trees.py +35 -0
dmu_scripts/ssh/coned.py +168 -0
dmu_scripts/text/transform_text.py +46 -0

dmu/text/transformer.py ADDED Viewed

@@ -0,0 +1,182 @@
+'''
+Module used to hold transformer class
+'''
+import os
+import pprint
+import toml
+import numpy
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:text:transformer')
+# -------------------------------------------------------------------------------------------
+class transformer:
+    # pyling disable = invalid-name
+    '''
+    Class used to apply transformations to text files
+    '''
+    # -----------------------------------------
+    def __init__(self, txt_path=None, cfg_path=None):
+        '''
+        txt_path (str): Path to text file to be transformed, can have any extension, py, txt, log, etc
+        cfg_path (str): Path to TOML file holding configuration needed for transformations
+        '''
+        self._txt_path = txt_path
+        self._cfg_path = cfg_path
+        self._suffix   = 'trf'
+        self._l_line   = None
+        self._cfg      = None
+        self._initialized = False
+    # -----------------------------------------
+    def _initialize(self):
+        if self._initialized:
+            return
+        self._check_file(self._txt_path)
+        self._check_file(self._cfg_path)
+        self._load_input()
+        self._cfg = toml.load(self._cfg_path)
+        self._initialized=True
+    # -----------------------------------------
+    def _check_file(self, file_path):
+        '''
+        Will raise exception if path not found
+        file_path (str): path to file
+        '''
+        if not os.path.isfile(file_path):
+            raise FileNotFoundError(f'File not found: {file_path}')
+        log.debug(f'Found: {file_path}')
+    # -----------------------------------------
+    def _load_input(self):
+        '''
+        Will open  self._txt_path and put the lines in self._l_line
+        '''
+        with open(self._txt_path) as ifile:
+            self._l_line = ifile.read().splitlines()
+            nline = len(self._l_line)
+            log.info(f'Found {nline} lines in {self._txt_path}')
+    # -----------------------------------------
+    def _get_out_path(self, out_path):
+        '''
+        Will return name of output file
+        If arg is not None, will make directory (in case it does not exist) and return arg
+        If arg is None, will rename input path using suffix  and return
+        '''
+        if out_path is not None:
+            dir_name = os.path.dirname(out_path)
+            os.makedirs(dir_name, exist_ok=True)
+            return out_path
+        file_name = os.path.basename(self._txt_path)
+        if '.' not in file_name:
+            return f'{file_name}_{self._suffix}'
+        l_part     = file_name.split('.')
+        bef_ext    = l_part[-2]
+        l_part[-2] = f'{bef_ext}_{self._suffix}'
+        file_name  = '.'.join(l_part)
+        file_dir   = os.path.dirname(self._txt_path)
+        return f'{file_dir}/{file_name}'
+    # -----------------------------------------
+    def _transform(self, l_line, trf):
+        log.info(f'{"":<4}{trf}')
+        if trf == 'append':
+            return self._apply_append(l_line)
+        else:
+            raise ValueError(f'Invalid transformation: {trf}')
+        return l_line
+    # -----------------------------------------
+    def _apply_append(self, l_line):
+        '''
+        Will take list of lines
+        and return list of lines with extra lines appended
+        according to config file
+        '''
+        d_append = self._cfg['trf']['append']
+        for target, l_to_be_added in d_append.items():
+            l_to_be_added = self._format_lines(l_to_be_added)
+            arr_line      = numpy.array(self._l_line)
+            arr_index,    = numpy.where(self._find_append_index(arr_line, target))
+            if arr_index.size  == 0:
+                pprint.pprint(self._l_line)
+                raise RuntimeError(f'No instance of \"{target}\" found in \"{self._txt_path}\"')
+            for index in arr_index:
+                org_line      = l_line[index]
+                ext_line      = '\n'.join(l_to_be_added)
+                l_line[index] = f'{org_line}\n{ext_line}'
+        return l_line
+    # -----------------------------------------
+    def _find_append_index(self, l_line, target):
+        '''
+        Returns list of flags denoting if target was or not fouund in list l_line
+        target can be exact or included in the l_line elements
+        '''
+        is_subst = False
+        try:
+            is_subst = self._cfg['settings']['as_substring']
+        except:
+            pass
+        if not is_subst:
+            log.debug(f'Searching exact matches for target: {target}')
+            l_flag = [ target == element for element in l_line ]
+        else:
+            log.debug(f'Searching with substrings for target: {target}')
+            l_flag = [ target in element for element in l_line ]
+        return l_flag
+    # -----------------------------------------
+    def _format_lines(self, l_line):
+        '''
+        If format was specified in the settings section, will format the
+        elements of the input list of lines
+        '''
+        if 'settings' not in self._cfg:
+            return l_line
+        if 'format'   not in self._cfg['settings']:
+            return l_line
+        fmt         = self._cfg['settings']['format']
+        l_formatted = [ fmt.format(line) for line in l_line ]
+        return l_formatted
+    # -----------------------------------------
+    def save_as(self, out_path=None):
+        '''
+        Saves text file after transformation to `out_path`
+        If no path is passed, will name as:
+        /some/dir/file.txt -> /some/dir/file_trf.txt
+        '''
+        self._initialize()
+        log.info(20 * '-')
+        log.info('Applying transformations')
+        log.info(20 * '-')
+        for trf in  self._cfg['trf']:
+            self._l_line = self._transform(self._l_line, trf)
+        out_path = self._get_out_path(out_path)
+        log.info(f'Saving to: {out_path}')
+        with open(out_path, 'w') as ofile:
+            text = '\n'.join(self._l_line)
+            ofile.write(text)
+# -------------------------------------------------------------------------------------------

dmu_data/__init__.py ADDED Viewed

File without changes

dmu_data/ml/tests/train_mva.yaml ADDED Viewed

@@ -0,0 +1,37 @@
+training :
+    nfold    : 3
+    features : [x, y, z]
+    rdm_stat : 1
+    hyper    :
+      loss              : log_loss
+      n_estimators      : 100
+      max_depth         : 3
+      learning_rate     : 0.1
+      min_samples_split : 2
+saving:
+    path : 'tests/ml/train_mva/model.pkl'
+plotting:
+    roc     :
+        min : [0, 0]
+    val_dir : 'tests/ml/train_mva'
+    features:
+        saving:
+            plt_dir : 'tests/ml/train_mva/features'
+        plots:
+          w :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['w', '']
+          x :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['x', '']
+          y :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['y', '']
+          z :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['z', '']

dmu_data/plotting/tests/2d.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+saving:
+    plt_dir : tests/plotting/2d_weighted
+general:
+    size : [20, 10]
+plots_2d:
+    - [x, y, weights, 'xy_w']
+    - [x, y,    null, 'xy_r']
+axes:
+    x :
+        binning : [-5.0, 8.0, 40]
+        label   : 'x'
+    y :
+        binning : [-5.0, 8.0, 40]
+        label   : 'y'

dmu_data/plotting/tests/fig_size.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+saving:
+    plt_dir : tests/plotting/fig_size
+general:
+    size : [20, 10]
+plots:
+    x :
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['x', 'Entries']
+    y :
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['y', 'Entries']

dmu_data/plotting/tests/high_stat.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+selection:
+    max_ran_entries : 50000
+    cuts:
+      z : 'z > 0'
+saving:
+    plt_dir : tests/plotting/high_stat
+definitions:
+    z : 'x + y'
+plots:
+    x :
+        binning    : [-5.0, 8.0, 40]
+        yscale     : 'linear'
+        labels     : ['x', 'Entries']
+    y :
+        binning    : [-5.0, 8.0, 40]
+        yscale     : 'linear'
+        labels     : ['y', 'Entries']
+    z :
+        binning    : [-5.0, 8.0, 40]
+        yscale     : 'linear'
+        labels     : ['x + y', 'Normalized']
+        normalized : true

dmu_data/plotting/tests/name.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+saving:
+    plt_dir : tests/plotting/name
+plots:
+    x :
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['x', 'Entries']
+        name    : 'xvar'
+    y :
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['y', 'Entries']
+        name    : 'yvar'

dmu_data/plotting/tests/no_bounds.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+saving:
+    plt_dir : tests/plotting/no_bounds
+plots:
+    x :
+        binning : [1, 1, 40]
+        yscale  : 'linear'
+        labels  : ['x', 'Entries']
+    y :
+        binning : [1, 1, 40]
+        yscale  : 'linear'
+        labels  : ['y', 'Entries']

dmu_data/plotting/tests/simple.yaml ADDED Viewed

@@ -0,0 +1,8 @@
+saving:
+    plt_dir : tests/plotting/simple
+plots:
+    x :
+        binning : [-5.0, 8.0, 40]
+    y :
+        binning : [-5.0, 8.0, 40]

dmu_data/plotting/tests/title.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+saving:
+    plt_dir : tests/plotting/title
+plots:
+    x :
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['x', 'Entries']
+        title   : 'Title for X plot'
+    y :
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['y', 'Entries']
+        title   : 'Title for Y plot'

dmu_data/plotting/tests/weights.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+saving:
+    plt_dir : tests/plotting/weights
+plots:
+    x :
+        weights : weights
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['x', 'Entries']
+    y :
+        weights : weights
+        binning : [-5.0, 8.0, 40]
+        yscale  : 'linear'
+        labels  : ['y', 'Entries']

dmu_data/text/transform.toml ADDED Viewed

@@ -0,0 +1,4 @@
+[trf]
+[trf.append]
+'primes are'=['2', '3', '5']
+'days are'=['Monday', 'Tuesday', 'Wednesday']

dmu_data/text/transform.txt ADDED Viewed

@@ -0,0 +1,6 @@
+the
+first
+primes are
+and
+the first
+days are

dmu_data/text/transform_set.toml ADDED Viewed

@@ -0,0 +1,8 @@
+[settings]
+as_substring=true
+format      ='--> {} <--'
+[trf]
+[trf.append]
+'primes are'=['2', '3', '5']
+'days are'=['Monday', 'Tuesday', 'Wednesday']

dmu_data/text/transform_set.txt ADDED Viewed

@@ -0,0 +1,6 @@
+the
+first
+primes are:
+and
+the first
+days are:

dmu_data/text/transform_trf.txt ADDED Viewed

@@ -0,0 +1,12 @@
+the
+first
+primes are
+2
+3
+5
+and
+the first
+days are
+Monday
+Tuesday
+Wednesday

dmu_scripts/physics/check_truth.py ADDED Viewed

@@ -0,0 +1,121 @@
+'''
+Script meant to do truth matching checks
+'''
+import os
+import copy
+import argparse
+import yaml
+import mplhep
+import matplotlib.pyplot as plt
+from ROOT import RDataFrame
+from dmu.logging.log_store   import LogStore
+from dmu.plotting.plotter_1d import Plotter1D as Plotter
+log=LogStore.add_logger('dmu:physics:check_truth')
+# ----------------------------------
+def _set_logs() -> None:
+    LogStore.set_level('dmu:plotting:Plotter'  , 30)
+    LogStore.set_level('dmu:plotting:Plotter1D', 30)
+# ----------------------------------
+def _get_args() -> argparse.Namespace:
+    '''
+    Parse args
+    '''
+    parser = argparse.ArgumentParser(description='Script used to carry out checks on truth matching mechanisms for MC')
+    parser.add_argument('-c', '--conf' , type=str, help='Path to config file', required=True)
+    args = parser.parse_args()
+    return args
+# ----------------------------------
+def _get_config(args : argparse.Namespace) -> dict:
+    path = args.conf
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f'Cannot find {path}')
+    with open(path, encoding='utf-8') as ifile:
+        cfg = yaml.safe_load(ifile)
+    return cfg
+# ----------------------------------
+def _get_rdf(file_path : str, tree_path : str) -> RDataFrame:
+    log.debug(f'Picking inputs from: {file_path}/{tree_path}')
+    rdf = RDataFrame(tree_path, file_path)
+    nentries = rdf.Count().GetValue()
+    log.debug(f'Found {nentries} entries')
+    return rdf
+# ----------------------------------
+def _preprocess_rdf(rdf : RDataFrame, cfg : dict) -> RDataFrame:
+    if 'max_entries' in cfg:
+        max_entries = cfg['max_entries']
+        rdf = rdf.Range(max_entries)
+    return rdf
+# ----------------------------------
+def _check(cfg : dict) -> None:
+    log.info(110 * '-')
+    log.info(f'{"Sample":<20}{"Method":<20}{"Initial":<15}{"":<15}{"Final":<15}{"":15}{"Efficiency":<10}')
+    log.info(110 * '-')
+    for sample_name in cfg['samples']:
+        file_path = cfg['samples'][sample_name]['file_path']
+        tree_path = cfg['samples'][sample_name]['tree_path']
+        rdf = _get_rdf(file_path, tree_path)
+        rdf = _preprocess_rdf(rdf, cfg)
+        d_cut_true = {}
+        d_cut_fake = {}
+        for method, cut in cfg['samples'][sample_name]['methods'].items():
+            _check_kind(rdf, sample_name, method, cut)
+            d_cut_true[method] = cut
+            d_cut_fake[method] = f'({cut}) == 0'
+        log.info('')
+        _plot_distributions(cfg, sample_name, rdf, d_cut_true, kind='matched')
+        _plot_distributions(cfg, sample_name, rdf, d_cut_fake, kind='anti_matched')
+# ----------------------------------
+def _plot_distributions(cfg : dict, sample_name : str, rdf : RDataFrame, d_cut : dict[str,str], kind : str) -> None:
+    cfg     = copy.deepcopy(cfg)
+    cfg_plt = cfg['samples'][sample_name]['plot']
+    cfg_plt = _add_suffix(cfg_plt, sample_name, kind)
+    d_rdf   = { method : rdf.Filter(cut) for method, cut in d_cut.items() }
+    ptr=Plotter(d_rdf=d_rdf, cfg=cfg_plt)
+    ptr.run()
+# ----------------------------------
+def _add_suffix(cfg : dict, sample_name : str, kind : str) -> dict:
+    d_var = cfg['plots']
+    for var in d_var:
+        d_var[var]['name']  = f'{var}_{kind}'
+        d_var[var]['title'] = f'{sample_name}; {kind}'
+    cfg['plots'] = d_var
+    return cfg
+# ----------------------------------
+def _check_kind(rdf : RDataFrame, sample : str, name : str, cut : str) -> RDataFrame:
+    nini = rdf.Count().GetValue()
+    rdf  = rdf.Filter(cut, name)
+    nfnl = rdf.Count().GetValue()
+    eff  = nfnl / nini * 100
+    log.info(f'{sample:<20}{name:<20}{nini:<15}{"":<15}{nfnl:<15}{"-->":15}{eff:10.2f}')
+# ----------------------------------
+def main():
+    '''
+    Script starts here
+    '''
+    _set_logs()
+    args = _get_args()
+    cfg  = _get_config(args)
+    plt.style.use(mplhep.style.LHCb2)
+    _check(cfg)
+# ----------------------------------
+if __name__ == '__main__':
+    main()