PyPI - data-manipulation-utilities - Versions diffs - 0.0.1__py3-none-any.whl - Mend

data-manipulation-utilities 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data_manipulation_utilities-0.0.1.dist-info/METADATA +713 -0
data_manipulation_utilities-0.0.1.dist-info/RECORD +45 -0
data_manipulation_utilities-0.0.1.dist-info/WHEEL +5 -0
data_manipulation_utilities-0.0.1.dist-info/entry_points.txt +6 -0
data_manipulation_utilities-0.0.1.dist-info/top_level.txt +3 -0
dmu/arrays/utilities.py +55 -0
dmu/dataframe/dataframe.py +36 -0
dmu/generic/utilities.py +69 -0
dmu/logging/log_store.py +129 -0
dmu/ml/cv_classifier.py +122 -0
dmu/ml/cv_predict.py +152 -0
dmu/ml/train_mva.py +257 -0
dmu/ml/utilities.py +132 -0
dmu/plotting/plotter.py +227 -0
dmu/plotting/plotter_1d.py +113 -0
dmu/plotting/plotter_2d.py +87 -0
dmu/rdataframe/atr_mgr.py +79 -0
dmu/rdataframe/utilities.py +72 -0
dmu/rfile/rfprinter.py +91 -0
dmu/rfile/utilities.py +34 -0
dmu/stats/fitter.py +515 -0
dmu/stats/function.py +314 -0
dmu/stats/utilities.py +134 -0
dmu/testing/utilities.py +119 -0
dmu/text/transformer.py +182 -0
dmu_data/__init__.py +0 -0
dmu_data/ml/tests/train_mva.yaml +37 -0
dmu_data/plotting/tests/2d.yaml +14 -0
dmu_data/plotting/tests/fig_size.yaml +13 -0
dmu_data/plotting/tests/high_stat.yaml +22 -0
dmu_data/plotting/tests/name.yaml +14 -0
dmu_data/plotting/tests/no_bounds.yaml +12 -0
dmu_data/plotting/tests/simple.yaml +8 -0
dmu_data/plotting/tests/title.yaml +14 -0
dmu_data/plotting/tests/weights.yaml +13 -0
dmu_data/text/transform.toml +4 -0
dmu_data/text/transform.txt +6 -0
dmu_data/text/transform_set.toml +8 -0
dmu_data/text/transform_set.txt +6 -0
dmu_data/text/transform_trf.txt +12 -0
dmu_scripts/physics/check_truth.py +121 -0
dmu_scripts/rfile/compare_root_files.py +299 -0
dmu_scripts/rfile/print_trees.py +35 -0
dmu_scripts/ssh/coned.py +168 -0
dmu_scripts/text/transform_text.py +46 -0

dmu/stats/function.py ADDED Viewed

@@ -0,0 +1,314 @@
+'''
+Module containing the Function class
+'''
+import os
+import json
+from typing import Any
+import numpy
+import matplotlib.pyplot as plt
+from scipy.interpolate     import interp1d
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:stats:function')
+#---------------------------------------------------------
+class FunOutOfBounds(Exception):
+    '''
+    Will be raised when function defined between [a, b] is evaluated outside
+    '''
+#---------------------------------------------------------
+class Function:
+    '''
+    Class meant to represent a 1D function created from (x, y) coordinates
+    '''
+    #------------------------------------------------
+    def __init__(self, x : list | numpy.ndarray, y : list | numpy.ndarray, kind : str = 'cubic'):
+        '''
+        x (list) : List with x coordinates
+        y (list) : List with y coordinates
+        '''
+        x = self._array_to_list(x)
+        y = self._array_to_list(y)
+        if len(x) != len(y):
+            raise ValueError('X and Y coordinates have different lengths')
+        npoint = len(x)
+        if npoint < 4:
+            raise ValueError('Need at least four points, found {npoint}')
+        x, y = self._remove_duplicates(x=x, y=y)
+        self._max_entries = 400
+        self._l_x = x
+        self._l_y = y
+        self._kind= kind
+        self._tag = 'no_tag'
+        self._interpolator = interp1d(self._l_x, self._l_y, kind=self._kind)
+        self._update_data()
+    #------------------------------------------------
+    def __eq__(self, othr):
+        if not isinstance(othr, Function):
+            log.warning('Comparison not done with instance of Function')
+            return False
+        d_self = self.__dict__
+        d_othr = othr.__dict__
+        if '_interpolator' in d_self:
+            del d_self['_interpolator']
+        if '_interpolator' in d_othr:
+            del d_othr['_interpolator']
+        return d_self == d_othr
+    #------------------------------------------------
+    def __str__(self):
+        npoints = len(self._l_x)
+        max_x   = max(self._l_x)
+        min_x   = min(self._l_x)
+        max_y   = max(self._l_y)
+        min_y   = min(self._l_y)
+        line = f'\n{"Points":<20}{npoints:<20}\n'
+        line+= '-------------------------\n'
+        line+= f'{"x-max":<20}{max_x:<20}\n'
+        line+= f'{"x-min":<20}{min_x:<20}\n'
+        line+= f'{"y-max":<20}{max_y:<20}\n'
+        line+= f'{"y-min":<20}{min_y:<20}'
+        return line
+    #------------------------------------------------
+    def __call__(self, xval : float | numpy.ndarray | list, off_bounds_raise : bool = False) -> numpy.ndarray:
+        '''
+        Class taking value of x coordinates as a float, numpy array or list
+        It will interpolate y value and return value
+        '''
+        if not off_bounds_raise:
+            xval = self._push_in_bounds(xval)
+        self._check_xval_validity(xval)
+        return self._interpolator(xval)
+    #------------------------------------------------
+    def _push_in_bounds(self, xval : float | numpy.ndarray | list) -> numpy.ndarray:
+        '''
+        If the xval container, has elements above (below) the upper (lower) bound, these events will be set to the closest bound
+        '''
+        xval = numpy.array(xval).flatten().astype(float)
+        max_x = max(self._l_x)
+        min_x = min(self._l_x)
+        if ((min_x <= xval) & (xval <= max_x)).all():
+            log.debug('Input array within bounds, will not push elements')
+            return xval
+        xmod = numpy.clip(xval, min_x, max_x)
+        arr_diff = xval != xmod
+        arr_indx = numpy.where(arr_diff)[0]
+        ndiff    = numpy.sum(arr_diff)
+        arr_indx = arr_indx[:20]
+        log.warning(f'Sending {ndiff} entries inside bounds [{min_x:.3e}, {max_x:.3e}]')
+        for indx in arr_indx:
+            org = xval[indx]
+            mod = xmod[indx]
+            log.info(f'{org:<20.5e}{"-->":<20}{mod:<20.5}')
+        return xmod
+    #------------------------------------------------
+    @staticmethod
+    def json_decoder(d_attr):
+        '''
+        Takes dictionary of attributes from JSON serialization
+        Returns instance of Function
+        '''
+        if '_l_x' not in d_attr:
+            raise KeyError('X values not found')
+        if '_l_y' not in d_attr:
+            raise KeyError('Y values not found')
+        if '_tag' not in d_attr:
+            raise KeyError('tag not found')
+        x    = d_attr['_l_x' ]
+        y    = d_attr['_l_y' ]
+        kind = d_attr['_kind']
+        tag  = d_attr['_tag' ]
+        fun  = Function(x=x, y=y, kind=kind)
+        fun.tag = tag
+        return fun
+    #------------------------------------------------
+    @property
+    def tag(self):
+        '''
+        Returns string simbolyzing tag of function
+        '''
+        return self._tag
+    @tag.setter
+    def tag(self, value : str):
+        '''
+        This sets the _tag property of the function
+        '''
+        self._tag = value
+    #------------------------------------------------
+    @staticmethod
+    def load(path : str):
+        '''
+        Will take path to JSON file with serialized function
+        Will return function instance
+        '''
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f'Cannot find: {path}')
+        with open(path, encoding='utf-8') as ifile:
+            fun = json.loads(ifile.read(), object_hook=Function.json_decoder)
+        log.info(f'Loaded from: {path}')
+        return fun
+    #------------------------------------------------
+    def _array_to_list(self, x : Any):
+        '''
+        Transform from ndarray to list
+        Return x if already list
+        Raise otherwise
+        '''
+        if isinstance(x, list):
+            log.debug('Already found list')
+            return x
+        if isinstance(x, numpy.ndarray):
+            log.debug('Transforming argument to list')
+            return x.tolist()
+        raise ValueError('Object introduced is neither a list nor a numpy array')
+    #------------------------------------------------
+    def _update_data(self):
+        '''
+        If number of entries in dataset is larger than _max_entries:
+        Use interpolator to scan function and get new (x, y) pairs.
+        '''
+        norg = len(self._l_x)
+        if norg <= self._max_entries:
+            return
+        log.info(f'Trimming dataset: {norg} -> {self._max_entries}')
+        min_x = min(self._l_x)
+        max_x = max(self._l_x)
+        arr_x = numpy.linspace(min_x, max_x, self._max_entries)
+        arr_y = self(arr_x)
+        self._l_x = arr_x.tolist()
+        self._l_y = arr_y.tolist()
+    #------------------------------------------------
+    def _remove_duplicates(self, x : list, y : list):
+        '''
+        Takes two lists with the same sizes and remove (x, y) points with repeated
+        x coordinates.
+        Return tuple with x and y after removal
+        '''
+        norg  = len(x)
+        d_tmp = dict(zip(x, y))
+        x = list(d_tmp.keys())
+        y = list(d_tmp.values())
+        nfnl  = len(x)
+        if norg != nfnl:
+            log.warning(f'Found duplicates: {norg} -> {nfnl}')
+        return x, y
+    #------------------------------------------------
+    def _check_xval_validity(self, xval : float | numpy.ndarray | list):
+        '''
+        Will check that xval is an acceptable value for the function to be evaluated at
+        '''
+        if isinstance(xval, list):
+            xval = numpy.array(xval)
+        if not isinstance(xval, (float, numpy.ndarray)):
+            raise ValueError(f'x value is not a float or numpy array: {xval}')
+        check_within_bounds_vect = numpy.vectorize(self._check_within_bounds)
+        check_within_bounds_vect(xval)
+    #------------------------------------------------
+    def _check_within_bounds(self, xval : float):
+        '''
+        Check that xval is within bounds of function
+        '''
+        if xval < min(self._l_x) or xval > max(self._l_x):
+            print(self)
+            raise FunOutOfBounds(f'x value outside bounds: {xval}')
+    #------------------------------------------------
+    def _json_encoder(self, obj):
+        '''
+        Takes Function object
+        Returns dictionary of attributes for encoding
+        '''
+        d_data = obj.__dict__
+        if '_interpolator' in d_data:
+            del d_data['_interpolator']
+        return d_data
+    #------------------------------------------------
+    def _save_plot(self, path : str):
+        '''
+        Takes path to PNG, saves scatter plot of l_y vs l_x
+        '''
+        plt.plot(self._l_x, self._l_y)
+        plt.savefig(path)
+        plt.close()
+        log.info(f'Saved to: {path}')
+    #------------------------------------------------
+    def save(self, path : str, plot : bool = False):
+        '''
+        Saves current object to JSON
+        path (str): Path to file, needs to end in .json
+        '''
+        if not path.endswith('.json'):
+            raise ValueError(f'Output path does not end in .json: {path}')
+        dir_name = os.path.dirname(path)
+        os.makedirs(dir_name, exist_ok=True)
+        with open(path, 'w', encoding='utf-8') as ofile:
+            json.dump(self, ofile, indent=4, default=self._json_encoder)
+        if plot:
+            path = path.replace('.json', '.png')
+            self._save_plot(path)
+        log.info(f'Saved to: {path}')
+#------------------------------------------------

dmu/stats/utilities.py ADDED Viewed

@@ -0,0 +1,134 @@
+'''
+Module with utility functions related to the dmu.stats project
+'''
+import os
+import re
+from typing import Union
+import zfit
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:stats:utilities')
+#-------------------------------------------------------
+#Zfit/print_pdf
+#-------------------------------------------------------
+def _get_const(par : zfit.Parameter, d_const : Union[None, dict[str, list[float]]]) -> str:
+    '''
+    Takes zfit parameter and dictionary of constraints
+    Returns a formatted string with the value of the constraint on that parameter
+    '''
+    if d_const is None or par.name not in d_const:
+        return 'none'
+    obj = d_const[par.name]
+    if isinstance(obj, (list, tuple)):
+        [mu, sg] = obj
+        val      = f'{mu:.3e}; {sg:.3e}'
+    else:
+        val      = str(obj)
+    return val
+#-------------------------------------------------------
+def _blind_vars(s_par : set, l_blind : Union[list[str], None] = None) -> set[zfit.Parameter]:
+    '''
+    Takes set of zfit parameters and list of parameter names to blind
+    returns set of zfit parameters that should be blinded
+    '''
+    if l_blind is None:
+        return s_par
+    rgx_ors = '|'.join(l_blind)
+    regex   = f'({rgx_ors})'
+    s_par_blind = { par for par in s_par if not re.match(regex, par.name) }
+    return s_par_blind
+#-------------------------------------------------------
+def _get_pars(
+        pdf : zfit.pdf.BasePDF,
+        blind : Union[None, list[str]]) -> tuple[list, list]:
+    s_par_flt = pdf.get_params(floating= True)
+    s_par_fix = pdf.get_params(floating=False)
+    s_par_flt = _blind_vars(s_par_flt, l_blind=blind)
+    s_par_fix = _blind_vars(s_par_fix, l_blind=blind)
+    l_par_flt = list(s_par_flt)
+    l_par_fix = list(s_par_fix)
+    l_par_flt = sorted(l_par_flt, key=lambda par: par.name)
+    l_par_fix = sorted(l_par_fix, key=lambda par: par.name)
+    return l_par_flt, l_par_fix
+#-------------------------------------------------------
+def _get_messages(
+        pdf       : zfit.pdf.BasePDF,
+        l_par_flt : list,
+        l_par_fix : list,
+        d_const   : Union[None, dict[str,list[float]]] = None) -> list[str]:
+    str_space = str(pdf.space)
+    l_msg=[]
+    l_msg.append('-' * 20)
+    l_msg.append(f'PDF: {pdf.name}')
+    l_msg.append(f'OBS: {str_space}')
+    l_msg.append(f'{"Name":<50}{"Value":>15}{"Low":>15}{"High":>15}{"Floating":>5}{"Constraint":>25}')
+    l_msg.append('-' * 20)
+    for par in l_par_flt:
+        value = par.value().numpy()
+        low   = par.lower
+        hig   = par.upper
+        const = _get_const(par, d_const)
+        l_msg.append(f'{par.name:<50}{value:>15.3e}{low:>15.3e}{hig:>15.3e}{par.floating:>5}{const:>25}')
+    l_msg.append('')
+    for par in l_par_fix:
+        value = par.value().numpy()
+        low   = par.lower
+        hig   = par.upper
+        const = _get_const(par, d_const)
+        l_msg.append(f'{par.name:<50}{value:>15.3e}{low:>15.3e}{hig:>15.3e}{par.floating:>5}{const:>25}')
+    return l_msg
+#-------------------------------------------------------
+def print_pdf(
+        pdf      : zfit.pdf.BasePDF,
+        d_const  : Union[None, dict[str,list[float]]] = None,
+        txt_path : Union[str,None]                    = None,
+        level    : int                                = 20,
+        blind    : Union[None, list[str]]             = None):
+    '''
+    Function used to print zfit PDFs
+    Parameters
+    -------------------
+    pdf (zfit.PDF): PDF
+    d_const (dict): Optional dictionary mapping {par_name : [mu, sg]}
+    txt_path (str): Optionally, dump output to text in this path
+    level (str)   : Optionally set the level at which the printing happens in screen, default info
+    blind (list)  : List of regular expressions matching variable names to blind in printout
+    '''
+    l_par_flt, l_par_fix = _get_pars(pdf, blind)
+    l_msg                = _get_messages(pdf, l_par_flt, l_par_fix, d_const)
+    if txt_path is not None:
+        log.debug(f'Saving to: {txt_path}')
+        message  = '\n'.join(l_msg)
+        dir_path = os.path.dirname(txt_path)
+        os.makedirs(dir_path, exist_ok=True)
+        with open(txt_path, 'w', encoding='utf-8') as ofile:
+            ofile.write(message)
+        return
+    for msg in l_msg:
+        if   level == 20:
+            log.info(msg)
+        elif level == 30:
+            log.debug(msg)
+        else:
+            raise ValueError(f'Invalid level: {level}')
+#-------------------------------------------------------

dmu/testing/utilities.py ADDED Viewed

@@ -0,0 +1,119 @@
+'''
+Module containing utility functions needed by unit tests
+'''
+import os
+from typing              import Union
+from dataclasses         import dataclass
+from importlib.resources import files
+from ROOT import RDF, TFile, RDataFrame
+import pandas as pnd
+import numpy
+import yaml
+from dmu.logging.log_store import LogStore
+log = LogStore.add_logger('dmu:testing:utilities')
+# -------------------------------
+@dataclass
+class Data:
+    '''
+    Class storing shared data
+    '''
+    nentries = 3000
+# -------------------------------
+def _double_data(d_data : dict) -> dict:
+    df_1   = pnd.DataFrame(d_data)
+    df_2   = pnd.DataFrame(d_data)
+    df     = pnd.concat([df_1, df_2], axis=0)
+    d_data = { name : df[name].to_numpy() for name in df.columns }
+    return d_data
+# -------------------------------
+def _add_nans(d_data : dict) -> dict:
+    df_good   = pnd.DataFrame(d_data)
+    df_bad    = pnd.DataFrame(d_data)
+    df_bad[:] = numpy.nan
+    df        = pnd.concat([df_good, df_bad])
+    d_data    = { name : df[name].to_numpy() for name in df.columns }
+    return d_data
+# -------------------------------
+def get_rdf(kind : Union[str,None] = None,
+            repeated : bool        = False,
+            add_nans : bool        = False):
+    '''
+    Return ROOT dataframe with toy data
+    '''
+    d_data = {}
+    if   kind == 'sig':
+        d_data['w'] = numpy.random.normal(0, 1, size=Data.nentries)
+        d_data['x'] = numpy.random.normal(0, 1, size=Data.nentries)
+        d_data['y'] = numpy.random.normal(0, 1, size=Data.nentries)
+        d_data['z'] = numpy.random.normal(0, 1, size=Data.nentries)
+    elif kind == 'bkg':
+        d_data['w'] = numpy.random.normal(1, 1, size=Data.nentries)
+        d_data['x'] = numpy.random.normal(1, 1, size=Data.nentries)
+        d_data['y'] = numpy.random.normal(1, 1, size=Data.nentries)
+        d_data['z'] = numpy.random.normal(1, 1, size=Data.nentries)
+    else:
+        log.error(f'Invalid kind: {kind}')
+        raise ValueError
+    if repeated:
+        d_data = _double_data(d_data)
+    if add_nans:
+        d_data = _add_nans(d_data)
+    rdf = RDF.FromNumpy(d_data)
+    return rdf
+# -------------------------------
+def get_config(name : Union[str,None] = None):
+    '''
+    Takes path to the YAML config file, after `dmu_data`
+    Returns dictionary with config
+    '''
+    if name is None:
+        raise ValueError('Name not pased')
+    cfg_path = files('dmu_data').joinpath(name)
+    cfg_path = str(cfg_path)
+    with open(cfg_path, encoding='utf-8') as ifile:
+        d_cfg = yaml.safe_load(ifile)
+    return d_cfg
+# -------------------------------
+def _get_rdf(nentries : int) -> RDataFrame:
+    rdf = RDataFrame(nentries)
+    rdf = rdf.Define('x', '0')
+    rdf = rdf.Define('y', '1')
+    rdf = rdf.Define('z', '2')
+    return rdf
+# -------------------------------
+def get_file_with_trees(path : str) -> TFile:
+    '''
+    Picks full path to toy ROOT file, in the form of /a/b/c/file.root
+    returns handle to it
+    '''
+    dir_name    = os.path.dirname(path)
+    os.makedirs(dir_name, exist_ok=True)
+    snap        = RDF.RSnapshotOptions()
+    snap.fMode  = 'recreate'
+    l_tree_name = ['odir/idir/a', 'dir/b', 'c']
+    l_nevt      = [    100, 200, 300]
+    l_rdf = [ _get_rdf(nevt) for nevt in l_nevt ]
+    for rdf, tree_name in zip(l_rdf, l_tree_name):
+        rdf.Snapshot(tree_name, path, ['x', 'y', 'z'], snap)
+        snap.fMode  = 'update'
+    return TFile(path)