PyPI - data-manipulation-utilities - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl - Mend

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
dmu/__init__.py +0 -0
dmu/generic/hashing.py +34 -8
dmu/generic/utilities.py +164 -11
dmu/logging/log_store.py +34 -2
dmu/logging/messages.py +96 -0
dmu/ml/cv_classifier.py +3 -3
dmu/ml/cv_diagnostics.py +3 -0
dmu/ml/cv_performance.py +58 -0
dmu/ml/cv_predict.py +149 -46
dmu/ml/train_mva.py +482 -100
dmu/ml/utilities.py +29 -10
dmu/pdataframe/utilities.py +28 -3
dmu/plotting/fwhm.py +2 -2
dmu/plotting/matrix.py +1 -1
dmu/plotting/plotter.py +23 -3
dmu/plotting/plotter_1d.py +96 -32
dmu/plotting/plotter_2d.py +5 -0
dmu/rdataframe/utilities.py +54 -3
dmu/rfile/ddfgetter.py +102 -0
dmu/stats/fit_stats.py +129 -0
dmu/stats/fitter.py +55 -22
dmu/stats/gof_calculator.py +7 -0
dmu/stats/model_factory.py +153 -62
dmu/stats/parameters.py +100 -0
dmu/stats/utilities.py +443 -12
dmu/stats/wdata.py +187 -0
dmu/stats/zfit.py +17 -0
dmu/stats/zfit_plotter.py +147 -36
dmu/testing/utilities.py +102 -24
dmu/workflow/__init__.py +0 -0
dmu/workflow/cache.py +266 -0
dmu_data/ml/tests/train_mva.yaml +9 -7
dmu_data/ml/tests/train_mva_def.yaml +75 -0
dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
dmu_data/plotting/tests/2d.yaml +5 -5
dmu_data/plotting/tests/line.yaml +15 -0
dmu_data/plotting/tests/styling.yaml +8 -1
dmu_data/rfile/friends.yaml +13 -0
dmu_data/stats/fitter/test_simple.yaml +28 -0
dmu_data/stats/kde_optimizer/control.json +1 -0
dmu_data/stats/kde_optimizer/signal.json +1 -0
dmu_data/stats/parameters/data.yaml +178 -0
dmu_data/tests/config.json +6 -0
dmu_data/tests/config.yaml +4 -0
dmu_data/tests/pdf_to_tex.txt +34 -0
dmu_scripts/kerberos/check_expiration +21 -0
dmu_scripts/kerberos/convert_certificate +22 -0
dmu_scripts/ml/compare_classifiers.py +85 -0
data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
{data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
{data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0

dmu/workflow/cache.py ADDED Viewed

@@ -0,0 +1,266 @@
+'''
+This module contains
+'''
+import os
+import sys
+import shutil
+from types      import NoneType
+from pathlib    import Path
+from contextlib import contextmanager
+from dmu.generic           import hashing
+from dmu.logging.log_store import LogStore
+log=LogStore.add_logger('dmu:workflow:cache')
+# ---------------------------
+class Cache:
+    '''
+    Class meant to wrap other classes in order to
+    - Keep track of the inputs through hashes
+    - Load cached data, if found, and prevent calculations
+    The following directories will be important:
+    out_dir  : Directory where the outputs will go, specified by the user
+    cache_dir: Subdirectory of out_dir, ${out_dir}/.cache
+    hash_dir : Subdirectory of out_dir, ${out_dir}/.cache/{hash}
+               Where {hash} is a 10 alphanumeric representing the has of the inputs
+    # On skipping caching
+    This is controlled by `_l_skip_class` which is a list of class names:
+    - These classes will have the caching turned off
+    - If the list is empty, caching runs for everything
+    - If the list is None, caching is turned off for everything
+    '''
+    _cache_root     : str|None = None
+    _l_skip_class   : list[str]|None = []
+    # ---------------------------
+    def __init__(self, out_path : str, **kwargs):
+        '''
+        Parameters
+        ---------------
+        out_path: Path to directory where outputs will go
+        kwargs  : Key word arguments symbolizing identity of inputs, used for hashing
+        '''
+        if Cache._cache_root is None:
+            raise ValueError('Caching directory not set')
+        log.debug(f'Using {Cache._cache_root} root directory for caching')
+        if 'code' in kwargs:
+            raise ValueError('Cannot append hashing data with key "code", already used')
+        kwargs['code']  = self._get_code_hash()
+        self._out_path  = os.path.normpath(f'{Cache._cache_root}/{out_path}')
+        log.debug(f'Using {self._out_path} output path')
+        os.makedirs(self._out_path, exist_ok=True)
+        self._dat_hash  = kwargs
+        self._cache_dir = self._get_dir(kind='cache')
+        self._hash_dir  : str
+    # ---------------------------
+    @classmethod
+    def set_cache_root(cls, root : str) -> None:
+        '''
+        Sets the path to the directory WRT which the _out_path_
+        will be placed
+        '''
+        if cls._cache_root is not None:
+            raise ValueError(f'Trying to set {root}, but already found {cls._cache_root}')
+        os.makedirs(root, exist_ok=True)
+        cls._cache_root = root
+    # ---------------------------
+    def _get_code_hash(self) -> str:
+        '''
+        If `MyTool` inherits from `Cache`. `mytool.py` git commit hash
+        should be returned
+        '''
+        cls   = self.__class__
+        mod   = sys.modules.get(cls.__module__)
+        if mod is None:
+            raise ValueError(f'Module not found: {cls.__module__}')
+        if mod.__file__ is None:
+            raise ValueError(f'Cannot extract file path for module: {cls.__module__}')
+        fname = mod.__file__
+        fpath = os.path.abspath(fname)
+        val   = hashing.hash_file(path=fpath)
+        log.debug(f'Using hash for: {fpath} = {val}')
+        return val
+    # ---------------------------
+    def _get_dir(
+            self,
+            kind : str,
+            make : bool = True) -> str:
+        '''
+        Parameters
+        --------------
+        kind : Kind of directory, cash, hash
+        make : If True (default) will try to make directory
+        '''
+        if   kind == 'cache':
+            dir_path  = f'{self._out_path}/.cache'
+        elif kind == 'hash':
+            cache_dir = self._get_dir(kind='cache')
+            hsh       = hashing.hash_object(self._dat_hash)
+            dir_path  = f'{cache_dir}/{hsh}'
+        else:
+            raise ValueError(f'Invalid directory kind: {kind}')
+        if make:
+            os.makedirs(dir_path, exist_ok=True)
+        return dir_path
+    # ---------------------------
+    def _cache(self) -> None:
+        '''
+        Meant to be called after all the calculations finish
+        It will copy all the outputs of the processing
+        to a hashed directory
+        '''
+        self._hash_dir  = self._get_dir(kind= 'hash')
+        log.info(f'Caching outputs to: {self._hash_dir}')
+        for source in Path(self._out_path).glob('*'):
+            if str(source) == self._cache_dir:
+                continue
+            log.debug(str(source))
+            log.debug('-->')
+            log.debug(self._hash_dir)
+            log.debug('')
+            if source.is_dir():
+                shutil.copytree(source, self._hash_dir+'/'+source.name, dirs_exist_ok=True)
+            else:
+                shutil.copy2(source, self._hash_dir)
+        self._delete_from_output(only_links=False)
+        self._copy_from_hashdir()
+    # ---------------------------
+    def _delete_from_output(self, only_links : bool) -> None:
+        '''
+        Delete all objects from _out_path directory, except for `.cache`
+        only_links: If true will only delete links
+        '''
+        for path in Path(self._out_path).iterdir():
+            if str(path) == self._cache_dir:
+                log.debug(f'Skipping cache dir: {self._cache_dir}')
+                continue
+            # These will always be symbolic links
+            if only_links and not path.is_symlink():
+                log.warning(f'Found a non-symlink not deleting: {path}')
+                continue
+            log.debug(f'Deleting {path}')
+            if path.is_dir() and not path.is_symlink():
+                shutil.rmtree(path)
+            else:
+                path.unlink()
+    # ---------------------------
+    def _copy_from_hashdir(self) -> None:
+        '''
+        Copies all the objects from _hash_dir to _out_path
+        '''
+        for source in Path(self._hash_dir).iterdir():
+            target = f'{self._out_path}/{source.name}'
+            log.debug(f'{str(source):<50}{"-->"}{target}')
+            os.symlink(source, target)
+    # ---------------------------
+    def _dont_cache(self) -> bool:
+        '''
+        Returns
+        ---------------
+        Flag that if:
+        True : Will stop the derived class from using caching (i.e. caching is off)
+        False: Cache
+        '''
+        if Cache._l_skip_class is None:
+            log.info('No class will be cached')
+            return True
+        if len(Cache._l_skip_class) == 0:
+            log.debug('All classes will be cached')
+            return False
+        class_name = self.__class__.__name__
+        skip = class_name in Cache._l_skip_class
+        if skip:
+            log.warning(f'Caching turned off for {class_name}')
+        else:
+            log.debug(f'Caching turned on  for {class_name}')
+        return skip
+    # ---------------------------
+    def _copy_from_cache(self) -> bool:
+        '''
+        Checks if hash directory exists:
+        No : Returns False
+        Yes:
+            - Removes contents of `out_path`, except for .cache
+            - Copies the contents of `hash_dir` to `out_dir`
+        Returns
+        ---------------
+        True if the object, cached was found, false otherwise.
+        '''
+        if self._dont_cache():
+            # If not copying from cache, will need to remove what is
+            # in the output directory, so that it gets replaced with
+            # new outputs
+            self._delete_from_output(only_links=False)
+            log.info('Not picking already cached outputs, remaking them')
+            return False
+        hash_dir = self._get_dir(kind='hash', make=False)
+        if not os.path.isdir(hash_dir):
+            log.debug(f'Hash directory {hash_dir} not found, not caching')
+            self._delete_from_output(only_links=False)
+            return False
+        self._hash_dir = hash_dir
+        log.debug(f'Data found in hash directory: {self._hash_dir}')
+        self._delete_from_output(only_links=False)
+        self._copy_from_hashdir()
+        return True
+    # ---------------------------
+    @contextmanager
+    @staticmethod
+    def turn_off_cache(val : list[str]|None):
+        '''
+        Parameters
+        ------------------
+        val: List of names of classes that inherit from `Cache`.
+        If None, will not cache for any class.
+        By default this is an empty list and it will cache for every class
+        '''
+        if not isinstance(val, (NoneType, list)):
+            log.error('This manager expects: list[str]|None')
+            raise ValueError(f'Invalid value: {val}')
+        old_val = Cache._l_skip_class
+        Cache._l_skip_class = val
+        try:
+            yield
+        finally:
+            Cache._l_skip_class = old_val
+# ---------------------------

dmu_data/ml/tests/train_mva.yaml CHANGED Viewed

@@ -6,16 +6,21 @@ dataset:
     y : -3
 training :
     nfold    : 3
-    features : [x, y, r]
+    features :
+      - x
+      - y
+      - r
     rdm_stat : 1
     hyper    :
       loss              : log_loss
+      max_features      : sqrt
       n_estimators      : 100
-      max_depth         : 3
-      learning_rate     : 0.1
+      max_depth         : 5
       min_samples_split : 2
+      subsample         : 0.8
+      learning_rate     : 0.1
 saving:
-    path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
+    output : /tmp/tests/dmu/ml/train_mva
 plotting:
     roc     :
         min : [0.0, 0.0]
@@ -31,10 +36,7 @@ plotting:
       title      : 'Correlation matrix'
       size       : [10, 10]
       mask_value : 0
-    val_dir : '/tmp/tests/dmu/ml/train_mva'
     features:
-        saving:
-            plt_dir : '/tmp/tests/dmu/ml/train_mva/features'
         plots:
           r :
             binning : [-6, 6, 100]

dmu_data/ml/tests/train_mva_def.yaml ADDED Viewed

@@ -0,0 +1,75 @@
+# This config file is used for testing training and evaluation
+# when there is a variable that is defined in different ways
+# for the `sig` and `bkg` samples
+dataset:
+  samples:
+    sig :
+      definitions:
+        n : x + y
+    bkg :
+      definitions:
+        n : x - y
+  define :
+    r : z + x
+  nan :
+    n : -3
+    y : -3
+training :
+    nfold    : 3
+    features :
+      - n
+      - y
+      - r
+    rdm_stat : 1
+    hyper    :
+      loss              : log_loss
+      max_features      : sqrt
+      n_estimators      : 100
+      max_depth         : 5
+      min_samples_split : 2
+      subsample         : 0.8
+      learning_rate     : 0.1
+saving:
+    output : /tmp/tests/dmu/ml/train_mva
+plotting:
+    roc     :
+        min : [0.0, 0.0]
+        max : [1.2, 1.2]
+        annotate:
+          sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
+          form : '{:.2f}'
+          color: 'green'
+          xoff : -15
+          yoff : -15
+          size :  10
+    correlation:
+      title      : 'Correlation matrix'
+      size       : [10, 10]
+      mask_value : 0
+    features:
+        plots:
+          r :
+            binning : [-6, 6, 100]
+            yscale  : 'linear'
+            labels  : ['$r$', '']
+          n :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$n$', '']
+          w :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$w$', '']
+          x :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$x$', '']
+          y :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$y$', '']
+          z :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$z$', '']

dmu_data/ml/tests/train_mva_with_diagnostics.yaml CHANGED Viewed

@@ -15,7 +15,7 @@ training :
       learning_rate     : 0.1
       min_samples_split : 2
 saving:
-    path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
+    output : /tmp/tests/dmu/ml/train_mva
 plotting:
     roc     :
         min : [0.0, 0.0]
@@ -60,15 +60,20 @@ diagnostics:
   output         : /tmp/tests/dmu/ml/train_mva/diagnostics
   correlations:
     target :
-      name : z
-      overlay :
+      name : z
+      overlay :
+        wp :
+          - 0.2
+          - 0.6
+          - 0.8
+          - 0.9
         general:
           size : [20, 10]
         saving:
           plt_dir : /tmp/tests/dmu/ml/train_mva/diagnostics
         plots:
           z :
-            binning    : [-4, +4, 30]
+            binning    : [-4, +4, 30]
             yscale     : 'linear'
             labels     : ['z', 'Entries']
             normalized : true
@@ -78,5 +83,5 @@ diagnostics:
       - Pearson
       - Kendall-$\tau$
     figure:
-      title: Training diagnostics
+      title: Training diagnostics
       size : [10, 8]

dmu_data/ml/tests/train_mva_with_preffix.yaml ADDED Viewed

@@ -0,0 +1,58 @@
+dataset:
+  define :
+    r : z + preffix.x.suffix
+  nan :
+    preffix.x.suffix : -3
+    y : -3
+training :
+    nfold    : 2
+    features :
+      - preffix.x.suffix
+      - y
+      - r
+    rdm_stat : 1
+    hyper    :
+      loss              : log_loss
+      n_estimators      : 100
+      max_depth         : 3
+      learning_rate     : 0.1
+      min_samples_split : 2
+saving:
+    output : /tmp/tests/dmu/ml/train_mva
+plotting:
+    roc     :
+        min : [0.0, 0.0]
+        max : [1.2, 1.2]
+        annotate:
+          sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
+          form : '{:.2f}'
+          color: 'green'
+          xoff : -15
+          yoff : -15
+          size :  10
+    correlation:
+      title      : 'Correlation matrix'
+      size       : [10, 10]
+      mask_value : 0
+    features:
+        plots:
+          r :
+            binning : [-6, 6, 100]
+            yscale  : 'linear'
+            labels  : ['$r$', '']
+          w :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$w$', '']
+          preffix.x.suffix :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$x$', '']
+          y :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$y$', '']
+          z :
+            binning : [-4, 4, 100]
+            yscale  : 'linear'
+            labels  : ['$z$', '']

dmu_data/plotting/tests/2d.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 saving:
-    plt_dir : /tmp/dmu/tests/plotting/2d_weighted
+    plt_dir : /tmp/tests/dmu/plotting
 selection:
   cuts:
     xlow : x > -1.5
@@ -8,10 +8,10 @@ definitions:
 general:
     size : [20, 10]
 plots_2d:
-    - [x, y, weights, 'xy_wgt', false]
-    - [x, y,    null, 'xy_raw', false]
-    - [x, z,    null, 'xz_raw', false]
-    - [x, z,    null, 'xz_log',  true]
+    - [x, y, weights, 'xy_wgt_lin', false]
+    - [x, z, weights, 'xz_wgt_log',  true]
+    - [x, y,    null, 'xy_raw_lin', false]
+    - [x, z,    null, 'xz_raw_log',  true]
 axes:
     x :
         binning : [-3.0, 3.0, 40]

dmu_data/plotting/tests/line.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+saving:
+    plt_dir : tests/plotting/line
+plots:
+    x :
+      binning : [-5.0, 8.0, 40]
+      title   : x distribution
+      vline   :
+        x     : 0
+        label : label
+        ls    : --
+        c     : blue
+        lw    : 1
+    y :
+      binning : [-5.0, 8.0, 40]
+      title   : y distribution

dmu_data/plotting/tests/styling.yaml CHANGED Viewed

@@ -5,7 +5,14 @@ plots:
     binning : [-5.0, 8.0, 40]
     title   : x distribution
     styling :
-      histtype : step
+      class A:
+        histtype : fill
+        color    : gray
+        alpha    : 0.3
+      class B:
+        color    : red
+        histtype : step
+        linestyle: '-'
   y :
     binning : [-5.0, 8.0, 40]
     title   : y distribution

dmu_data/rfile/friends.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+tree   : tree_name
+primary_keys:
+  - index
+files :
+  - file_001.root
+  - file_002.root
+  - file_003.root
+samples:
+  - /tmp/tests/dmu/rfile/main
+  - /tmp/tests/dmu/rfile/frn1
+  - /tmp/tests/dmu/rfile/frn2
+  - /tmp/tests/dmu/rfile/frn3
+  - /tmp/tests/dmu/rfile/frn4

dmu_data/stats/fitter/test_simple.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+# The strategies below are exclusive, only can should be used at a time
+strategy      :
+      # This strategy will fit multiple times and retry the fit until either
+      # ntries is exhausted or the pvalue is reached.
+      retry   :
+          ntries        : 4    #Number of tries
+          pvalue_thresh : 0.05 #Pvalue threshold, if the fit is better than this, the loop ends
+          ignore_status : true #Will pick invalid fits if this is true, otherwise only valid fits will be counted
+      # This will fit smaller datasets and get the value of the shape parameters to allow
+      # these shapes to float only around this value and within nsigma
+      # Fit can be carried out multiple times with larger and larger samples to tighten parameters
+      steps   :
+          nsteps   : [1e3, 1e4] #Number of entries to use
+          nsigma   : [5.0, 2.0] #Number of sigmas for the range of the parameter, for each step
+# The lines below will split the range of the data [0-10] into two subranges, such that the NLL is built
+# only in those ranges. The ranges need to be tuples
+ranges        :
+      - !!python/tuple [0, 3]
+      - !!python/tuple [6, 9]
+#The lines below will allow using contraints for each parameter, where the first element is the mean and the second
+#the width of a Gaussian constraint. No correlations are implemented, yet.
+constraints   :
+  mu : [5.0, 1.0]
+  sg : [1.0, 0.1]
+#After each fit, the parameters spciefied below will be printed, for debugging purposes
+print_pars    : ['mu', 'sg']
+likelihood :
+  binned : false

data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

data-manipulation-utilities 0.2.7py3-none-any.whl → 0.2.8.dev714py3-none-any.whl