PyPI - data-manipulation-utilities - Versions diffs - 0.1.9__tar.gz → 0.2.1__tar.gz - Mend

data-manipulation-utilities 0.1.9tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.2
 Name: data_manipulation_utilities
-Version: 0.1.9
+Version: 0.2.1
 Description-Content-Type: text/markdown
 Requires-Dist: logzero
 Requires-Dist: PyYAML
 Requires-Dist: scipy
-Requires-Dist: awkward
+Requires-Dist: awkward==2.4.6
 Requires-Dist: tqdm
 Requires-Dist: joblib
 Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
 The `Fitter` class is a wrapper to zfit, use to make fitting easier.
+### Goodness of fits
+Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
+This is done by:
+- Binning the data and PDF.
+- Calculating the reduced $\chi^2$.
+- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
+This class is used as shown below:
+```python
+from dmu.stats.gof_calculator import GofCalculator
+nll = _get_nll()
+res = Data.minimizer.minimize(nll)
+gcl = GofCalculator(nll, ndof=10)
+gof = gcl.get_gof(kind='pvalue')
+```
+where:
+- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
+It is needed to know how many bins to use to make the histogram. The recommended value is 10.
+- `kind` The argument can be `pvalue` or `chi2/ndof`.
 ### Simplest fit
 ```python
@@ -396,6 +423,14 @@ obj.run()
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
 ```yaml
+dataset:
+    # If the key is found to be NaN, replace its value with the number provided
+    # This will be used in the training.
+    # Otherwise the entries with NaNs will be dropped
+    nan:
+        x : 0
+        y : 0
+        z : -999
 training :
     nfold    : 10
     features : [w, x, y, z]
@@ -406,8 +441,25 @@ training :
       learning_rate     : 0.1
       min_samples_split : 2
 saving:
+    # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
     path : 'tests/ml/train_mva/model.pkl'
 plotting:
+    roc :
+        min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
+        max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
+        # The section below is optional and will annotate the ROC curve with
+        # values for the score at different signal efficiencies
+        annotate:
+          sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
+          form    : '{:.2f}' # Use two decimals for scores
+          color   : 'green'  # Color for text and marker
+          xoff    : -15      # Offsets in X and Y
+          yoff    : -15
+          size    :  10      # Size of text
+    correlation: # Adds correlation matrix for training datasets
+      title      : 'Correlation matrix'
+      size       : [10, 10]
+      mask_value : 0                # Where correlation is zero, the bin will appear white
     val_dir : 'tests/ml/train_mva'
     features:
         saving:
@@ -475,6 +527,36 @@ When evaluating the model with real data, problems might occur, we deal with the
 - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
+# Pandas dataframes
+## Utilities
+These are thin layers of code that take pandas dataframes and carry out specific tasks
+### Dataframe to latex
+One can save a dataframe to latex with:
+```python
+import pandas as pnd
+import dmu.pdataframe.utilities as put
+d_data = {}
+d_data['a'] = [1,2,3]
+d_data['b'] = [4,5,6]
+df = pnd.DataFrame(d_data)
+d_format = {
+        'a' : '{:.0f}',
+        'b' : '{:.3f}'}
+df = _get_df()
+put.df_to_tex(df,
+        './table.tex',
+        d_format = d_format,
+        caption  = 'some caption')
+```
 # Rdataframes
 These are utility functions meant to be used with ROOT dataframes.
@@ -626,6 +708,43 @@ axes:
         label   : 'y'
 ```
+# Other plots
+## Matrices
+This can be done with `MatrixPlotter`, whose usage is illustrated below:
+```python
+import numpy
+import matplotlib.pyplot as plt
+from dmu.plotting.matrix import MatrixPlotter
+cfg = {
+        'labels'     : ['x', 'y', 'z'], # Used to label the matrix axes
+        'title'      : 'Some title',    # Optional, title of plot
+        'label_angle': 45,              # Labels will be rotated by 45 degrees
+        'upper'      : True,            # Useful in case this is a symmetric matrix
+        'zrange'     : [0, 10],         # Controls the z axis range
+        'size'       : [7, 7],          # Plot size
+        'format'     : '{:.3f}',        # Optional, if used will add numerical values to the contents, otherwise a color bar is used
+        'fontsize'   : 12,              # Font size associated to `format`
+        'mask_value' : 0,               # These values will appear white in the plot
+        }
+mat = [
+        [1, 2, 3],
+        [2, 0, 4],
+        [3, 4, numpy.nan]
+        ]
+mat = numpy.array(mat)
+obj = MatrixPlotter(mat=mat, cfg=cfg)
+obj.plot()
+plt.show()
+```
 # Manipulating ROOT files
 ## Getting trees from file

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/README.md RENAMED Viewed

@@ -184,6 +184,33 @@ print_pdf(pdf,
 The `Fitter` class is a wrapper to zfit, use to make fitting easier.
+### Goodness of fits
+Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
+This is done by:
+- Binning the data and PDF.
+- Calculating the reduced $\chi^2$.
+- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
+This class is used as shown below:
+```python
+from dmu.stats.gof_calculator import GofCalculator
+nll = _get_nll()
+res = Data.minimizer.minimize(nll)
+gcl = GofCalculator(nll, ndof=10)
+gof = gcl.get_gof(kind='pvalue')
+```
+where:
+- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
+It is needed to know how many bins to use to make the histogram. The recommended value is 10.
+- `kind` The argument can be `pvalue` or `chi2/ndof`.
 ### Simplest fit
 ```python
@@ -376,6 +403,14 @@ obj.run()
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
 ```yaml
+dataset:
+    # If the key is found to be NaN, replace its value with the number provided
+    # This will be used in the training.
+    # Otherwise the entries with NaNs will be dropped
+    nan:
+        x : 0
+        y : 0
+        z : -999
 training :
     nfold    : 10
     features : [w, x, y, z]
@@ -386,8 +421,25 @@ training :
       learning_rate     : 0.1
       min_samples_split : 2
 saving:
+    # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
     path : 'tests/ml/train_mva/model.pkl'
 plotting:
+    roc :
+        min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
+        max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
+        # The section below is optional and will annotate the ROC curve with
+        # values for the score at different signal efficiencies
+        annotate:
+          sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
+          form    : '{:.2f}' # Use two decimals for scores
+          color   : 'green'  # Color for text and marker
+          xoff    : -15      # Offsets in X and Y
+          yoff    : -15
+          size    :  10      # Size of text
+    correlation: # Adds correlation matrix for training datasets
+      title      : 'Correlation matrix'
+      size       : [10, 10]
+      mask_value : 0                # Where correlation is zero, the bin will appear white
     val_dir : 'tests/ml/train_mva'
     features:
         saving:
@@ -455,6 +507,36 @@ When evaluating the model with real data, problems might occur, we deal with the
 - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
+# Pandas dataframes
+## Utilities
+These are thin layers of code that take pandas dataframes and carry out specific tasks
+### Dataframe to latex
+One can save a dataframe to latex with:
+```python
+import pandas as pnd
+import dmu.pdataframe.utilities as put
+d_data = {}
+d_data['a'] = [1,2,3]
+d_data['b'] = [4,5,6]
+df = pnd.DataFrame(d_data)
+d_format = {
+        'a' : '{:.0f}',
+        'b' : '{:.3f}'}
+df = _get_df()
+put.df_to_tex(df,
+        './table.tex',
+        d_format = d_format,
+        caption  = 'some caption')
+```
 # Rdataframes
 These are utility functions meant to be used with ROOT dataframes.
@@ -606,6 +688,43 @@ axes:
         label   : 'y'
 ```
+# Other plots
+## Matrices
+This can be done with `MatrixPlotter`, whose usage is illustrated below:
+```python
+import numpy
+import matplotlib.pyplot as plt
+from dmu.plotting.matrix import MatrixPlotter
+cfg = {
+        'labels'     : ['x', 'y', 'z'], # Used to label the matrix axes
+        'title'      : 'Some title',    # Optional, title of plot
+        'label_angle': 45,              # Labels will be rotated by 45 degrees
+        'upper'      : True,            # Useful in case this is a symmetric matrix
+        'zrange'     : [0, 10],         # Controls the z axis range
+        'size'       : [7, 7],          # Plot size
+        'format'     : '{:.3f}',        # Optional, if used will add numerical values to the contents, otherwise a color bar is used
+        'fontsize'   : 12,              # Font size associated to `format`
+        'mask_value' : 0,               # These values will appear white in the plot
+        }
+mat = [
+        [1, 2, 3],
+        [2, 0, 4],
+        [3, 4, numpy.nan]
+        ]
+mat = numpy.array(mat)
+obj = MatrixPlotter(mat=mat, cfg=cfg)
+obj.plot()
+plt.show()
+```
 # Manipulating ROOT files
 ## Getting trees from file

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/pyproject.toml RENAMED Viewed

@@ -1,12 +1,12 @@
 [project]
 name        = 'data_manipulation_utilities'
-version     = '0.1.9'
+version     = '0.2.1'
 readme      = 'README.md'
 dependencies= [
 'logzero',
 'PyYAML',
 'scipy',
-'awkward',
+'awkward==2.4.6',
 'tqdm',
 'joblib',
 'scikit-learn',

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.2
 Name: data_manipulation_utilities
-Version: 0.1.9
+Version: 0.2.1
 Description-Content-Type: text/markdown
 Requires-Dist: logzero
 Requires-Dist: PyYAML
 Requires-Dist: scipy
-Requires-Dist: awkward
+Requires-Dist: awkward==2.4.6
 Requires-Dist: tqdm
 Requires-Dist: joblib
 Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
 The `Fitter` class is a wrapper to zfit, use to make fitting easier.
+### Goodness of fits
+Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
+This is done by:
+- Binning the data and PDF.
+- Calculating the reduced $\chi^2$.
+- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
+This class is used as shown below:
+```python
+from dmu.stats.gof_calculator import GofCalculator
+nll = _get_nll()
+res = Data.minimizer.minimize(nll)
+gcl = GofCalculator(nll, ndof=10)
+gof = gcl.get_gof(kind='pvalue')
+```
+where:
+- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
+It is needed to know how many bins to use to make the histogram. The recommended value is 10.
+- `kind` The argument can be `pvalue` or `chi2/ndof`.
 ### Simplest fit
 ```python
@@ -396,6 +423,14 @@ obj.run()
 where the settings for the training go in a config dictionary, which when written to YAML looks like:
 ```yaml
+dataset:
+    # If the key is found to be NaN, replace its value with the number provided
+    # This will be used in the training.
+    # Otherwise the entries with NaNs will be dropped
+    nan:
+        x : 0
+        y : 0
+        z : -999
 training :
     nfold    : 10
     features : [w, x, y, z]
@@ -406,8 +441,25 @@ training :
       learning_rate     : 0.1
       min_samples_split : 2
 saving:
+    # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
     path : 'tests/ml/train_mva/model.pkl'
 plotting:
+    roc :
+        min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
+        max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
+        # The section below is optional and will annotate the ROC curve with
+        # values for the score at different signal efficiencies
+        annotate:
+          sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
+          form    : '{:.2f}' # Use two decimals for scores
+          color   : 'green'  # Color for text and marker
+          xoff    : -15      # Offsets in X and Y
+          yoff    : -15
+          size    :  10      # Size of text
+    correlation: # Adds correlation matrix for training datasets
+      title      : 'Correlation matrix'
+      size       : [10, 10]
+      mask_value : 0                # Where correlation is zero, the bin will appear white
     val_dir : 'tests/ml/train_mva'
     features:
         saving:
@@ -475,6 +527,36 @@ When evaluating the model with real data, problems might occur, we deal with the
 - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_  with zeros and evaluated. However, before returning, the probabilities will be
 saved as -1. I.e. entries with NaNs will have probabilities of -1.
+# Pandas dataframes
+## Utilities
+These are thin layers of code that take pandas dataframes and carry out specific tasks
+### Dataframe to latex
+One can save a dataframe to latex with:
+```python
+import pandas as pnd
+import dmu.pdataframe.utilities as put
+d_data = {}
+d_data['a'] = [1,2,3]
+d_data['b'] = [4,5,6]
+df = pnd.DataFrame(d_data)
+d_format = {
+        'a' : '{:.0f}',
+        'b' : '{:.3f}'}
+df = _get_df()
+put.df_to_tex(df,
+        './table.tex',
+        d_format = d_format,
+        caption  = 'some caption')
+```
 # Rdataframes
 These are utility functions meant to be used with ROOT dataframes.
@@ -626,6 +708,43 @@ axes:
         label   : 'y'
 ```
+# Other plots
+## Matrices
+This can be done with `MatrixPlotter`, whose usage is illustrated below:
+```python
+import numpy
+import matplotlib.pyplot as plt
+from dmu.plotting.matrix import MatrixPlotter
+cfg = {
+        'labels'     : ['x', 'y', 'z'], # Used to label the matrix axes
+        'title'      : 'Some title',    # Optional, title of plot
+        'label_angle': 45,              # Labels will be rotated by 45 degrees
+        'upper'      : True,            # Useful in case this is a symmetric matrix
+        'zrange'     : [0, 10],         # Controls the z axis range
+        'size'       : [7, 7],          # Plot size
+        'format'     : '{:.3f}',        # Optional, if used will add numerical values to the contents, otherwise a color bar is used
+        'fontsize'   : 12,              # Font size associated to `format`
+        'mask_value' : 0,               # These values will appear white in the plot
+        }
+mat = [
+        [1, 2, 3],
+        [2, 0, 4],
+        [3, 4, numpy.nan]
+        ]
+mat = numpy.array(mat)
+obj = MatrixPlotter(mat=mat, cfg=cfg)
+obj.plot()
+plt.show()
+```
 # Manipulating ROOT files
 ## Getting trees from file

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/SOURCES.txt RENAMED Viewed

@@ -13,9 +13,12 @@ src/dmu/ml/cv_classifier.py
 src/dmu/ml/cv_predict.py
 src/dmu/ml/train_mva.py
 src/dmu/ml/utilities.py
+src/dmu/pdataframe/utilities.py
+src/dmu/plotting/matrix.py
 src/dmu/plotting/plotter.py
 src/dmu/plotting/plotter_1d.py
 src/dmu/plotting/plotter_2d.py
+src/dmu/plotting/utilities.py
 src/dmu/rdataframe/atr_mgr.py
 src/dmu/rdataframe/utilities.py
 src/dmu/rfile/rfprinter.py
@@ -36,6 +39,7 @@ src/dmu_data/plotting/tests/fig_size.yaml
 src/dmu_data/plotting/tests/high_stat.yaml
 src/dmu_data/plotting/tests/name.yaml
 src/dmu_data/plotting/tests/no_bounds.yaml
+src/dmu_data/plotting/tests/normalized.yaml
 src/dmu_data/plotting/tests/simple.yaml
 src/dmu_data/plotting/tests/title.yaml
 src/dmu_data/plotting/tests/weights.yaml

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/requires.txt RENAMED Viewed

@@ -1,7 +1,7 @@
 logzero
 PyYAML
 scipy
-awkward
+awkward==2.4.6
 tqdm
 joblib
 scikit-learn

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_classifier.py RENAMED Viewed

@@ -2,6 +2,7 @@
 Module holding cv_classifier class
 '''
+from typing                  import Union
 from sklearn.ensemble        import GradientBoostingClassifier
 from dmu.logging.log_store import LogStore
@@ -22,7 +23,7 @@ class CVClassifier(GradientBoostingClassifier):
     '''
     # pylint: disable = too-many-ancestors, abstract-method
     # ----------------------------------
-    def __init__(self, cfg : dict | None = None):
+    def __init__(self, cfg : Union[dict,None] = None):
         '''
         cfg (dict) : Dictionary with configuration, specially the hyperparameters set in the `hyper` field
         '''

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_predict.py RENAMED Viewed

@@ -10,8 +10,8 @@ import tqdm
 from ROOT import RDataFrame
 import dmu.ml.utilities     as ut
-import dmu.ml.cv_classifier as CVClassifier
+from dmu.ml.cv_classifier  import CVClassifier
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:ml:cv_predict')
@@ -147,6 +147,7 @@ class CVPredict:
             arr_prb = self._predict_with_overlap(df_ft)
         arr_prb = self._patch_probabilities(arr_prb)
+        arr_prb = arr_prb.T[1]
         return arr_prb
 # ---------------------------------------

data-manipulation-utilities 0.1.9__tar.gz → 0.2.1__tar.gz

data-manipulation-utilities 0.1.9tar.gz → 0.2.1tar.gz