PyPI - data-manipulation-utilities - Versions diffs - 0.1.9__tar.gz → 0.2.0__tar.gz - Mend

data-manipulation-utilities 0.1.9tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{data_manipulation_utilities-0.1.9/src/data_manipulation_utilities.egg-info → data_manipulation_utilities-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.2
 Name: data_manipulation_utilities
-Version: 0.1.9
+Version: 0.2.0
 Description-Content-Type: text/markdown
 Requires-Dist: logzero
 Requires-Dist: PyYAML
 Requires-Dist: scipy
-Requires-Dist: awkward
+Requires-Dist: awkward==2.4.6
 Requires-Dist: tqdm
 Requires-Dist: joblib
 Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
 The `Fitter` class is a wrapper to zfit, use to make fitting easier.
+### Goodness of fits
+Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
+This is done by:
+- Binning the data and PDF.
+- Calculating the reduced $\chi^2$.
+- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
+This class is used as shown below:
+```python
+from dmu.stats.gof_calculator import GofCalculator
+nll = _get_nll()
+res = Data.minimizer.minimize(nll)
+gcl = GofCalculator(nll, ndof=10)
+gof = gcl.get_gof(kind='pvalue')
+```
+where:
+- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
+It is needed to know how many bins to use to make the histogram. The recommended value is 10.
+- `kind` The argument can be `pvalue` or `chi2/ndof`.
 ### Simplest fit
 ```python

data_manipulation_utilities-0.1.9/PKG-INFO → data_manipulation_utilities-0.2.0/README.md RENAMED Viewed

@@ -1,23 +1,3 @@
-Metadata-Version: 2.2
-Name: data_manipulation_utilities
-Version: 0.1.9
-Description-Content-Type: text/markdown
-Requires-Dist: logzero
-Requires-Dist: PyYAML
-Requires-Dist: scipy
-Requires-Dist: awkward
-Requires-Dist: tqdm
-Requires-Dist: joblib
-Requires-Dist: scikit-learn
-Requires-Dist: toml
-Requires-Dist: numpy
-Requires-Dist: matplotlib
-Requires-Dist: mplhep
-Requires-Dist: hist[plot]
-Requires-Dist: pandas
-Provides-Extra: dev
-Requires-Dist: pytest; extra == "dev"
 # D(ata) M(anipulation) U(tilities)
 These are tools that can be used for different data analysis tasks.
@@ -204,6 +184,33 @@ print_pdf(pdf,
 The `Fitter` class is a wrapper to zfit, use to make fitting easier.
+### Goodness of fits
+Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
+This is done by:
+- Binning the data and PDF.
+- Calculating the reduced $\chi^2$.
+- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
+This class is used as shown below:
+```python
+from dmu.stats.gof_calculator import GofCalculator
+nll = _get_nll()
+res = Data.minimizer.minimize(nll)
+gcl = GofCalculator(nll, ndof=10)
+gof = gcl.get_gof(kind='pvalue')
+```
+where:
+- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
+It is needed to know how many bins to use to make the histogram. The recommended value is 10.
+- `kind` The argument can be `pvalue` or `chi2/ndof`.
 ### Simplest fit
 ```python

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/pyproject.toml RENAMED Viewed

@@ -1,12 +1,12 @@
 [project]
 name        = 'data_manipulation_utilities'
-version     = '0.1.9'
+version     = '0.2.0'
 readme      = 'README.md'
 dependencies= [
 'logzero',
 'PyYAML',
 'scipy',
-'awkward',
+'awkward==2.4.6',
 'tqdm',
 'joblib',
 'scikit-learn',

data_manipulation_utilities-0.1.9/README.md → data_manipulation_utilities-0.2.0/src/data_manipulation_utilities.egg-info/PKG-INFO RENAMED Viewed

@@ -1,3 +1,23 @@
+Metadata-Version: 2.2
+Name: data_manipulation_utilities
+Version: 0.2.0
+Description-Content-Type: text/markdown
+Requires-Dist: logzero
+Requires-Dist: PyYAML
+Requires-Dist: scipy
+Requires-Dist: awkward==2.4.6
+Requires-Dist: tqdm
+Requires-Dist: joblib
+Requires-Dist: scikit-learn
+Requires-Dist: toml
+Requires-Dist: numpy
+Requires-Dist: matplotlib
+Requires-Dist: mplhep
+Requires-Dist: hist[plot]
+Requires-Dist: pandas
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
 # D(ata) M(anipulation) U(tilities)
 These are tools that can be used for different data analysis tasks.
@@ -184,6 +204,33 @@ print_pdf(pdf,
 The `Fitter` class is a wrapper to zfit, use to make fitting easier.
+### Goodness of fits
+Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
+This is done by:
+- Binning the data and PDF.
+- Calculating the reduced $\chi^2$.
+- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
+This class is used as shown below:
+```python
+from dmu.stats.gof_calculator import GofCalculator
+nll = _get_nll()
+res = Data.minimizer.minimize(nll)
+gcl = GofCalculator(nll, ndof=10)
+gof = gcl.get_gof(kind='pvalue')
+```
+where:
+- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
+It is needed to know how many bins to use to make the histogram. The recommended value is 10.
+- `kind` The argument can be `pvalue` or `chi2/ndof`.
 ### Simplest fit
 ```python

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/SOURCES.txt RENAMED Viewed

@@ -36,6 +36,7 @@ src/dmu_data/plotting/tests/fig_size.yaml
 src/dmu_data/plotting/tests/high_stat.yaml
 src/dmu_data/plotting/tests/name.yaml
 src/dmu_data/plotting/tests/no_bounds.yaml
+src/dmu_data/plotting/tests/normalized.yaml
 src/dmu_data/plotting/tests/simple.yaml
 src/dmu_data/plotting/tests/title.yaml
 src/dmu_data/plotting/tests/weights.yaml

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/requires.txt RENAMED Viewed

@@ -1,7 +1,7 @@
 logzero
 PyYAML
 scipy
-awkward
+awkward==2.4.6
 tqdm
 joblib
 scikit-learn

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/ml/cv_predict.py RENAMED Viewed

@@ -10,8 +10,8 @@ import tqdm
 from ROOT import RDataFrame
 import dmu.ml.utilities     as ut
-import dmu.ml.cv_classifier as CVClassifier
+from dmu.ml.cv_classifier  import CVClassifier
 from dmu.logging.log_store import LogStore
 log = LogStore.add_logger('dmu:ml:cv_predict')
@@ -147,6 +147,7 @@ class CVPredict:
             arr_prb = self._predict_with_overlap(df_ft)
         arr_prb = self._patch_probabilities(arr_prb)
+        arr_prb = arr_prb.T[1]
         return arr_prb
 # ---------------------------------------

{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/plotting/plotter_1d.py RENAMED Viewed

@@ -2,7 +2,6 @@
 Module containing plotter class
 '''
-import hist
 from hist import Hist
 import numpy
@@ -79,6 +78,7 @@ class Plotter1D(Plotter):
         l_bc_all = []
         for name, arr_val in d_data.items():
             arr_wgt      = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
+            arr_wgt      = self._normalize_weights(arr_wgt, var)
             hst          = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x', label=name).Weight()
             hst.fill(x=arr_val, weight=arr_wgt)
             hst.plot(label=name)
@@ -88,6 +88,23 @@ class Plotter1D(Plotter):
         return max_y
     # --------------------------------------------
+    def _normalize_weights(self, arr_wgt : numpy.ndarray, var : str) -> numpy.ndarray:
+        cfg_var = self._d_cfg['plots'][var]
+        if 'normalized' not in cfg_var:
+            log.debug(f'Not normalizing for variable: {var}')
+            return arr_wgt
+        if not cfg_var['normalized']:
+            log.debug(f'Not normalizing for variable: {var}')
+            return arr_wgt
+        log.debug(f'Normalizing for variable: {var}')
+        total   = numpy.sum(arr_wgt)
+        arr_wgt = arr_wgt / total
+        return arr_wgt
+    # --------------------------------------------
     def _style_plot(self, var : str, max_y : float) -> None:
         d_cfg  = self._d_cfg['plots'][var]
         yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'

data_manipulation_utilities-0.2.0/src/dmu_data/plotting/tests/normalized.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+saving:
+    plt_dir : tests/plotting/normalized
+plots:
+    x :
+      normalized : true
+      binning    : [-5.0, 8.0, 40]
+    y :
+      normalized : false
+      binning    : [-5.0, 8.0, 40]