data-manipulation-utilities 0.1.9__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {data_manipulation_utilities-0.1.9/src/data_manipulation_utilities.egg-info → data_manipulation_utilities-0.2.0}/PKG-INFO +29 -2
  2. data_manipulation_utilities-0.1.9/PKG-INFO → data_manipulation_utilities-0.2.0/README.md +27 -20
  3. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/pyproject.toml +2 -2
  4. data_manipulation_utilities-0.1.9/README.md → data_manipulation_utilities-0.2.0/src/data_manipulation_utilities.egg-info/PKG-INFO +47 -0
  5. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/SOURCES.txt +1 -0
  6. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
  7. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/ml/cv_predict.py +2 -1
  8. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/plotting/plotter_1d.py +18 -1
  9. data_manipulation_utilities-0.2.0/src/dmu_data/plotting/tests/normalized.yaml +9 -0
  10. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/setup.cfg +0 -0
  11. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
  12. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
  13. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
  14. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/arrays/utilities.py +0 -0
  15. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/generic/utilities.py +0 -0
  16. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/logging/log_store.py +0 -0
  17. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/ml/cv_classifier.py +0 -0
  18. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/ml/train_mva.py +0 -0
  19. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/ml/utilities.py +0 -0
  20. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/plotting/plotter.py +0 -0
  21. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/plotting/plotter_2d.py +0 -0
  22. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/rdataframe/atr_mgr.py +0 -0
  23. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/rdataframe/utilities.py +0 -0
  24. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/rfile/rfprinter.py +0 -0
  25. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/rfile/utilities.py +0 -0
  26. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/fitter.py +0 -0
  27. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/function.py +0 -0
  28. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/gof_calculator.py +0 -0
  29. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/minimizers.py +0 -0
  30. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/model_factory.py +0 -0
  31. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/utilities.py +0 -0
  32. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/stats/zfit_plotter.py +0 -0
  33. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/testing/utilities.py +0 -0
  34. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu/text/transformer.py +0 -0
  35. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/__init__.py +0 -0
  36. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/ml/tests/train_mva.yaml +0 -0
  37. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/2d.yaml +0 -0
  38. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
  39. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
  40. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/name.yaml +0 -0
  41. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
  42. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/simple.yaml +0 -0
  43. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/title.yaml +0 -0
  44. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/plotting/tests/weights.yaml +0 -0
  45. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/text/transform.toml +0 -0
  46. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/text/transform.txt +0 -0
  47. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/text/transform_set.toml +0 -0
  48. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/text/transform_set.txt +0 -0
  49. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_data/text/transform_trf.txt +0 -0
  50. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_scripts/git/publish +0 -0
  51. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_scripts/physics/check_truth.py +0 -0
  52. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
  53. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_scripts/rfile/print_trees.py +0 -0
  54. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_scripts/ssh/coned.py +0 -0
  55. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.0}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.1.9
3
+ Version: 0.2.0
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward
8
+ Requires-Dist: awkward==2.4.6
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
204
204
 
205
205
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
206
206
 
207
+ ### Goodness of fits
208
+
209
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
210
+ This is done by:
211
+
212
+ - Binning the data and PDF.
213
+ - Calculating the reduced $\chi^2$.
214
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
215
+
216
+ This class is used as shown below:
217
+
218
+ ```python
219
+ from dmu.stats.gof_calculator import GofCalculator
220
+
221
+ nll = _get_nll()
222
+ res = Data.minimizer.minimize(nll)
223
+
224
+ gcl = GofCalculator(nll, ndof=10)
225
+ gof = gcl.get_gof(kind='pvalue')
226
+ ```
227
+
228
+ where:
229
+
230
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
231
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
232
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
233
+
207
234
  ### Simplest fit
208
235
 
209
236
  ```python
@@ -1,23 +1,3 @@
1
- Metadata-Version: 2.2
2
- Name: data_manipulation_utilities
3
- Version: 0.1.9
4
- Description-Content-Type: text/markdown
5
- Requires-Dist: logzero
6
- Requires-Dist: PyYAML
7
- Requires-Dist: scipy
8
- Requires-Dist: awkward
9
- Requires-Dist: tqdm
10
- Requires-Dist: joblib
11
- Requires-Dist: scikit-learn
12
- Requires-Dist: toml
13
- Requires-Dist: numpy
14
- Requires-Dist: matplotlib
15
- Requires-Dist: mplhep
16
- Requires-Dist: hist[plot]
17
- Requires-Dist: pandas
18
- Provides-Extra: dev
19
- Requires-Dist: pytest; extra == "dev"
20
-
21
1
  # D(ata) M(anipulation) U(tilities)
22
2
 
23
3
  These are tools that can be used for different data analysis tasks.
@@ -204,6 +184,33 @@ print_pdf(pdf,
204
184
 
205
185
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
206
186
 
187
+ ### Goodness of fits
188
+
189
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
190
+ This is done by:
191
+
192
+ - Binning the data and PDF.
193
+ - Calculating the reduced $\chi^2$.
194
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
195
+
196
+ This class is used as shown below:
197
+
198
+ ```python
199
+ from dmu.stats.gof_calculator import GofCalculator
200
+
201
+ nll = _get_nll()
202
+ res = Data.minimizer.minimize(nll)
203
+
204
+ gcl = GofCalculator(nll, ndof=10)
205
+ gof = gcl.get_gof(kind='pvalue')
206
+ ```
207
+
208
+ where:
209
+
210
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
211
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
212
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
213
+
207
214
  ### Simplest fit
208
215
 
209
216
  ```python
@@ -1,12 +1,12 @@
1
1
  [project]
2
2
  name = 'data_manipulation_utilities'
3
- version = '0.1.9'
3
+ version = '0.2.0'
4
4
  readme = 'README.md'
5
5
  dependencies= [
6
6
  'logzero',
7
7
  'PyYAML',
8
8
  'scipy',
9
- 'awkward',
9
+ 'awkward==2.4.6',
10
10
  'tqdm',
11
11
  'joblib',
12
12
  'scikit-learn',
@@ -1,3 +1,23 @@
1
+ Metadata-Version: 2.2
2
+ Name: data_manipulation_utilities
3
+ Version: 0.2.0
4
+ Description-Content-Type: text/markdown
5
+ Requires-Dist: logzero
6
+ Requires-Dist: PyYAML
7
+ Requires-Dist: scipy
8
+ Requires-Dist: awkward==2.4.6
9
+ Requires-Dist: tqdm
10
+ Requires-Dist: joblib
11
+ Requires-Dist: scikit-learn
12
+ Requires-Dist: toml
13
+ Requires-Dist: numpy
14
+ Requires-Dist: matplotlib
15
+ Requires-Dist: mplhep
16
+ Requires-Dist: hist[plot]
17
+ Requires-Dist: pandas
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest; extra == "dev"
20
+
1
21
  # D(ata) M(anipulation) U(tilities)
2
22
 
3
23
  These are tools that can be used for different data analysis tasks.
@@ -184,6 +204,33 @@ print_pdf(pdf,
184
204
 
185
205
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
186
206
 
207
+ ### Goodness of fits
208
+
209
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
210
+ This is done by:
211
+
212
+ - Binning the data and PDF.
213
+ - Calculating the reduced $\chi^2$.
214
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
215
+
216
+ This class is used as shown below:
217
+
218
+ ```python
219
+ from dmu.stats.gof_calculator import GofCalculator
220
+
221
+ nll = _get_nll()
222
+ res = Data.minimizer.minimize(nll)
223
+
224
+ gcl = GofCalculator(nll, ndof=10)
225
+ gof = gcl.get_gof(kind='pvalue')
226
+ ```
227
+
228
+ where:
229
+
230
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
231
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
232
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
233
+
187
234
  ### Simplest fit
188
235
 
189
236
  ```python
@@ -36,6 +36,7 @@ src/dmu_data/plotting/tests/fig_size.yaml
36
36
  src/dmu_data/plotting/tests/high_stat.yaml
37
37
  src/dmu_data/plotting/tests/name.yaml
38
38
  src/dmu_data/plotting/tests/no_bounds.yaml
39
+ src/dmu_data/plotting/tests/normalized.yaml
39
40
  src/dmu_data/plotting/tests/simple.yaml
40
41
  src/dmu_data/plotting/tests/title.yaml
41
42
  src/dmu_data/plotting/tests/weights.yaml
@@ -1,7 +1,7 @@
1
1
  logzero
2
2
  PyYAML
3
3
  scipy
4
- awkward
4
+ awkward==2.4.6
5
5
  tqdm
6
6
  joblib
7
7
  scikit-learn
@@ -10,8 +10,8 @@ import tqdm
10
10
  from ROOT import RDataFrame
11
11
 
12
12
  import dmu.ml.utilities as ut
13
- import dmu.ml.cv_classifier as CVClassifier
14
13
 
14
+ from dmu.ml.cv_classifier import CVClassifier
15
15
  from dmu.logging.log_store import LogStore
16
16
 
17
17
  log = LogStore.add_logger('dmu:ml:cv_predict')
@@ -147,6 +147,7 @@ class CVPredict:
147
147
  arr_prb = self._predict_with_overlap(df_ft)
148
148
 
149
149
  arr_prb = self._patch_probabilities(arr_prb)
150
+ arr_prb = arr_prb.T[1]
150
151
 
151
152
  return arr_prb
152
153
  # ---------------------------------------
@@ -2,7 +2,6 @@
2
2
  Module containing plotter class
3
3
  '''
4
4
 
5
- import hist
6
5
  from hist import Hist
7
6
 
8
7
  import numpy
@@ -79,6 +78,7 @@ class Plotter1D(Plotter):
79
78
  l_bc_all = []
80
79
  for name, arr_val in d_data.items():
81
80
  arr_wgt = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
81
+ arr_wgt = self._normalize_weights(arr_wgt, var)
82
82
  hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x', label=name).Weight()
83
83
  hst.fill(x=arr_val, weight=arr_wgt)
84
84
  hst.plot(label=name)
@@ -88,6 +88,23 @@ class Plotter1D(Plotter):
88
88
 
89
89
  return max_y
90
90
  # --------------------------------------------
91
+ def _normalize_weights(self, arr_wgt : numpy.ndarray, var : str) -> numpy.ndarray:
92
+ cfg_var = self._d_cfg['plots'][var]
93
+ if 'normalized' not in cfg_var:
94
+ log.debug(f'Not normalizing for variable: {var}')
95
+ return arr_wgt
96
+
97
+ if not cfg_var['normalized']:
98
+ log.debug(f'Not normalizing for variable: {var}')
99
+ return arr_wgt
100
+
101
+ log.debug(f'Normalizing for variable: {var}')
102
+ total = numpy.sum(arr_wgt)
103
+ arr_wgt = arr_wgt / total
104
+
105
+ return arr_wgt
106
+ # --------------------------------------------
107
+
91
108
  def _style_plot(self, var : str, max_y : float) -> None:
92
109
  d_cfg = self._d_cfg['plots'][var]
93
110
  yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
@@ -0,0 +1,9 @@
1
+ saving:
2
+ plt_dir : tests/plotting/normalized
3
+ plots:
4
+ x :
5
+ normalized : true
6
+ binning : [-5.0, 8.0, 40]
7
+ y :
8
+ normalized : false
9
+ binning : [-5.0, 8.0, 40]