data-manipulation-utilities 0.1.9__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.1.9
3
+ Version: 0.2.0
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward
8
+ Requires-Dist: awkward==2.4.6
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
204
204
 
205
205
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
206
206
 
207
+ ### Goodness of fits
208
+
209
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
210
+ This is done by:
211
+
212
+ - Binning the data and PDF.
213
+ - Calculating the reduced $\chi^2$.
214
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
215
+
216
+ This class is used as shown below:
217
+
218
+ ```python
219
+ from dmu.stats.gof_calculator import GofCalculator
220
+
221
+ nll = _get_nll()
222
+ res = Data.minimizer.minimize(nll)
223
+
224
+ gcl = GofCalculator(nll, ndof=10)
225
+ gof = gcl.get_gof(kind='pvalue')
226
+ ```
227
+
228
+ where:
229
+
230
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
231
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
232
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
233
+
207
234
  ### Simplest fit
208
235
 
209
236
  ```python
@@ -1,13 +1,13 @@
1
- data_manipulation_utilities-0.1.9.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
1
+ data_manipulation_utilities-0.2.0.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
2
2
  dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
3
3
  dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
4
4
  dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
5
5
  dmu/ml/cv_classifier.py,sha256=n81m7i2M6Zq96AEd9EZGwXSrbG5m9jkS5RdeXvbsAXU,3712
6
- dmu/ml/cv_predict.py,sha256=Bqxu-f6qquKJokFljhCzL_kiGcjLJLQFhVBD130fsyw,4893
6
+ dmu/ml/cv_predict.py,sha256=AhCsCnHWPWGIRVTdGS1NxA2m4yH7t2lV_OdALwQAcAE,4927
7
7
  dmu/ml/train_mva.py,sha256=d_n-A07DFweikz5nXap4OE_Mqx8VprFT7zbxmnQAbac,9638
8
8
  dmu/ml/utilities.py,sha256=Nue7O9zi1QXgjGRPH6wnSAW9jusMQ2ZOSDJzBqJKIi0,3687
9
9
  dmu/plotting/plotter.py,sha256=ytMxtzHEY8ZFU0ZKEBE-ROjMszXl5kHTMnQnWe173nU,7208
10
- dmu/plotting/plotter_1d.py,sha256=O7rTgCBlpCko1RSpj2TzcUIfx9sKoz2jAgw73Pz7Ynk,4472
10
+ dmu/plotting/plotter_1d.py,sha256=g6H2xAgsL9a6vRkpbqHICb3qwV_qMiQPZxxw_oOSf9M,5115
11
11
  dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
12
12
  dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
13
13
  dmu/rdataframe/utilities.py,sha256=x8r379F2-vZPYzAdMFCn_V4Kx2Tx9t9pn_QHcZ1euew,2756
@@ -29,6 +29,7 @@ dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d
29
29
  dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
30
30
  dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
31
31
  dmu_data/plotting/tests/no_bounds.yaml,sha256=8e1QdphBjz-suDr857DoeUC2DXiy6SE-gvkORJQYv80,257
32
+ dmu_data/plotting/tests/normalized.yaml,sha256=Y0eKtyV5pvlSxvqfsLjytYtv8xYF3HZ5WEdCJdeHGQI,193
32
33
  dmu_data/plotting/tests/simple.yaml,sha256=N_TvNBh_2dU0-VYgu_LMrtY0kV_hg2HxVuEoDlr1HX8,138
33
34
  dmu_data/plotting/tests/title.yaml,sha256=bawKp9aGpeRrHzv69BOCbFX8sq9bb3Es9tdsPTE7jIk,333
34
35
  dmu_data/plotting/tests/weights.yaml,sha256=RWQ1KxbCq-uO62WJ2AoY4h5Umc37zG35s-TpKnNMABI,312
@@ -43,8 +44,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
43
44
  dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
44
45
  dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
45
46
  dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
46
- data_manipulation_utilities-0.1.9.dist-info/METADATA,sha256=sxu2cZc14f4VfDD2J3MLGmW0jRHXJBpmDspXUt1D_0k,23046
47
- data_manipulation_utilities-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
48
- data_manipulation_utilities-0.1.9.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
49
- data_manipulation_utilities-0.1.9.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
50
- data_manipulation_utilities-0.1.9.dist-info/RECORD,,
47
+ data_manipulation_utilities-0.2.0.dist-info/METADATA,sha256=TJhGYcpEMs08J-Jw-Q9UT6PivCSnKo5APqPZLoFOM7g,23800
48
+ data_manipulation_utilities-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
49
+ data_manipulation_utilities-0.2.0.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
50
+ data_manipulation_utilities-0.2.0.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
51
+ data_manipulation_utilities-0.2.0.dist-info/RECORD,,
dmu/ml/cv_predict.py CHANGED
@@ -10,8 +10,8 @@ import tqdm
10
10
  from ROOT import RDataFrame
11
11
 
12
12
  import dmu.ml.utilities as ut
13
- import dmu.ml.cv_classifier as CVClassifier
14
13
 
14
+ from dmu.ml.cv_classifier import CVClassifier
15
15
  from dmu.logging.log_store import LogStore
16
16
 
17
17
  log = LogStore.add_logger('dmu:ml:cv_predict')
@@ -147,6 +147,7 @@ class CVPredict:
147
147
  arr_prb = self._predict_with_overlap(df_ft)
148
148
 
149
149
  arr_prb = self._patch_probabilities(arr_prb)
150
+ arr_prb = arr_prb.T[1]
150
151
 
151
152
  return arr_prb
152
153
  # ---------------------------------------
@@ -2,7 +2,6 @@
2
2
  Module containing plotter class
3
3
  '''
4
4
 
5
- import hist
6
5
  from hist import Hist
7
6
 
8
7
  import numpy
@@ -79,6 +78,7 @@ class Plotter1D(Plotter):
79
78
  l_bc_all = []
80
79
  for name, arr_val in d_data.items():
81
80
  arr_wgt = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
81
+ arr_wgt = self._normalize_weights(arr_wgt, var)
82
82
  hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x', label=name).Weight()
83
83
  hst.fill(x=arr_val, weight=arr_wgt)
84
84
  hst.plot(label=name)
@@ -88,6 +88,23 @@ class Plotter1D(Plotter):
88
88
 
89
89
  return max_y
90
90
  # --------------------------------------------
91
+ def _normalize_weights(self, arr_wgt : numpy.ndarray, var : str) -> numpy.ndarray:
92
+ cfg_var = self._d_cfg['plots'][var]
93
+ if 'normalized' not in cfg_var:
94
+ log.debug(f'Not normalizing for variable: {var}')
95
+ return arr_wgt
96
+
97
+ if not cfg_var['normalized']:
98
+ log.debug(f'Not normalizing for variable: {var}')
99
+ return arr_wgt
100
+
101
+ log.debug(f'Normalizing for variable: {var}')
102
+ total = numpy.sum(arr_wgt)
103
+ arr_wgt = arr_wgt / total
104
+
105
+ return arr_wgt
106
+ # --------------------------------------------
107
+
91
108
  def _style_plot(self, var : str, max_y : float) -> None:
92
109
  d_cfg = self._d_cfg['plots'][var]
93
110
  yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
@@ -0,0 +1,9 @@
1
+ saving:
2
+ plt_dir : tests/plotting/normalized
3
+ plots:
4
+ x :
5
+ normalized : true
6
+ binning : [-5.0, 8.0, 40]
7
+ y :
8
+ normalized : false
9
+ binning : [-5.0, 8.0, 40]