data-manipulation-utilities 0.1.6__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/METADATA +102 -3
- {data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/RECORD +15 -11
- {data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/WHEEL +1 -1
- dmu/plotting/plotter.py +1 -1
- dmu/plotting/plotter_1d.py +54 -32
- dmu/plotting/plotter_2d.py +1 -1
- dmu/stats/fitter.py +16 -9
- dmu/stats/gof_calculator.py +145 -0
- dmu/stats/minimizers.py +183 -0
- dmu/stats/model_factory.py +207 -0
- dmu/stats/zfit_plotter.py +527 -0
- dmu_data/plotting/tests/2d.yaml +7 -1
- {data_manipulation_utilities-0.1.6.data → data_manipulation_utilities-0.1.9.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/top_level.txt +0 -0
{data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
@@ -41,7 +41,7 @@ such that:
|
|
41
41
|
|
42
42
|
Then, for each remote it pushes the tags and the commits.
|
43
43
|
|
44
|
-
*Why?*
|
44
|
+
*Why?*
|
45
45
|
|
46
46
|
1. Tags should be named as the project's version
|
47
47
|
1. As soon as a new version is created, that version needs to be tagged.
|
@@ -121,6 +121,24 @@ samples:
|
|
121
121
|
|
122
122
|
## PDFs
|
123
123
|
|
124
|
+
### Model building
|
125
|
+
|
126
|
+
In order to do complex fits, one often needs PDFs with many parameters, which need to be added.
|
127
|
+
In these PDFs certain parameters (e.g. $\mu$ or $\sigma$) need to be shared. This project provides
|
128
|
+
`ModelFactory`, which can do this as shown below:
|
129
|
+
|
130
|
+
```python
|
131
|
+
from dmu.stats.model_factory import ModelFactory
|
132
|
+
|
133
|
+
l_pdf = ['cbr'] + 2 * ['cbl']
|
134
|
+
l_shr = ['mu', 'sg']
|
135
|
+
mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
|
136
|
+
pdf = mod.get_pdf()
|
137
|
+
```
|
138
|
+
|
139
|
+
where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
|
140
|
+
The `mu` and `sg` parameters are shared.
|
141
|
+
|
124
142
|
### Printing PDFs
|
125
143
|
|
126
144
|
One can print a zfit PDF by doing:
|
@@ -231,6 +249,87 @@ likelihood :
|
|
231
249
|
nbins : 100 #If specified, will do binned likelihood fit instead of unbinned
|
232
250
|
```
|
233
251
|
|
252
|
+
## Minimizers
|
253
|
+
|
254
|
+
These are alternative implementations of the minimizers in zfit meant to be used for special types of fits.
|
255
|
+
|
256
|
+
### Anealing minimizer
|
257
|
+
|
258
|
+
This minimizer is meant to be used for fits to models with many parameters, where multiple minima are expected in the
|
259
|
+
likelihood. The minimizer use is illustrated in:
|
260
|
+
|
261
|
+
```python
|
262
|
+
from dmu.stats.minimizers import AnealingMinimizer
|
263
|
+
|
264
|
+
nll = _get_nll()
|
265
|
+
minimizer = AnealingMinimizer(ntries=10, pvalue=0.05)
|
266
|
+
res = minimizer.minimize(nll)
|
267
|
+
```
|
268
|
+
|
269
|
+
this will:
|
270
|
+
|
271
|
+
- Take the `NLL` object.
|
272
|
+
- Try fitting at most 10 times
|
273
|
+
- After each fit, calculate the goodness of fit (in this case the p-value)
|
274
|
+
- Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
|
275
|
+
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
276
|
+
randomize the parameters and try again.
|
277
|
+
- If the desired goodness of fit has not been achieved, pick the best result.
|
278
|
+
- Return the `FitResult` object and set the PDF to the final fit result.
|
279
|
+
|
280
|
+
The $\chi^2/Ndof$ can also be used as in:
|
281
|
+
|
282
|
+
```python
|
283
|
+
from dmu.stats.minimizers import AnealingMinimizer
|
284
|
+
|
285
|
+
nll = _get_nll()
|
286
|
+
minimizer = AnealingMinimizer(ntries=10, chi2ndof=1.00)
|
287
|
+
res = minimizer.minimize(nll)
|
288
|
+
```
|
289
|
+
|
290
|
+
## Fit plotting
|
291
|
+
|
292
|
+
The class `ZFitPlotter` can be used to plot fits done with zfit. For a complete set of examples of how to use
|
293
|
+
this class check the [tests](tests/stats/test_fit_plotter.py). A simple example of its usage is below:
|
294
|
+
|
295
|
+
```python
|
296
|
+
from dmu.stats.zfit_plotter import ZFitPlotter
|
297
|
+
|
298
|
+
obs = zfit.Space('m', limits=(0, 10))
|
299
|
+
|
300
|
+
# Create signal PDF
|
301
|
+
mu = zfit.Parameter("mu", 5.0, 0, 10)
|
302
|
+
sg = zfit.Parameter("sg", 0.5, 0, 5)
|
303
|
+
sig = zfit.pdf.Gauss(obs=obs, mu=mu, sigma=sg)
|
304
|
+
nsg = zfit.Parameter('nsg', 1000, 0, 10000)
|
305
|
+
esig= sig.create_extended(nsg, name='gauss')
|
306
|
+
|
307
|
+
# Create background PDF
|
308
|
+
lm = zfit.Parameter('lm', -0.1, -1, 0)
|
309
|
+
bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
|
310
|
+
nbk = zfit.Parameter('nbk', 1000, 0, 10000)
|
311
|
+
ebkg= bkg.create_extended(nbk, name='expo')
|
312
|
+
|
313
|
+
# Add them
|
314
|
+
pdf = zfit.pdf.SumPDF([ebkg, esig])
|
315
|
+
sam = pdf.create_sampler()
|
316
|
+
|
317
|
+
# Plot them
|
318
|
+
obj = ZFitPlotter(data=sam, model=pdf)
|
319
|
+
d_leg = {'gauss': 'New Gauss'}
|
320
|
+
obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
|
321
|
+
|
322
|
+
# add a line to pull hist
|
323
|
+
obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
|
324
|
+
```
|
325
|
+
|
326
|
+
this class supports:
|
327
|
+
|
328
|
+
- Handling title, legend, plots size.
|
329
|
+
- Adding pulls.
|
330
|
+
- Stacking and overlaying of PDFs.
|
331
|
+
- Blinding.
|
332
|
+
|
234
333
|
## Arrays
|
235
334
|
|
236
335
|
### Scaling by non-integer
|
{data_manipulation_utilities-0.1.6.dist-info → data_manipulation_utilities-0.1.9.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
data_manipulation_utilities-0.1.
|
1
|
+
data_manipulation_utilities-0.1.9.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
|
2
2
|
dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
|
3
3
|
dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
|
4
4
|
dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
|
@@ -6,21 +6,25 @@ dmu/ml/cv_classifier.py,sha256=n81m7i2M6Zq96AEd9EZGwXSrbG5m9jkS5RdeXvbsAXU,3712
|
|
6
6
|
dmu/ml/cv_predict.py,sha256=Bqxu-f6qquKJokFljhCzL_kiGcjLJLQFhVBD130fsyw,4893
|
7
7
|
dmu/ml/train_mva.py,sha256=d_n-A07DFweikz5nXap4OE_Mqx8VprFT7zbxmnQAbac,9638
|
8
8
|
dmu/ml/utilities.py,sha256=Nue7O9zi1QXgjGRPH6wnSAW9jusMQ2ZOSDJzBqJKIi0,3687
|
9
|
-
dmu/plotting/plotter.py,sha256=
|
10
|
-
dmu/plotting/plotter_1d.py,sha256=
|
11
|
-
dmu/plotting/plotter_2d.py,sha256=
|
9
|
+
dmu/plotting/plotter.py,sha256=ytMxtzHEY8ZFU0ZKEBE-ROjMszXl5kHTMnQnWe173nU,7208
|
10
|
+
dmu/plotting/plotter_1d.py,sha256=O7rTgCBlpCko1RSpj2TzcUIfx9sKoz2jAgw73Pz7Ynk,4472
|
11
|
+
dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
|
12
12
|
dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
|
13
13
|
dmu/rdataframe/utilities.py,sha256=x8r379F2-vZPYzAdMFCn_V4Kx2Tx9t9pn_QHcZ1euew,2756
|
14
14
|
dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
|
15
15
|
dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
|
16
|
-
dmu/stats/fitter.py,sha256=
|
16
|
+
dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
|
17
17
|
dmu/stats/function.py,sha256=yzi_Fvp_ASsFzbWFivIf-comquy21WoeY7is6dgY0Go,9491
|
18
|
+
dmu/stats/gof_calculator.py,sha256=4EN6OhULcztFvsAZ00rxgohJemnjtDNB5o0IBcv6kbk,4657
|
19
|
+
dmu/stats/minimizers.py,sha256=f9cilFY9Kp9UvbSIUsKBGFzOOg7EEWZJLPod-4k-LAQ,6216
|
20
|
+
dmu/stats/model_factory.py,sha256=LyDOf0f9I5dNUTS0MXHtSivD8aAcTLIagvMPtoXtThk,7426
|
18
21
|
dmu/stats/utilities.py,sha256=LQy4kd3xSXqpApcWuYfZxkGQyjowaXv2Wr1c4Bj-4ys,4523
|
22
|
+
dmu/stats/zfit_plotter.py,sha256=Xs6kisNEmNQXhYRCcjowxO6xHuyAyrfyQIFhGAR61U4,19719
|
19
23
|
dmu/testing/utilities.py,sha256=WbMM4e9Cn3-B-12Vr64mB5qTKkV32joStlRkD-48lG0,3460
|
20
24
|
dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
|
21
25
|
dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
26
|
dmu_data/ml/tests/train_mva.yaml,sha256=TCniCVpXMEFxZcHa8IIqollKA7ci4OkBnRznLEkXM9o,925
|
23
|
-
dmu_data/plotting/tests/2d.yaml,sha256=
|
27
|
+
dmu_data/plotting/tests/2d.yaml,sha256=VApcAfJFbjNcjMCTBSRm2P37MQlGavMZv6msbZwLSgw,402
|
24
28
|
dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
|
25
29
|
dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
|
26
30
|
dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
|
@@ -39,8 +43,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
|
|
39
43
|
dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
|
40
44
|
dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
|
41
45
|
dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
|
42
|
-
data_manipulation_utilities-0.1.
|
43
|
-
data_manipulation_utilities-0.1.
|
44
|
-
data_manipulation_utilities-0.1.
|
45
|
-
data_manipulation_utilities-0.1.
|
46
|
-
data_manipulation_utilities-0.1.
|
46
|
+
data_manipulation_utilities-0.1.9.dist-info/METADATA,sha256=sxu2cZc14f4VfDD2J3MLGmW0jRHXJBpmDspXUt1D_0k,23046
|
47
|
+
data_manipulation_utilities-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
48
|
+
data_manipulation_utilities-0.1.9.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
|
49
|
+
data_manipulation_utilities-0.1.9.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
|
50
|
+
data_manipulation_utilities-0.1.9.dist-info/RECORD,,
|
dmu/plotting/plotter.py
CHANGED
dmu/plotting/plotter_1d.py
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
Module containing plotter class
|
3
3
|
'''
|
4
4
|
|
5
|
+
import hist
|
6
|
+
from hist import Hist
|
7
|
+
|
5
8
|
import numpy
|
6
9
|
import matplotlib.pyplot as plt
|
7
10
|
|
@@ -33,58 +36,75 @@ class Plotter1D(Plotter):
|
|
33
36
|
|
34
37
|
return xname, yname
|
35
38
|
#-------------------------------------
|
36
|
-
def
|
39
|
+
def _is_normalized(self, var : str) -> bool:
|
40
|
+
d_cfg = self._d_cfg['plots'][var]
|
41
|
+
normalized=False
|
42
|
+
if 'normalized' in d_cfg:
|
43
|
+
normalized = d_cfg['normalized']
|
44
|
+
|
45
|
+
return normalized
|
46
|
+
#-------------------------------------
|
47
|
+
def _get_binning(self, var : str, d_data : dict[str, numpy.ndarray]) -> tuple[float, float, int]:
|
48
|
+
d_cfg = self._d_cfg['plots'][var]
|
49
|
+
minx, maxx, bins = d_cfg['binning']
|
50
|
+
if maxx <= minx + 1e-5:
|
51
|
+
log.info(f'Bounds not set for {var}, will calculated them')
|
52
|
+
minx, maxx = self._find_bounds(d_data = d_data, qnt=minx)
|
53
|
+
log.info(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
|
54
|
+
else:
|
55
|
+
log.debug(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
|
56
|
+
|
57
|
+
return minx, maxx, bins
|
58
|
+
#-------------------------------------
|
59
|
+
def _plot_var(self, var : str) -> float:
|
37
60
|
'''
|
38
61
|
Will plot a variable from a dictionary of dataframes
|
39
62
|
Parameters
|
40
63
|
--------------------
|
41
64
|
var (str) : name of column
|
65
|
+
|
66
|
+
Return
|
67
|
+
--------------------
|
68
|
+
Largest bin content among all bins and among all histograms plotted
|
42
69
|
'''
|
43
70
|
# pylint: disable=too-many-locals
|
44
71
|
|
45
|
-
d_cfg = self._d_cfg['plots'][var]
|
46
|
-
|
47
|
-
minx, maxx, bins = d_cfg['binning']
|
48
|
-
yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
|
49
|
-
xname, yname = self._get_labels(var)
|
50
|
-
|
51
|
-
normalized=False
|
52
|
-
if 'normalized' in d_cfg:
|
53
|
-
normalized = d_cfg['normalized']
|
54
|
-
|
55
|
-
title = ''
|
56
|
-
if 'title' in d_cfg:
|
57
|
-
title = d_cfg['title']
|
58
|
-
|
59
72
|
d_data = {}
|
60
73
|
for name, rdf in self._d_rdf.items():
|
61
74
|
d_data[name] = rdf.AsNumpy([var])[var]
|
62
75
|
|
63
|
-
|
64
|
-
|
65
|
-
minx, maxx = self._find_bounds(d_data = d_data, qnt=minx)
|
66
|
-
log.info(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
|
67
|
-
else:
|
68
|
-
log.debug(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
|
76
|
+
minx, maxx, bins = self._get_binning(var, d_data)
|
77
|
+
d_wgt = self._get_weights(var)
|
69
78
|
|
70
79
|
l_bc_all = []
|
71
|
-
d_wgt = self._get_weights(var)
|
72
80
|
for name, arr_val in d_data.items():
|
73
|
-
arr_wgt
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
l_bc_all
|
81
|
+
arr_wgt = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
|
82
|
+
hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x', label=name).Weight()
|
83
|
+
hst.fill(x=arr_val, weight=arr_wgt)
|
84
|
+
hst.plot(label=name)
|
85
|
+
l_bc_all += hst.values().tolist()
|
78
86
|
|
79
|
-
|
80
|
-
plt.xlabel(xname)
|
81
|
-
plt.ylabel(yname)
|
87
|
+
max_y = max(l_bc_all)
|
82
88
|
|
89
|
+
return max_y
|
90
|
+
# --------------------------------------------
|
91
|
+
def _style_plot(self, var : str, max_y : float) -> None:
|
92
|
+
d_cfg = self._d_cfg['plots'][var]
|
93
|
+
yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
|
94
|
+
|
95
|
+
xname, yname = self._get_labels(var)
|
96
|
+
plt.xlabel(xname)
|
97
|
+
plt.ylabel(yname)
|
98
|
+
plt.yscale(yscale)
|
83
99
|
if yscale == 'linear':
|
84
100
|
plt.ylim(bottom=0)
|
85
101
|
|
86
|
-
|
102
|
+
title = ''
|
103
|
+
if 'title' in d_cfg:
|
104
|
+
title = d_cfg['title']
|
105
|
+
|
87
106
|
plt.ylim(top=1.2 * max_y)
|
107
|
+
plt.legend()
|
88
108
|
plt.title(title)
|
89
109
|
# --------------------------------------------
|
90
110
|
def _plot_lines(self, var : str):
|
@@ -106,8 +126,10 @@ class Plotter1D(Plotter):
|
|
106
126
|
fig_size = self._get_fig_size()
|
107
127
|
for var in self._d_cfg['plots']:
|
108
128
|
log.debug(f'Plotting: {var}')
|
129
|
+
|
109
130
|
plt.figure(var, figsize=fig_size)
|
110
|
-
self._plot_var(var)
|
131
|
+
max_y = self._plot_var(var)
|
132
|
+
self._style_plot(var, max_y)
|
111
133
|
self._plot_lines(var)
|
112
134
|
self._save_plot(var)
|
113
135
|
# --------------------------------------------
|
dmu/plotting/plotter_2d.py
CHANGED
@@ -31,8 +31,8 @@ class Plotter2D(Plotter):
|
|
31
31
|
if not isinstance(cfg, dict):
|
32
32
|
raise ValueError('Config dictionary not passed')
|
33
33
|
|
34
|
-
self._rdf : RDataFrame = rdf
|
35
34
|
self._d_cfg : dict = cfg
|
35
|
+
self._rdf : RDataFrame = super()._preprocess_rdf(rdf)
|
36
36
|
|
37
37
|
self._wgt : numpy.ndarray
|
38
38
|
# --------------------------------------------
|
dmu/stats/fitter.py
CHANGED
@@ -4,6 +4,7 @@ Module holding zfitter class
|
|
4
4
|
|
5
5
|
import pprint
|
6
6
|
from typing import Union
|
7
|
+
from functools import lru_cache
|
7
8
|
|
8
9
|
import numpy
|
9
10
|
import zfit
|
@@ -100,8 +101,8 @@ class Fitter:
|
|
100
101
|
|
101
102
|
return data
|
102
103
|
#------------------------------
|
103
|
-
def _bin_pdf(self
|
104
|
-
|
104
|
+
def _bin_pdf(self):
|
105
|
+
nbins, min_x, max_x = self._get_binning()
|
105
106
|
_, arr_edg = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
|
106
107
|
|
107
108
|
size = arr_edg.size
|
@@ -117,23 +118,29 @@ class Fitter:
|
|
117
118
|
|
118
119
|
return numpy.array(l_bc)
|
119
120
|
#------------------------------
|
121
|
+
def _bin_data(self):
|
122
|
+
nbins, min_x, max_x = self._get_binning()
|
123
|
+
arr_data, _ = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
|
124
|
+
arr_data = arr_data.astype(float)
|
125
|
+
|
126
|
+
return arr_data
|
127
|
+
#------------------------------
|
128
|
+
@lru_cache(maxsize=10)
|
120
129
|
def _get_binning(self):
|
121
130
|
min_x = numpy.min(self._data_np)
|
122
131
|
max_x = numpy.max(self._data_np)
|
123
132
|
nbins = self._ndof + self._get_float_pars()
|
124
133
|
|
134
|
+
log.debug(f'Nbins: {nbins}')
|
135
|
+
log.debug(f'Range: [{min_x:.3f}, {max_x:.3f}]')
|
136
|
+
|
125
137
|
return nbins, min_x, max_x
|
126
138
|
#------------------------------
|
127
139
|
def _calc_gof(self):
|
128
140
|
log.debug('Calculating GOF')
|
129
|
-
nbins, min_x, max_x = self._get_binning()
|
130
141
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
arr_data, _ = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
|
135
|
-
arr_data = arr_data.astype(float)
|
136
|
-
arr_modl = self._bin_pdf(nbins)
|
142
|
+
arr_data = self._bin_data()
|
143
|
+
arr_modl = self._bin_pdf()
|
137
144
|
norm = numpy.sum(arr_data) / numpy.sum(arr_modl)
|
138
145
|
arr_modl = norm * arr_modl
|
139
146
|
arr_res = arr_modl - arr_data
|
@@ -0,0 +1,145 @@
|
|
1
|
+
'''
|
2
|
+
Module holding GofCalculator class
|
3
|
+
'''
|
4
|
+
from functools import lru_cache
|
5
|
+
|
6
|
+
import zfit
|
7
|
+
import numpy
|
8
|
+
import pandas as pnd
|
9
|
+
|
10
|
+
from scipy import stats
|
11
|
+
from zfit.core.basepdf import BasePDF as zpdf
|
12
|
+
from zfit.core.parameter import Parameter as zpar
|
13
|
+
from dmu.logging.log_store import LogStore
|
14
|
+
|
15
|
+
log = LogStore.add_logger('dmu:stats:gofcalculator')
|
16
|
+
# ------------------------
|
17
|
+
class GofCalculator:
|
18
|
+
'''
|
19
|
+
Class used to calculate goodness of fit from zfit NLL
|
20
|
+
'''
|
21
|
+
# ---------------------
|
22
|
+
def __init__(self, nll, ndof : int = 10):
|
23
|
+
self._nll = nll
|
24
|
+
self._ndof = ndof
|
25
|
+
|
26
|
+
self._pdf = self._pdf_from_nll()
|
27
|
+
self._data_in = self._data_from_nll()
|
28
|
+
self._data_np = self._data_np_from_data(self._data_in)
|
29
|
+
self._data_zf = zfit.Data.from_numpy(obs=self._pdf.space, array=self._data_np)
|
30
|
+
# ---------------------
|
31
|
+
def _data_np_from_data(self, dat) -> numpy.ndarray:
|
32
|
+
if isinstance(dat, numpy.ndarray):
|
33
|
+
return dat
|
34
|
+
|
35
|
+
if isinstance(dat, zfit.Data):
|
36
|
+
return zfit.run(zfit.z.unstack_x(dat))
|
37
|
+
|
38
|
+
if isinstance(dat, pnd.DataFrame):
|
39
|
+
return dat.to_numpy()
|
40
|
+
|
41
|
+
if isinstance(dat, pnd.Series):
|
42
|
+
dat = pnd.DataFrame(dat)
|
43
|
+
return dat.to_numpy()
|
44
|
+
|
45
|
+
data_type = str(type(dat))
|
46
|
+
raise ValueError(f'Data is not a numpy array, zfit.Data or pandas.DataFrame, but {data_type}')
|
47
|
+
# ---------------------
|
48
|
+
def _pdf_from_nll(self) -> zpdf:
|
49
|
+
l_model = self._nll.model
|
50
|
+
if len(l_model) != 1:
|
51
|
+
raise ValueError('Not found one and only one model')
|
52
|
+
|
53
|
+
return l_model[0]
|
54
|
+
# ---------------------
|
55
|
+
def _data_from_nll(self) -> zpdf:
|
56
|
+
l_data = self._nll.data
|
57
|
+
if len(l_data) != 1:
|
58
|
+
raise ValueError('Not found one and only one dataset')
|
59
|
+
|
60
|
+
return l_data[0]
|
61
|
+
# ---------------------
|
62
|
+
def _get_float_pars(self) -> int:
|
63
|
+
npar = 0
|
64
|
+
s_par = self._pdf.get_params()
|
65
|
+
for par in s_par:
|
66
|
+
if par.floating:
|
67
|
+
npar+=1
|
68
|
+
|
69
|
+
return npar
|
70
|
+
# ---------------------
|
71
|
+
@lru_cache(maxsize=10)
|
72
|
+
def _get_binning(self) -> tuple[int, float, float]:
|
73
|
+
min_x = numpy.min(self._data_np)
|
74
|
+
max_x = numpy.max(self._data_np)
|
75
|
+
nbins = self._ndof + self._get_float_pars()
|
76
|
+
|
77
|
+
log.debug(f'Nbins: {nbins}')
|
78
|
+
log.debug(f'Range: [{min_x:.3f}, {max_x:.3f}]')
|
79
|
+
|
80
|
+
return nbins, min_x, max_x
|
81
|
+
# ---------------------
|
82
|
+
def _get_pdf_bin_contents(self) -> numpy.ndarray:
|
83
|
+
nbins, min_x, max_x = self._get_binning()
|
84
|
+
_, arr_edg = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
|
85
|
+
|
86
|
+
size = arr_edg.size
|
87
|
+
|
88
|
+
l_bc = []
|
89
|
+
for i_edg in range(size - 1):
|
90
|
+
low = arr_edg[i_edg + 0]
|
91
|
+
hig = arr_edg[i_edg + 1]
|
92
|
+
|
93
|
+
var : zpar = self._pdf.integrate(limits = [low, hig])
|
94
|
+
val = var.numpy()[0]
|
95
|
+
l_bc.append(val * self._data_np.size)
|
96
|
+
|
97
|
+
return numpy.array(l_bc)
|
98
|
+
#------------------------------
|
99
|
+
def _get_data_bin_contents(self) -> numpy.ndarray:
|
100
|
+
nbins, min_x, max_x = self._get_binning()
|
101
|
+
arr_data, _ = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
|
102
|
+
arr_data = arr_data.astype(float)
|
103
|
+
|
104
|
+
return arr_data
|
105
|
+
#------------------------------
|
106
|
+
@lru_cache(maxsize=30)
|
107
|
+
def _calculate_gof(self) -> tuple[float, int, float]:
|
108
|
+
log.debug('Calculating GOF')
|
109
|
+
|
110
|
+
arr_data = self._get_data_bin_contents()
|
111
|
+
arr_modl = self._get_pdf_bin_contents()
|
112
|
+
|
113
|
+
norm = numpy.sum(arr_data) / numpy.sum(arr_modl)
|
114
|
+
arr_modl = norm * arr_modl
|
115
|
+
arr_res = arr_modl - arr_data
|
116
|
+
|
117
|
+
arr_chi2 = numpy.divide(arr_res ** 2, arr_data, out=numpy.zeros_like(arr_data), where=arr_data!=0)
|
118
|
+
sum_chi2 = numpy.sum(arr_chi2)
|
119
|
+
|
120
|
+
pvalue = 1 - stats.chi2.cdf(sum_chi2, self._ndof)
|
121
|
+
pvalue = float(pvalue)
|
122
|
+
|
123
|
+
log.debug(f'Chi2: {sum_chi2:.3f}')
|
124
|
+
log.debug(f'Ndof: {self._ndof}')
|
125
|
+
log.debug(f'pval: {pvalue:<.3e}')
|
126
|
+
|
127
|
+
return sum_chi2, self._ndof, pvalue
|
128
|
+
# ---------------------
|
129
|
+
def get_gof(self, kind : str) -> float:
|
130
|
+
'''
|
131
|
+
Returns good ness of fit of a given kind
|
132
|
+
|
133
|
+
kind: Type of goodness of fit, e.g. pvalue
|
134
|
+
'''
|
135
|
+
|
136
|
+
chi2, ndof, pval = self._calculate_gof()
|
137
|
+
|
138
|
+
if kind == 'pvalue':
|
139
|
+
return pval
|
140
|
+
|
141
|
+
if kind == 'chi2/ndof':
|
142
|
+
return chi2/ndof
|
143
|
+
|
144
|
+
raise NotImplementedError(f'Invalid goodness of fit: {kind}')
|
145
|
+
# ------------------------
|