data-manipulation-utilities 0.1.6__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.1.6
3
+ Version: 0.1.9
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
@@ -41,7 +41,7 @@ such that:
41
41
 
42
42
  Then, for each remote it pushes the tags and the commits.
43
43
 
44
- *Why?*
44
+ *Why?*
45
45
 
46
46
  1. Tags should be named as the project's version
47
47
  1. As soon as a new version is created, that version needs to be tagged.
@@ -121,6 +121,24 @@ samples:
121
121
 
122
122
  ## PDFs
123
123
 
124
+ ### Model building
125
+
126
+ In order to do complex fits, one often needs PDFs with many parameters, which need to be added.
127
+ In these PDFs certain parameters (e.g. $\mu$ or $\sigma$) need to be shared. This project provides
128
+ `ModelFactory`, which can do this as shown below:
129
+
130
+ ```python
131
+ from dmu.stats.model_factory import ModelFactory
132
+
133
+ l_pdf = ['cbr'] + 2 * ['cbl']
134
+ l_shr = ['mu', 'sg']
135
+ mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
136
+ pdf = mod.get_pdf()
137
+ ```
138
+
139
+ where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
140
+ The `mu` and `sg` parameters are shared.
141
+
124
142
  ### Printing PDFs
125
143
 
126
144
  One can print a zfit PDF by doing:
@@ -231,6 +249,87 @@ likelihood :
231
249
  nbins : 100 #If specified, will do binned likelihood fit instead of unbinned
232
250
  ```
233
251
 
252
+ ## Minimizers
253
+
254
+ These are alternative implementations of the minimizers in zfit meant to be used for special types of fits.
255
+
256
+ ### Anealing minimizer
257
+
258
+ This minimizer is meant to be used for fits to models with many parameters, where multiple minima are expected in the
259
+ likelihood. The minimizer use is illustrated in:
260
+
261
+ ```python
262
+ from dmu.stats.minimizers import AnealingMinimizer
263
+
264
+ nll = _get_nll()
265
+ minimizer = AnealingMinimizer(ntries=10, pvalue=0.05)
266
+ res = minimizer.minimize(nll)
267
+ ```
268
+
269
+ this will:
270
+
271
+ - Take the `NLL` object.
272
+ - Try fitting at most 10 times
273
+ - After each fit, calculate the goodness of fit (in this case the p-value)
274
+ - Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
275
+ - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
276
+ randomize the parameters and try again.
277
+ - If the desired goodness of fit has not been achieved, pick the best result.
278
+ - Return the `FitResult` object and set the PDF to the final fit result.
279
+
280
+ The $\chi^2/Ndof$ can also be used as in:
281
+
282
+ ```python
283
+ from dmu.stats.minimizers import AnealingMinimizer
284
+
285
+ nll = _get_nll()
286
+ minimizer = AnealingMinimizer(ntries=10, chi2ndof=1.00)
287
+ res = minimizer.minimize(nll)
288
+ ```
289
+
290
+ ## Fit plotting
291
+
292
+ The class `ZFitPlotter` can be used to plot fits done with zfit. For a complete set of examples of how to use
293
+ this class check the [tests](tests/stats/test_fit_plotter.py). A simple example of its usage is below:
294
+
295
+ ```python
296
+ from dmu.stats.zfit_plotter import ZFitPlotter
297
+
298
+ obs = zfit.Space('m', limits=(0, 10))
299
+
300
+ # Create signal PDF
301
+ mu = zfit.Parameter("mu", 5.0, 0, 10)
302
+ sg = zfit.Parameter("sg", 0.5, 0, 5)
303
+ sig = zfit.pdf.Gauss(obs=obs, mu=mu, sigma=sg)
304
+ nsg = zfit.Parameter('nsg', 1000, 0, 10000)
305
+ esig= sig.create_extended(nsg, name='gauss')
306
+
307
+ # Create background PDF
308
+ lm = zfit.Parameter('lm', -0.1, -1, 0)
309
+ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
310
+ nbk = zfit.Parameter('nbk', 1000, 0, 10000)
311
+ ebkg= bkg.create_extended(nbk, name='expo')
312
+
313
+ # Add them
314
+ pdf = zfit.pdf.SumPDF([ebkg, esig])
315
+ sam = pdf.create_sampler()
316
+
317
+ # Plot them
318
+ obj = ZFitPlotter(data=sam, model=pdf)
319
+ d_leg = {'gauss': 'New Gauss'}
320
+ obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
321
+
322
+ # add a line to pull hist
323
+ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
324
+ ```
325
+
326
+ this class supports:
327
+
328
+ - Handling title, legend, plots size.
329
+ - Adding pulls.
330
+ - Stacking and overlaying of PDFs.
331
+ - Blinding.
332
+
234
333
  ## Arrays
235
334
 
236
335
  ### Scaling by non-integer
@@ -1,4 +1,4 @@
1
- data_manipulation_utilities-0.1.6.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
1
+ data_manipulation_utilities-0.1.9.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
2
2
  dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
3
3
  dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
4
4
  dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
@@ -6,21 +6,25 @@ dmu/ml/cv_classifier.py,sha256=n81m7i2M6Zq96AEd9EZGwXSrbG5m9jkS5RdeXvbsAXU,3712
6
6
  dmu/ml/cv_predict.py,sha256=Bqxu-f6qquKJokFljhCzL_kiGcjLJLQFhVBD130fsyw,4893
7
7
  dmu/ml/train_mva.py,sha256=d_n-A07DFweikz5nXap4OE_Mqx8VprFT7zbxmnQAbac,9638
8
8
  dmu/ml/utilities.py,sha256=Nue7O9zi1QXgjGRPH6wnSAW9jusMQ2ZOSDJzBqJKIi0,3687
9
- dmu/plotting/plotter.py,sha256=laa6Kl7P-ZOIhaOFBVjOH4XQ4kPCV7wBNvLIMBnyCwM,7181
10
- dmu/plotting/plotter_1d.py,sha256=G-i94uzm2TjNaog1A4agAKar_G0qNdkAqIPCmzhe85Y,3660
11
- dmu/plotting/plotter_2d.py,sha256=SWPKns-CfpUZHgBXvwm3gceH3k2eL_mKGXQ8sWpZJB0,2919
9
+ dmu/plotting/plotter.py,sha256=ytMxtzHEY8ZFU0ZKEBE-ROjMszXl5kHTMnQnWe173nU,7208
10
+ dmu/plotting/plotter_1d.py,sha256=O7rTgCBlpCko1RSpj2TzcUIfx9sKoz2jAgw73Pz7Ynk,4472
11
+ dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
12
12
  dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
13
13
  dmu/rdataframe/utilities.py,sha256=x8r379F2-vZPYzAdMFCn_V4Kx2Tx9t9pn_QHcZ1euew,2756
14
14
  dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
15
15
  dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
16
- dmu/stats/fitter.py,sha256=LDvFNyhgO0OzXN7aH3kfHe6LzuPqdQfPcKR_IegDcaU,18204
16
+ dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
17
17
  dmu/stats/function.py,sha256=yzi_Fvp_ASsFzbWFivIf-comquy21WoeY7is6dgY0Go,9491
18
+ dmu/stats/gof_calculator.py,sha256=4EN6OhULcztFvsAZ00rxgohJemnjtDNB5o0IBcv6kbk,4657
19
+ dmu/stats/minimizers.py,sha256=f9cilFY9Kp9UvbSIUsKBGFzOOg7EEWZJLPod-4k-LAQ,6216
20
+ dmu/stats/model_factory.py,sha256=LyDOf0f9I5dNUTS0MXHtSivD8aAcTLIagvMPtoXtThk,7426
18
21
  dmu/stats/utilities.py,sha256=LQy4kd3xSXqpApcWuYfZxkGQyjowaXv2Wr1c4Bj-4ys,4523
22
+ dmu/stats/zfit_plotter.py,sha256=Xs6kisNEmNQXhYRCcjowxO6xHuyAyrfyQIFhGAR61U4,19719
19
23
  dmu/testing/utilities.py,sha256=WbMM4e9Cn3-B-12Vr64mB5qTKkV32joStlRkD-48lG0,3460
20
24
  dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
21
25
  dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
26
  dmu_data/ml/tests/train_mva.yaml,sha256=TCniCVpXMEFxZcHa8IIqollKA7ci4OkBnRznLEkXM9o,925
23
- dmu_data/plotting/tests/2d.yaml,sha256=lTMNheK3DB8klp4O5QjMDwBI1A1Oh2_Wp2F2Ro9VQKM,282
27
+ dmu_data/plotting/tests/2d.yaml,sha256=VApcAfJFbjNcjMCTBSRm2P37MQlGavMZv6msbZwLSgw,402
24
28
  dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
25
29
  dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
26
30
  dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
@@ -39,8 +43,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
39
43
  dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
40
44
  dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
41
45
  dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
42
- data_manipulation_utilities-0.1.6.dist-info/METADATA,sha256=1ttATABwWcdqqPJM72_4s_ZQjtbFp9MzkfsprkDJTv8,19946
43
- data_manipulation_utilities-0.1.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
44
- data_manipulation_utilities-0.1.6.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
45
- data_manipulation_utilities-0.1.6.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
46
- data_manipulation_utilities-0.1.6.dist-info/RECORD,,
46
+ data_manipulation_utilities-0.1.9.dist-info/METADATA,sha256=sxu2cZc14f4VfDD2J3MLGmW0jRHXJBpmDspXUt1D_0k,23046
47
+ data_manipulation_utilities-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
48
+ data_manipulation_utilities-0.1.9.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
49
+ data_manipulation_utilities-0.1.9.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
50
+ data_manipulation_utilities-0.1.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
dmu/plotting/plotter.py CHANGED
@@ -65,7 +65,7 @@ class Plotter:
65
65
 
66
66
  return minx, maxx
67
67
  #-------------------------------------
68
- def _preprocess_rdf(self, rdf):
68
+ def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
69
69
  '''
70
70
  rdf (RDataFrame): ROOT dataframe
71
71
 
@@ -2,6 +2,9 @@
2
2
  Module containing plotter class
3
3
  '''
4
4
 
5
+ import hist
6
+ from hist import Hist
7
+
5
8
  import numpy
6
9
  import matplotlib.pyplot as plt
7
10
 
@@ -33,58 +36,75 @@ class Plotter1D(Plotter):
33
36
 
34
37
  return xname, yname
35
38
  #-------------------------------------
36
- def _plot_var(self, var):
39
+ def _is_normalized(self, var : str) -> bool:
40
+ d_cfg = self._d_cfg['plots'][var]
41
+ normalized=False
42
+ if 'normalized' in d_cfg:
43
+ normalized = d_cfg['normalized']
44
+
45
+ return normalized
46
+ #-------------------------------------
47
+ def _get_binning(self, var : str, d_data : dict[str, numpy.ndarray]) -> tuple[float, float, int]:
48
+ d_cfg = self._d_cfg['plots'][var]
49
+ minx, maxx, bins = d_cfg['binning']
50
+ if maxx <= minx + 1e-5:
51
+ log.info(f'Bounds not set for {var}, will calculated them')
52
+ minx, maxx = self._find_bounds(d_data = d_data, qnt=minx)
53
+ log.info(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
54
+ else:
55
+ log.debug(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
56
+
57
+ return minx, maxx, bins
58
+ #-------------------------------------
59
+ def _plot_var(self, var : str) -> float:
37
60
  '''
38
61
  Will plot a variable from a dictionary of dataframes
39
62
  Parameters
40
63
  --------------------
41
64
  var (str) : name of column
65
+
66
+ Return
67
+ --------------------
68
+ Largest bin content among all bins and among all histograms plotted
42
69
  '''
43
70
  # pylint: disable=too-many-locals
44
71
 
45
- d_cfg = self._d_cfg['plots'][var]
46
-
47
- minx, maxx, bins = d_cfg['binning']
48
- yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
49
- xname, yname = self._get_labels(var)
50
-
51
- normalized=False
52
- if 'normalized' in d_cfg:
53
- normalized = d_cfg['normalized']
54
-
55
- title = ''
56
- if 'title' in d_cfg:
57
- title = d_cfg['title']
58
-
59
72
  d_data = {}
60
73
  for name, rdf in self._d_rdf.items():
61
74
  d_data[name] = rdf.AsNumpy([var])[var]
62
75
 
63
- if maxx <= minx + 1e-5:
64
- log.info(f'Bounds not set for {var}, will calculated them')
65
- minx, maxx = self._find_bounds(d_data = d_data, qnt=minx)
66
- log.info(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
67
- else:
68
- log.debug(f'Using bounds [{minx:.3e}, {maxx:.3e}]')
76
+ minx, maxx, bins = self._get_binning(var, d_data)
77
+ d_wgt = self._get_weights(var)
69
78
 
70
79
  l_bc_all = []
71
- d_wgt = self._get_weights(var)
72
80
  for name, arr_val in d_data.items():
73
- arr_wgt = d_wgt[name] if d_wgt is not None else None
74
-
75
- self._print_weights(arr_wgt, var, name)
76
- l_bc, _, _ = plt.hist(arr_val, weights=arr_wgt, bins=bins, range=(minx, maxx), density=normalized, histtype='step', label=name)
77
- l_bc_all += numpy.array(l_bc).tolist()
81
+ arr_wgt = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
82
+ hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x', label=name).Weight()
83
+ hst.fill(x=arr_val, weight=arr_wgt)
84
+ hst.plot(label=name)
85
+ l_bc_all += hst.values().tolist()
78
86
 
79
- plt.yscale(yscale)
80
- plt.xlabel(xname)
81
- plt.ylabel(yname)
87
+ max_y = max(l_bc_all)
82
88
 
89
+ return max_y
90
+ # --------------------------------------------
91
+ def _style_plot(self, var : str, max_y : float) -> None:
92
+ d_cfg = self._d_cfg['plots'][var]
93
+ yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
94
+
95
+ xname, yname = self._get_labels(var)
96
+ plt.xlabel(xname)
97
+ plt.ylabel(yname)
98
+ plt.yscale(yscale)
83
99
  if yscale == 'linear':
84
100
  plt.ylim(bottom=0)
85
101
 
86
- max_y = max(l_bc_all)
102
+ title = ''
103
+ if 'title' in d_cfg:
104
+ title = d_cfg['title']
105
+
87
106
  plt.ylim(top=1.2 * max_y)
107
+ plt.legend()
88
108
  plt.title(title)
89
109
  # --------------------------------------------
90
110
  def _plot_lines(self, var : str):
@@ -106,8 +126,10 @@ class Plotter1D(Plotter):
106
126
  fig_size = self._get_fig_size()
107
127
  for var in self._d_cfg['plots']:
108
128
  log.debug(f'Plotting: {var}')
129
+
109
130
  plt.figure(var, figsize=fig_size)
110
- self._plot_var(var)
131
+ max_y = self._plot_var(var)
132
+ self._style_plot(var, max_y)
111
133
  self._plot_lines(var)
112
134
  self._save_plot(var)
113
135
  # --------------------------------------------
@@ -31,8 +31,8 @@ class Plotter2D(Plotter):
31
31
  if not isinstance(cfg, dict):
32
32
  raise ValueError('Config dictionary not passed')
33
33
 
34
- self._rdf : RDataFrame = rdf
35
34
  self._d_cfg : dict = cfg
35
+ self._rdf : RDataFrame = super()._preprocess_rdf(rdf)
36
36
 
37
37
  self._wgt : numpy.ndarray
38
38
  # --------------------------------------------
dmu/stats/fitter.py CHANGED
@@ -4,6 +4,7 @@ Module holding zfitter class
4
4
 
5
5
  import pprint
6
6
  from typing import Union
7
+ from functools import lru_cache
7
8
 
8
9
  import numpy
9
10
  import zfit
@@ -100,8 +101,8 @@ class Fitter:
100
101
 
101
102
  return data
102
103
  #------------------------------
103
- def _bin_pdf(self, nbins):
104
- [[min_x]], [[max_x]] = self._pdf.space.limits
104
+ def _bin_pdf(self):
105
+ nbins, min_x, max_x = self._get_binning()
105
106
  _, arr_edg = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
106
107
 
107
108
  size = arr_edg.size
@@ -117,23 +118,29 @@ class Fitter:
117
118
 
118
119
  return numpy.array(l_bc)
119
120
  #------------------------------
121
+ def _bin_data(self):
122
+ nbins, min_x, max_x = self._get_binning()
123
+ arr_data, _ = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
124
+ arr_data = arr_data.astype(float)
125
+
126
+ return arr_data
127
+ #------------------------------
128
+ @lru_cache(maxsize=10)
120
129
  def _get_binning(self):
121
130
  min_x = numpy.min(self._data_np)
122
131
  max_x = numpy.max(self._data_np)
123
132
  nbins = self._ndof + self._get_float_pars()
124
133
 
134
+ log.debug(f'Nbins: {nbins}')
135
+ log.debug(f'Range: [{min_x:.3f}, {max_x:.3f}]')
136
+
125
137
  return nbins, min_x, max_x
126
138
  #------------------------------
127
139
  def _calc_gof(self):
128
140
  log.debug('Calculating GOF')
129
- nbins, min_x, max_x = self._get_binning()
130
141
 
131
- log.debug(f'Nbins: {nbins}')
132
- log.debug(f'Range: [{min_x:.3f}, {max_x:.3f}]')
133
-
134
- arr_data, _ = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
135
- arr_data = arr_data.astype(float)
136
- arr_modl = self._bin_pdf(nbins)
142
+ arr_data = self._bin_data()
143
+ arr_modl = self._bin_pdf()
137
144
  norm = numpy.sum(arr_data) / numpy.sum(arr_modl)
138
145
  arr_modl = norm * arr_modl
139
146
  arr_res = arr_modl - arr_data
@@ -0,0 +1,145 @@
1
+ '''
2
+ Module holding GofCalculator class
3
+ '''
4
+ from functools import lru_cache
5
+
6
+ import zfit
7
+ import numpy
8
+ import pandas as pnd
9
+
10
+ from scipy import stats
11
+ from zfit.core.basepdf import BasePDF as zpdf
12
+ from zfit.core.parameter import Parameter as zpar
13
+ from dmu.logging.log_store import LogStore
14
+
15
+ log = LogStore.add_logger('dmu:stats:gofcalculator')
16
+ # ------------------------
17
+ class GofCalculator:
18
+ '''
19
+ Class used to calculate goodness of fit from zfit NLL
20
+ '''
21
+ # ---------------------
22
+ def __init__(self, nll, ndof : int = 10):
23
+ self._nll = nll
24
+ self._ndof = ndof
25
+
26
+ self._pdf = self._pdf_from_nll()
27
+ self._data_in = self._data_from_nll()
28
+ self._data_np = self._data_np_from_data(self._data_in)
29
+ self._data_zf = zfit.Data.from_numpy(obs=self._pdf.space, array=self._data_np)
30
+ # ---------------------
31
+ def _data_np_from_data(self, dat) -> numpy.ndarray:
32
+ if isinstance(dat, numpy.ndarray):
33
+ return dat
34
+
35
+ if isinstance(dat, zfit.Data):
36
+ return zfit.run(zfit.z.unstack_x(dat))
37
+
38
+ if isinstance(dat, pnd.DataFrame):
39
+ return dat.to_numpy()
40
+
41
+ if isinstance(dat, pnd.Series):
42
+ dat = pnd.DataFrame(dat)
43
+ return dat.to_numpy()
44
+
45
+ data_type = str(type(dat))
46
+ raise ValueError(f'Data is not a numpy array, zfit.Data or pandas.DataFrame, but {data_type}')
47
+ # ---------------------
48
+ def _pdf_from_nll(self) -> zpdf:
49
+ l_model = self._nll.model
50
+ if len(l_model) != 1:
51
+ raise ValueError('Not found one and only one model')
52
+
53
+ return l_model[0]
54
+ # ---------------------
55
+ def _data_from_nll(self) -> zpdf:
56
+ l_data = self._nll.data
57
+ if len(l_data) != 1:
58
+ raise ValueError('Not found one and only one dataset')
59
+
60
+ return l_data[0]
61
+ # ---------------------
62
+ def _get_float_pars(self) -> int:
63
+ npar = 0
64
+ s_par = self._pdf.get_params()
65
+ for par in s_par:
66
+ if par.floating:
67
+ npar+=1
68
+
69
+ return npar
70
+ # ---------------------
71
+ @lru_cache(maxsize=10)
72
+ def _get_binning(self) -> tuple[int, float, float]:
73
+ min_x = numpy.min(self._data_np)
74
+ max_x = numpy.max(self._data_np)
75
+ nbins = self._ndof + self._get_float_pars()
76
+
77
+ log.debug(f'Nbins: {nbins}')
78
+ log.debug(f'Range: [{min_x:.3f}, {max_x:.3f}]')
79
+
80
+ return nbins, min_x, max_x
81
+ # ---------------------
82
+ def _get_pdf_bin_contents(self) -> numpy.ndarray:
83
+ nbins, min_x, max_x = self._get_binning()
84
+ _, arr_edg = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
85
+
86
+ size = arr_edg.size
87
+
88
+ l_bc = []
89
+ for i_edg in range(size - 1):
90
+ low = arr_edg[i_edg + 0]
91
+ hig = arr_edg[i_edg + 1]
92
+
93
+ var : zpar = self._pdf.integrate(limits = [low, hig])
94
+ val = var.numpy()[0]
95
+ l_bc.append(val * self._data_np.size)
96
+
97
+ return numpy.array(l_bc)
98
+ #------------------------------
99
+ def _get_data_bin_contents(self) -> numpy.ndarray:
100
+ nbins, min_x, max_x = self._get_binning()
101
+ arr_data, _ = numpy.histogram(self._data_np, bins = nbins, range=(min_x, max_x))
102
+ arr_data = arr_data.astype(float)
103
+
104
+ return arr_data
105
+ #------------------------------
106
+ @lru_cache(maxsize=30)
107
+ def _calculate_gof(self) -> tuple[float, int, float]:
108
+ log.debug('Calculating GOF')
109
+
110
+ arr_data = self._get_data_bin_contents()
111
+ arr_modl = self._get_pdf_bin_contents()
112
+
113
+ norm = numpy.sum(arr_data) / numpy.sum(arr_modl)
114
+ arr_modl = norm * arr_modl
115
+ arr_res = arr_modl - arr_data
116
+
117
+ arr_chi2 = numpy.divide(arr_res ** 2, arr_data, out=numpy.zeros_like(arr_data), where=arr_data!=0)
118
+ sum_chi2 = numpy.sum(arr_chi2)
119
+
120
+ pvalue = 1 - stats.chi2.cdf(sum_chi2, self._ndof)
121
+ pvalue = float(pvalue)
122
+
123
+ log.debug(f'Chi2: {sum_chi2:.3f}')
124
+ log.debug(f'Ndof: {self._ndof}')
125
+ log.debug(f'pval: {pvalue:<.3e}')
126
+
127
+ return sum_chi2, self._ndof, pvalue
128
+ # ---------------------
129
+ def get_gof(self, kind : str) -> float:
130
+ '''
131
+ Returns good ness of fit of a given kind
132
+
133
+ kind: Type of goodness of fit, e.g. pvalue
134
+ '''
135
+
136
+ chi2, ndof, pval = self._calculate_gof()
137
+
138
+ if kind == 'pvalue':
139
+ return pval
140
+
141
+ if kind == 'chi2/ndof':
142
+ return chi2/ndof
143
+
144
+ raise NotImplementedError(f'Invalid goodness of fit: {kind}')
145
+ # ------------------------