data-manipulation-utilities 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +179 -10
  2. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +31 -19
  3. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
  4. dmu/generic/hashing.py +44 -0
  5. dmu/generic/utilities.py +14 -1
  6. dmu/generic/version_management.py +3 -5
  7. dmu/ml/cv_diagnostics.py +221 -0
  8. dmu/ml/train_mva.py +143 -46
  9. dmu/pdataframe/utilities.py +36 -3
  10. dmu/plotting/fwhm.py +64 -0
  11. dmu/plotting/plotter.py +2 -0
  12. dmu/plotting/plotter_1d.py +87 -6
  13. dmu/stats/fitter.py +1 -1
  14. dmu/stats/minimizers.py +40 -11
  15. dmu/stats/model_factory.py +248 -44
  16. dmu/stats/zfit_models.py +68 -0
  17. dmu/stats/zfit_plotter.py +29 -21
  18. dmu/testing/utilities.py +31 -4
  19. dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
  20. dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
  21. dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
  22. dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
  23. dmu_data/ml/tests/train_mva.yaml +19 -10
  24. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
  25. dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
  26. dmu_data/plotting/tests/plug_stats.yaml +19 -0
  27. dmu_data/plotting/tests/simple.yaml +4 -3
  28. dmu_data/plotting/tests/styling.yaml +11 -0
  29. {data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
  30. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
  31. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0
dmu/stats/zfit_plotter.py CHANGED
@@ -1,7 +1,7 @@
1
1
  '''
2
2
  Module containing plot class, used to plot fits
3
3
  '''
4
- # pylint: disable=too-many-instance-attributes
4
+ # pylint: disable=too-many-instance-attributes, too-many-arguments
5
5
 
6
6
  import warnings
7
7
  import pprint
@@ -51,6 +51,8 @@ class ZFitPlotter:
51
51
  self._figsize = None
52
52
  self._leg_loc = None
53
53
 
54
+ self.dat_xerr : bool
55
+
54
56
  # zfit.settings.advanced_warnings['extend_wrapped_extended'] = False
55
57
  warnings.filterwarnings("ignore")
56
58
  #----------------------------------------
@@ -60,17 +62,17 @@ class ZFitPlotter:
60
62
  self._l_def_col = list(mcolors.TABLEAU_COLORS.keys())
61
63
  #----------------------------------------
62
64
  def _data_to_zdata(self, obs, data, weights):
65
+ if isinstance(data, zfit.data.Data):
66
+ return data
67
+
63
68
  if isinstance(data, np.ndarray):
64
69
  data = zfit.Data.from_numpy (obs=obs, array=data , weights=weights)
65
70
  elif isinstance(data, pd.Series):
66
71
  data = zfit.Data.from_pandas(obs=obs, df=pd.DataFrame(data), weights=weights)
67
72
  elif isinstance(data, pd.DataFrame):
68
73
  data = zfit.Data.from_pandas(obs=obs, df=data , weights=weights)
69
- elif isinstance(data, zfit.data.Data):
70
- data = data
71
74
  else:
72
- log.error(f'Passed data is of usupported type {type(data)}')
73
- raise
75
+ raise ValueError(f'Passed data is of usupported type {type(data)}')
74
76
 
75
77
  return data
76
78
  #----------------------------------------
@@ -200,7 +202,7 @@ class ZFitPlotter:
200
202
  #----------------------------------------
201
203
  def _get_zfit_gof(self):
202
204
  if not hasattr(self._result, 'gof'):
203
- return
205
+ return None
204
206
 
205
207
  chi2, ndof, pval = self._result.gof
206
208
 
@@ -211,14 +213,16 @@ class ZFitPlotter:
211
213
  def _get_text(self, ext_text):
212
214
  gof_text = self._get_zfit_gof()
213
215
 
214
- if ext_text is None and gof_text is None:
215
- return
216
- elif ext_text is not None and gof_text is None:
216
+ if ext_text is None and gof_text is None:
217
+ return None
218
+
219
+ if ext_text is not None and gof_text is None:
217
220
  return ext_text
218
- elif ext_text is None and gof_text is not None:
221
+
222
+ if ext_text is None and gof_text is not None:
219
223
  return gof_text
220
- else:
221
- return f'{ext_text}\n{gof_text}'
224
+
225
+ return f'{ext_text}\n{gof_text}'
222
226
  #----------------------------------------
223
227
  def _get_pars(self):
224
228
  '''
@@ -238,7 +242,7 @@ class ZFitPlotter:
238
242
  name= par if isinstance(par, str) else par.name
239
243
  try:
240
244
  err = d_val['hesse']['error']
241
- except:
245
+ except KeyError:
242
246
  log.warning(f'Cannot extract {name} Hesse errors, using zeros')
243
247
  pprint.pprint(d_val)
244
248
  err = 0
@@ -260,7 +264,7 @@ class ZFitPlotter:
260
264
  '''
261
265
  d_par = self._get_pars()
262
266
 
263
- line = f''
267
+ line = ''
264
268
  for name, [val, err] in d_par.items():
265
269
  if add_pars != 'all' and name not in add_pars:
266
270
  continue
@@ -328,7 +332,7 @@ class ZFitPlotter:
328
332
  nevt = self._get_component_yield(model, par)
329
333
 
330
334
  if model.name in self._l_plot_components and hasattr(model, 'pdfs'):
331
- l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
335
+ l_model = [ (frc, pdf) for pdf, frc in zip(model.pdfs, model.params.values()) ]
332
336
  elif model.name in self._l_plot_components and not hasattr(model, 'pdfs'):
333
337
  log.warning(f'Cannot plot {model.name} as separate components, despite it was requested')
334
338
  l_model = [ (1, model)]
@@ -347,17 +351,17 @@ class ZFitPlotter:
347
351
  ax.plot(self.x, y, '-', label=self._leg.get(name, name), color=self._col.get(name))
348
352
 
349
353
  if (blind_name is not None) and (was_blinded is False):
350
- log.error(f'Blinding was requested, but PDF {blind_name} was not found among:')
351
354
  for model in self.total_model.pdfs:
352
355
  log.info(model.name)
353
- raise
356
+
357
+ raise ValueError(f'Blinding was requested, but PDF {blind_name} was not found among:')
354
358
  #----------------------------------------
355
359
  def _get_col(self, name):
356
360
  if name in self._col:
357
361
  return self._col[name]
358
362
 
359
363
  col = self._l_def_col[0]
360
- del(self._l_def_col[0])
364
+ del self._l_def_col[0]
361
365
 
362
366
  return col
363
367
  #----------------------------------------
@@ -400,9 +404,8 @@ class ZFitPlotter:
400
404
  if plot_range is not None:
401
405
  try:
402
406
  self.lower, self.upper = plot_range
403
- except TypeError:
404
- log.error(f'plot_range argument is expected to be a tuple with two numeric values')
405
- raise TypeError
407
+ except TypeError as exc:
408
+ raise TypeError('plot_range argument is expected to be a tuple with two numeric values') from exc
406
409
 
407
410
  return np.linspace(self.lower, self.upper, 2000)
408
411
  #----------------------------------------
@@ -439,6 +442,7 @@ class ZFitPlotter:
439
442
  add_pars = None,
440
443
  ymax = None,
441
444
  skip_pulls = False,
445
+ yscale : str = None,
442
446
  axs = None,
443
447
  figsize:tuple = (13, 7),
444
448
  leg_loc:str = 'best',
@@ -464,6 +468,7 @@ class ZFitPlotter:
464
468
  figsize (tuple) : Tuple with figure size, default (13, 7)
465
469
  leg_loc (str) : Location of legend, default 'best'
466
470
  xerr (bool or float) : Used to pass xerr to mplhep histplot. True will use error with bin size, False, no error, otherwise it's the size of the xerror bar
471
+ yscale (str) : Scale for y axis of main plot, either log or linear
467
472
  '''
468
473
  # pylint: disable=too-many-locals, too-many-positional-arguments, too-many-arguments
469
474
  d_leg = {} if d_leg is None else d_leg
@@ -512,6 +517,9 @@ class ZFitPlotter:
512
517
  self.axs[0].set(xlabel=xlabel, ylabel=ylabel)
513
518
  self.axs[0].set_xlim([self.lower, self.upper])
514
519
 
520
+ if yscale is not None:
521
+ self.axs[0].set_yscale(yscale)
522
+
515
523
  if title is not None:
516
524
  self.axs[0].set_title(title)
517
525
 
dmu/testing/utilities.py CHANGED
@@ -3,16 +3,20 @@ Module containing utility functions needed by unit tests
3
3
  '''
4
4
  import os
5
5
  import math
6
+ import glob
6
7
  from typing import Union
7
8
  from dataclasses import dataclass
8
9
  from importlib.resources import files
9
10
 
10
11
  from ROOT import RDF, TFile, RDataFrame
11
12
 
13
+ import joblib
12
14
  import pandas as pnd
13
15
  import numpy
14
16
  import yaml
15
17
 
18
+ from dmu.ml.train_mva import TrainMva
19
+ from dmu.ml.cv_classifier import CVClassifier
16
20
  from dmu.logging.log_store import LogStore
17
21
 
18
22
  log = LogStore.add_logger('dmu:testing:utilities')
@@ -22,6 +26,7 @@ class Data:
22
26
  '''
23
27
  Class storing shared data
24
28
  '''
29
+ out_dir = '/tmp/tests/dmu/ml/cv_predict'
25
30
  # -------------------------------
26
31
  def _double_data(df_1 : pnd.DataFrame) -> pnd.DataFrame:
27
32
  df_2 = df_1.copy()
@@ -39,7 +44,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
39
44
  else:
40
45
  l_col_index = [ l_col.index(column) for column in columns ]
41
46
 
42
- log.debug('Replacing randomly with {size} NaNs')
47
+ log.debug(f'Replacing randomly with {size} NaNs')
43
48
  for _ in range(size):
44
49
  irow = numpy.random.randint(0, df.shape[0]) # Random row index
45
50
  icol = numpy.random.choice(l_col_index) # Random column index
@@ -51,7 +56,7 @@ def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
51
56
  def get_rdf(kind : Union[str,None] = None,
52
57
  repeated : bool = False,
53
58
  nentries : int = 3_000,
54
- add_nans : list[str] = None):
59
+ columns_with_nans : list[str] = None):
55
60
  '''
56
61
  Return ROOT dataframe with toy data
57
62
  '''
@@ -76,8 +81,8 @@ def get_rdf(kind : Union[str,None] = None,
76
81
  if repeated:
77
82
  df = _double_data(df)
78
83
 
79
- if add_nans:
80
- df = _add_nans(df, columns=add_nans)
84
+ if columns_with_nans is not None:
85
+ df = _add_nans(df, columns=columns_with_nans)
81
86
 
82
87
  rdf = RDF.FromPandas(df)
83
88
 
@@ -126,3 +131,25 @@ def get_file_with_trees(path : str) -> TFile:
126
131
  snap.fMode = 'update'
127
132
 
128
133
  return TFile(path)
134
+ # -------------------------------
135
+ def get_models(rdf_sig : RDataFrame, rdf_bkg : RDataFrame) -> list[CVClassifier]:
136
+ '''
137
+ Will train and return models
138
+ '''
139
+
140
+ cfg = get_config('ml/tests/train_mva.yaml')
141
+ pkl_path = f'{Data.out_dir}/model.pkl'
142
+ plt_dir = f'{Data.out_dir}/cv_predict'
143
+ cfg['saving']['path'] = pkl_path
144
+ cfg['plotting']['val_dir'] = plt_dir
145
+ cfg['plotting']['features']['saving']['plt_dir'] = plt_dir
146
+
147
+ obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
148
+ obj.run()
149
+
150
+ pkl_wc = pkl_path.replace('.pkl', '_*.pkl')
151
+ l_pkl_path = glob.glob(pkl_wc)
152
+ l_model = [ joblib.load(pkl_path) for pkl_path in l_pkl_path ]
153
+
154
+ return l_model
155
+ # -------------------------------
@@ -0,0 +1,13 @@
1
+ output : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
2
+ # Will assume that the target is already in the input dataframe
3
+ # and will use it, instead of evaluating models
4
+ score_from_rdf : w
5
+ correlations:
6
+ # Variables with respect to which the correlations with the features will be measured
7
+ target :
8
+ name : z
9
+ methods:
10
+ - Pearson
11
+ figure:
12
+ title: Scores from file
13
+ size : [10, 8]
@@ -0,0 +1,10 @@
1
+ output : /tmp/tests/dmu/ml/cv_diagnostics/from_model
2
+ correlations:
3
+ # Variables with respect to which the correlations with the features will be measured
4
+ target :
5
+ name : z
6
+ methods:
7
+ - Pearson
8
+ figure:
9
+ size : [10, 8]
10
+ rotate: 90
@@ -0,0 +1,10 @@
1
+ output : /tmp/tests/dmu/ml/cv_diagnostics/multiple_methods
2
+ correlations:
3
+ # Variables with respect to which the correlations with the features will be measured
4
+ target :
5
+ name : z
6
+ methods:
7
+ - Pearson
8
+ - Kendall-$\tau$
9
+ figure:
10
+ size : [10, 8]
@@ -0,0 +1,33 @@
1
+ output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
2
+ # Will assume that the target is already in the input dataframe
3
+ # and will use it, instead of evaluating models
4
+ score_from_rdf : w
5
+ correlations:
6
+ # Variables with respect to which the correlations with the features will be measured
7
+ target :
8
+ name : z
9
+ overlay :
10
+ wp :
11
+ - 0.2
12
+ - 0.5
13
+ - 0.7
14
+ - 0.9
15
+ general:
16
+ size : [12, 10]
17
+ saving:
18
+ plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/overlay
19
+ plots:
20
+ z :
21
+ binning : [-4, 4, 10]
22
+ yscale : 'linear'
23
+ labels : ['$z$', 'Entries']
24
+ normalized : true
25
+ styling :
26
+ linestyle: '-'
27
+ methods:
28
+ - Pearson
29
+ - Kendall-$\tau$
30
+ figure:
31
+ title : Scores from file
32
+ size : [12, 10]
33
+ xlabelsize: 30
@@ -1,10 +1,12 @@
1
1
  dataset:
2
+ define :
3
+ r : z + x
2
4
  nan :
3
- x : 1
4
- y : 2
5
+ x : -3
6
+ y : -3
5
7
  training :
6
8
  nfold : 3
7
- features : [x, y, z]
9
+ features : [x, y, r]
8
10
  rdm_stat : 1
9
11
  hyper :
10
12
  loss : log_loss
@@ -13,7 +15,7 @@ training :
13
15
  learning_rate : 0.1
14
16
  min_samples_split : 2
15
17
  saving:
16
- path : '/tmp/dmu/ml/tests/train_mva/model.pkl'
18
+ path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
17
19
  plotting:
18
20
  roc :
19
21
  min : [0.0, 0.0]
@@ -29,21 +31,28 @@ plotting:
29
31
  title : 'Correlation matrix'
30
32
  size : [10, 10]
31
33
  mask_value : 0
32
- val_dir : '/tmp/dmu/ml/tests/train_mva'
34
+ val_dir : '/tmp/tests/dmu/ml/train_mva'
33
35
  features:
34
36
  saving:
35
- plt_dir : '/tmp/dmu/ml/tests/train_mva/features'
37
+ plt_dir : '/tmp/tests/dmu/ml/train_mva/features'
36
38
  plots:
39
+ r :
40
+ binning : [-6, 6, 100]
41
+ yscale : 'linear'
42
+ labels : ['$r$', '']
43
+ w :
44
+ binning : [-4, 4, 100]
45
+ yscale : 'linear'
46
+ labels : ['$w$', '']
37
47
  x :
38
48
  binning : [-4, 4, 100]
39
49
  yscale : 'linear'
40
- labels : ['x', '']
50
+ labels : ['$x$', '']
41
51
  y :
42
52
  binning : [-4, 4, 100]
43
53
  yscale : 'linear'
44
- labels : ['y', '']
54
+ labels : ['$y$', '']
45
55
  z :
46
56
  binning : [-4, 4, 100]
47
57
  yscale : 'linear'
48
- labels : ['z', '']
49
-
58
+ labels : ['$z$', '']
@@ -0,0 +1,82 @@
1
+ dataset:
2
+ define :
3
+ r : z + x
4
+ nan :
5
+ x : -3
6
+ y : -3
7
+ training :
8
+ nfold : 3
9
+ features : [x, y, r]
10
+ rdm_stat : 1
11
+ hyper :
12
+ loss : log_loss
13
+ n_estimators : 100
14
+ max_depth : 3
15
+ learning_rate : 0.1
16
+ min_samples_split : 2
17
+ saving:
18
+ path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
19
+ plotting:
20
+ roc :
21
+ min : [0.0, 0.0]
22
+ max : [1.2, 1.2]
23
+ annotate:
24
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
25
+ form : '{:.2f}'
26
+ color: 'green'
27
+ xoff : -15
28
+ yoff : -15
29
+ size : 10
30
+ correlation:
31
+ title : 'Correlation matrix'
32
+ size : [10, 10]
33
+ mask_value : 0
34
+ val_dir : '/tmp/tests/dmu/ml/train_mva'
35
+ features:
36
+ saving:
37
+ plt_dir : '/tmp/tests/dmu/ml/train_mva/features'
38
+ plots:
39
+ r :
40
+ binning : [-6, 6, 100]
41
+ yscale : 'linear'
42
+ labels : ['$r$', '']
43
+ w :
44
+ binning : [-4, 4, 100]
45
+ yscale : 'linear'
46
+ labels : ['$w$', '']
47
+ x :
48
+ binning : [-4, 4, 100]
49
+ yscale : 'linear'
50
+ labels : ['$x$', '']
51
+ y :
52
+ binning : [-4, 4, 100]
53
+ yscale : 'linear'
54
+ labels : ['$y$', '']
55
+ z :
56
+ binning : [-4, 4, 100]
57
+ yscale : 'linear'
58
+ labels : ['$z$', '']
59
+ diagnostics:
60
+ output : /tmp/tests/dmu/ml/train_mva/diagnostics
61
+ correlations:
62
+ target :
63
+ name : z
64
+ overlay :
65
+ general:
66
+ size : [20, 10]
67
+ saving:
68
+ plt_dir : /tmp/tests/dmu/ml/train_mva/diagnostics
69
+ plots:
70
+ z :
71
+ binning : [-4, +4, 30]
72
+ yscale : 'linear'
73
+ labels : ['z', 'Entries']
74
+ normalized : true
75
+ styling :
76
+ linestyle: '-'
77
+ methods:
78
+ - Pearson
79
+ - Kendall-$\tau$
80
+ figure:
81
+ title: Training diagnostics
82
+ size : [10, 8]
@@ -0,0 +1,24 @@
1
+ saving:
2
+ plt_dir : plotting/pluggins/fwhm
3
+ plots:
4
+ x :
5
+ binning : [-5.0, 8.0, 40]
6
+ title : x distribution
7
+ y :
8
+ binning : [-5.0, 8.0, 40]
9
+ title : y distribution
10
+ plugin:
11
+ fwhm:
12
+ x :
13
+ plot : true
14
+ obs : [-2, 4]
15
+ plot : true
16
+ format : FWHM={:.3f}
17
+ add_std: True
18
+ y :
19
+ plot : true
20
+ obs : [-4, 8]
21
+ plot : true
22
+ format : FWHM={:.3f}
23
+ add_std: True
24
+
@@ -0,0 +1,19 @@
1
+ saving:
2
+ plt_dir : plotting/pluggins/stats
3
+ plots:
4
+ x :
5
+ binning : [-5.0, 8.0, 40]
6
+ title : x distribution
7
+ styling:
8
+ linestyle : '-'
9
+ y :
10
+ binning : [-5.0, 8.0, 40]
11
+ title : y distribution
12
+ styling:
13
+ linestyle : '-'
14
+ plugin:
15
+ stats:
16
+ x :
17
+ mean : $\mu$={:.2f}
18
+ rms : $\sigma$={:.2f}
19
+ sum : $\Sigma$={:.0f}
@@ -1,8 +1,9 @@
1
1
  saving:
2
2
  plt_dir : tests/plotting/simple
3
-
4
3
  plots:
5
4
  x :
6
- binning : [-5.0, 8.0, 40]
5
+ binning : [-5.0, 8.0, 40]
6
+ title : x distribution
7
7
  y :
8
- binning : [-5.0, 8.0, 40]
8
+ binning : [-5.0, 8.0, 40]
9
+ title : y distribution
@@ -0,0 +1,11 @@
1
+ saving:
2
+ plt_dir : tests/plotting/styling
3
+ plots:
4
+ x :
5
+ binning : [-5.0, 8.0, 40]
6
+ title : x distribution
7
+ styling :
8
+ histtype : step
9
+ y :
10
+ binning : [-5.0, 8.0, 40]
11
+ title : y distribution