data-manipulation-utilities 0.2.6__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +800 -34
  2. data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
  3. {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
  4. {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
  5. dmu/__init__.py +0 -0
  6. dmu/generic/hashing.py +70 -0
  7. dmu/generic/utilities.py +175 -9
  8. dmu/generic/version_management.py +3 -5
  9. dmu/logging/log_store.py +34 -2
  10. dmu/logging/messages.py +96 -0
  11. dmu/ml/cv_classifier.py +3 -3
  12. dmu/ml/cv_diagnostics.py +224 -0
  13. dmu/ml/cv_performance.py +58 -0
  14. dmu/ml/cv_predict.py +149 -46
  15. dmu/ml/train_mva.py +587 -112
  16. dmu/ml/utilities.py +29 -10
  17. dmu/pdataframe/utilities.py +61 -3
  18. dmu/plotting/fwhm.py +64 -0
  19. dmu/plotting/matrix.py +1 -1
  20. dmu/plotting/plotter.py +25 -3
  21. dmu/plotting/plotter_1d.py +159 -14
  22. dmu/plotting/plotter_2d.py +5 -0
  23. dmu/rdataframe/utilities.py +54 -3
  24. dmu/rfile/ddfgetter.py +102 -0
  25. dmu/stats/fit_stats.py +129 -0
  26. dmu/stats/fitter.py +56 -23
  27. dmu/stats/gof_calculator.py +7 -0
  28. dmu/stats/model_factory.py +305 -50
  29. dmu/stats/parameters.py +100 -0
  30. dmu/stats/utilities.py +443 -12
  31. dmu/stats/wdata.py +187 -0
  32. dmu/stats/zfit.py +17 -0
  33. dmu/stats/zfit_models.py +68 -0
  34. dmu/stats/zfit_plotter.py +175 -56
  35. dmu/testing/utilities.py +120 -15
  36. dmu/workflow/__init__.py +0 -0
  37. dmu/workflow/cache.py +266 -0
  38. dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
  39. dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
  40. dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
  41. dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
  42. dmu_data/ml/tests/train_mva.yaml +20 -12
  43. dmu_data/ml/tests/train_mva_def.yaml +75 -0
  44. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +87 -0
  45. dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
  46. dmu_data/plotting/tests/2d.yaml +5 -5
  47. dmu_data/plotting/tests/line.yaml +15 -0
  48. dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
  49. dmu_data/plotting/tests/plug_stats.yaml +19 -0
  50. dmu_data/plotting/tests/simple.yaml +4 -3
  51. dmu_data/plotting/tests/styling.yaml +18 -0
  52. dmu_data/rfile/friends.yaml +13 -0
  53. dmu_data/stats/fitter/test_simple.yaml +28 -0
  54. dmu_data/stats/kde_optimizer/control.json +1 -0
  55. dmu_data/stats/kde_optimizer/signal.json +1 -0
  56. dmu_data/stats/parameters/data.yaml +178 -0
  57. dmu_data/tests/config.json +6 -0
  58. dmu_data/tests/config.yaml +4 -0
  59. dmu_data/tests/pdf_to_tex.txt +34 -0
  60. dmu_scripts/kerberos/check_expiration +21 -0
  61. dmu_scripts/kerberos/convert_certificate +22 -0
  62. dmu_scripts/ml/compare_classifiers.py +85 -0
  63. data_manipulation_utilities-0.2.6.dist-info/RECORD +0 -57
  64. {data_manipulation_utilities-0.2.6.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
  65. {data_manipulation_utilities-0.2.6.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,10 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.6
3
+ Version: 0.2.8.dev714
4
+ Summary: Project storing utilities needed to reduce boilerplate code when analyzing data
4
5
  Description-Content-Type: text/markdown
5
- Requires-Dist: logzero
6
- Requires-Dist: PyYAML
7
- Requires-Dist: scipy
8
- Requires-Dist: awkward
9
- Requires-Dist: tqdm
10
- Requires-Dist: joblib
11
- Requires-Dist: scikit-learn
12
- Requires-Dist: toml
13
- Requires-Dist: numpy
14
- Requires-Dist: matplotlib
15
- Requires-Dist: mplhep
16
- Requires-Dist: hist[plot]
17
- Requires-Dist: pandas
18
- Provides-Extra: dev
19
- Requires-Dist: pytest; extra == "dev"
6
+
7
+ [TOC]
20
8
 
21
9
  # D(ata) M(anipulation) U(tilities)
22
10
 
@@ -51,6 +39,190 @@ Then, for each remote it pushes the tags and the commits.
51
39
 
52
40
  This section describes generic tools that could not be put in a specific category, but tend to be useful.
53
41
 
42
+ ## Caching data
43
+
44
+ In order to reuse data that is hard to calculate one would need:
45
+
46
+ - Serializable data, i.e. strings, floats, lists, etc
47
+ - A way to get a unique identifier of that data, e.g. a hashable object
48
+
49
+ If both are avalable, one can:
50
+
51
+ ```python
52
+ import dmu.generic.utilities as gut
53
+
54
+ def _get_something() -> float:
55
+ # This loads the data, if found
56
+ hashable = arg1, arg2
57
+
58
+ ret = gut.load_cached(hash_obj=hashable, on_fail=-999)
59
+ if ret != -999:
60
+ return ret
61
+
62
+ obj = very_expensive_function(arg1, arg2)
63
+
64
+ # This saves the data
65
+ ret = gut.cache_data(obj, hash_obj=hashable)
66
+
67
+ return ret
68
+ ```
69
+
70
+ the cached data will go to JSON files in `/tmp/dmu/cache`.
71
+
72
+ ## Caching with a base class
73
+
74
+ Caching functionalities can be added to a class through a base class as in:
75
+
76
+ ```python
77
+ from dmu.workflow.cache import Cache as Wcache
78
+
79
+ class Tester(Wcache):
80
+ '''
81
+ Testing class, produces outputs from simple inputs
82
+ '''
83
+ # -----------------------------------
84
+ def __init__(
85
+ self,
86
+ nval : int):
87
+ '''
88
+ nval, some integer used to produce output data
89
+ '''
90
+ super().__init__(
91
+ out_path='Tester',
92
+ nval =nval)
93
+
94
+ self._nval = nval
95
+ # -----------------------------------
96
+ def run(self) -> None:
97
+ '''
98
+ Returns a list of 1's
99
+ '''
100
+ # _out_path belongs to the base class
101
+ obj_path = f'{self._out_path}/values.json'
102
+
103
+ if self._copy_from_cache():
104
+ log.warning('Output cached, not running')
105
+ return gut.load_json(obj_path)
106
+
107
+ log.info('Data not cached, running')
108
+ res = [1] * self._nval
109
+
110
+ gut.dump_json(res, obj_path)
111
+ self._cache()
112
+
113
+ return res
114
+
115
+ # This will set the root directory where cached data goes
116
+ # The data will go to `/some/directory/Tester`
117
+ # This has to be done ONCE and only ONCE.
118
+ Wcache.set_cache_root(root='/some/directory')
119
+
120
+ obj = Tester(nval=3)
121
+ ...
122
+ ```
123
+
124
+ where the tester class has access to extra functionalities to:
125
+
126
+ - Cache outputs to a hashed directory
127
+ - For the next run, check if the directory exists, if so pick
128
+ the outputs and put them in the output directory
129
+ - If not rerun the process
130
+
131
+ Several hashed directories might exist, like in the diagram:
132
+
133
+ ![](doc/images/cache_hash.png)
134
+
135
+ **Important**: This class will also use the hash of the module where the `Test`
136
+ class is defined. Thus, changes in the code or in the input data, will invalidate the hash.
137
+
138
+ ### Turning caching off
139
+
140
+ This can be done temporarily with:
141
+
142
+ ```python
143
+ with Wcache.turn_off_cache(val=['Tester']):
144
+ obj = Tester(nval=4)
145
+ out = obj.run()
146
+ ```
147
+
148
+ for any list of classes that inherit from `Cache` by passing the list of class names.
149
+ If `val=None` is passed, ALL the classes caching is turned off.
150
+
151
+ ## Silencing import messages
152
+
153
+ To silence messages given by modules not in the user's control do:
154
+
155
+ ```python
156
+ import dmu.generic.utilities as gut
157
+
158
+ with gut.silent_import():
159
+ import tensorflow
160
+ ```
161
+
162
+ ## Silencing messages going to __stderr__ originating deep from C++ code
163
+
164
+ This is an issue with frameworks like `Tensorflow`. Some messages are impossible
165
+ to kill, which interferes with the debugging process. In order hide selectively
166
+ those messages, do:
167
+
168
+ ```python
169
+ from dmu.logging import messages as mes
170
+
171
+ l_msg = ['ONE', 'TWO']
172
+ with mes.filter_stderr(banned_substrings=l_msg):
173
+ os.write(2, b'MSG ONE\n')
174
+ os.write(2, b'MSG TWO\n')
175
+ os.write(2, b'MSG THREE\n')
176
+ ```
177
+
178
+ The context manager above will only allow `THREE` into the error stream.
179
+
180
+ ## YAML
181
+
182
+ When dumping data to yaml files do it like:
183
+
184
+ ```python
185
+ import dmu.generic.utilities as gut
186
+
187
+ yaml.dump(data, Dumper=gut.BlockStyleDumper)
188
+ ```
189
+
190
+ to make sure the indentation is correct.
191
+
192
+ ## Hashing
193
+
194
+ ### Hashing python objects
195
+
196
+ The snippet below:
197
+
198
+ ```python
199
+ from dmu.generic import hashing
200
+
201
+ obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
202
+ val = hashing.hash_object(obj)
203
+ ```
204
+
205
+ will:
206
+
207
+ - Make the input object into a JSON string
208
+ - Encode it to utf-8
209
+ - Make a 64 characters hash out of it
210
+
211
+ in two lines, thus keeping the user's code clean.
212
+
213
+ ### Hashing files
214
+
215
+ The following snippet:
216
+
217
+ ```python
218
+ from dmu.generic import hashing
219
+
220
+ path = '/some/file/path.txt'
221
+ val = hashing.hash_file(path=obj)
222
+ ```
223
+
224
+ should provide a hash to a file, given its path.
225
+
54
226
  ## Timer
55
227
 
56
228
  In order to benchmark functions do:
@@ -67,9 +239,9 @@ def fun():
67
239
  fun()
68
240
  ```
69
241
 
70
- ## JSON dumper
242
+ ## JSON/YAML dumper and loader
71
243
 
72
- The following lines will dump data (dictionaries, lists, etc) to a JSON file:
244
+ The following lines will dump data (dictionaries, lists, etc) to a JSON/YAML file and load it back:
73
245
 
74
246
  ```python
75
247
  import dmu.generic.utilities as gut
@@ -77,8 +249,48 @@ import dmu.generic.utilities as gut
77
249
  data = [1,2,3,4]
78
250
 
79
251
  gut.dump_json(data, '/tmp/list.json')
252
+ data = gut.load_json('/tmp/list.json')
253
+ ```
254
+
255
+ this will dump to either JSON or YAML files, depending on the extension, extensions allowed are:
256
+
257
+ ```
258
+ .json
259
+ .yaml
260
+ .yml
261
+ ```
262
+
263
+ and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
264
+
265
+ ## PKL dumper and loader
266
+
267
+ In the same way one can do:
268
+
269
+ ```python
270
+ import dmu.generic.utilities as gut
271
+
272
+ data = [1,2,3,4]
273
+
274
+ gut.dump_pickle(data, '/tmp/list.pkl')
275
+ data = gut.load_pickle('/tmp/list.pkl')
276
+ ```
277
+
278
+ ## Loader of files and configurations from data packages
279
+
280
+ YAML and JSON files can be loaded from data packages with:
281
+
282
+ ```python
283
+ import dmu.generic.utilities as gut
284
+
285
+ data = gut.load_data(package='dmu_data', fpath=f'tests/data.json')
286
+ conf = gut.load_conf(package='dmu_data', fpath=f'tests/config.json')
80
287
  ```
81
288
 
289
+ the former will return a python dictionary, list, etc.
290
+ The later will return a `DataConf` object from the `omegaconf` project.
291
+ Check [this](https://omegaconf.readthedocs.io/en/2.3_branch/index.html)
292
+ for more information.
293
+
82
294
  # Physics
83
295
 
84
296
  ## Truth matching
@@ -119,8 +331,72 @@ samples:
119
331
 
120
332
  # Math
121
333
 
334
+ ## Weighted data
335
+
336
+ `Wdata` is a small class symbolizing weighted data that contains extra functionality. It can
337
+ be used as:
338
+
339
+ ```python
340
+ from dmu.stats.wdata import Wdata
341
+
342
+ arr_mass = numpy.random.normal(loc=0, scale=1.0, size=Data.nentries)
343
+ arr_wgt = numpy.random.normal(loc=1, scale=0.1, size=Data.nentries)
344
+
345
+ # Make an instance
346
+ wdata = Wdata(data=arr_mass, weights=arr_wgt)
347
+
348
+ # create a zfit dataset, if needed
349
+ obs = zfit.Space('obs', limits=(-3, +3))
350
+ zdata = wdata.to_zfit(obs=obs)
351
+
352
+ # Add datasets
353
+ wdata_1 = Wdata(data=arr_mass, weights=arr_wgt)
354
+ wdata_2 = Wdata(data=arr_mass, weights=arr_wgt)
355
+ wdata_3 = wdata_1 + wdata_2
356
+
357
+ # Extract information from dataset
358
+
359
+ wdata.sumw() # sum of weights
360
+ wdata.size() # Number of entries
361
+
362
+ # Update weights creating a new Wdata instance
363
+ arr_wgt_new = numpy.random.normal(loc=1, scale=0.2, size=Data.nentries)
364
+
365
+ # New weights
366
+ wdata_2 = wdata.update_weights(weights=arr_wgt_new, replace=True)
367
+
368
+ # Multiply old weights by new ones and update
369
+ wdata_3 = wdata.update_weights(weights=arr_wgt_new, replace=False)
370
+ ```
371
+
122
372
  ## PDFs
123
373
 
374
+ ### Suppressing tensorflow messages from zfit import
375
+
376
+ If you work with zfit, you will see messages from tensorflow, by importing zfit through:
377
+
378
+ ```python
379
+ from dmu.stats.zfit import zfit
380
+ ```
381
+
382
+ these messages should be hidden. If `ROOT` is installed, the wrapper will import it before
383
+ importing tensorflow. That will prevent crashes which usually happen when `tensorflow`
384
+ is imported before `ROOT`.
385
+
386
+ ### Toy models
387
+
388
+ For quick tests, one can retrieve simple models with :
389
+
390
+ ```python
391
+ from dmu.stats import utilities as sut
392
+
393
+ # For a Gaussian plus Exponential, extended
394
+ pdf = sut.get_model(kind='s+b')
395
+
396
+ # For a Gaussian signal, non extended
397
+ pdf = sut.get_model(kind='signal')
398
+ ```
399
+
124
400
  ### Model building
125
401
 
126
402
  In order to do complex fits, one often needs PDFs with many parameters, which need to be added.
@@ -132,7 +408,27 @@ from dmu.stats.model_factory import ModelFactory
132
408
 
133
409
  l_pdf = ['cbr'] + 2 * ['cbl']
134
410
  l_shr = ['mu', 'sg']
135
- mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
411
+ l_flt = ['mu', 'sg'] # Will mark these parameters as floating for the fit done afterwards
412
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'} # Optional, will reparametrize for scale and resolution
413
+ d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # Optional, will fix two parameters whose names start with the keys
414
+
415
+ # If mu and sg are meant to be shared among all the models
416
+ # The parameters can be passed here.
417
+ # In this case, they are also meant to be floating
418
+ mu = zfit.param.Parameter('mu_flt', 5280, 5000, 5500)
419
+ sg = zfit.param.Parameter('sg_flt', 80, 20, 100)
420
+ l_reuse = [mu, sg]
421
+
422
+ mod = ModelFactory(
423
+ preffix = 'pref', # Preffix for parameter naming
424
+ obs = Data.obs,
425
+ l_pdf = l_pdf,
426
+ l_shared= l_shr,
427
+ l_float = l_float,
428
+ l_reuse = l_reuse, # Optional
429
+ d_rep = d_rep, # Optional
430
+ d_fix = d_fix) # Optional
431
+
136
432
  pdf = mod.get_pdf()
137
433
  ```
138
434
 
@@ -145,10 +441,63 @@ pol1: Polynomial of degree 1
145
441
  pol2: Polynomial of degree 2
146
442
  cbr : CrystallBall with right tail
147
443
  cbl : CrystallBall with left tail
148
- gauss : Gaussian
444
+ gauss : Gaussian
149
445
  dscb : Double sided CrystallBall
150
446
  ```
151
447
 
448
+ ### Model building with reparametrizations
449
+
450
+ In order to introduce reparametrizations for the means and the resolutions, such that:
451
+
452
+ $\mu\to\mu+\Delta\mu$
453
+ $\sigma\to\sigma\cdot s_{\sigma}$
454
+
455
+ where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
456
+
457
+ ```python
458
+ import zfit
459
+ from dmu.stats.model_factory import ModelFactory
460
+
461
+ l_shr = ['mu', 'sg']
462
+ l_flt = []
463
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'}
464
+ obs = zfit.Space('mass', limits=(5080, 5680))
465
+
466
+ mod = ModelFactory(
467
+ preffix = name,
468
+ obs = obs,
469
+ l_pdf = l_name,
470
+ d_rep = d_rep,
471
+ l_shared= l_shr,
472
+ l_float = l_flt)
473
+ pdf = mod.get_pdf()
474
+ ```
475
+
476
+ Here, the floating parameters **should not** be the same as the reparametrized ones.
477
+
478
+ ### Overriding parameters
479
+
480
+ The models above have their parameter ranges chosen for fits to B meson distributions
481
+ e.g. the mean of the distributions is around 5GeV. To make these models extensible for other
482
+ resonances do:
483
+
484
+ ```python
485
+ from dmu.stats.parameters import ParameterLibrary as PL
486
+
487
+ # This will override the ranges and starting value
488
+ PL.set_values(kind='cbr', parameter='mu', val=3000, low=2500, high=3500)
489
+
490
+ # This will fix a parameter, the three arguments need to be equal
491
+ PL.set_values(kind='cbr', parameter='sg', val= 30, low= 30, high= 30)
492
+ ```
493
+
494
+ before using the `ModelFactory` class.
495
+ For a summary of all the parameters and values available do:
496
+
497
+ ```python
498
+ PL.print_parameters(kind='cbr')
499
+ ```
500
+
152
501
  ### Printing PDFs
153
502
 
154
503
  One can print a zfit PDF by doing:
@@ -210,6 +559,25 @@ print_pdf(pdf,
210
559
  txt_path = 'tests/stats/utilities/print_pdf/pdf_const.txt')
211
560
  ```
212
561
 
562
+
563
+ ### Storing PDF as latex
564
+
565
+ The file above can be transformed into a `tex` file by running:
566
+
567
+ ```python
568
+ from dmu.stats.utilities import pdf_to_tex
569
+
570
+ d_par = {
571
+ 'ar_dscb_Signal_002_1_reso_flt' : r'$\alpha_{DSCB}^{1}$',
572
+ 'ar_dscb_Signal_002_2_reso_flt' : r'$\alpha_{DSCB}^{2}$',
573
+ }
574
+
575
+ # It will skip fixed parameters by default
576
+ pdf_to_tex(path='/path/to/pdf.txt', d_par=d_par, skip_fixed=True)
577
+ ```
578
+
579
+ where `d_par` will rename the `Parameters` column, such that it's in latex.
580
+
213
581
  ## Fits
214
582
 
215
583
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
@@ -273,8 +641,8 @@ strategy :
273
641
  # The lines below will split the range of the data [0-10] into two subranges, such that the NLL is built
274
642
  # only in those ranges. The ranges need to be tuples
275
643
  ranges :
276
- - !!python/tuple [0, 3]
277
- - !!python/tuple [6, 9]
644
+ - [0, 3]
645
+ - [6, 9]
278
646
  #The lines below will allow using contraints for each parameter, where the first element is the mean and the second
279
647
  #the width of a Gaussian constraint. No correlations are implemented, yet.
280
648
  constraints :
@@ -356,6 +724,10 @@ obj = ZFitPlotter(data=sam, model=pdf)
356
724
  d_leg = {'gauss': 'New Gauss'}
357
725
  obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
358
726
 
727
+ #Alternatively one can do:
728
+ obj.plot(nbins=50, d_leg=d_leg, stacked=True, ranges=[[0,3], [3,10]])
729
+ # For plotting only sidebands, useful if one has a blinded fit
730
+
359
731
  # add a line to pull hist
360
732
  obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
361
733
  ```
@@ -367,6 +739,71 @@ this class supports:
367
739
  - Stacking and overlaying of PDFs.
368
740
  - Blinding.
369
741
 
742
+ ## Fit saving
743
+
744
+ To save in one go everything regarding your fit do:
745
+
746
+ ```python
747
+ from dmu.stats import utilities as sut
748
+ from dmu.stats.zfit_plotter import ZFitPlotter
749
+
750
+ ptr = ZFitPlotter(data=dat, model=pdf)
751
+ ptr.plot()
752
+
753
+ sut.save_fit(data=data, model=pdf, res=fit_result, fit_dir='/some/directory', d_const=constraints)
754
+ ```
755
+
756
+ and the function will save everything that you would normally need from a fit.
757
+ If the lines with `ZFitPlotter` were called before `save_fit` the fit plot will also be saved.
758
+
759
+ ### Transforming fit results to DictConfig
760
+
761
+ The `OmegaConf` library offers `DictConfig` objects, which are easier to handle
762
+ when reading nested data. To transform a zfit result object into one of these
763
+ objects do:
764
+
765
+ ```python
766
+ from dmu.stats import utilities as sut
767
+
768
+ cres = sut.zres_to_cres(res=res)
769
+ ```
770
+
771
+ and then one would access the information like:
772
+
773
+ ```python
774
+ error = cres.mu.error
775
+ value = cres.mu.value
776
+ ```
777
+
778
+ and these objects can be saved to JSON with:
779
+
780
+ ```python
781
+ OmegaConf.save(config=cres, f='results.yaml')
782
+ ```
783
+
784
+ ## Placeholdef fits
785
+
786
+ In order to create a _fake_ fit on top of which one could develop other tools, do:
787
+
788
+ ```python
789
+ from dmu.stats import utilities
790
+
791
+ utilities.placeholder_fit(kind='s+b', fit_dir='/some/directory')
792
+ ```
793
+
794
+ ## Retrieving information on fits
795
+
796
+ Once the fit has be done and the results are saved to a given directory one can do:
797
+
798
+ ```python
799
+ from dmu.stats.fit_stats import FitStats
800
+
801
+ obj =FitStats(fit_dir='/directory/with/fit')
802
+ val = obj.get_value(name='var_name', kind='value or error')
803
+ ```
804
+
805
+ and the tool will retrieve the value. This is useful when the values are needed elsewhere
806
+ in the code, i.e. it would connect the fitting part with other parts.
370
807
  ## Arrays
371
808
 
372
809
  ### Scaling by non-integer
@@ -413,6 +850,24 @@ xval = numpy.lispace(0, 5, num=100)
413
850
  yval = fun(xval)
414
851
  ```
415
852
 
853
+ ## Other utilities
854
+
855
+ These are here to decrease boilerplate code
856
+
857
+ ```python
858
+ from dmu.stats import utilities as sut
859
+
860
+ # Retrieves name of observable from observable
861
+ name = sut.name_from_obs(obs=obs)
862
+
863
+ # Retrieves range of observable from observable
864
+ minx, maxx = sut.range_from_obs(obs=obs)
865
+
866
+ # This is needed because when building a KDE with too little data, that KDE cannot be evaluated
867
+ # and when trying it, tensorflow emits an exception.
868
+ is_pdf_usable(pdf)
869
+ ```
870
+
416
871
  # Machine learning
417
872
 
418
873
  ## Classification
@@ -427,16 +882,31 @@ rdf_bkg = _get_rdf(kind='bkg')
427
882
  cfg = _get_config()
428
883
 
429
884
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
430
- obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
885
+ obj.run(
886
+ skip_fit=False, # by default it will be false, if true, it will only make plots of features
887
+ opt_ntrial=20, # By default this is zero, if a larger number is chosen, a hyperparameter optimization with optuna will run with this number of trials
888
+ load_trained=False, # If true, it will not train the models but will just load them, only makes sense if models already exist. Useful to add postprocessing code, like the diagnostics section.
889
+ )
431
890
  ```
432
891
 
433
892
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
434
893
 
435
894
  ```yaml
436
895
  dataset:
896
+ # This section is optional. It can be used to redefine
897
+ # columns in different ways for different samples
898
+ #
899
+ # When evaluating the model, the same definitions will be used
900
+ # but they will be taken from the `sig` section.
901
+ samples:
902
+ sig:
903
+ definitions:
904
+ x : v + w
905
+ bkg:
906
+ definitions:
907
+ x : v - w
437
908
  # Before training, new features can be defined as below
438
909
  define :
439
- x : v + w
440
910
  y : v - w
441
911
  # If the key is found to be NaN, replace its value with the number provided
442
912
  # This will be used in the training.
@@ -455,8 +925,8 @@ training :
455
925
  learning_rate : 0.1
456
926
  min_samples_split : 2
457
927
  saving:
458
- # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
459
- path : 'tests/ml/train_mva/model.pkl'
928
+ # The model names are model_001.pkl, model_002.pkl, etc, one for each fold
929
+ path : 'tests/ml/train_mva'
460
930
  plotting:
461
931
  roc :
462
932
  min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
@@ -474,10 +944,7 @@ plotting:
474
944
  title : 'Correlation matrix'
475
945
  size : [10, 10]
476
946
  mask_value : 0 # Where correlation is zero, the bin will appear white
477
- val_dir : 'tests/ml/train_mva'
478
947
  features:
479
- saving:
480
- plt_dir : 'tests/ml/train_mva/features'
481
948
  plots:
482
949
  w :
483
950
  binning : [-4, 4, 100]
@@ -499,6 +966,20 @@ plotting:
499
966
 
500
967
  the `TrainMva` is just a wrapper to `scikit-learn` that enables cross-validation (and therefore that explains the `nfolds` setting).
501
968
 
969
+ #### Outputs
970
+
971
+ The trainer will produce in the output:
972
+
973
+ - Models in form of `pkl` files
974
+ - Plots of the features
975
+ - For each fold:
976
+ 1. Covariance plot
977
+ 1. ROC curve plot
978
+ 1. Feature importance table in latex
979
+ 1. JSON file with data to build the ROC curve
980
+ - For the full dataset it will provide the ROC curve, scores distribution and JSON file with `x`, `y` coordinates for ROC curve.
981
+ - Latex table with hyperparameters and NaN replacements.
982
+
502
983
  ### Caveats
503
984
 
504
985
  When training on real data, several things might go wrong and the code will try to deal with them in the following ways:
@@ -538,6 +1019,18 @@ If a sample exists, that was used in the training of _every_ model, no model can
538
1019
  During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
539
1020
  from that configuration and the user does not need to define extra columns.
540
1021
 
1022
+ ### Further optimization
1023
+
1024
+ If not all the entries of the ROOT dataframe are needed for the prediction (e.g. some entries won't be used anyway) define
1025
+ a column as:
1026
+
1027
+ ```python
1028
+ rdf = rdf.Define('skip_mva_prediction', 'mass < 3000')
1029
+ ```
1030
+
1031
+ and the predictor will assign scores of `-1` to all the entries with `mass < 3000`.
1032
+ This should speed up the prediction and reduce resource consumption.
1033
+
541
1034
  ### Caveats
542
1035
 
543
1036
  When evaluating the model with real data, problems might occur, we deal with them as follows:
@@ -552,12 +1045,158 @@ When evaluating the model with real data, problems might occur, we deal with the
552
1045
  - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
553
1046
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
554
1047
 
1048
+ ## Diagnostics
1049
+
1050
+ To run diagnostics on the trained model do:
1051
+
1052
+ ```python
1053
+ from dmu.ml.cv_diagnostics import CVDiagnostics
1054
+
1055
+ # Where l_model is the list of models and cfg is a dictionary with the config
1056
+ cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
1057
+ cvd.run()
1058
+ ```
1059
+
1060
+ the configuration can be loaded from a YAML file and would look like:
1061
+
1062
+ ```yaml
1063
+ # Directory where plots will go
1064
+ output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
1065
+ # Optional, will assume that the target is already in the input dataframe
1066
+ # and will use it, instead of evaluating models
1067
+ score_from_rdf : mva
1068
+ correlations:
1069
+ # Variables with respect to which the correlations with the features will be measured
1070
+ target :
1071
+ name : mass
1072
+ overlay :
1073
+ # These are the working points at which the "mass" variable will be plotted
1074
+ # If there is a correlation the shape should change
1075
+ wp :
1076
+ - 0.2
1077
+ - 0.5
1078
+ - 0.7
1079
+ - 0.9
1080
+ general:
1081
+ size : [20, 10]
1082
+ saving:
1083
+ plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
1084
+ plots:
1085
+ z :
1086
+ binning : [1000, 4000, 30]
1087
+ yscale : 'linear'
1088
+ labels : ['mass', 'Entries']
1089
+ normalized : true
1090
+ methods:
1091
+ - Pearson
1092
+ - Kendall-$\tau$
1093
+ figure:
1094
+ title: Scores from file
1095
+ size : [10, 8]
1096
+ xlabelsize: 18 # Constrols size of x axis labels. By default 30
1097
+ rotate : 60 # Will rotate xlabels by 60 degrees
1098
+ ```
1099
+
1100
+ ## Comparing classifiers
1101
+
1102
+ ### Simple approach
1103
+ To do that run:
1104
+
1105
+ ```bash
1106
+ compare_classifiers -c /path/to/config.yaml
1107
+ ```
1108
+
1109
+ where the config looks like:
1110
+
1111
+ ```yaml
1112
+ out_dir : /path/to/plots
1113
+ classifiers:
1114
+ label for model 1 : /path/to/directory/with/model1
1115
+ label for model 2 : /path/to/directory/with/model2
1116
+ ```
1117
+
1118
+ However this will only compare the classifiers ROC curves with respect to the
1119
+ samples that were used to train them.
1120
+
1121
+ ### With custom samples
1122
+
1123
+ However the models' peformances can also be compared by _plugging_ any
1124
+ signal and backgroud proxy for any model, like:
1125
+
1126
+ ```python
1127
+ import matplotlib.pyplot as plt
1128
+ from dmu.ml.cv_performance import CVPerformance
1129
+
1130
+ cvp = CVPerformance()
1131
+ cvp.plot_roc(
1132
+ sig =rdf_sig_1, bkg=rdf_bkg_1,
1133
+ model=l_model_1, name='def', color='red')
1134
+ cvp.plot_roc(
1135
+ sig =rdf_sig_1, bkg=rdf_bkg_2,
1136
+ model=l_model_2, name='alt', color='blue')
1137
+
1138
+ plt.legend()
1139
+ plt.grid()
1140
+ plt.show()
1141
+ ```
1142
+
1143
+ This should show an overlay of different ROC curves made for a specific combination
1144
+ of signal and background proxies with a given model.
1145
+
1146
+ # Dask dataframes
1147
+
1148
+ In order to process large ammounts of data a `Dask` dataframe is more suitable.
1149
+ A set of `ROOT` files can be loaded into one of these with:
1150
+
1151
+
1152
+ ```python
1153
+ from dmu.rfile.ddfgetter import DDFGetter
1154
+
1155
+ # Can also pass directly the configuration dictionary with the `cfg` argument
1156
+ # If no columns argument is passed, will take all the columns
1157
+
1158
+ ddfg = DDFGetter(config_path='config.yaml', columns=['a', 'b'])
1159
+ ddf = ddfg.get_dataframe()
1160
+
1161
+ # This will provide the pandas dataframe
1162
+ df = ddf.compute()
1163
+ ...
1164
+ ```
1165
+ where `config.yaml` would look like:
1166
+
1167
+ ```yaml
1168
+ tree : tree_name
1169
+ primary_keys:
1170
+ - index
1171
+ files :
1172
+ - file_001.root
1173
+ - file_002.root
1174
+ - file_003.root
1175
+ samples:
1176
+ - /tmp/tests/dmu/rfile/main
1177
+ - /tmp/tests/dmu/rfile/frnd
1178
+ ```
1179
+
555
1180
  # Pandas dataframes
556
1181
 
557
1182
  ## Utilities
558
1183
 
559
1184
  These are thin layers of code that take pandas dataframes and carry out specific tasks
560
1185
 
1186
+ ### NaN filter
1187
+
1188
+ The following snippet will remove NaNs from the dataframe
1189
+ if up to 2% of the rows have NaNs. Beyond that, an exception will be risen.
1190
+
1191
+ ```python
1192
+ import dmu.pdataframe.utilities as put
1193
+
1194
+ # Default is 0.02
1195
+ df = put.dropna(df, nan_frac=0.02)
1196
+ ```
1197
+
1198
+ The usecase is cleaning up automatically, data that is not expected to be perfect.
1199
+
561
1200
  ### Dataframe to latex
562
1201
 
563
1202
  One can save a dataframe to latex with:
@@ -582,10 +1221,41 @@ put.df_to_tex(df,
582
1221
  caption = 'some caption')
583
1222
  ```
584
1223
 
1224
+ ### Dataframe to and from YAML
1225
+
1226
+ This extends the existing JSON functionality
1227
+
1228
+ ```python
1229
+ import dmu.pdataframe.utilities as put
1230
+
1231
+ df_1 = _get_df()
1232
+ put.to_yaml(df_1, yml_path)
1233
+ df_2 = put.from_yaml(yml_path)
1234
+ ```
1235
+
1236
+ and is meant to be less verbose than doing it through the YAML module.
585
1237
  # Rdataframes
586
1238
 
587
1239
  These are utility functions meant to be used with ROOT dataframes.
588
1240
 
1241
+ ## Cutflows from RDataFrames
1242
+
1243
+ When using the `Filter` method on a ROOT dataframe, one can:
1244
+
1245
+ ```python
1246
+ rep = rdf.Report()
1247
+ rep.Print()
1248
+ ```
1249
+
1250
+ however this `rep` object is not python friendly, despite it is basically a table that can be
1251
+ put in pandas dataframe. Precisely this can be done with:
1252
+
1253
+ ```python
1254
+ from dmu.rdataframe import utilities as ut
1255
+
1256
+ df = ut.rdf_report_to_df(rep)
1257
+ ```
1258
+
589
1259
  ## Adding a column from a numpy array
590
1260
 
591
1261
  ### With numba
@@ -649,6 +1319,18 @@ obj = AtrMgr(rdf)
649
1319
  obj.to_json('/path/to/file.json')
650
1320
  ```
651
1321
 
1322
+ ## Filtering for a random number of entries
1323
+
1324
+ The built in method `Range` only can be used to select ranges. Use
1325
+
1326
+ ```python
1327
+ import dmu.rdataframe.utilities as ut
1328
+
1329
+ rdf = ut.random_filter(rdf, entries=val)
1330
+ ```
1331
+
1332
+ to select **approximately** a random number `entries` of entries from the dataframe.
1333
+
652
1334
  # Logging
653
1335
 
654
1336
  The `LogStore` class is an interface to the `logging` module. It is aimed at making it easier to include
@@ -668,6 +1350,25 @@ log.error('error')
668
1350
  log.critical('critical')
669
1351
  ```
670
1352
 
1353
+ In order to get a specific logger do:
1354
+
1355
+ ```python
1356
+ logger = LogStore.get_logger(name='my_logger_name')
1357
+ ```
1358
+
1359
+ In order to get the logging level fromt the logger do:
1360
+
1361
+ ```python
1362
+ level = log.getEffectiveLevel()
1363
+ ```
1364
+
1365
+ And a context manager is available, which can be used with:
1366
+
1367
+ ```python
1368
+ with LogStore.level('logger_name', 10):
1369
+ log.debug('Debug message')
1370
+ ```
1371
+
671
1372
  # Plotting from ROOT dataframes
672
1373
 
673
1374
  ## 1D plots
@@ -703,10 +1404,34 @@ definitions:
703
1404
  plots:
704
1405
  x :
705
1406
  binning : [0.98, 0.98, 40] # Here bounds agree => tool will calculate bounds making sure that they are the 2% and 98% quantile
706
- yscale : 'linear' # Optional, if not passed, will do linear, can be log
1407
+ yscale : linear # Optional, if not passed, will do linear, can be log
707
1408
  labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
708
- title : 'some title can be added for different variable plots'
709
- name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
1409
+ title : some title can be added for different variable plots
1410
+ name : plot_of_x # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
1411
+ weights : my_weights # Optional, this is the column in the dataframe with the weights
1412
+ # Can add styling to specific plots, this should be the argument of
1413
+ # hist.plot(...)
1414
+ styling :
1415
+ # This section will update the styling of each category
1416
+ # The categories (class A, etc) are the keys of the dictionary of dataframes
1417
+ class A:
1418
+ # These are the arguments of plt.hist(...)
1419
+ histtype : fill
1420
+ color : gray
1421
+ alpha : 0.3
1422
+ class B:
1423
+ color : red
1424
+ histtype : step
1425
+ linestyle: '-' # Linestyle is by default 'none',
1426
+ # needs to be overriden to see _steps_
1427
+ # This will add vertical lines to plots, the arguments are the same
1428
+ # as the ones passed to axvline
1429
+ vline :
1430
+ x : 0
1431
+ label : label
1432
+ ls : --
1433
+ c : blue
1434
+ lw : 1
710
1435
  y :
711
1436
  binning : [-5.0, 8.0, 40]
712
1437
  yscale : 'linear'
@@ -725,11 +1450,52 @@ style:
725
1450
  # The line below would place the legend outside the figure to avoid ovelaps with the histogram
726
1451
  bbox_to_anchor : [1.2, 1]
727
1452
  stats:
728
- nentries : '{:.2e}' # This will add number of entries in legend box
1453
+ sumw : '{:.2f}' # This will add sum of weights to label. If no weights, then it will be the nentries value
729
1454
  ```
730
1455
 
731
1456
  it's up to the user to build this dictionary and load it.
732
1457
 
1458
+ ### Pluggins
1459
+
1460
+ Extra functionality can be `plugged` into the code by using the pluggins section like:
1461
+
1462
+ #### FWHM
1463
+ ```yaml
1464
+ plugin:
1465
+ fwhm:
1466
+ # Can control each variable fit separately
1467
+ x :
1468
+ plot : true
1469
+ obs : [-2, 4]
1470
+ plot : true
1471
+ format : FWHM={:.3f}
1472
+ add_std: True
1473
+ y :
1474
+ plot : true
1475
+ obs : [-4, 8]
1476
+ plot : true
1477
+ format : FWHM={:.3f}
1478
+ add_std: True
1479
+ ```
1480
+
1481
+ where the section will
1482
+
1483
+ - Use a KDE to fit the distribution and plot it on top of the histogram
1484
+ - Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
1485
+
1486
+ #### stats
1487
+
1488
+ ```yaml
1489
+ plugin:
1490
+ stats:
1491
+ x :
1492
+ mean : $\mu$={:.2f}
1493
+ rms : $\sigma$={:.2f}
1494
+ sum : $\Sigma$={:.0f}
1495
+ ```
1496
+
1497
+ Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
1498
+
733
1499
  ## 2D plots
734
1500
 
735
1501
  For the 2D case it would look like: