data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
  2. data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
  3. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
  4. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
  5. dmu/__init__.py +0 -0
  6. dmu/generic/hashing.py +34 -8
  7. dmu/generic/utilities.py +164 -11
  8. dmu/logging/log_store.py +34 -2
  9. dmu/logging/messages.py +96 -0
  10. dmu/ml/cv_classifier.py +3 -3
  11. dmu/ml/cv_diagnostics.py +3 -0
  12. dmu/ml/cv_performance.py +58 -0
  13. dmu/ml/cv_predict.py +149 -46
  14. dmu/ml/train_mva.py +482 -100
  15. dmu/ml/utilities.py +29 -10
  16. dmu/pdataframe/utilities.py +28 -3
  17. dmu/plotting/fwhm.py +2 -2
  18. dmu/plotting/matrix.py +1 -1
  19. dmu/plotting/plotter.py +23 -3
  20. dmu/plotting/plotter_1d.py +96 -32
  21. dmu/plotting/plotter_2d.py +5 -0
  22. dmu/rdataframe/utilities.py +54 -3
  23. dmu/rfile/ddfgetter.py +102 -0
  24. dmu/stats/fit_stats.py +129 -0
  25. dmu/stats/fitter.py +55 -22
  26. dmu/stats/gof_calculator.py +7 -0
  27. dmu/stats/model_factory.py +153 -62
  28. dmu/stats/parameters.py +100 -0
  29. dmu/stats/utilities.py +443 -12
  30. dmu/stats/wdata.py +187 -0
  31. dmu/stats/zfit.py +17 -0
  32. dmu/stats/zfit_plotter.py +147 -36
  33. dmu/testing/utilities.py +102 -24
  34. dmu/workflow/__init__.py +0 -0
  35. dmu/workflow/cache.py +266 -0
  36. dmu_data/ml/tests/train_mva.yaml +9 -7
  37. dmu_data/ml/tests/train_mva_def.yaml +75 -0
  38. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
  39. dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
  40. dmu_data/plotting/tests/2d.yaml +5 -5
  41. dmu_data/plotting/tests/line.yaml +15 -0
  42. dmu_data/plotting/tests/styling.yaml +8 -1
  43. dmu_data/rfile/friends.yaml +13 -0
  44. dmu_data/stats/fitter/test_simple.yaml +28 -0
  45. dmu_data/stats/kde_optimizer/control.json +1 -0
  46. dmu_data/stats/kde_optimizer/signal.json +1 -0
  47. dmu_data/stats/parameters/data.yaml +178 -0
  48. dmu_data/tests/config.json +6 -0
  49. dmu_data/tests/config.yaml +4 -0
  50. dmu_data/tests/pdf_to_tex.txt +34 -0
  51. dmu_scripts/kerberos/check_expiration +21 -0
  52. dmu_scripts/kerberos/convert_certificate +22 -0
  53. dmu_scripts/ml/compare_classifiers.py +85 -0
  54. data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
  55. {data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
  56. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.7
3
+ Version: 0.2.8.dev714
4
+ Summary: Project storing utilities needed to reduce boilerplate code when analyzing data
4
5
  Description-Content-Type: text/markdown
5
- Requires-Dist: logzero
6
- Requires-Dist: PyYAML
7
- Requires-Dist: scipy
8
- Requires-Dist: awkward
9
- Requires-Dist: tqdm
10
- Requires-Dist: numpy
11
- Requires-Dist: toml
12
- Requires-Dist: numpy
13
- Requires-Dist: matplotlib
14
- Requires-Dist: mplhep
15
- Requires-Dist: hist[plot]
16
- Requires-Dist: pandas
17
- Provides-Extra: fit
18
- Requires-Dist: zfit; extra == "fit"
19
- Requires-Dist: tensorflow==2.18.0; extra == "fit"
20
- Provides-Extra: ml
21
- Requires-Dist: scikit-learn; extra == "ml"
22
- Requires-Dist: joblib; extra == "ml"
23
- Provides-Extra: dev
24
- Requires-Dist: pytest; extra == "dev"
6
+
7
+ [TOC]
25
8
 
26
9
  # D(ata) M(anipulation) U(tilities)
27
10
 
@@ -56,8 +39,160 @@ Then, for each remote it pushes the tags and the commits.
56
39
 
57
40
  This section describes generic tools that could not be put in a specific category, but tend to be useful.
58
41
 
42
+ ## Caching data
43
+
44
+ In order to reuse data that is hard to calculate one would need:
45
+
46
+ - Serializable data, i.e. strings, floats, lists, etc
47
+ - A way to get a unique identifier of that data, e.g. a hashable object
48
+
49
+ If both are avalable, one can:
50
+
51
+ ```python
52
+ import dmu.generic.utilities as gut
53
+
54
+ def _get_something() -> float:
55
+ # This loads the data, if found
56
+ hashable = arg1, arg2
57
+
58
+ ret = gut.load_cached(hash_obj=hashable, on_fail=-999)
59
+ if ret != -999:
60
+ return ret
61
+
62
+ obj = very_expensive_function(arg1, arg2)
63
+
64
+ # This saves the data
65
+ ret = gut.cache_data(obj, hash_obj=hashable)
66
+
67
+ return ret
68
+ ```
69
+
70
+ the cached data will go to JSON files in `/tmp/dmu/cache`.
71
+
72
+ ## Caching with a base class
73
+
74
+ Caching functionalities can be added to a class through a base class as in:
75
+
76
+ ```python
77
+ from dmu.workflow.cache import Cache as Wcache
78
+
79
+ class Tester(Wcache):
80
+ '''
81
+ Testing class, produces outputs from simple inputs
82
+ '''
83
+ # -----------------------------------
84
+ def __init__(
85
+ self,
86
+ nval : int):
87
+ '''
88
+ nval, some integer used to produce output data
89
+ '''
90
+ super().__init__(
91
+ out_path='Tester',
92
+ nval =nval)
93
+
94
+ self._nval = nval
95
+ # -----------------------------------
96
+ def run(self) -> None:
97
+ '''
98
+ Returns a list of 1's
99
+ '''
100
+ # _out_path belongs to the base class
101
+ obj_path = f'{self._out_path}/values.json'
102
+
103
+ if self._copy_from_cache():
104
+ log.warning('Output cached, not running')
105
+ return gut.load_json(obj_path)
106
+
107
+ log.info('Data not cached, running')
108
+ res = [1] * self._nval
109
+
110
+ gut.dump_json(res, obj_path)
111
+ self._cache()
112
+
113
+ return res
114
+
115
+ # This will set the root directory where cached data goes
116
+ # The data will go to `/some/directory/Tester`
117
+ # This has to be done ONCE and only ONCE.
118
+ Wcache.set_cache_root(root='/some/directory')
119
+
120
+ obj = Tester(nval=3)
121
+ ...
122
+ ```
123
+
124
+ where the tester class has access to extra functionalities to:
125
+
126
+ - Cache outputs to a hashed directory
127
+ - For the next run, check if the directory exists, if so pick
128
+ the outputs and put them in the output directory
129
+ - If not rerun the process
130
+
131
+ Several hashed directories might exist, like in the diagram:
132
+
133
+ ![](doc/images/cache_hash.png)
134
+
135
+ **Important**: This class will also use the hash of the module where the `Test`
136
+ class is defined. Thus, changes in the code or in the input data, will invalidate the hash.
137
+
138
+ ### Turning caching off
139
+
140
+ This can be done temporarily with:
141
+
142
+ ```python
143
+ with Wcache.turn_off_cache(val=['Tester']):
144
+ obj = Tester(nval=4)
145
+ out = obj.run()
146
+ ```
147
+
148
+ for any list of classes that inherit from `Cache` by passing the list of class names.
149
+ If `val=None` is passed, ALL the classes caching is turned off.
150
+
151
+ ## Silencing import messages
152
+
153
+ To silence messages given by modules not in the user's control do:
154
+
155
+ ```python
156
+ import dmu.generic.utilities as gut
157
+
158
+ with gut.silent_import():
159
+ import tensorflow
160
+ ```
161
+
162
+ ## Silencing messages going to __stderr__ originating deep from C++ code
163
+
164
+ This is an issue with frameworks like `Tensorflow`. Some messages are impossible
165
+ to kill, which interferes with the debugging process. In order hide selectively
166
+ those messages, do:
167
+
168
+ ```python
169
+ from dmu.logging import messages as mes
170
+
171
+ l_msg = ['ONE', 'TWO']
172
+ with mes.filter_stderr(banned_substrings=l_msg):
173
+ os.write(2, b'MSG ONE\n')
174
+ os.write(2, b'MSG TWO\n')
175
+ os.write(2, b'MSG THREE\n')
176
+ ```
177
+
178
+ The context manager above will only allow `THREE` into the error stream.
179
+
180
+ ## YAML
181
+
182
+ When dumping data to yaml files do it like:
183
+
184
+ ```python
185
+ import dmu.generic.utilities as gut
186
+
187
+ yaml.dump(data, Dumper=gut.BlockStyleDumper)
188
+ ```
189
+
190
+ to make sure the indentation is correct.
191
+
59
192
  ## Hashing
60
193
 
194
+ ### Hashing python objects
195
+
61
196
  The snippet below:
62
197
 
63
198
  ```python
@@ -73,7 +208,20 @@ will:
73
208
  - Encode it to utf-8
74
209
  - Make a 64 characters hash out of it
75
210
 
76
- in two lines, thus keeping the user's code clean.
211
+ in two lines, thus keeping the user's code clean.
212
+
213
+ ### Hashing files
214
+
215
+ The following snippet:
216
+
217
+ ```python
218
+ from dmu.generic import hashing
219
+
220
+ path = '/some/file/path.txt'
221
+ val = hashing.hash_file(path=obj)
222
+ ```
223
+
224
+ should provide a hash to a file, given its path.
77
225
 
78
226
  ## Timer
79
227
 
@@ -91,9 +239,9 @@ def fun():
91
239
  fun()
92
240
  ```
93
241
 
94
- ## JSON dumper and loader
242
+ ## JSON/YAML dumper and loader
95
243
 
96
- The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
244
+ The following lines will dump data (dictionaries, lists, etc) to a JSON/YAML file and load it back:
97
245
 
98
246
  ```python
99
247
  import dmu.generic.utilities as gut
@@ -104,8 +252,45 @@ gut.dump_json(data, '/tmp/list.json')
104
252
  data = gut.load_json('/tmp/list.json')
105
253
  ```
106
254
 
255
+ this will dump to either JSON or YAML files, depending on the extension, extensions allowed are:
256
+
257
+ ```
258
+ .json
259
+ .yaml
260
+ .yml
261
+ ```
262
+
107
263
  and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
108
264
 
265
+ ## PKL dumper and loader
266
+
267
+ In the same way one can do:
268
+
269
+ ```python
270
+ import dmu.generic.utilities as gut
271
+
272
+ data = [1,2,3,4]
273
+
274
+ gut.dump_pickle(data, '/tmp/list.pkl')
275
+ data = gut.load_pickle('/tmp/list.pkl')
276
+ ```
277
+
278
+ ## Loader of files and configurations from data packages
279
+
280
+ YAML and JSON files can be loaded from data packages with:
281
+
282
+ ```python
283
+ import dmu.generic.utilities as gut
284
+
285
+ data = gut.load_data(package='dmu_data', fpath=f'tests/data.json')
286
+ conf = gut.load_conf(package='dmu_data', fpath=f'tests/config.json')
287
+ ```
288
+
289
+ the former will return a python dictionary, list, etc.
290
+ The later will return a `DataConf` object from the `omegaconf` project.
291
+ Check [this](https://omegaconf.readthedocs.io/en/2.3_branch/index.html)
292
+ for more information.
293
+
109
294
  # Physics
110
295
 
111
296
  ## Truth matching
@@ -146,8 +331,72 @@ samples:
146
331
 
147
332
  # Math
148
333
 
334
+ ## Weighted data
335
+
336
+ `Wdata` is a small class symbolizing weighted data that contains extra functionality. It can
337
+ be used as:
338
+
339
+ ```python
340
+ from dmu.stats.wdata import Wdata
341
+
342
+ arr_mass = numpy.random.normal(loc=0, scale=1.0, size=Data.nentries)
343
+ arr_wgt = numpy.random.normal(loc=1, scale=0.1, size=Data.nentries)
344
+
345
+ # Make an instance
346
+ wdata = Wdata(data=arr_mass, weights=arr_wgt)
347
+
348
+ # create a zfit dataset, if needed
349
+ obs = zfit.Space('obs', limits=(-3, +3))
350
+ zdata = wdata.to_zfit(obs=obs)
351
+
352
+ # Add datasets
353
+ wdata_1 = Wdata(data=arr_mass, weights=arr_wgt)
354
+ wdata_2 = Wdata(data=arr_mass, weights=arr_wgt)
355
+ wdata_3 = wdata_1 + wdata_2
356
+
357
+ # Extract information from dataset
358
+
359
+ wdata.sumw() # sum of weights
360
+ wdata.size() # Number of entries
361
+
362
+ # Update weights creating a new Wdata instance
363
+ arr_wgt_new = numpy.random.normal(loc=1, scale=0.2, size=Data.nentries)
364
+
365
+ # New weights
366
+ wdata_2 = wdata.update_weights(weights=arr_wgt_new, replace=True)
367
+
368
+ # Multiply old weights by new ones and update
369
+ wdata_3 = wdata.update_weights(weights=arr_wgt_new, replace=False)
370
+ ```
371
+
149
372
  ## PDFs
150
373
 
374
+ ### Suppressing tensorflow messages from zfit import
375
+
376
+ If you work with zfit, you will see messages from tensorflow, by importing zfit through:
377
+
378
+ ```python
379
+ from dmu.stats.zfit import zfit
380
+ ```
381
+
382
+ these messages should be hidden. If `ROOT` is installed, the wrapper will import it before
383
+ importing tensorflow. That will prevent crashes which usually happen when `tensorflow`
384
+ is imported before `ROOT`.
385
+
386
+ ### Toy models
387
+
388
+ For quick tests, one can retrieve simple models with :
389
+
390
+ ```python
391
+ from dmu.stats import utilities as sut
392
+
393
+ # For a Gaussian plus Exponential, extended
394
+ pdf = sut.get_model(kind='s+b')
395
+
396
+ # For a Gaussian signal, non extended
397
+ pdf = sut.get_model(kind='signal')
398
+ ```
399
+
151
400
  ### Model building
152
401
 
153
402
  In order to do complex fits, one often needs PDFs with many parameters, which need to be added.
@@ -159,8 +408,27 @@ from dmu.stats.model_factory import ModelFactory
159
408
 
160
409
  l_pdf = ['cbr'] + 2 * ['cbl']
161
410
  l_shr = ['mu', 'sg']
162
- d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
163
- mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
411
+ l_flt = ['mu', 'sg'] # Will mark these parameters as floating for the fit done afterwards
412
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'} # Optional, will reparametrize for scale and resolution
413
+ d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # Optional, will fix two parameters whose names start with the keys
414
+
415
+ # If mu and sg are meant to be shared among all the models
416
+ # The parameters can be passed here.
417
+ # In this case, they are also meant to be floating
418
+ mu = zfit.param.Parameter('mu_flt', 5280, 5000, 5500)
419
+ sg = zfit.param.Parameter('sg_flt', 80, 20, 100)
420
+ l_reuse = [mu, sg]
421
+
422
+ mod = ModelFactory(
423
+ preffix = 'pref', # Preffix for parameter naming
424
+ obs = Data.obs,
425
+ l_pdf = l_pdf,
426
+ l_shared= l_shr,
427
+ l_float = l_float,
428
+ l_reuse = l_reuse, # Optional
429
+ d_rep = d_rep, # Optional
430
+ d_fix = d_fix) # Optional
431
+
164
432
  pdf = mod.get_pdf()
165
433
  ```
166
434
 
@@ -181,7 +449,7 @@ dscb : Double sided CrystallBall
181
449
 
182
450
  In order to introduce reparametrizations for the means and the resolutions, such that:
183
451
 
184
- $\mu\to\mu+\Delta\mu$
452
+ $\mu\to\mu+\Delta\mu$
185
453
  $\sigma\to\sigma\cdot s_{\sigma}$
186
454
 
187
455
  where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
@@ -207,6 +475,29 @@ pdf = mod.get_pdf()
207
475
 
208
476
  Here, the floating parameters **should not** be the same as the reparametrized ones.
209
477
 
478
+ ### Overriding parameters
479
+
480
+ The models above have their parameter ranges chosen for fits to B meson distributions
481
+ e.g. the mean of the distributions is around 5GeV. To make these models extensible for other
482
+ resonances do:
483
+
484
+ ```python
485
+ from dmu.stats.parameters import ParameterLibrary as PL
486
+
487
+ # This will override the ranges and starting value
488
+ PL.set_values(kind='cbr', parameter='mu', val=3000, low=2500, high=3500)
489
+
490
+ # This will fix a parameter, the three arguments need to be equal
491
+ PL.set_values(kind='cbr', parameter='sg', val= 30, low= 30, high= 30)
492
+ ```
493
+
494
+ before using the `ModelFactory` class.
495
+ For a summary of all the parameters and values available do:
496
+
497
+ ```python
498
+ PL.print_parameters(kind='cbr')
499
+ ```
500
+
210
501
  ### Printing PDFs
211
502
 
212
503
  One can print a zfit PDF by doing:
@@ -268,6 +559,25 @@ print_pdf(pdf,
268
559
  txt_path = 'tests/stats/utilities/print_pdf/pdf_const.txt')
269
560
  ```
270
561
 
562
+
563
+ ### Storing PDF as latex
564
+
565
+ The file above can be transformed into a `tex` file by running:
566
+
567
+ ```python
568
+ from dmu.stats.utilities import pdf_to_tex
569
+
570
+ d_par = {
571
+ 'ar_dscb_Signal_002_1_reso_flt' : r'$\alpha_{DSCB}^{1}$',
572
+ 'ar_dscb_Signal_002_2_reso_flt' : r'$\alpha_{DSCB}^{2}$',
573
+ }
574
+
575
+ # It will skip fixed parameters by default
576
+ pdf_to_tex(path='/path/to/pdf.txt', d_par=d_par, skip_fixed=True)
577
+ ```
578
+
579
+ where `d_par` will rename the `Parameters` column, such that it's in latex.
580
+
271
581
  ## Fits
272
582
 
273
583
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
@@ -331,8 +641,8 @@ strategy :
331
641
  # The lines below will split the range of the data [0-10] into two subranges, such that the NLL is built
332
642
  # only in those ranges. The ranges need to be tuples
333
643
  ranges :
334
- - !!python/tuple [0, 3]
335
- - !!python/tuple [6, 9]
644
+ - [0, 3]
645
+ - [6, 9]
336
646
  #The lines below will allow using contraints for each parameter, where the first element is the mean and the second
337
647
  #the width of a Gaussian constraint. No correlations are implemented, yet.
338
648
  constraints :
@@ -414,6 +724,10 @@ obj = ZFitPlotter(data=sam, model=pdf)
414
724
  d_leg = {'gauss': 'New Gauss'}
415
725
  obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
416
726
 
727
+ #Alternatively one can do:
728
+ obj.plot(nbins=50, d_leg=d_leg, stacked=True, ranges=[[0,3], [3,10]])
729
+ # For plotting only sidebands, useful if one has a blinded fit
730
+
417
731
  # add a line to pull hist
418
732
  obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
419
733
  ```
@@ -425,6 +739,71 @@ this class supports:
425
739
  - Stacking and overlaying of PDFs.
426
740
  - Blinding.
427
741
 
742
+ ## Fit saving
743
+
744
+ To save in one go everything regarding your fit do:
745
+
746
+ ```python
747
+ from dmu.stats import utilities as sut
748
+ from dmu.stats.zfit_plotter import ZFitPlotter
749
+
750
+ ptr = ZFitPlotter(data=dat, model=pdf)
751
+ ptr.plot()
752
+
753
+ sut.save_fit(data=data, model=pdf, res=fit_result, fit_dir='/some/directory', d_const=constraints)
754
+ ```
755
+
756
+ and the function will save everything that you would normally need from a fit.
757
+ If the lines with `ZFitPlotter` were called before `save_fit` the fit plot will also be saved.
758
+
759
+ ### Transforming fit results to DictConfig
760
+
761
+ The `OmegaConf` library offers `DictConfig` objects, which are easier to handle
762
+ when reading nested data. To transform a zfit result object into one of these
763
+ objects do:
764
+
765
+ ```python
766
+ from dmu.stats import utilities as sut
767
+
768
+ cres = sut.zres_to_cres(res=res)
769
+ ```
770
+
771
+ and then one would access the information like:
772
+
773
+ ```python
774
+ error = cres.mu.error
775
+ value = cres.mu.value
776
+ ```
777
+
778
+ and these objects can be saved to JSON with:
779
+
780
+ ```python
781
+ OmegaConf.save(config=cres, f='results.yaml')
782
+ ```
783
+
784
+ ## Placeholdef fits
785
+
786
+ In order to create a _fake_ fit on top of which one could develop other tools, do:
787
+
788
+ ```python
789
+ from dmu.stats import utilities
790
+
791
+ utilities.placeholder_fit(kind='s+b', fit_dir='/some/directory')
792
+ ```
793
+
794
+ ## Retrieving information on fits
795
+
796
+ Once the fit has be done and the results are saved to a given directory one can do:
797
+
798
+ ```python
799
+ from dmu.stats.fit_stats import FitStats
800
+
801
+ obj =FitStats(fit_dir='/directory/with/fit')
802
+ val = obj.get_value(name='var_name', kind='value or error')
803
+ ```
804
+
805
+ and the tool will retrieve the value. This is useful when the values are needed elsewhere
806
+ in the code, i.e. it would connect the fitting part with other parts.
428
807
  ## Arrays
429
808
 
430
809
  ### Scaling by non-integer
@@ -471,6 +850,24 @@ xval = numpy.lispace(0, 5, num=100)
471
850
  yval = fun(xval)
472
851
  ```
473
852
 
853
+ ## Other utilities
854
+
855
+ These are here to decrease boilerplate code
856
+
857
+ ```python
858
+ from dmu.stats import utilities as sut
859
+
860
+ # Retrieves name of observable from observable
861
+ name = sut.name_from_obs(obs=obs)
862
+
863
+ # Retrieves range of observable from observable
864
+ minx, maxx = sut.range_from_obs(obs=obs)
865
+
866
+ # This is needed because when building a KDE with too little data, that KDE cannot be evaluated
867
+ # and when trying it, tensorflow emits an exception.
868
+ is_pdf_usable(pdf)
869
+ ```
870
+
474
871
  # Machine learning
475
872
 
476
873
  ## Classification
@@ -485,16 +882,31 @@ rdf_bkg = _get_rdf(kind='bkg')
485
882
  cfg = _get_config()
486
883
 
487
884
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
488
- obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
885
+ obj.run(
886
+ skip_fit=False, # by default it will be false, if true, it will only make plots of features
887
+ opt_ntrial=20, # By default this is zero, if a larger number is chosen, a hyperparameter optimization with optuna will run with this number of trials
888
+ load_trained=False, # If true, it will not train the models but will just load them, only makes sense if models already exist. Useful to add postprocessing code, like the diagnostics section.
889
+ )
489
890
  ```
490
891
 
491
892
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
492
893
 
493
894
  ```yaml
494
895
  dataset:
896
+ # This section is optional. It can be used to redefine
897
+ # columns in different ways for different samples
898
+ #
899
+ # When evaluating the model, the same definitions will be used
900
+ # but they will be taken from the `sig` section.
901
+ samples:
902
+ sig:
903
+ definitions:
904
+ x : v + w
905
+ bkg:
906
+ definitions:
907
+ x : v - w
495
908
  # Before training, new features can be defined as below
496
909
  define :
497
- x : v + w
498
910
  y : v - w
499
911
  # If the key is found to be NaN, replace its value with the number provided
500
912
  # This will be used in the training.
@@ -513,8 +925,8 @@ training :
513
925
  learning_rate : 0.1
514
926
  min_samples_split : 2
515
927
  saving:
516
- # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
517
- path : 'tests/ml/train_mva/model.pkl'
928
+ # The model names are model_001.pkl, model_002.pkl, etc, one for each fold
929
+ path : 'tests/ml/train_mva'
518
930
  plotting:
519
931
  roc :
520
932
  min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
@@ -532,10 +944,7 @@ plotting:
532
944
  title : 'Correlation matrix'
533
945
  size : [10, 10]
534
946
  mask_value : 0 # Where correlation is zero, the bin will appear white
535
- val_dir : 'tests/ml/train_mva'
536
947
  features:
537
- saving:
538
- plt_dir : 'tests/ml/train_mva/features'
539
948
  plots:
540
949
  w :
541
950
  binning : [-4, 4, 100]
@@ -557,6 +966,20 @@ plotting:
557
966
 
558
967
  the `TrainMva` is just a wrapper to `scikit-learn` that enables cross-validation (and therefore that explains the `nfolds` setting).
559
968
 
969
+ #### Outputs
970
+
971
+ The trainer will produce in the output:
972
+
973
+ - Models in form of `pkl` files
974
+ - Plots of the features
975
+ - For each fold:
976
+ 1. Covariance plot
977
+ 1. ROC curve plot
978
+ 1. Feature importance table in latex
979
+ 1. JSON file with data to build the ROC curve
980
+ - For the full dataset it will provide the ROC curve, scores distribution and JSON file with `x`, `y` coordinates for ROC curve.
981
+ - Latex table with hyperparameters and NaN replacements.
982
+
560
983
  ### Caveats
561
984
 
562
985
  When training on real data, several things might go wrong and the code will try to deal with them in the following ways:
@@ -596,6 +1019,18 @@ If a sample exists, that was used in the training of _every_ model, no model can
596
1019
  During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
597
1020
  from that configuration and the user does not need to define extra columns.
598
1021
 
1022
+ ### Further optimization
1023
+
1024
+ If not all the entries of the ROOT dataframe are needed for the prediction (e.g. some entries won't be used anyway) define
1025
+ a column as:
1026
+
1027
+ ```python
1028
+ rdf = rdf.Define('skip_mva_prediction', 'mass < 3000')
1029
+ ```
1030
+
1031
+ and the predictor will assign scores of `-1` to all the entries with `mass < 3000`.
1032
+ This should speed up the prediction and reduce resource consumption.
1033
+
599
1034
  ### Caveats
600
1035
 
601
1036
  When evaluating the model with real data, problems might occur, we deal with them as follows:
@@ -635,6 +1070,8 @@ correlations:
635
1070
  target :
636
1071
  name : mass
637
1072
  overlay :
1073
+ # These are the working points at which the "mass" variable will be plotted
1074
+ # If there is a correlation the shape should change
638
1075
  wp :
639
1076
  - 0.2
640
1077
  - 0.5
@@ -650,8 +1087,6 @@ correlations:
650
1087
  yscale : 'linear'
651
1088
  labels : ['mass', 'Entries']
652
1089
  normalized : true
653
- styling :
654
- linestyle: '-' # By default there is no line, just pointer
655
1090
  methods:
656
1091
  - Pearson
657
1092
  - Kendall-$\tau$
@@ -662,12 +1097,106 @@ correlations:
662
1097
  rotate : 60 # Will rotate xlabels by 60 degrees
663
1098
  ```
664
1099
 
1100
+ ## Comparing classifiers
1101
+
1102
+ ### Simple approach
1103
+ To do that run:
1104
+
1105
+ ```bash
1106
+ compare_classifiers -c /path/to/config.yaml
1107
+ ```
1108
+
1109
+ where the config looks like:
1110
+
1111
+ ```yaml
1112
+ out_dir : /path/to/plots
1113
+ classifiers:
1114
+ label for model 1 : /path/to/directory/with/model1
1115
+ label for model 2 : /path/to/directory/with/model2
1116
+ ```
1117
+
1118
+ However this will only compare the classifiers ROC curves with respect to the
1119
+ samples that were used to train them.
1120
+
1121
+ ### With custom samples
1122
+
1123
+ However the models' peformances can also be compared by _plugging_ any
1124
+ signal and backgroud proxy for any model, like:
1125
+
1126
+ ```python
1127
+ import matplotlib.pyplot as plt
1128
+ from dmu.ml.cv_performance import CVPerformance
1129
+
1130
+ cvp = CVPerformance()
1131
+ cvp.plot_roc(
1132
+ sig =rdf_sig_1, bkg=rdf_bkg_1,
1133
+ model=l_model_1, name='def', color='red')
1134
+ cvp.plot_roc(
1135
+ sig =rdf_sig_1, bkg=rdf_bkg_2,
1136
+ model=l_model_2, name='alt', color='blue')
1137
+
1138
+ plt.legend()
1139
+ plt.grid()
1140
+ plt.show()
1141
+ ```
1142
+
1143
+ This should show an overlay of different ROC curves made for a specific combination
1144
+ of signal and background proxies with a given model.
1145
+
1146
+ # Dask dataframes
1147
+
1148
+ In order to process large ammounts of data a `Dask` dataframe is more suitable.
1149
+ A set of `ROOT` files can be loaded into one of these with:
1150
+
1151
+
1152
+ ```python
1153
+ from dmu.rfile.ddfgetter import DDFGetter
1154
+
1155
+ # Can also pass directly the configuration dictionary with the `cfg` argument
1156
+ # If no columns argument is passed, will take all the columns
1157
+
1158
+ ddfg = DDFGetter(config_path='config.yaml', columns=['a', 'b'])
1159
+ ddf = ddfg.get_dataframe()
1160
+
1161
+ # This will provide the pandas dataframe
1162
+ df = ddf.compute()
1163
+ ...
1164
+ ```
1165
+ where `config.yaml` would look like:
1166
+
1167
+ ```yaml
1168
+ tree : tree_name
1169
+ primary_keys:
1170
+ - index
1171
+ files :
1172
+ - file_001.root
1173
+ - file_002.root
1174
+ - file_003.root
1175
+ samples:
1176
+ - /tmp/tests/dmu/rfile/main
1177
+ - /tmp/tests/dmu/rfile/frnd
1178
+ ```
1179
+
665
1180
  # Pandas dataframes
666
1181
 
667
1182
  ## Utilities
668
1183
 
669
1184
  These are thin layers of code that take pandas dataframes and carry out specific tasks
670
1185
 
1186
+ ### NaN filter
1187
+
1188
+ The following snippet will remove NaNs from the dataframe
1189
+ if up to 2% of the rows have NaNs. Beyond that, an exception will be risen.
1190
+
1191
+ ```python
1192
+ import dmu.pdataframe.utilities as put
1193
+
1194
+ # Default is 0.02
1195
+ df = put.dropna(df, nan_frac=0.02)
1196
+ ```
1197
+
1198
+ The usecase is cleaning up automatically, data that is not expected to be perfect.
1199
+
671
1200
  ### Dataframe to latex
672
1201
 
673
1202
  One can save a dataframe to latex with:
@@ -709,6 +1238,24 @@ and is meant to be less verbose than doing it through the YAML module.
709
1238
 
710
1239
  These are utility functions meant to be used with ROOT dataframes.
711
1240
 
1241
+ ## Cutflows from RDataFrames
1242
+
1243
+ When using the `Filter` method on a ROOT dataframe, one can:
1244
+
1245
+ ```python
1246
+ rep = rdf.Report()
1247
+ rep.Print()
1248
+ ```
1249
+
1250
+ however this `rep` object is not python friendly, despite it is basically a table that can be
1251
+ put in pandas dataframe. Precisely this can be done with:
1252
+
1253
+ ```python
1254
+ from dmu.rdataframe import utilities as ut
1255
+
1256
+ df = ut.rdf_report_to_df(rep)
1257
+ ```
1258
+
712
1259
  ## Adding a column from a numpy array
713
1260
 
714
1261
  ### With numba
@@ -772,6 +1319,18 @@ obj = AtrMgr(rdf)
772
1319
  obj.to_json('/path/to/file.json')
773
1320
  ```
774
1321
 
1322
+ ## Filtering for a random number of entries
1323
+
1324
+ The built in method `Range` only can be used to select ranges. Use
1325
+
1326
+ ```python
1327
+ import dmu.rdataframe.utilities as ut
1328
+
1329
+ rdf = ut.random_filter(rdf, entries=val)
1330
+ ```
1331
+
1332
+ to select **approximately** a random number `entries` of entries from the dataframe.
1333
+
775
1334
  # Logging
776
1335
 
777
1336
  The `LogStore` class is an interface to the `logging` module. It is aimed at making it easier to include
@@ -791,6 +1350,25 @@ log.error('error')
791
1350
  log.critical('critical')
792
1351
  ```
793
1352
 
1353
+ In order to get a specific logger do:
1354
+
1355
+ ```python
1356
+ logger = LogStore.get_logger(name='my_logger_name')
1357
+ ```
1358
+
1359
+ In order to get the logging level fromt the logger do:
1360
+
1361
+ ```python
1362
+ level = log.getEffectiveLevel()
1363
+ ```
1364
+
1365
+ And a context manager is available, which can be used with:
1366
+
1367
+ ```python
1368
+ with LogStore.level('logger_name', 10):
1369
+ log.debug('Debug message')
1370
+ ```
1371
+
794
1372
  # Plotting from ROOT dataframes
795
1373
 
796
1374
  ## 1D plots
@@ -826,15 +1404,34 @@ definitions:
826
1404
  plots:
827
1405
  x :
828
1406
  binning : [0.98, 0.98, 40] # Here bounds agree => tool will calculate bounds making sure that they are the 2% and 98% quantile
829
- yscale : 'linear' # Optional, if not passed, will do linear, can be log
1407
+ yscale : linear # Optional, if not passed, will do linear, can be log
830
1408
  labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
831
- title : 'some title can be added for different variable plots'
832
- name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
1409
+ title : some title can be added for different variable plots
1410
+ name : plot_of_x # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
1411
+ weights : my_weights # Optional, this is the column in the dataframe with the weights
833
1412
  # Can add styling to specific plots, this should be the argument of
834
1413
  # hist.plot(...)
835
1414
  styling :
836
- label : x
837
- linestyle: '-'
1415
+ # This section will update the styling of each category
1416
+ # The categories (class A, etc) are the keys of the dictionary of dataframes
1417
+ class A:
1418
+ # These are the arguments of plt.hist(...)
1419
+ histtype : fill
1420
+ color : gray
1421
+ alpha : 0.3
1422
+ class B:
1423
+ color : red
1424
+ histtype : step
1425
+ linestyle: '-' # Linestyle is by default 'none',
1426
+ # needs to be overriden to see _steps_
1427
+ # This will add vertical lines to plots, the arguments are the same
1428
+ # as the ones passed to axvline
1429
+ vline :
1430
+ x : 0
1431
+ label : label
1432
+ ls : --
1433
+ c : blue
1434
+ lw : 1
838
1435
  y :
839
1436
  binning : [-5.0, 8.0, 40]
840
1437
  yscale : 'linear'
@@ -853,7 +1450,7 @@ style:
853
1450
  # The line below would place the legend outside the figure to avoid ovelaps with the histogram
854
1451
  bbox_to_anchor : [1.2, 1]
855
1452
  stats:
856
- nentries : '{:.2e}' # This will add number of entries in legend box
1453
+ sumw : '{:.2f}' # This will add sum of weights to label. If no weights, then it will be the nentries value
857
1454
  ```
858
1455
 
859
1456
  it's up to the user to build this dictionary and load it.