data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev720.dist-info}/METADATA +669 -42
  2. data_manipulation_utilities-0.2.8.dev720.dist-info/RECORD +45 -0
  3. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev720.dist-info}/WHEEL +1 -2
  4. data_manipulation_utilities-0.2.8.dev720.dist-info/entry_points.txt +8 -0
  5. dmu/generic/hashing.py +34 -8
  6. dmu/generic/utilities.py +164 -11
  7. dmu/logging/log_store.py +34 -2
  8. dmu/logging/messages.py +96 -0
  9. dmu/ml/cv_classifier.py +3 -3
  10. dmu/ml/cv_diagnostics.py +3 -0
  11. dmu/ml/cv_performance.py +58 -0
  12. dmu/ml/cv_predict.py +149 -46
  13. dmu/ml/train_mva.py +482 -100
  14. dmu/ml/utilities.py +29 -10
  15. dmu/pdataframe/utilities.py +28 -3
  16. dmu/plotting/fwhm.py +2 -2
  17. dmu/plotting/matrix.py +1 -1
  18. dmu/plotting/plotter.py +23 -3
  19. dmu/plotting/plotter_1d.py +96 -32
  20. dmu/plotting/plotter_2d.py +5 -0
  21. dmu/rdataframe/utilities.py +54 -3
  22. dmu/rfile/ddfgetter.py +102 -0
  23. dmu/stats/fit_stats.py +129 -0
  24. dmu/stats/fitter.py +55 -22
  25. dmu/stats/gof_calculator.py +7 -0
  26. dmu/stats/model_factory.py +153 -62
  27. dmu/stats/parameters.py +100 -0
  28. dmu/stats/utilities.py +443 -12
  29. dmu/stats/wdata.py +187 -0
  30. dmu/stats/zfit.py +17 -0
  31. dmu/stats/zfit_plotter.py +147 -36
  32. dmu/testing/utilities.py +102 -24
  33. dmu/workflow/__init__.py +0 -0
  34. dmu/workflow/cache.py +266 -0
  35. data_manipulation_utilities-0.2.7.data/scripts/publish +0 -89
  36. data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
  37. data_manipulation_utilities-0.2.7.dist-info/entry_points.txt +0 -6
  38. data_manipulation_utilities-0.2.7.dist-info/top_level.txt +0 -3
  39. dmu_data/ml/tests/diagnostics_from_file.yaml +0 -13
  40. dmu_data/ml/tests/diagnostics_from_model.yaml +0 -10
  41. dmu_data/ml/tests/diagnostics_multiple_methods.yaml +0 -10
  42. dmu_data/ml/tests/diagnostics_overlay.yaml +0 -33
  43. dmu_data/ml/tests/train_mva.yaml +0 -58
  44. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +0 -82
  45. dmu_data/plotting/tests/2d.yaml +0 -24
  46. dmu_data/plotting/tests/fig_size.yaml +0 -13
  47. dmu_data/plotting/tests/high_stat.yaml +0 -22
  48. dmu_data/plotting/tests/legend.yaml +0 -12
  49. dmu_data/plotting/tests/name.yaml +0 -14
  50. dmu_data/plotting/tests/no_bounds.yaml +0 -12
  51. dmu_data/plotting/tests/normalized.yaml +0 -9
  52. dmu_data/plotting/tests/plug_fwhm.yaml +0 -24
  53. dmu_data/plotting/tests/plug_stats.yaml +0 -19
  54. dmu_data/plotting/tests/simple.yaml +0 -9
  55. dmu_data/plotting/tests/stats.yaml +0 -9
  56. dmu_data/plotting/tests/styling.yaml +0 -11
  57. dmu_data/plotting/tests/title.yaml +0 -14
  58. dmu_data/plotting/tests/weights.yaml +0 -13
  59. dmu_data/text/transform.toml +0 -4
  60. dmu_data/text/transform.txt +0 -6
  61. dmu_data/text/transform_set.toml +0 -8
  62. dmu_data/text/transform_set.txt +0 -6
  63. dmu_data/text/transform_trf.txt +0 -12
  64. dmu_scripts/git/publish +0 -89
  65. dmu_scripts/physics/check_truth.py +0 -121
  66. dmu_scripts/rfile/compare_root_files.py +0 -299
  67. dmu_scripts/rfile/print_trees.py +0 -35
  68. dmu_scripts/ssh/coned.py +0 -168
  69. dmu_scripts/text/transform_text.py +0 -46
  70. {dmu_data → dmu}/__init__.py +0 -0
@@ -1,27 +1,39 @@
1
- Metadata-Version: 2.4
2
- Name: data_manipulation_utilities
3
- Version: 0.2.7
4
- Description-Content-Type: text/markdown
5
- Requires-Dist: logzero
1
+ Metadata-Version: 2.3
2
+ Name: data-manipulation-utilities
3
+ Version: 0.2.8.dev720
4
+ Summary: Project storing utilities needed to reduce boilerplate code when analyzing data
5
+ Requires-Python: >=3.10,<3.13
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.10
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Provides-Extra: dev
11
+ Provides-Extra: fit
12
+ Provides-Extra: ml
6
13
  Requires-Dist: PyYAML
7
- Requires-Dist: scipy
8
14
  Requires-Dist: awkward
9
- Requires-Dist: tqdm
10
- Requires-Dist: numpy
11
- Requires-Dist: toml
12
- Requires-Dist: numpy
15
+ Requires-Dist: awkward-pandas
16
+ Requires-Dist: dask[dataframe,distributed]
17
+ Requires-Dist: hist[plot]
18
+ Requires-Dist: joblib ; extra == "ml"
19
+ Requires-Dist: logzero
13
20
  Requires-Dist: matplotlib
14
21
  Requires-Dist: mplhep
15
- Requires-Dist: hist[plot]
22
+ Requires-Dist: numpy
23
+ Requires-Dist: omegaconf
24
+ Requires-Dist: optuna ; extra == "ml"
16
25
  Requires-Dist: pandas
17
- Provides-Extra: fit
18
- Requires-Dist: zfit; extra == "fit"
19
- Requires-Dist: tensorflow==2.18.0; extra == "fit"
20
- Provides-Extra: ml
21
- Requires-Dist: scikit-learn; extra == "ml"
22
- Requires-Dist: joblib; extra == "ml"
23
- Provides-Extra: dev
24
- Requires-Dist: pytest; extra == "dev"
26
+ Requires-Dist: pytest ; extra == "dev"
27
+ Requires-Dist: scikit-learn ; extra == "ml"
28
+ Requires-Dist: scipy
29
+ Requires-Dist: tensorflow
30
+ Requires-Dist: toml
31
+ Requires-Dist: tqdm
32
+ Requires-Dist: uproot
33
+ Requires-Dist: zfit (==0.26.0) ; extra == "fit"
34
+ Description-Content-Type: text/markdown
35
+
36
+ [TOC]
25
37
 
26
38
  # D(ata) M(anipulation) U(tilities)
27
39
 
@@ -56,8 +68,160 @@ Then, for each remote it pushes the tags and the commits.
56
68
 
57
69
  This section describes generic tools that could not be put in a specific category, but tend to be useful.
58
70
 
71
+ ## Caching data
72
+
73
+ In order to reuse data that is hard to calculate one would need:
74
+
75
+ - Serializable data, i.e. strings, floats, lists, etc
76
+ - A way to get a unique identifier of that data, e.g. a hashable object
77
+
78
+ If both are avalable, one can:
79
+
80
+ ```python
81
+ import dmu.generic.utilities as gut
82
+
83
+ def _get_something() -> float:
84
+ # This loads the data, if found
85
+ hashable = arg1, arg2
86
+
87
+ ret = gut.load_cached(hash_obj=hashable, on_fail=-999)
88
+ if ret != -999:
89
+ return ret
90
+
91
+ obj = very_expensive_function(arg1, arg2)
92
+
93
+ # This saves the data
94
+ ret = gut.cache_data(obj, hash_obj=hashable)
95
+
96
+ return ret
97
+ ```
98
+
99
+ the cached data will go to JSON files in `/tmp/dmu/cache`.
100
+
101
+ ## Caching with a base class
102
+
103
+ Caching functionalities can be added to a class through a base class as in:
104
+
105
+ ```python
106
+ from dmu.workflow.cache import Cache as Wcache
107
+
108
+ class Tester(Wcache):
109
+ '''
110
+ Testing class, produces outputs from simple inputs
111
+ '''
112
+ # -----------------------------------
113
+ def __init__(
114
+ self,
115
+ nval : int):
116
+ '''
117
+ nval, some integer used to produce output data
118
+ '''
119
+ super().__init__(
120
+ out_path='Tester',
121
+ nval =nval)
122
+
123
+ self._nval = nval
124
+ # -----------------------------------
125
+ def run(self) -> None:
126
+ '''
127
+ Returns a list of 1's
128
+ '''
129
+ # _out_path belongs to the base class
130
+ obj_path = f'{self._out_path}/values.json'
131
+
132
+ if self._copy_from_cache():
133
+ log.warning('Output cached, not running')
134
+ return gut.load_json(obj_path)
135
+
136
+ log.info('Data not cached, running')
137
+ res = [1] * self._nval
138
+
139
+ gut.dump_json(res, obj_path)
140
+ self._cache()
141
+
142
+ return res
143
+
144
+ # This will set the root directory where cached data goes
145
+ # The data will go to `/some/directory/Tester`
146
+ # This has to be done ONCE and only ONCE.
147
+ Wcache.set_cache_root(root='/some/directory')
148
+
149
+ obj = Tester(nval=3)
150
+ ...
151
+ ```
152
+
153
+ where the tester class has access to extra functionalities to:
154
+
155
+ - Cache outputs to a hashed directory
156
+ - For the next run, check if the directory exists, if so pick
157
+ the outputs and put them in the output directory
158
+ - If not rerun the process
159
+
160
+ Several hashed directories might exist, like in the diagram:
161
+
162
+ ![](doc/images/cache_hash.png)
163
+
164
+ **Important**: This class will also use the hash of the module where the `Test`
165
+ class is defined. Thus, changes in the code or in the input data, will invalidate the hash.
166
+
167
+ ### Turning caching off
168
+
169
+ This can be done temporarily with:
170
+
171
+ ```python
172
+ with Wcache.turn_off_cache(val=['Tester']):
173
+ obj = Tester(nval=4)
174
+ out = obj.run()
175
+ ```
176
+
177
+ for any list of classes that inherit from `Cache` by passing the list of class names.
178
+ If `val=None` is passed, ALL the classes caching is turned off.
179
+
180
+ ## Silencing import messages
181
+
182
+ To silence messages given by modules not in the user's control do:
183
+
184
+ ```python
185
+ import dmu.generic.utilities as gut
186
+
187
+ with gut.silent_import():
188
+ import tensorflow
189
+ ```
190
+
191
+ ## Silencing messages going to __stderr__ originating deep from C++ code
192
+
193
+ This is an issue with frameworks like `Tensorflow`. Some messages are impossible
194
+ to kill, which interferes with the debugging process. In order hide selectively
195
+ those messages, do:
196
+
197
+ ```python
198
+ from dmu.logging import messages as mes
199
+
200
+ l_msg = ['ONE', 'TWO']
201
+ with mes.filter_stderr(banned_substrings=l_msg):
202
+ os.write(2, b'MSG ONE\n')
203
+ os.write(2, b'MSG TWO\n')
204
+ os.write(2, b'MSG THREE\n')
205
+ ```
206
+
207
+ The context manager above will only allow `THREE` into the error stream.
208
+
209
+ ## YAML
210
+
211
+ When dumping data to yaml files do it like:
212
+
213
+ ```python
214
+ import dmu.generic.utilities as gut
215
+
216
+ yaml.dump(data, Dumper=gut.BlockStyleDumper)
217
+ ```
218
+
219
+ to make sure the indentation is correct.
220
+
59
221
  ## Hashing
60
222
 
223
+ ### Hashing python objects
224
+
61
225
  The snippet below:
62
226
 
63
227
  ```python
@@ -73,7 +237,20 @@ will:
73
237
  - Encode it to utf-8
74
238
  - Make a 64 characters hash out of it
75
239
 
76
- in two lines, thus keeping the user's code clean.
240
+ in two lines, thus keeping the user's code clean.
241
+
242
+ ### Hashing files
243
+
244
+ The following snippet:
245
+
246
+ ```python
247
+ from dmu.generic import hashing
248
+
249
+ path = '/some/file/path.txt'
250
+ val = hashing.hash_file(path=obj)
251
+ ```
252
+
253
+ should provide a hash to a file, given its path.
77
254
 
78
255
  ## Timer
79
256
 
@@ -91,9 +268,9 @@ def fun():
91
268
  fun()
92
269
  ```
93
270
 
94
- ## JSON dumper and loader
271
+ ## JSON/YAML dumper and loader
95
272
 
96
- The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
273
+ The following lines will dump data (dictionaries, lists, etc) to a JSON/YAML file and load it back:
97
274
 
98
275
  ```python
99
276
  import dmu.generic.utilities as gut
@@ -104,8 +281,45 @@ gut.dump_json(data, '/tmp/list.json')
104
281
  data = gut.load_json('/tmp/list.json')
105
282
  ```
106
283
 
284
+ this will dump to either JSON or YAML files, depending on the extension, extensions allowed are:
285
+
286
+ ```
287
+ .json
288
+ .yaml
289
+ .yml
290
+ ```
291
+
107
292
  and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
108
293
 
294
+ ## PKL dumper and loader
295
+
296
+ In the same way one can do:
297
+
298
+ ```python
299
+ import dmu.generic.utilities as gut
300
+
301
+ data = [1,2,3,4]
302
+
303
+ gut.dump_pickle(data, '/tmp/list.pkl')
304
+ data = gut.load_pickle('/tmp/list.pkl')
305
+ ```
306
+
307
+ ## Loader of files and configurations from data packages
308
+
309
+ YAML and JSON files can be loaded from data packages with:
310
+
311
+ ```python
312
+ import dmu.generic.utilities as gut
313
+
314
+ data = gut.load_data(package='dmu_data', fpath=f'tests/data.json')
315
+ conf = gut.load_conf(package='dmu_data', fpath=f'tests/config.json')
316
+ ```
317
+
318
+ the former will return a python dictionary, list, etc.
319
+ The later will return a `DataConf` object from the `omegaconf` project.
320
+ Check [this](https://omegaconf.readthedocs.io/en/2.3_branch/index.html)
321
+ for more information.
322
+
109
323
  # Physics
110
324
 
111
325
  ## Truth matching
@@ -146,8 +360,72 @@ samples:
146
360
 
147
361
  # Math
148
362
 
363
+ ## Weighted data
364
+
365
+ `Wdata` is a small class symbolizing weighted data that contains extra functionality. It can
366
+ be used as:
367
+
368
+ ```python
369
+ from dmu.stats.wdata import Wdata
370
+
371
+ arr_mass = numpy.random.normal(loc=0, scale=1.0, size=Data.nentries)
372
+ arr_wgt = numpy.random.normal(loc=1, scale=0.1, size=Data.nentries)
373
+
374
+ # Make an instance
375
+ wdata = Wdata(data=arr_mass, weights=arr_wgt)
376
+
377
+ # create a zfit dataset, if needed
378
+ obs = zfit.Space('obs', limits=(-3, +3))
379
+ zdata = wdata.to_zfit(obs=obs)
380
+
381
+ # Add datasets
382
+ wdata_1 = Wdata(data=arr_mass, weights=arr_wgt)
383
+ wdata_2 = Wdata(data=arr_mass, weights=arr_wgt)
384
+ wdata_3 = wdata_1 + wdata_2
385
+
386
+ # Extract information from dataset
387
+
388
+ wdata.sumw() # sum of weights
389
+ wdata.size() # Number of entries
390
+
391
+ # Update weights creating a new Wdata instance
392
+ arr_wgt_new = numpy.random.normal(loc=1, scale=0.2, size=Data.nentries)
393
+
394
+ # New weights
395
+ wdata_2 = wdata.update_weights(weights=arr_wgt_new, replace=True)
396
+
397
+ # Multiply old weights by new ones and update
398
+ wdata_3 = wdata.update_weights(weights=arr_wgt_new, replace=False)
399
+ ```
400
+
149
401
  ## PDFs
150
402
 
403
+ ### Suppressing tensorflow messages from zfit import
404
+
405
+ If you work with zfit, you will see messages from tensorflow, by importing zfit through:
406
+
407
+ ```python
408
+ from dmu.stats.zfit import zfit
409
+ ```
410
+
411
+ these messages should be hidden. If `ROOT` is installed, the wrapper will import it before
412
+ importing tensorflow. That will prevent crashes which usually happen when `tensorflow`
413
+ is imported before `ROOT`.
414
+
415
+ ### Toy models
416
+
417
+ For quick tests, one can retrieve simple models with :
418
+
419
+ ```python
420
+ from dmu.stats import utilities as sut
421
+
422
+ # For a Gaussian plus Exponential, extended
423
+ pdf = sut.get_model(kind='s+b')
424
+
425
+ # For a Gaussian signal, non extended
426
+ pdf = sut.get_model(kind='signal')
427
+ ```
428
+
151
429
  ### Model building
152
430
 
153
431
  In order to do complex fits, one often needs PDFs with many parameters, which need to be added.
@@ -159,8 +437,27 @@ from dmu.stats.model_factory import ModelFactory
159
437
 
160
438
  l_pdf = ['cbr'] + 2 * ['cbl']
161
439
  l_shr = ['mu', 'sg']
162
- d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
163
- mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
440
+ l_flt = ['mu', 'sg'] # Will mark these parameters as floating for the fit done afterwards
441
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'} # Optional, will reparametrize for scale and resolution
442
+ d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # Optional, will fix two parameters whose names start with the keys
443
+
444
+ # If mu and sg are meant to be shared among all the models
445
+ # The parameters can be passed here.
446
+ # In this case, they are also meant to be floating
447
+ mu = zfit.param.Parameter('mu_flt', 5280, 5000, 5500)
448
+ sg = zfit.param.Parameter('sg_flt', 80, 20, 100)
449
+ l_reuse = [mu, sg]
450
+
451
+ mod = ModelFactory(
452
+ preffix = 'pref', # Preffix for parameter naming
453
+ obs = Data.obs,
454
+ l_pdf = l_pdf,
455
+ l_shared= l_shr,
456
+ l_float = l_float,
457
+ l_reuse = l_reuse, # Optional
458
+ d_rep = d_rep, # Optional
459
+ d_fix = d_fix) # Optional
460
+
164
461
  pdf = mod.get_pdf()
165
462
  ```
166
463
 
@@ -181,7 +478,7 @@ dscb : Double sided CrystallBall
181
478
 
182
479
  In order to introduce reparametrizations for the means and the resolutions, such that:
183
480
 
184
- $\mu\to\mu+\Delta\mu$
481
+ $\mu\to\mu+\Delta\mu$
185
482
  $\sigma\to\sigma\cdot s_{\sigma}$
186
483
 
187
484
  where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
@@ -207,6 +504,29 @@ pdf = mod.get_pdf()
207
504
 
208
505
  Here, the floating parameters **should not** be the same as the reparametrized ones.
209
506
 
507
+ ### Overriding parameters
508
+
509
+ The models above have their parameter ranges chosen for fits to B meson distributions
510
+ e.g. the mean of the distributions is around 5GeV. To make these models extensible for other
511
+ resonances do:
512
+
513
+ ```python
514
+ from dmu.stats.parameters import ParameterLibrary as PL
515
+
516
+ # This will override the ranges and starting value
517
+ PL.set_values(kind='cbr', parameter='mu', val=3000, low=2500, high=3500)
518
+
519
+ # This will fix a parameter, the three arguments need to be equal
520
+ PL.set_values(kind='cbr', parameter='sg', val= 30, low= 30, high= 30)
521
+ ```
522
+
523
+ before using the `ModelFactory` class.
524
+ For a summary of all the parameters and values available do:
525
+
526
+ ```python
527
+ PL.print_parameters(kind='cbr')
528
+ ```
529
+
210
530
  ### Printing PDFs
211
531
 
212
532
  One can print a zfit PDF by doing:
@@ -268,6 +588,25 @@ print_pdf(pdf,
268
588
  txt_path = 'tests/stats/utilities/print_pdf/pdf_const.txt')
269
589
  ```
270
590
 
591
+
592
+ ### Storing PDF as latex
593
+
594
+ The file above can be transformed into a `tex` file by running:
595
+
596
+ ```python
597
+ from dmu.stats.utilities import pdf_to_tex
598
+
599
+ d_par = {
600
+ 'ar_dscb_Signal_002_1_reso_flt' : r'$\alpha_{DSCB}^{1}$',
601
+ 'ar_dscb_Signal_002_2_reso_flt' : r'$\alpha_{DSCB}^{2}$',
602
+ }
603
+
604
+ # It will skip fixed parameters by default
605
+ pdf_to_tex(path='/path/to/pdf.txt', d_par=d_par, skip_fixed=True)
606
+ ```
607
+
608
+ where `d_par` will rename the `Parameters` column, such that it's in latex.
609
+
271
610
  ## Fits
272
611
 
273
612
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
@@ -331,8 +670,8 @@ strategy :
331
670
  # The lines below will split the range of the data [0-10] into two subranges, such that the NLL is built
332
671
  # only in those ranges. The ranges need to be tuples
333
672
  ranges :
334
- - !!python/tuple [0, 3]
335
- - !!python/tuple [6, 9]
673
+ - [0, 3]
674
+ - [6, 9]
336
675
  #The lines below will allow using contraints for each parameter, where the first element is the mean and the second
337
676
  #the width of a Gaussian constraint. No correlations are implemented, yet.
338
677
  constraints :
@@ -414,6 +753,10 @@ obj = ZFitPlotter(data=sam, model=pdf)
414
753
  d_leg = {'gauss': 'New Gauss'}
415
754
  obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
416
755
 
756
+ #Alternatively one can do:
757
+ obj.plot(nbins=50, d_leg=d_leg, stacked=True, ranges=[[0,3], [3,10]])
758
+ # For plotting only sidebands, useful if one has a blinded fit
759
+
417
760
  # add a line to pull hist
418
761
  obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
419
762
  ```
@@ -425,6 +768,71 @@ this class supports:
425
768
  - Stacking and overlaying of PDFs.
426
769
  - Blinding.
427
770
 
771
+ ## Fit saving
772
+
773
+ To save in one go everything regarding your fit do:
774
+
775
+ ```python
776
+ from dmu.stats import utilities as sut
777
+ from dmu.stats.zfit_plotter import ZFitPlotter
778
+
779
+ ptr = ZFitPlotter(data=dat, model=pdf)
780
+ ptr.plot()
781
+
782
+ sut.save_fit(data=data, model=pdf, res=fit_result, fit_dir='/some/directory', d_const=constraints)
783
+ ```
784
+
785
+ and the function will save everything that you would normally need from a fit.
786
+ If the lines with `ZFitPlotter` were called before `save_fit` the fit plot will also be saved.
787
+
788
+ ### Transforming fit results to DictConfig
789
+
790
+ The `OmegaConf` library offers `DictConfig` objects, which are easier to handle
791
+ when reading nested data. To transform a zfit result object into one of these
792
+ objects do:
793
+
794
+ ```python
795
+ from dmu.stats import utilities as sut
796
+
797
+ cres = sut.zres_to_cres(res=res)
798
+ ```
799
+
800
+ and then one would access the information like:
801
+
802
+ ```python
803
+ error = cres.mu.error
804
+ value = cres.mu.value
805
+ ```
806
+
807
+ and these objects can be saved to JSON with:
808
+
809
+ ```python
810
+ OmegaConf.save(config=cres, f='results.yaml')
811
+ ```
812
+
813
+ ## Placeholdef fits
814
+
815
+ In order to create a _fake_ fit on top of which one could develop other tools, do:
816
+
817
+ ```python
818
+ from dmu.stats import utilities
819
+
820
+ utilities.placeholder_fit(kind='s+b', fit_dir='/some/directory')
821
+ ```
822
+
823
+ ## Retrieving information on fits
824
+
825
+ Once the fit has be done and the results are saved to a given directory one can do:
826
+
827
+ ```python
828
+ from dmu.stats.fit_stats import FitStats
829
+
830
+ obj =FitStats(fit_dir='/directory/with/fit')
831
+ val = obj.get_value(name='var_name', kind='value or error')
832
+ ```
833
+
834
+ and the tool will retrieve the value. This is useful when the values are needed elsewhere
835
+ in the code, i.e. it would connect the fitting part with other parts.
428
836
  ## Arrays
429
837
 
430
838
  ### Scaling by non-integer
@@ -471,6 +879,24 @@ xval = numpy.lispace(0, 5, num=100)
471
879
  yval = fun(xval)
472
880
  ```
473
881
 
882
+ ## Other utilities
883
+
884
+ These are here to decrease boilerplate code
885
+
886
+ ```python
887
+ from dmu.stats import utilities as sut
888
+
889
+ # Retrieves name of observable from observable
890
+ name = sut.name_from_obs(obs=obs)
891
+
892
+ # Retrieves range of observable from observable
893
+ minx, maxx = sut.range_from_obs(obs=obs)
894
+
895
+ # This is needed because when building a KDE with too little data, that KDE cannot be evaluated
896
+ # and when trying it, tensorflow emits an exception.
897
+ is_pdf_usable(pdf)
898
+ ```
899
+
474
900
  # Machine learning
475
901
 
476
902
  ## Classification
@@ -485,16 +911,31 @@ rdf_bkg = _get_rdf(kind='bkg')
485
911
  cfg = _get_config()
486
912
 
487
913
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
488
- obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
914
+ obj.run(
915
+ skip_fit=False, # by default it will be false, if true, it will only make plots of features
916
+ opt_ntrial=20, # By default this is zero, if a larger number is chosen, a hyperparameter optimization with optuna will run with this number of trials
917
+ load_trained=False, # If true, it will not train the models but will just load them, only makes sense if models already exist. Useful to add postprocessing code, like the diagnostics section.
918
+ )
489
919
  ```
490
920
 
491
921
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
492
922
 
493
923
  ```yaml
494
924
  dataset:
925
+ # This section is optional. It can be used to redefine
926
+ # columns in different ways for different samples
927
+ #
928
+ # When evaluating the model, the same definitions will be used
929
+ # but they will be taken from the `sig` section.
930
+ samples:
931
+ sig:
932
+ definitions:
933
+ x : v + w
934
+ bkg:
935
+ definitions:
936
+ x : v - w
495
937
  # Before training, new features can be defined as below
496
938
  define :
497
- x : v + w
498
939
  y : v - w
499
940
  # If the key is found to be NaN, replace its value with the number provided
500
941
  # This will be used in the training.
@@ -513,8 +954,8 @@ training :
513
954
  learning_rate : 0.1
514
955
  min_samples_split : 2
515
956
  saving:
516
- # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
517
- path : 'tests/ml/train_mva/model.pkl'
957
+ # The model names are model_001.pkl, model_002.pkl, etc, one for each fold
958
+ path : 'tests/ml/train_mva'
518
959
  plotting:
519
960
  roc :
520
961
  min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
@@ -532,10 +973,7 @@ plotting:
532
973
  title : 'Correlation matrix'
533
974
  size : [10, 10]
534
975
  mask_value : 0 # Where correlation is zero, the bin will appear white
535
- val_dir : 'tests/ml/train_mva'
536
976
  features:
537
- saving:
538
- plt_dir : 'tests/ml/train_mva/features'
539
977
  plots:
540
978
  w :
541
979
  binning : [-4, 4, 100]
@@ -557,6 +995,20 @@ plotting:
557
995
 
558
996
  the `TrainMva` is just a wrapper to `scikit-learn` that enables cross-validation (and therefore that explains the `nfolds` setting).
559
997
 
998
+ #### Outputs
999
+
1000
+ The trainer will produce in the output:
1001
+
1002
+ - Models in form of `pkl` files
1003
+ - Plots of the features
1004
+ - For each fold:
1005
+ 1. Covariance plot
1006
+ 1. ROC curve plot
1007
+ 1. Feature importance table in latex
1008
+ 1. JSON file with data to build the ROC curve
1009
+ - For the full dataset it will provide the ROC curve, scores distribution and JSON file with `x`, `y` coordinates for ROC curve.
1010
+ - Latex table with hyperparameters and NaN replacements.
1011
+
560
1012
  ### Caveats
561
1013
 
562
1014
  When training on real data, several things might go wrong and the code will try to deal with them in the following ways:
@@ -596,6 +1048,18 @@ If a sample exists, that was used in the training of _every_ model, no model can
596
1048
  During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
597
1049
  from that configuration and the user does not need to define extra columns.
598
1050
 
1051
+ ### Further optimization
1052
+
1053
+ If not all the entries of the ROOT dataframe are needed for the prediction (e.g. some entries won't be used anyway) define
1054
+ a column as:
1055
+
1056
+ ```python
1057
+ rdf = rdf.Define('skip_mva_prediction', 'mass < 3000')
1058
+ ```
1059
+
1060
+ and the predictor will assign scores of `-1` to all the entries with `mass < 3000`.
1061
+ This should speed up the prediction and reduce resource consumption.
1062
+
599
1063
  ### Caveats
600
1064
 
601
1065
  When evaluating the model with real data, problems might occur, we deal with them as follows:
@@ -635,6 +1099,8 @@ correlations:
635
1099
  target :
636
1100
  name : mass
637
1101
  overlay :
1102
+ # These are the working points at which the "mass" variable will be plotted
1103
+ # If there is a correlation the shape should change
638
1104
  wp :
639
1105
  - 0.2
640
1106
  - 0.5
@@ -650,8 +1116,6 @@ correlations:
650
1116
  yscale : 'linear'
651
1117
  labels : ['mass', 'Entries']
652
1118
  normalized : true
653
- styling :
654
- linestyle: '-' # By default there is no line, just pointer
655
1119
  methods:
656
1120
  - Pearson
657
1121
  - Kendall-$\tau$
@@ -662,12 +1126,106 @@ correlations:
662
1126
  rotate : 60 # Will rotate xlabels by 60 degrees
663
1127
  ```
664
1128
 
1129
+ ## Comparing classifiers
1130
+
1131
+ ### Simple approach
1132
+ To do that run:
1133
+
1134
+ ```bash
1135
+ compare_classifiers -c /path/to/config.yaml
1136
+ ```
1137
+
1138
+ where the config looks like:
1139
+
1140
+ ```yaml
1141
+ out_dir : /path/to/plots
1142
+ classifiers:
1143
+ label for model 1 : /path/to/directory/with/model1
1144
+ label for model 2 : /path/to/directory/with/model2
1145
+ ```
1146
+
1147
+ However this will only compare the classifiers ROC curves with respect to the
1148
+ samples that were used to train them.
1149
+
1150
+ ### With custom samples
1151
+
1152
+ However the models' peformances can also be compared by _plugging_ any
1153
+ signal and backgroud proxy for any model, like:
1154
+
1155
+ ```python
1156
+ import matplotlib.pyplot as plt
1157
+ from dmu.ml.cv_performance import CVPerformance
1158
+
1159
+ cvp = CVPerformance()
1160
+ cvp.plot_roc(
1161
+ sig =rdf_sig_1, bkg=rdf_bkg_1,
1162
+ model=l_model_1, name='def', color='red')
1163
+ cvp.plot_roc(
1164
+ sig =rdf_sig_1, bkg=rdf_bkg_2,
1165
+ model=l_model_2, name='alt', color='blue')
1166
+
1167
+ plt.legend()
1168
+ plt.grid()
1169
+ plt.show()
1170
+ ```
1171
+
1172
+ This should show an overlay of different ROC curves made for a specific combination
1173
+ of signal and background proxies with a given model.
1174
+
1175
+ # Dask dataframes
1176
+
1177
+ In order to process large ammounts of data a `Dask` dataframe is more suitable.
1178
+ A set of `ROOT` files can be loaded into one of these with:
1179
+
1180
+
1181
+ ```python
1182
+ from dmu.rfile.ddfgetter import DDFGetter
1183
+
1184
+ # Can also pass directly the configuration dictionary with the `cfg` argument
1185
+ # If no columns argument is passed, will take all the columns
1186
+
1187
+ ddfg = DDFGetter(config_path='config.yaml', columns=['a', 'b'])
1188
+ ddf = ddfg.get_dataframe()
1189
+
1190
+ # This will provide the pandas dataframe
1191
+ df = ddf.compute()
1192
+ ...
1193
+ ```
1194
+ where `config.yaml` would look like:
1195
+
1196
+ ```yaml
1197
+ tree : tree_name
1198
+ primary_keys:
1199
+ - index
1200
+ files :
1201
+ - file_001.root
1202
+ - file_002.root
1203
+ - file_003.root
1204
+ samples:
1205
+ - /tmp/tests/dmu/rfile/main
1206
+ - /tmp/tests/dmu/rfile/frnd
1207
+ ```
1208
+
665
1209
  # Pandas dataframes
666
1210
 
667
1211
  ## Utilities
668
1212
 
669
1213
  These are thin layers of code that take pandas dataframes and carry out specific tasks
670
1214
 
1215
+ ### NaN filter
1216
+
1217
+ The following snippet will remove NaNs from the dataframe
1218
+ if up to 2% of the rows have NaNs. Beyond that, an exception will be risen.
1219
+
1220
+ ```python
1221
+ import dmu.pdataframe.utilities as put
1222
+
1223
+ # Default is 0.02
1224
+ df = put.dropna(df, nan_frac=0.02)
1225
+ ```
1226
+
1227
+ The usecase is cleaning up automatically, data that is not expected to be perfect.
1228
+
671
1229
  ### Dataframe to latex
672
1230
 
673
1231
  One can save a dataframe to latex with:
@@ -709,6 +1267,24 @@ and is meant to be less verbose than doing it through the YAML module.
709
1267
 
710
1268
  These are utility functions meant to be used with ROOT dataframes.
711
1269
 
1270
+ ## Cutflows from RDataFrames
1271
+
1272
+ When using the `Filter` method on a ROOT dataframe, one can:
1273
+
1274
+ ```python
1275
+ rep = rdf.Report()
1276
+ rep.Print()
1277
+ ```
1278
+
1279
+ however this `rep` object is not python friendly, despite it is basically a table that can be
1280
+ put in pandas dataframe. Precisely this can be done with:
1281
+
1282
+ ```python
1283
+ from dmu.rdataframe import utilities as ut
1284
+
1285
+ df = ut.rdf_report_to_df(rep)
1286
+ ```
1287
+
712
1288
  ## Adding a column from a numpy array
713
1289
 
714
1290
  ### With numba
@@ -772,6 +1348,18 @@ obj = AtrMgr(rdf)
772
1348
  obj.to_json('/path/to/file.json')
773
1349
  ```
774
1350
 
1351
+ ## Filtering for a random number of entries
1352
+
1353
+ The built in method `Range` only can be used to select ranges. Use
1354
+
1355
+ ```python
1356
+ import dmu.rdataframe.utilities as ut
1357
+
1358
+ rdf = ut.random_filter(rdf, entries=val)
1359
+ ```
1360
+
1361
+ to select **approximately** a random number `entries` of entries from the dataframe.
1362
+
775
1363
  # Logging
776
1364
 
777
1365
  The `LogStore` class is an interface to the `logging` module. It is aimed at making it easier to include
@@ -791,6 +1379,25 @@ log.error('error')
791
1379
  log.critical('critical')
792
1380
  ```
793
1381
 
1382
+ In order to get a specific logger do:
1383
+
1384
+ ```python
1385
+ logger = LogStore.get_logger(name='my_logger_name')
1386
+ ```
1387
+
1388
+ In order to get the logging level fromt the logger do:
1389
+
1390
+ ```python
1391
+ level = log.getEffectiveLevel()
1392
+ ```
1393
+
1394
+ And a context manager is available, which can be used with:
1395
+
1396
+ ```python
1397
+ with LogStore.level('logger_name', 10):
1398
+ log.debug('Debug message')
1399
+ ```
1400
+
794
1401
  # Plotting from ROOT dataframes
795
1402
 
796
1403
  ## 1D plots
@@ -826,15 +1433,34 @@ definitions:
826
1433
  plots:
827
1434
  x :
828
1435
  binning : [0.98, 0.98, 40] # Here bounds agree => tool will calculate bounds making sure that they are the 2% and 98% quantile
829
- yscale : 'linear' # Optional, if not passed, will do linear, can be log
1436
+ yscale : linear # Optional, if not passed, will do linear, can be log
830
1437
  labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
831
- title : 'some title can be added for different variable plots'
832
- name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
1438
+ title : some title can be added for different variable plots
1439
+ name : plot_of_x # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
1440
+ weights : my_weights # Optional, this is the column in the dataframe with the weights
833
1441
  # Can add styling to specific plots, this should be the argument of
834
1442
  # hist.plot(...)
835
1443
  styling :
836
- label : x
837
- linestyle: '-'
1444
+ # This section will update the styling of each category
1445
+ # The categories (class A, etc) are the keys of the dictionary of dataframes
1446
+ class A:
1447
+ # These are the arguments of plt.hist(...)
1448
+ histtype : fill
1449
+ color : gray
1450
+ alpha : 0.3
1451
+ class B:
1452
+ color : red
1453
+ histtype : step
1454
+ linestyle: '-' # Linestyle is by default 'none',
1455
+ # needs to be overriden to see _steps_
1456
+ # This will add vertical lines to plots, the arguments are the same
1457
+ # as the ones passed to axvline
1458
+ vline :
1459
+ x : 0
1460
+ label : label
1461
+ ls : --
1462
+ c : blue
1463
+ lw : 1
838
1464
  y :
839
1465
  binning : [-5.0, 8.0, 40]
840
1466
  yscale : 'linear'
@@ -853,7 +1479,7 @@ style:
853
1479
  # The line below would place the legend outside the figure to avoid ovelaps with the histogram
854
1480
  bbox_to_anchor : [1.2, 1]
855
1481
  stats:
856
- nentries : '{:.2e}' # This will add number of entries in legend box
1482
+ sumw : '{:.2f}' # This will add sum of weights to label. If no weights, then it will be the nentries value
857
1483
  ```
858
1484
 
859
1485
  it's up to the user to build this dictionary and load it.
@@ -1196,3 +1822,4 @@ lxplus:
1196
1822
  ```
1197
1823
 
1198
1824
  and should be placed in `$HOME/.config/dmu/ssh/servers.yaml`
1825
+