data-manipulation-utilities 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/PKG-INFO +45 -17
  2. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/README.md +44 -16
  3. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/pyproject.toml +1 -1
  4. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/PKG-INFO +45 -17
  5. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/SOURCES.txt +2 -0
  6. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/ml/cv_classifier.py +16 -2
  7. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/ml/cv_predict.py +5 -5
  8. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/ml/train_mva.py +48 -30
  9. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/ml/utilities.py +11 -5
  10. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/plotting/plotter.py +6 -2
  11. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/plotting/plotter_1d.py +22 -4
  12. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/plotting/plotter_2d.py +10 -9
  13. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/minimizers.py +40 -11
  14. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/model_factory.py +77 -31
  15. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/testing/utilities.py +36 -27
  16. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/ml/tests/train_mva.yaml +2 -2
  17. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/2d.yaml +8 -4
  18. data_manipulation_utilities-0.2.6/src/dmu_data/plotting/tests/legend.yaml +12 -0
  19. data_manipulation_utilities-0.2.6/src/dmu_data/plotting/tests/stats.yaml +9 -0
  20. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/setup.cfg +0 -0
  21. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
  22. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
  23. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/requires.txt +0 -0
  24. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
  25. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/arrays/utilities.py +0 -0
  26. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/generic/utilities.py +0 -0
  27. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/generic/version_management.py +0 -0
  28. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/logging/log_store.py +0 -0
  29. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/pdataframe/utilities.py +0 -0
  30. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/plotting/matrix.py +0 -0
  31. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/plotting/utilities.py +0 -0
  32. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/rdataframe/atr_mgr.py +0 -0
  33. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/rdataframe/utilities.py +0 -0
  34. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/rfile/rfprinter.py +0 -0
  35. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/rfile/utilities.py +0 -0
  36. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/fitter.py +0 -0
  37. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/function.py +0 -0
  38. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/gof_calculator.py +0 -0
  39. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/utilities.py +0 -0
  40. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/stats/zfit_plotter.py +0 -0
  41. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu/text/transformer.py +0 -0
  42. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/__init__.py +0 -0
  43. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
  44. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
  45. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/name.yaml +0 -0
  46. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
  47. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
  48. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/simple.yaml +0 -0
  49. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/title.yaml +0 -0
  50. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/plotting/tests/weights.yaml +0 -0
  51. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/text/transform.toml +0 -0
  52. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/text/transform.txt +0 -0
  53. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/text/transform_set.toml +0 -0
  54. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/text/transform_set.txt +0 -0
  55. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_data/text/transform_trf.txt +0 -0
  56. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_scripts/git/publish +0 -0
  57. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_scripts/physics/check_truth.py +0 -0
  58. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
  59. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_scripts/rfile/print_trees.py +0 -0
  60. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_scripts/ssh/coned.py +0 -0
  61. {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.6}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
@@ -26,7 +26,7 @@ These are tools that can be used for different data analysis tasks.
26
26
 
27
27
  ## Pushing
28
28
 
29
- From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
29
+ From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
30
30
  using a `pyproject.toml` file, run:
31
31
 
32
32
  ```bash
@@ -36,10 +36,10 @@ publish
36
36
  such that:
37
37
 
38
38
  1. The `pyproject.toml` file is checked and the version of the project is extracted.
39
- 1. If a tag named as the version exists move to the steps below.
39
+ 1. If a tag named as the version exists move to the steps below.
40
40
  1. If it does not, make a new tag with the name as the version
41
41
 
42
- Then, for each remote it pushes the tags and the commits.
42
+ Then, for each remote it pushes the tags and the commits.
43
43
 
44
44
  *Why?*
45
45
 
@@ -137,7 +137,17 @@ pdf = mod.get_pdf()
137
137
  ```
138
138
 
139
139
  where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
140
- The `mu` and `sg` parameters are shared.
140
+ The `mu` and `sg` parameters are shared. The elementary components that can be plugged are:
141
+
142
+ ```
143
+ exp: Exponential
144
+ pol1: Polynomial of degree 1
145
+ pol2: Polynomial of degree 2
146
+ cbr : CrystallBall with right tail
147
+ cbl : CrystallBall with left tail
148
+ gauss : Gaussian
149
+ dscb : Double sided CrystallBall
150
+ ```
141
151
 
142
152
  ### Printing PDFs
143
153
 
@@ -299,7 +309,7 @@ this will:
299
309
  - Try fitting at most 10 times
300
310
  - After each fit, calculate the goodness of fit (in this case the p-value)
301
311
  - Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
302
- - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
312
+ - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
303
313
  randomize the parameters and try again.
304
314
  - If the desired goodness of fit has not been achieved, pick the best result.
305
315
  - Return the `FitResult` object and set the PDF to the final fit result.
@@ -337,11 +347,11 @@ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
337
347
  nbk = zfit.Parameter('nbk', 1000, 0, 10000)
338
348
  ebkg= bkg.create_extended(nbk, name='expo')
339
349
 
340
- # Add them
350
+ # Add them
341
351
  pdf = zfit.pdf.SumPDF([ebkg, esig])
342
352
  sam = pdf.create_sampler()
343
353
 
344
- # Plot them
354
+ # Plot them
345
355
  obj = ZFitPlotter(data=sam, model=pdf)
346
356
  d_leg = {'gauss': 'New Gauss'}
347
357
  obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
@@ -353,7 +363,7 @@ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
353
363
  this class supports:
354
364
 
355
365
  - Handling title, legend, plots size.
356
- - Adding pulls.
366
+ - Adding pulls.
357
367
  - Stacking and overlaying of PDFs.
358
368
  - Blinding.
359
369
 
@@ -417,7 +427,7 @@ rdf_bkg = _get_rdf(kind='bkg')
417
427
  cfg = _get_config()
418
428
 
419
429
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
420
- obj.run()
430
+ obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
421
431
  ```
422
432
 
423
433
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -434,7 +444,7 @@ dataset:
434
444
  nan:
435
445
  x : 0
436
446
  y : 0
437
- z : -999
447
+ z : -999
438
448
  training :
439
449
  nfold : 10
440
450
  features : [x, y, z]
@@ -497,7 +507,7 @@ When training on real data, several things might go wrong and the code will try
497
507
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
498
508
  entries will be removed before training.
499
509
 
500
- - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
510
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
511
  - Can use the `nan` section shown above to replace `NaN` values with something else
502
512
  - For whatever remains we remove the entries from the training.
503
513
 
@@ -539,7 +549,7 @@ When evaluating the model with real data, problems might occur, we deal with the
539
549
  ```python
540
550
  model.cfg
541
551
  ```
542
- - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
552
+ - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
543
553
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
544
554
 
545
555
  # Pandas dataframes
@@ -674,6 +684,9 @@ ptr.run()
674
684
  where the config dictionary `cfg_dat` in YAML would look like:
675
685
 
676
686
  ```yaml
687
+ general:
688
+ # This will set the figure size
689
+ size : [20, 10]
677
690
  selection:
678
691
  #Will do at most 50K random entries. Will only happen if the dataset has more than 50K entries
679
692
  max_ran_entries : 50000
@@ -703,6 +716,16 @@ plots:
703
716
  yscale : 'linear'
704
717
  labels : ['x + y', 'Entries']
705
718
  normalized : true #This should normalize to the area
719
+ # Some vertical dashed lines are drawn by default
720
+ # If you see them, you can turn them off with this
721
+ style:
722
+ skip_lines : true
723
+ # This can pass arguments to legend making function `plt.legend()` in matplotlib
724
+ legend:
725
+ # The line below would place the legend outside the figure to avoid ovelaps with the histogram
726
+ bbox_to_anchor : [1.2, 1]
727
+ stats:
728
+ nentries : '{:.2e}' # This will add number of entries in legend box
706
729
  ```
707
730
 
708
731
  it's up to the user to build this dictionary and load it.
@@ -724,14 +747,19 @@ The config would look like:
724
747
  ```yaml
725
748
  saving:
726
749
  plt_dir : tests/plotting/2d
750
+ selection:
751
+ cuts:
752
+ xlow : x > -1.5
727
753
  general:
728
754
  size : [20, 10]
729
755
  plots_2d:
730
756
  # Column x and y
731
757
  # Name of column where weights are, null for not weights
732
758
  # Name of output plot, e.g. xy_x.png
733
- - [x, y, weights, 'xy_w']
734
- - [x, y, null, 'xy_r']
759
+ # Book signaling to use log scale for z axis
760
+ - [x, y, weights, 'xy_w', false]
761
+ - [x, y, null, 'xy_r', false]
762
+ - [x, y, null, 'xy_l', true]
735
763
  axes:
736
764
  x :
737
765
  binning : [-5.0, 8.0, 40]
@@ -823,7 +851,7 @@ Directory/Treename
823
851
  B_ENDVERTEX_CHI2DOF Double_t
824
852
  ```
825
853
 
826
- ## Comparing ROOT files
854
+ ## Comparing ROOT files
827
855
 
828
856
  Given two ROOT files the command below:
829
857
 
@@ -885,7 +913,7 @@ last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
885
913
  # of directories in `dir_path`, e.g.:
886
914
 
887
915
  oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
888
- oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
916
+ oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
889
917
  ```
890
918
 
891
919
  The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
@@ -6,7 +6,7 @@ These are tools that can be used for different data analysis tasks.
6
6
 
7
7
  ## Pushing
8
8
 
9
- From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
9
+ From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
10
10
  using a `pyproject.toml` file, run:
11
11
 
12
12
  ```bash
@@ -16,10 +16,10 @@ publish
16
16
  such that:
17
17
 
18
18
  1. The `pyproject.toml` file is checked and the version of the project is extracted.
19
- 1. If a tag named as the version exists move to the steps below.
19
+ 1. If a tag named as the version exists move to the steps below.
20
20
  1. If it does not, make a new tag with the name as the version
21
21
 
22
- Then, for each remote it pushes the tags and the commits.
22
+ Then, for each remote it pushes the tags and the commits.
23
23
 
24
24
  *Why?*
25
25
 
@@ -117,7 +117,17 @@ pdf = mod.get_pdf()
117
117
  ```
118
118
 
119
119
  where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
120
- The `mu` and `sg` parameters are shared.
120
+ The `mu` and `sg` parameters are shared. The elementary components that can be plugged are:
121
+
122
+ ```
123
+ exp: Exponential
124
+ pol1: Polynomial of degree 1
125
+ pol2: Polynomial of degree 2
126
+ cbr : CrystallBall with right tail
127
+ cbl : CrystallBall with left tail
128
+ gauss : Gaussian
129
+ dscb : Double sided CrystallBall
130
+ ```
121
131
 
122
132
  ### Printing PDFs
123
133
 
@@ -279,7 +289,7 @@ this will:
279
289
  - Try fitting at most 10 times
280
290
  - After each fit, calculate the goodness of fit (in this case the p-value)
281
291
  - Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
282
- - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
292
+ - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
283
293
  randomize the parameters and try again.
284
294
  - If the desired goodness of fit has not been achieved, pick the best result.
285
295
  - Return the `FitResult` object and set the PDF to the final fit result.
@@ -317,11 +327,11 @@ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
317
327
  nbk = zfit.Parameter('nbk', 1000, 0, 10000)
318
328
  ebkg= bkg.create_extended(nbk, name='expo')
319
329
 
320
- # Add them
330
+ # Add them
321
331
  pdf = zfit.pdf.SumPDF([ebkg, esig])
322
332
  sam = pdf.create_sampler()
323
333
 
324
- # Plot them
334
+ # Plot them
325
335
  obj = ZFitPlotter(data=sam, model=pdf)
326
336
  d_leg = {'gauss': 'New Gauss'}
327
337
  obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
@@ -333,7 +343,7 @@ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
333
343
  this class supports:
334
344
 
335
345
  - Handling title, legend, plots size.
336
- - Adding pulls.
346
+ - Adding pulls.
337
347
  - Stacking and overlaying of PDFs.
338
348
  - Blinding.
339
349
 
@@ -397,7 +407,7 @@ rdf_bkg = _get_rdf(kind='bkg')
397
407
  cfg = _get_config()
398
408
 
399
409
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
400
- obj.run()
410
+ obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
401
411
  ```
402
412
 
403
413
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -414,7 +424,7 @@ dataset:
414
424
  nan:
415
425
  x : 0
416
426
  y : 0
417
- z : -999
427
+ z : -999
418
428
  training :
419
429
  nfold : 10
420
430
  features : [x, y, z]
@@ -477,7 +487,7 @@ When training on real data, several things might go wrong and the code will try
477
487
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
478
488
  entries will be removed before training.
479
489
 
480
- - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
490
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
481
491
  - Can use the `nan` section shown above to replace `NaN` values with something else
482
492
  - For whatever remains we remove the entries from the training.
483
493
 
@@ -519,7 +529,7 @@ When evaluating the model with real data, problems might occur, we deal with the
519
529
  ```python
520
530
  model.cfg
521
531
  ```
522
- - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
532
+ - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
523
533
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
524
534
 
525
535
  # Pandas dataframes
@@ -654,6 +664,9 @@ ptr.run()
654
664
  where the config dictionary `cfg_dat` in YAML would look like:
655
665
 
656
666
  ```yaml
667
+ general:
668
+ # This will set the figure size
669
+ size : [20, 10]
657
670
  selection:
658
671
  #Will do at most 50K random entries. Will only happen if the dataset has more than 50K entries
659
672
  max_ran_entries : 50000
@@ -683,6 +696,16 @@ plots:
683
696
  yscale : 'linear'
684
697
  labels : ['x + y', 'Entries']
685
698
  normalized : true #This should normalize to the area
699
+ # Some vertical dashed lines are drawn by default
700
+ # If you see them, you can turn them off with this
701
+ style:
702
+ skip_lines : true
703
+ # This can pass arguments to legend making function `plt.legend()` in matplotlib
704
+ legend:
705
+ # The line below would place the legend outside the figure to avoid ovelaps with the histogram
706
+ bbox_to_anchor : [1.2, 1]
707
+ stats:
708
+ nentries : '{:.2e}' # This will add number of entries in legend box
686
709
  ```
687
710
 
688
711
  it's up to the user to build this dictionary and load it.
@@ -704,14 +727,19 @@ The config would look like:
704
727
  ```yaml
705
728
  saving:
706
729
  plt_dir : tests/plotting/2d
730
+ selection:
731
+ cuts:
732
+ xlow : x > -1.5
707
733
  general:
708
734
  size : [20, 10]
709
735
  plots_2d:
710
736
  # Column x and y
711
737
  # Name of column where weights are, null for not weights
712
738
  # Name of output plot, e.g. xy_x.png
713
- - [x, y, weights, 'xy_w']
714
- - [x, y, null, 'xy_r']
739
+ # Book signaling to use log scale for z axis
740
+ - [x, y, weights, 'xy_w', false]
741
+ - [x, y, null, 'xy_r', false]
742
+ - [x, y, null, 'xy_l', true]
715
743
  axes:
716
744
  x :
717
745
  binning : [-5.0, 8.0, 40]
@@ -803,7 +831,7 @@ Directory/Treename
803
831
  B_ENDVERTEX_CHI2DOF Double_t
804
832
  ```
805
833
 
806
- ## Comparing ROOT files
834
+ ## Comparing ROOT files
807
835
 
808
836
  Given two ROOT files the command below:
809
837
 
@@ -865,7 +893,7 @@ last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
865
893
  # of directories in `dir_path`, e.g.:
866
894
 
867
895
  oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
868
- oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
896
+ oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
869
897
  ```
870
898
 
871
899
  The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = 'data_manipulation_utilities'
3
- version = '0.2.4'
3
+ version = '0.2.6'
4
4
  readme = 'README.md'
5
5
  dependencies= [
6
6
  'logzero',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
@@ -26,7 +26,7 @@ These are tools that can be used for different data analysis tasks.
26
26
 
27
27
  ## Pushing
28
28
 
29
- From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
29
+ From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
30
30
  using a `pyproject.toml` file, run:
31
31
 
32
32
  ```bash
@@ -36,10 +36,10 @@ publish
36
36
  such that:
37
37
 
38
38
  1. The `pyproject.toml` file is checked and the version of the project is extracted.
39
- 1. If a tag named as the version exists move to the steps below.
39
+ 1. If a tag named as the version exists move to the steps below.
40
40
  1. If it does not, make a new tag with the name as the version
41
41
 
42
- Then, for each remote it pushes the tags and the commits.
42
+ Then, for each remote it pushes the tags and the commits.
43
43
 
44
44
  *Why?*
45
45
 
@@ -137,7 +137,17 @@ pdf = mod.get_pdf()
137
137
  ```
138
138
 
139
139
  where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
140
- The `mu` and `sg` parameters are shared.
140
+ The `mu` and `sg` parameters are shared. The elementary components that can be plugged are:
141
+
142
+ ```
143
+ exp: Exponential
144
+ pol1: Polynomial of degree 1
145
+ pol2: Polynomial of degree 2
146
+ cbr : CrystallBall with right tail
147
+ cbl : CrystallBall with left tail
148
+ gauss : Gaussian
149
+ dscb : Double sided CrystallBall
150
+ ```
141
151
 
142
152
  ### Printing PDFs
143
153
 
@@ -299,7 +309,7 @@ this will:
299
309
  - Try fitting at most 10 times
300
310
  - After each fit, calculate the goodness of fit (in this case the p-value)
301
311
  - Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
302
- - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
312
+ - If the fit has not succeeded because of convergence, validity or goodness of fit issues,
303
313
  randomize the parameters and try again.
304
314
  - If the desired goodness of fit has not been achieved, pick the best result.
305
315
  - Return the `FitResult` object and set the PDF to the final fit result.
@@ -337,11 +347,11 @@ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
337
347
  nbk = zfit.Parameter('nbk', 1000, 0, 10000)
338
348
  ebkg= bkg.create_extended(nbk, name='expo')
339
349
 
340
- # Add them
350
+ # Add them
341
351
  pdf = zfit.pdf.SumPDF([ebkg, esig])
342
352
  sam = pdf.create_sampler()
343
353
 
344
- # Plot them
354
+ # Plot them
345
355
  obj = ZFitPlotter(data=sam, model=pdf)
346
356
  d_leg = {'gauss': 'New Gauss'}
347
357
  obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
@@ -353,7 +363,7 @@ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
353
363
  this class supports:
354
364
 
355
365
  - Handling title, legend, plots size.
356
- - Adding pulls.
366
+ - Adding pulls.
357
367
  - Stacking and overlaying of PDFs.
358
368
  - Blinding.
359
369
 
@@ -417,7 +427,7 @@ rdf_bkg = _get_rdf(kind='bkg')
417
427
  cfg = _get_config()
418
428
 
419
429
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
420
- obj.run()
430
+ obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
421
431
  ```
422
432
 
423
433
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -434,7 +444,7 @@ dataset:
434
444
  nan:
435
445
  x : 0
436
446
  y : 0
437
- z : -999
447
+ z : -999
438
448
  training :
439
449
  nfold : 10
440
450
  features : [x, y, z]
@@ -497,7 +507,7 @@ When training on real data, several things might go wrong and the code will try
497
507
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
498
508
  entries will be removed before training.
499
509
 
500
- - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
510
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
511
  - Can use the `nan` section shown above to replace `NaN` values with something else
502
512
  - For whatever remains we remove the entries from the training.
503
513
 
@@ -539,7 +549,7 @@ When evaluating the model with real data, problems might occur, we deal with the
539
549
  ```python
540
550
  model.cfg
541
551
  ```
542
- - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
552
+ - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
543
553
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
544
554
 
545
555
  # Pandas dataframes
@@ -674,6 +684,9 @@ ptr.run()
674
684
  where the config dictionary `cfg_dat` in YAML would look like:
675
685
 
676
686
  ```yaml
687
+ general:
688
+ # This will set the figure size
689
+ size : [20, 10]
677
690
  selection:
678
691
  #Will do at most 50K random entries. Will only happen if the dataset has more than 50K entries
679
692
  max_ran_entries : 50000
@@ -703,6 +716,16 @@ plots:
703
716
  yscale : 'linear'
704
717
  labels : ['x + y', 'Entries']
705
718
  normalized : true #This should normalize to the area
719
+ # Some vertical dashed lines are drawn by default
720
+ # If you see them, you can turn them off with this
721
+ style:
722
+ skip_lines : true
723
+ # This can pass arguments to legend making function `plt.legend()` in matplotlib
724
+ legend:
725
+ # The line below would place the legend outside the figure to avoid ovelaps with the histogram
726
+ bbox_to_anchor : [1.2, 1]
727
+ stats:
728
+ nentries : '{:.2e}' # This will add number of entries in legend box
706
729
  ```
707
730
 
708
731
  it's up to the user to build this dictionary and load it.
@@ -724,14 +747,19 @@ The config would look like:
724
747
  ```yaml
725
748
  saving:
726
749
  plt_dir : tests/plotting/2d
750
+ selection:
751
+ cuts:
752
+ xlow : x > -1.5
727
753
  general:
728
754
  size : [20, 10]
729
755
  plots_2d:
730
756
  # Column x and y
731
757
  # Name of column where weights are, null for not weights
732
758
  # Name of output plot, e.g. xy_x.png
733
- - [x, y, weights, 'xy_w']
734
- - [x, y, null, 'xy_r']
759
+ # Book signaling to use log scale for z axis
760
+ - [x, y, weights, 'xy_w', false]
761
+ - [x, y, null, 'xy_r', false]
762
+ - [x, y, null, 'xy_l', true]
735
763
  axes:
736
764
  x :
737
765
  binning : [-5.0, 8.0, 40]
@@ -823,7 +851,7 @@ Directory/Treename
823
851
  B_ENDVERTEX_CHI2DOF Double_t
824
852
  ```
825
853
 
826
- ## Comparing ROOT files
854
+ ## Comparing ROOT files
827
855
 
828
856
  Given two ROOT files the command below:
829
857
 
@@ -885,7 +913,7 @@ last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
885
913
  # of directories in `dir_path`, e.g.:
886
914
 
887
915
  oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
888
- oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
916
+ oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
889
917
  ```
890
918
 
891
919
  The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
@@ -38,10 +38,12 @@ src/dmu_data/ml/tests/train_mva.yaml
38
38
  src/dmu_data/plotting/tests/2d.yaml
39
39
  src/dmu_data/plotting/tests/fig_size.yaml
40
40
  src/dmu_data/plotting/tests/high_stat.yaml
41
+ src/dmu_data/plotting/tests/legend.yaml
41
42
  src/dmu_data/plotting/tests/name.yaml
42
43
  src/dmu_data/plotting/tests/no_bounds.yaml
43
44
  src/dmu_data/plotting/tests/normalized.yaml
44
45
  src/dmu_data/plotting/tests/simple.yaml
46
+ src/dmu_data/plotting/tests/stats.yaml
45
47
  src/dmu_data/plotting/tests/title.yaml
46
48
  src/dmu_data/plotting/tests/weights.yaml
47
49
  src/dmu_data/text/transform.toml
@@ -1,15 +1,15 @@
1
1
  '''
2
2
  Module holding cv_classifier class
3
3
  '''
4
-
4
+ import os
5
5
  from typing import Union
6
6
  from sklearn.ensemble import GradientBoostingClassifier
7
7
 
8
+ import yaml
8
9
  from dmu.logging.log_store import LogStore
9
10
  import dmu.ml.utilities as ut
10
11
 
11
12
  log = LogStore.add_logger('dmu:ml:CVClassifier')
12
-
13
13
  # ---------------------------------------
14
14
  class CVSameData(Exception):
15
15
  '''
@@ -61,6 +61,20 @@ class CVClassifier(GradientBoostingClassifier):
61
61
 
62
62
  return self._cfg
63
63
  # ----------------------------------
64
+ def save_cfg(self, path : str):
65
+ '''
66
+ Will save configuration used to train this classifier to YAML
67
+
68
+ path: Path to YAML file
69
+ '''
70
+ dir_name = os.path.dirname(path)
71
+ os.makedirs(dir_name, exist_ok=True)
72
+
73
+ with open(path, 'w', encoding='utf-8') as ofile:
74
+ yaml.safe_dump(self._cfg, ofile, indent=2)
75
+
76
+ log.info(f'Saved config to: {path}')
77
+ # ----------------------------------
64
78
  def __str__(self):
65
79
  nhash = len(self._s_hash)
66
80
 
@@ -73,11 +73,11 @@ class CVPredict:
73
73
  log.debug('Not doing any NaN replacement')
74
74
  return df
75
75
 
76
- log.debug(60 * '-')
76
+ log.info(60 * '-')
77
77
  log.info('Doing NaN replacements')
78
- log.debug(60 * '-')
78
+ log.info(60 * '-')
79
79
  for var, val in self._d_nan_rep.items():
80
- log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
80
+ log.info(f'{var:<20}{"--->":20}{val:<20.3f}')
81
81
  df[var] = df[var].fillna(val)
82
82
 
83
83
  return df
@@ -155,7 +155,7 @@ class CVPredict:
155
155
  ndif = len(s_dif_hash)
156
156
  ndat = len(s_dat_hash)
157
157
  nmod = len(s_mod_hash)
158
- log.debug(f'{ndif:<20}{"=":10}{ndat:<20}{"-":10}{nmod:<20}')
158
+ log.debug(f'{ndif:<10}{"=":5}{ndat:<10}{"-":5}{nmod:<10}')
159
159
 
160
160
  df_ft_group= df_ft.loc[df_ft.index.isin(s_dif_hash)]
161
161
 
@@ -173,7 +173,7 @@ class CVPredict:
173
173
  return arr_prb
174
174
 
175
175
  nentries = len(self._arr_patch)
176
- log.warning(f'Patching {nentries} probabilities')
176
+ log.warning(f'Patching {nentries} probabilities with -1')
177
177
  arr_prb[self._arr_patch] = -1
178
178
 
179
179
  return arr_prb