data-manipulation-utilities 0.1.9__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/PKG-INFO +121 -2
  2. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/README.md +119 -0
  3. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/pyproject.toml +2 -2
  4. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/PKG-INFO +121 -2
  5. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/SOURCES.txt +4 -0
  6. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
  7. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_classifier.py +2 -1
  8. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_predict.py +2 -1
  9. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/train_mva.py +204 -25
  10. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/utilities.py +8 -0
  11. data_manipulation_utilities-0.2.1/src/dmu/pdataframe/utilities.py +36 -0
  12. data_manipulation_utilities-0.2.1/src/dmu/plotting/matrix.py +157 -0
  13. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/plotting/plotter_1d.py +18 -1
  14. data_manipulation_utilities-0.2.1/src/dmu/plotting/utilities.py +33 -0
  15. data_manipulation_utilities-0.2.1/src/dmu_data/ml/tests/train_mva.yaml +52 -0
  16. data_manipulation_utilities-0.2.1/src/dmu_data/plotting/tests/normalized.yaml +9 -0
  17. data_manipulation_utilities-0.1.9/src/dmu_data/ml/tests/train_mva.yaml +0 -37
  18. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/setup.cfg +0 -0
  19. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
  20. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
  21. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
  22. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/arrays/utilities.py +0 -0
  23. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/generic/utilities.py +0 -0
  24. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/logging/log_store.py +0 -0
  25. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/plotting/plotter.py +0 -0
  26. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/plotting/plotter_2d.py +0 -0
  27. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rdataframe/atr_mgr.py +0 -0
  28. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rdataframe/utilities.py +0 -0
  29. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rfile/rfprinter.py +0 -0
  30. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rfile/utilities.py +0 -0
  31. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/fitter.py +0 -0
  32. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/function.py +0 -0
  33. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/gof_calculator.py +0 -0
  34. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/minimizers.py +0 -0
  35. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/model_factory.py +0 -0
  36. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/utilities.py +0 -0
  37. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/zfit_plotter.py +0 -0
  38. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/testing/utilities.py +0 -0
  39. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/text/transformer.py +0 -0
  40. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/__init__.py +0 -0
  41. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/2d.yaml +0 -0
  42. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
  43. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
  44. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/name.yaml +0 -0
  45. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
  46. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/simple.yaml +0 -0
  47. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/title.yaml +0 -0
  48. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/weights.yaml +0 -0
  49. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform.toml +0 -0
  50. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform.txt +0 -0
  51. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform_set.toml +0 -0
  52. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform_set.txt +0 -0
  53. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform_trf.txt +0 -0
  54. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/git/publish +0 -0
  55. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/physics/check_truth.py +0 -0
  56. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
  57. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/rfile/print_trees.py +0 -0
  58. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/ssh/coned.py +0 -0
  59. {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.1.9
3
+ Version: 0.2.1
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward
8
+ Requires-Dist: awkward==2.4.6
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
204
204
 
205
205
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
206
206
 
207
+ ### Goodness of fits
208
+
209
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
210
+ This is done by:
211
+
212
+ - Binning the data and PDF.
213
+ - Calculating the reduced $\chi^2$.
214
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
215
+
216
+ This class is used as shown below:
217
+
218
+ ```python
219
+ from dmu.stats.gof_calculator import GofCalculator
220
+
221
+ nll = _get_nll()
222
+ res = Data.minimizer.minimize(nll)
223
+
224
+ gcl = GofCalculator(nll, ndof=10)
225
+ gof = gcl.get_gof(kind='pvalue')
226
+ ```
227
+
228
+ where:
229
+
230
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
231
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
232
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
233
+
207
234
  ### Simplest fit
208
235
 
209
236
  ```python
@@ -396,6 +423,14 @@ obj.run()
396
423
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
397
424
 
398
425
  ```yaml
426
+ dataset:
427
+ # If the key is found to be NaN, replace its value with the number provided
428
+ # This will be used in the training.
429
+ # Otherwise the entries with NaNs will be dropped
430
+ nan:
431
+ x : 0
432
+ y : 0
433
+ z : -999
399
434
  training :
400
435
  nfold : 10
401
436
  features : [w, x, y, z]
@@ -406,8 +441,25 @@ training :
406
441
  learning_rate : 0.1
407
442
  min_samples_split : 2
408
443
  saving:
444
+ # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
409
445
  path : 'tests/ml/train_mva/model.pkl'
410
446
  plotting:
447
+ roc :
448
+ min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
449
+ max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
450
+ # The section below is optional and will annotate the ROC curve with
451
+ # values for the score at different signal efficiencies
452
+ annotate:
453
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
454
+ form : '{:.2f}' # Use two decimals for scores
455
+ color : 'green' # Color for text and marker
456
+ xoff : -15 # Offsets in X and Y
457
+ yoff : -15
458
+ size : 10 # Size of text
459
+ correlation: # Adds correlation matrix for training datasets
460
+ title : 'Correlation matrix'
461
+ size : [10, 10]
462
+ mask_value : 0 # Where correlation is zero, the bin will appear white
411
463
  val_dir : 'tests/ml/train_mva'
412
464
  features:
413
465
  saving:
@@ -475,6 +527,36 @@ When evaluating the model with real data, problems might occur, we deal with the
475
527
  - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
476
528
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
477
529
 
530
+ # Pandas dataframes
531
+
532
+ ## Utilities
533
+
534
+ These are thin layers of code that take pandas dataframes and carry out specific tasks
535
+
536
+ ### Dataframe to latex
537
+
538
+ One can save a dataframe to latex with:
539
+
540
+ ```python
541
+ import pandas as pnd
542
+ import dmu.pdataframe.utilities as put
543
+
544
+ d_data = {}
545
+ d_data['a'] = [1,2,3]
546
+ d_data['b'] = [4,5,6]
547
+ df = pnd.DataFrame(d_data)
548
+
549
+ d_format = {
550
+ 'a' : '{:.0f}',
551
+ 'b' : '{:.3f}'}
552
+
553
+ df = _get_df()
554
+ put.df_to_tex(df,
555
+ './table.tex',
556
+ d_format = d_format,
557
+ caption = 'some caption')
558
+ ```
559
+
478
560
  # Rdataframes
479
561
 
480
562
  These are utility functions meant to be used with ROOT dataframes.
@@ -626,6 +708,43 @@ axes:
626
708
  label : 'y'
627
709
  ```
628
710
 
711
+ # Other plots
712
+
713
+ ## Matrices
714
+
715
+ This can be done with `MatrixPlotter`, whose usage is illustrated below:
716
+
717
+ ```python
718
+ import numpy
719
+ import matplotlib.pyplot as plt
720
+
721
+ from dmu.plotting.matrix import MatrixPlotter
722
+
723
+ cfg = {
724
+ 'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
725
+ 'title' : 'Some title', # Optional, title of plot
726
+ 'label_angle': 45, # Labels will be rotated by 45 degrees
727
+ 'upper' : True, # Useful in case this is a symmetric matrix
728
+ 'zrange' : [0, 10], # Controls the z axis range
729
+ 'size' : [7, 7], # Plot size
730
+ 'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
731
+ 'fontsize' : 12, # Font size associated to `format`
732
+ 'mask_value' : 0, # These values will appear white in the plot
733
+ }
734
+
735
+ mat = [
736
+ [1, 2, 3],
737
+ [2, 0, 4],
738
+ [3, 4, numpy.nan]
739
+ ]
740
+
741
+ mat = numpy.array(mat)
742
+
743
+ obj = MatrixPlotter(mat=mat, cfg=cfg)
744
+ obj.plot()
745
+ plt.show()
746
+ ```
747
+
629
748
  # Manipulating ROOT files
630
749
 
631
750
  ## Getting trees from file
@@ -184,6 +184,33 @@ print_pdf(pdf,
184
184
 
185
185
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
186
186
 
187
+ ### Goodness of fits
188
+
189
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
190
+ This is done by:
191
+
192
+ - Binning the data and PDF.
193
+ - Calculating the reduced $\chi^2$.
194
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
195
+
196
+ This class is used as shown below:
197
+
198
+ ```python
199
+ from dmu.stats.gof_calculator import GofCalculator
200
+
201
+ nll = _get_nll()
202
+ res = Data.minimizer.minimize(nll)
203
+
204
+ gcl = GofCalculator(nll, ndof=10)
205
+ gof = gcl.get_gof(kind='pvalue')
206
+ ```
207
+
208
+ where:
209
+
210
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
211
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
212
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
213
+
187
214
  ### Simplest fit
188
215
 
189
216
  ```python
@@ -376,6 +403,14 @@ obj.run()
376
403
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
377
404
 
378
405
  ```yaml
406
+ dataset:
407
+ # If the key is found to be NaN, replace its value with the number provided
408
+ # This will be used in the training.
409
+ # Otherwise the entries with NaNs will be dropped
410
+ nan:
411
+ x : 0
412
+ y : 0
413
+ z : -999
379
414
  training :
380
415
  nfold : 10
381
416
  features : [w, x, y, z]
@@ -386,8 +421,25 @@ training :
386
421
  learning_rate : 0.1
387
422
  min_samples_split : 2
388
423
  saving:
424
+ # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
389
425
  path : 'tests/ml/train_mva/model.pkl'
390
426
  plotting:
427
+ roc :
428
+ min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
429
+ max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
430
+ # The section below is optional and will annotate the ROC curve with
431
+ # values for the score at different signal efficiencies
432
+ annotate:
433
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
434
+ form : '{:.2f}' # Use two decimals for scores
435
+ color : 'green' # Color for text and marker
436
+ xoff : -15 # Offsets in X and Y
437
+ yoff : -15
438
+ size : 10 # Size of text
439
+ correlation: # Adds correlation matrix for training datasets
440
+ title : 'Correlation matrix'
441
+ size : [10, 10]
442
+ mask_value : 0 # Where correlation is zero, the bin will appear white
391
443
  val_dir : 'tests/ml/train_mva'
392
444
  features:
393
445
  saving:
@@ -455,6 +507,36 @@ When evaluating the model with real data, problems might occur, we deal with the
455
507
  - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
456
508
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
457
509
 
510
+ # Pandas dataframes
511
+
512
+ ## Utilities
513
+
514
+ These are thin layers of code that take pandas dataframes and carry out specific tasks
515
+
516
+ ### Dataframe to latex
517
+
518
+ One can save a dataframe to latex with:
519
+
520
+ ```python
521
+ import pandas as pnd
522
+ import dmu.pdataframe.utilities as put
523
+
524
+ d_data = {}
525
+ d_data['a'] = [1,2,3]
526
+ d_data['b'] = [4,5,6]
527
+ df = pnd.DataFrame(d_data)
528
+
529
+ d_format = {
530
+ 'a' : '{:.0f}',
531
+ 'b' : '{:.3f}'}
532
+
533
+ df = _get_df()
534
+ put.df_to_tex(df,
535
+ './table.tex',
536
+ d_format = d_format,
537
+ caption = 'some caption')
538
+ ```
539
+
458
540
  # Rdataframes
459
541
 
460
542
  These are utility functions meant to be used with ROOT dataframes.
@@ -606,6 +688,43 @@ axes:
606
688
  label : 'y'
607
689
  ```
608
690
 
691
+ # Other plots
692
+
693
+ ## Matrices
694
+
695
+ This can be done with `MatrixPlotter`, whose usage is illustrated below:
696
+
697
+ ```python
698
+ import numpy
699
+ import matplotlib.pyplot as plt
700
+
701
+ from dmu.plotting.matrix import MatrixPlotter
702
+
703
+ cfg = {
704
+ 'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
705
+ 'title' : 'Some title', # Optional, title of plot
706
+ 'label_angle': 45, # Labels will be rotated by 45 degrees
707
+ 'upper' : True, # Useful in case this is a symmetric matrix
708
+ 'zrange' : [0, 10], # Controls the z axis range
709
+ 'size' : [7, 7], # Plot size
710
+ 'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
711
+ 'fontsize' : 12, # Font size associated to `format`
712
+ 'mask_value' : 0, # These values will appear white in the plot
713
+ }
714
+
715
+ mat = [
716
+ [1, 2, 3],
717
+ [2, 0, 4],
718
+ [3, 4, numpy.nan]
719
+ ]
720
+
721
+ mat = numpy.array(mat)
722
+
723
+ obj = MatrixPlotter(mat=mat, cfg=cfg)
724
+ obj.plot()
725
+ plt.show()
726
+ ```
727
+
609
728
  # Manipulating ROOT files
610
729
 
611
730
  ## Getting trees from file
@@ -1,12 +1,12 @@
1
1
  [project]
2
2
  name = 'data_manipulation_utilities'
3
- version = '0.1.9'
3
+ version = '0.2.1'
4
4
  readme = 'README.md'
5
5
  dependencies= [
6
6
  'logzero',
7
7
  'PyYAML',
8
8
  'scipy',
9
- 'awkward',
9
+ 'awkward==2.4.6',
10
10
  'tqdm',
11
11
  'joblib',
12
12
  'scikit-learn',
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.1.9
3
+ Version: 0.2.1
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward
8
+ Requires-Dist: awkward==2.4.6
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -204,6 +204,33 @@ print_pdf(pdf,
204
204
 
205
205
  The `Fitter` class is a wrapper to zfit, use to make fitting easier.
206
206
 
207
+ ### Goodness of fits
208
+
209
+ Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
210
+ This is done by:
211
+
212
+ - Binning the data and PDF.
213
+ - Calculating the reduced $\chi^2$.
214
+ - Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
215
+
216
+ This class is used as shown below:
217
+
218
+ ```python
219
+ from dmu.stats.gof_calculator import GofCalculator
220
+
221
+ nll = _get_nll()
222
+ res = Data.minimizer.minimize(nll)
223
+
224
+ gcl = GofCalculator(nll, ndof=10)
225
+ gof = gcl.get_gof(kind='pvalue')
226
+ ```
227
+
228
+ where:
229
+
230
+ - `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
231
+ It is needed to know how many bins to use to make the histogram. The recommended value is 10.
232
+ - `kind` The argument can be `pvalue` or `chi2/ndof`.
233
+
207
234
  ### Simplest fit
208
235
 
209
236
  ```python
@@ -396,6 +423,14 @@ obj.run()
396
423
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
397
424
 
398
425
  ```yaml
426
+ dataset:
427
+ # If the key is found to be NaN, replace its value with the number provided
428
+ # This will be used in the training.
429
+ # Otherwise the entries with NaNs will be dropped
430
+ nan:
431
+ x : 0
432
+ y : 0
433
+ z : -999
399
434
  training :
400
435
  nfold : 10
401
436
  features : [w, x, y, z]
@@ -406,8 +441,25 @@ training :
406
441
  learning_rate : 0.1
407
442
  min_samples_split : 2
408
443
  saving:
444
+ # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
409
445
  path : 'tests/ml/train_mva/model.pkl'
410
446
  plotting:
447
+ roc :
448
+ min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
449
+ max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
450
+ # The section below is optional and will annotate the ROC curve with
451
+ # values for the score at different signal efficiencies
452
+ annotate:
453
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
454
+ form : '{:.2f}' # Use two decimals for scores
455
+ color : 'green' # Color for text and marker
456
+ xoff : -15 # Offsets in X and Y
457
+ yoff : -15
458
+ size : 10 # Size of text
459
+ correlation: # Adds correlation matrix for training datasets
460
+ title : 'Correlation matrix'
461
+ size : [10, 10]
462
+ mask_value : 0 # Where correlation is zero, the bin will appear white
411
463
  val_dir : 'tests/ml/train_mva'
412
464
  features:
413
465
  saving:
@@ -475,6 +527,36 @@ When evaluating the model with real data, problems might occur, we deal with the
475
527
  - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
476
528
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
477
529
 
530
+ # Pandas dataframes
531
+
532
+ ## Utilities
533
+
534
+ These are thin layers of code that take pandas dataframes and carry out specific tasks
535
+
536
+ ### Dataframe to latex
537
+
538
+ One can save a dataframe to latex with:
539
+
540
+ ```python
541
+ import pandas as pnd
542
+ import dmu.pdataframe.utilities as put
543
+
544
+ d_data = {}
545
+ d_data['a'] = [1,2,3]
546
+ d_data['b'] = [4,5,6]
547
+ df = pnd.DataFrame(d_data)
548
+
549
+ d_format = {
550
+ 'a' : '{:.0f}',
551
+ 'b' : '{:.3f}'}
552
+
553
+ df = _get_df()
554
+ put.df_to_tex(df,
555
+ './table.tex',
556
+ d_format = d_format,
557
+ caption = 'some caption')
558
+ ```
559
+
478
560
  # Rdataframes
479
561
 
480
562
  These are utility functions meant to be used with ROOT dataframes.
@@ -626,6 +708,43 @@ axes:
626
708
  label : 'y'
627
709
  ```
628
710
 
711
+ # Other plots
712
+
713
+ ## Matrices
714
+
715
+ This can be done with `MatrixPlotter`, whose usage is illustrated below:
716
+
717
+ ```python
718
+ import numpy
719
+ import matplotlib.pyplot as plt
720
+
721
+ from dmu.plotting.matrix import MatrixPlotter
722
+
723
+ cfg = {
724
+ 'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
725
+ 'title' : 'Some title', # Optional, title of plot
726
+ 'label_angle': 45, # Labels will be rotated by 45 degrees
727
+ 'upper' : True, # Useful in case this is a symmetric matrix
728
+ 'zrange' : [0, 10], # Controls the z axis range
729
+ 'size' : [7, 7], # Plot size
730
+ 'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
731
+ 'fontsize' : 12, # Font size associated to `format`
732
+ 'mask_value' : 0, # These values will appear white in the plot
733
+ }
734
+
735
+ mat = [
736
+ [1, 2, 3],
737
+ [2, 0, 4],
738
+ [3, 4, numpy.nan]
739
+ ]
740
+
741
+ mat = numpy.array(mat)
742
+
743
+ obj = MatrixPlotter(mat=mat, cfg=cfg)
744
+ obj.plot()
745
+ plt.show()
746
+ ```
747
+
629
748
  # Manipulating ROOT files
630
749
 
631
750
  ## Getting trees from file
@@ -13,9 +13,12 @@ src/dmu/ml/cv_classifier.py
13
13
  src/dmu/ml/cv_predict.py
14
14
  src/dmu/ml/train_mva.py
15
15
  src/dmu/ml/utilities.py
16
+ src/dmu/pdataframe/utilities.py
17
+ src/dmu/plotting/matrix.py
16
18
  src/dmu/plotting/plotter.py
17
19
  src/dmu/plotting/plotter_1d.py
18
20
  src/dmu/plotting/plotter_2d.py
21
+ src/dmu/plotting/utilities.py
19
22
  src/dmu/rdataframe/atr_mgr.py
20
23
  src/dmu/rdataframe/utilities.py
21
24
  src/dmu/rfile/rfprinter.py
@@ -36,6 +39,7 @@ src/dmu_data/plotting/tests/fig_size.yaml
36
39
  src/dmu_data/plotting/tests/high_stat.yaml
37
40
  src/dmu_data/plotting/tests/name.yaml
38
41
  src/dmu_data/plotting/tests/no_bounds.yaml
42
+ src/dmu_data/plotting/tests/normalized.yaml
39
43
  src/dmu_data/plotting/tests/simple.yaml
40
44
  src/dmu_data/plotting/tests/title.yaml
41
45
  src/dmu_data/plotting/tests/weights.yaml
@@ -1,7 +1,7 @@
1
1
  logzero
2
2
  PyYAML
3
3
  scipy
4
- awkward
4
+ awkward==2.4.6
5
5
  tqdm
6
6
  joblib
7
7
  scikit-learn
@@ -2,6 +2,7 @@
2
2
  Module holding cv_classifier class
3
3
  '''
4
4
 
5
+ from typing import Union
5
6
  from sklearn.ensemble import GradientBoostingClassifier
6
7
 
7
8
  from dmu.logging.log_store import LogStore
@@ -22,7 +23,7 @@ class CVClassifier(GradientBoostingClassifier):
22
23
  '''
23
24
  # pylint: disable = too-many-ancestors, abstract-method
24
25
  # ----------------------------------
25
- def __init__(self, cfg : dict | None = None):
26
+ def __init__(self, cfg : Union[dict,None] = None):
26
27
  '''
27
28
  cfg (dict) : Dictionary with configuration, specially the hyperparameters set in the `hyper` field
28
29
  '''
@@ -10,8 +10,8 @@ import tqdm
10
10
  from ROOT import RDataFrame
11
11
 
12
12
  import dmu.ml.utilities as ut
13
- import dmu.ml.cv_classifier as CVClassifier
14
13
 
14
+ from dmu.ml.cv_classifier import CVClassifier
15
15
  from dmu.logging.log_store import LogStore
16
16
 
17
17
  log = LogStore.add_logger('dmu:ml:cv_predict')
@@ -147,6 +147,7 @@ class CVPredict:
147
147
  arr_prb = self._predict_with_overlap(df_ft)
148
148
 
149
149
  arr_prb = self._patch_probabilities(arr_prb)
150
+ arr_prb = arr_prb.T[1]
150
151
 
151
152
  return arr_prb
152
153
  # ---------------------------------------