data-manipulation-utilities 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/PKG-INFO +113 -6
  2. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/README.md +111 -4
  3. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/pyproject.toml +2 -2
  4. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/PKG-INFO +113 -6
  5. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/SOURCES.txt +3 -0
  6. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
  7. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/cv_classifier.py +2 -1
  8. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/cv_predict.py +50 -2
  9. data_manipulation_utilities-0.2.2/src/dmu/ml/train_mva.py +447 -0
  10. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/utilities.py +8 -0
  11. data_manipulation_utilities-0.2.2/src/dmu/pdataframe/utilities.py +36 -0
  12. data_manipulation_utilities-0.2.2/src/dmu/plotting/matrix.py +157 -0
  13. data_manipulation_utilities-0.2.2/src/dmu/plotting/utilities.py +33 -0
  14. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rdataframe/utilities.py +1 -1
  15. data_manipulation_utilities-0.2.2/src/dmu_data/ml/tests/train_mva.yaml +52 -0
  16. data_manipulation_utilities-0.2.0/src/dmu/ml/train_mva.py +0 -257
  17. data_manipulation_utilities-0.2.0/src/dmu_data/ml/tests/train_mva.yaml +0 -37
  18. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/setup.cfg +0 -0
  19. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
  20. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
  21. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
  22. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/arrays/utilities.py +0 -0
  23. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/generic/utilities.py +0 -0
  24. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/logging/log_store.py +0 -0
  25. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/plotting/plotter.py +0 -0
  26. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/plotting/plotter_1d.py +0 -0
  27. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/plotting/plotter_2d.py +0 -0
  28. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rdataframe/atr_mgr.py +0 -0
  29. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rfile/rfprinter.py +0 -0
  30. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rfile/utilities.py +0 -0
  31. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/fitter.py +0 -0
  32. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/function.py +0 -0
  33. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/gof_calculator.py +0 -0
  34. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/minimizers.py +0 -0
  35. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/model_factory.py +0 -0
  36. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/utilities.py +0 -0
  37. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/zfit_plotter.py +0 -0
  38. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/testing/utilities.py +0 -0
  39. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/text/transformer.py +0 -0
  40. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/__init__.py +0 -0
  41. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/2d.yaml +0 -0
  42. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
  43. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
  44. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/name.yaml +0 -0
  45. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
  46. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
  47. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/simple.yaml +0 -0
  48. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/title.yaml +0 -0
  49. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/weights.yaml +0 -0
  50. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform.toml +0 -0
  51. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform.txt +0 -0
  52. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform_set.toml +0 -0
  53. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform_set.txt +0 -0
  54. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform_trf.txt +0 -0
  55. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/git/publish +0 -0
  56. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/physics/check_truth.py +0 -0
  57. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
  58. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/rfile/print_trees.py +0 -0
  59. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/ssh/coned.py +0 -0
  60. {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward==2.4.6
8
+ Requires-Dist: awkward
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -423,9 +423,21 @@ obj.run()
423
423
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
424
424
 
425
425
  ```yaml
426
+ dataset:
427
+ # Before training, new features can be defined as below
428
+ define :
429
+ x : v + w
430
+ y : v - w
431
+ # If the key is found to be NaN, replace its value with the number provided
432
+ # This will be used in the training.
433
+ # Otherwise the entries with NaNs will be dropped
434
+ nan:
435
+ x : 0
436
+ y : 0
437
+ z : -999
426
438
  training :
427
439
  nfold : 10
428
- features : [w, x, y, z]
440
+ features : [x, y, z]
429
441
  hyper :
430
442
  loss : log_loss
431
443
  n_estimators : 100
@@ -433,8 +445,25 @@ training :
433
445
  learning_rate : 0.1
434
446
  min_samples_split : 2
435
447
  saving:
448
+ # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
436
449
  path : 'tests/ml/train_mva/model.pkl'
437
450
  plotting:
451
+ roc :
452
+ min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
453
+ max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
454
+ # The section below is optional and will annotate the ROC curve with
455
+ # values for the score at different signal efficiencies
456
+ annotate:
457
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
458
+ form : '{:.2f}' # Use two decimals for scores
459
+ color : 'green' # Color for text and marker
460
+ xoff : -15 # Offsets in X and Y
461
+ yoff : -15
462
+ size : 10 # Size of text
463
+ correlation: # Adds correlation matrix for training datasets
464
+ title : 'Correlation matrix'
465
+ size : [10, 10]
466
+ mask_value : 0 # Where correlation is zero, the bin will appear white
438
467
  val_dir : 'tests/ml/train_mva'
439
468
  features:
440
469
  saving:
@@ -468,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
468
497
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
469
498
  entries will be removed before training.
470
499
 
471
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
500
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
+ - Can use the `nan` section shown above to replace `NaN` values with something else
502
+ - For whatever remains we remove the entries from the training.
472
503
 
473
504
  ## Application
474
505
 
@@ -491,17 +522,56 @@ The picking process happens through the comparison of hashes between the samples
491
522
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
492
523
  `GradientBoostClassifier`, here called `CVClassifier`.
493
524
 
494
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
525
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
495
526
  `CVSameData` exception will be risen.
496
527
 
528
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
529
+ from that configuration and the user does not need to define extra columns.
530
+
497
531
  ### Caveats
498
532
 
499
533
  When evaluating the model with real data, problems might occur, we deal with them as follows:
500
534
 
501
535
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
502
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
536
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
537
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
538
+ and can be accessed through:
539
+ ```python
540
+ model.cfg
541
+ ```
542
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
503
543
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
504
544
 
545
+ # Pandas dataframes
546
+
547
+ ## Utilities
548
+
549
+ These are thin layers of code that take pandas dataframes and carry out specific tasks
550
+
551
+ ### Dataframe to latex
552
+
553
+ One can save a dataframe to latex with:
554
+
555
+ ```python
556
+ import pandas as pnd
557
+ import dmu.pdataframe.utilities as put
558
+
559
+ d_data = {}
560
+ d_data['a'] = [1,2,3]
561
+ d_data['b'] = [4,5,6]
562
+ df = pnd.DataFrame(d_data)
563
+
564
+ d_format = {
565
+ 'a' : '{:.0f}',
566
+ 'b' : '{:.3f}'}
567
+
568
+ df = _get_df()
569
+ put.df_to_tex(df,
570
+ './table.tex',
571
+ d_format = d_format,
572
+ caption = 'some caption')
573
+ ```
574
+
505
575
  # Rdataframes
506
576
 
507
577
  These are utility functions meant to be used with ROOT dataframes.
@@ -653,6 +723,43 @@ axes:
653
723
  label : 'y'
654
724
  ```
655
725
 
726
+ # Other plots
727
+
728
+ ## Matrices
729
+
730
+ This can be done with `MatrixPlotter`, whose usage is illustrated below:
731
+
732
+ ```python
733
+ import numpy
734
+ import matplotlib.pyplot as plt
735
+
736
+ from dmu.plotting.matrix import MatrixPlotter
737
+
738
+ cfg = {
739
+ 'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
740
+ 'title' : 'Some title', # Optional, title of plot
741
+ 'label_angle': 45, # Labels will be rotated by 45 degrees
742
+ 'upper' : True, # Useful in case this is a symmetric matrix
743
+ 'zrange' : [0, 10], # Controls the z axis range
744
+ 'size' : [7, 7], # Plot size
745
+ 'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
746
+ 'fontsize' : 12, # Font size associated to `format`
747
+ 'mask_value' : 0, # These values will appear white in the plot
748
+ }
749
+
750
+ mat = [
751
+ [1, 2, 3],
752
+ [2, 0, 4],
753
+ [3, 4, numpy.nan]
754
+ ]
755
+
756
+ mat = numpy.array(mat)
757
+
758
+ obj = MatrixPlotter(mat=mat, cfg=cfg)
759
+ obj.plot()
760
+ plt.show()
761
+ ```
762
+
656
763
  # Manipulating ROOT files
657
764
 
658
765
  ## Getting trees from file
@@ -403,9 +403,21 @@ obj.run()
403
403
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
404
404
 
405
405
  ```yaml
406
+ dataset:
407
+ # Before training, new features can be defined as below
408
+ define :
409
+ x : v + w
410
+ y : v - w
411
+ # If the key is found to be NaN, replace its value with the number provided
412
+ # This will be used in the training.
413
+ # Otherwise the entries with NaNs will be dropped
414
+ nan:
415
+ x : 0
416
+ y : 0
417
+ z : -999
406
418
  training :
407
419
  nfold : 10
408
- features : [w, x, y, z]
420
+ features : [x, y, z]
409
421
  hyper :
410
422
  loss : log_loss
411
423
  n_estimators : 100
@@ -413,8 +425,25 @@ training :
413
425
  learning_rate : 0.1
414
426
  min_samples_split : 2
415
427
  saving:
428
+ # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
416
429
  path : 'tests/ml/train_mva/model.pkl'
417
430
  plotting:
431
+ roc :
432
+ min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
433
+ max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
434
+ # The section below is optional and will annotate the ROC curve with
435
+ # values for the score at different signal efficiencies
436
+ annotate:
437
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
438
+ form : '{:.2f}' # Use two decimals for scores
439
+ color : 'green' # Color for text and marker
440
+ xoff : -15 # Offsets in X and Y
441
+ yoff : -15
442
+ size : 10 # Size of text
443
+ correlation: # Adds correlation matrix for training datasets
444
+ title : 'Correlation matrix'
445
+ size : [10, 10]
446
+ mask_value : 0 # Where correlation is zero, the bin will appear white
418
447
  val_dir : 'tests/ml/train_mva'
419
448
  features:
420
449
  saving:
@@ -448,7 +477,9 @@ When training on real data, several things might go wrong and the code will try
448
477
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
449
478
  entries will be removed before training.
450
479
 
451
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
480
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
481
+ - Can use the `nan` section shown above to replace `NaN` values with something else
482
+ - For whatever remains we remove the entries from the training.
452
483
 
453
484
  ## Application
454
485
 
@@ -471,17 +502,56 @@ The picking process happens through the comparison of hashes between the samples
471
502
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
472
503
  `GradientBoostClassifier`, here called `CVClassifier`.
473
504
 
474
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
505
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
475
506
  `CVSameData` exception will be risen.
476
507
 
508
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
509
+ from that configuration and the user does not need to define extra columns.
510
+
477
511
  ### Caveats
478
512
 
479
513
  When evaluating the model with real data, problems might occur, we deal with them as follows:
480
514
 
481
515
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
482
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
516
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
517
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
518
+ and can be accessed through:
519
+ ```python
520
+ model.cfg
521
+ ```
522
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
483
523
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
484
524
 
525
+ # Pandas dataframes
526
+
527
+ ## Utilities
528
+
529
+ These are thin layers of code that take pandas dataframes and carry out specific tasks
530
+
531
+ ### Dataframe to latex
532
+
533
+ One can save a dataframe to latex with:
534
+
535
+ ```python
536
+ import pandas as pnd
537
+ import dmu.pdataframe.utilities as put
538
+
539
+ d_data = {}
540
+ d_data['a'] = [1,2,3]
541
+ d_data['b'] = [4,5,6]
542
+ df = pnd.DataFrame(d_data)
543
+
544
+ d_format = {
545
+ 'a' : '{:.0f}',
546
+ 'b' : '{:.3f}'}
547
+
548
+ df = _get_df()
549
+ put.df_to_tex(df,
550
+ './table.tex',
551
+ d_format = d_format,
552
+ caption = 'some caption')
553
+ ```
554
+
485
555
  # Rdataframes
486
556
 
487
557
  These are utility functions meant to be used with ROOT dataframes.
@@ -633,6 +703,43 @@ axes:
633
703
  label : 'y'
634
704
  ```
635
705
 
706
+ # Other plots
707
+
708
+ ## Matrices
709
+
710
+ This can be done with `MatrixPlotter`, whose usage is illustrated below:
711
+
712
+ ```python
713
+ import numpy
714
+ import matplotlib.pyplot as plt
715
+
716
+ from dmu.plotting.matrix import MatrixPlotter
717
+
718
+ cfg = {
719
+ 'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
720
+ 'title' : 'Some title', # Optional, title of plot
721
+ 'label_angle': 45, # Labels will be rotated by 45 degrees
722
+ 'upper' : True, # Useful in case this is a symmetric matrix
723
+ 'zrange' : [0, 10], # Controls the z axis range
724
+ 'size' : [7, 7], # Plot size
725
+ 'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
726
+ 'fontsize' : 12, # Font size associated to `format`
727
+ 'mask_value' : 0, # These values will appear white in the plot
728
+ }
729
+
730
+ mat = [
731
+ [1, 2, 3],
732
+ [2, 0, 4],
733
+ [3, 4, numpy.nan]
734
+ ]
735
+
736
+ mat = numpy.array(mat)
737
+
738
+ obj = MatrixPlotter(mat=mat, cfg=cfg)
739
+ obj.plot()
740
+ plt.show()
741
+ ```
742
+
636
743
  # Manipulating ROOT files
637
744
 
638
745
  ## Getting trees from file
@@ -1,12 +1,12 @@
1
1
  [project]
2
2
  name = 'data_manipulation_utilities'
3
- version = '0.2.0'
3
+ version = '0.2.2'
4
4
  readme = 'README.md'
5
5
  dependencies= [
6
6
  'logzero',
7
7
  'PyYAML',
8
8
  'scipy',
9
- 'awkward==2.4.6',
9
+ 'awkward',
10
10
  'tqdm',
11
11
  'joblib',
12
12
  'scikit-learn',
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward==2.4.6
8
+ Requires-Dist: awkward
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -423,9 +423,21 @@ obj.run()
423
423
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
424
424
 
425
425
  ```yaml
426
+ dataset:
427
+ # Before training, new features can be defined as below
428
+ define :
429
+ x : v + w
430
+ y : v - w
431
+ # If the key is found to be NaN, replace its value with the number provided
432
+ # This will be used in the training.
433
+ # Otherwise the entries with NaNs will be dropped
434
+ nan:
435
+ x : 0
436
+ y : 0
437
+ z : -999
426
438
  training :
427
439
  nfold : 10
428
- features : [w, x, y, z]
440
+ features : [x, y, z]
429
441
  hyper :
430
442
  loss : log_loss
431
443
  n_estimators : 100
@@ -433,8 +445,25 @@ training :
433
445
  learning_rate : 0.1
434
446
  min_samples_split : 2
435
447
  saving:
448
+ # The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
436
449
  path : 'tests/ml/train_mva/model.pkl'
437
450
  plotting:
451
+ roc :
452
+ min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
453
+ max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
454
+ # The section below is optional and will annotate the ROC curve with
455
+ # values for the score at different signal efficiencies
456
+ annotate:
457
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
458
+ form : '{:.2f}' # Use two decimals for scores
459
+ color : 'green' # Color for text and marker
460
+ xoff : -15 # Offsets in X and Y
461
+ yoff : -15
462
+ size : 10 # Size of text
463
+ correlation: # Adds correlation matrix for training datasets
464
+ title : 'Correlation matrix'
465
+ size : [10, 10]
466
+ mask_value : 0 # Where correlation is zero, the bin will appear white
438
467
  val_dir : 'tests/ml/train_mva'
439
468
  features:
440
469
  saving:
@@ -468,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
468
497
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
469
498
  entries will be removed before training.
470
499
 
471
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
500
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
+ - Can use the `nan` section shown above to replace `NaN` values with something else
502
+ - For whatever remains we remove the entries from the training.
472
503
 
473
504
  ## Application
474
505
 
@@ -491,17 +522,56 @@ The picking process happens through the comparison of hashes between the samples
491
522
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
492
523
  `GradientBoostClassifier`, here called `CVClassifier`.
493
524
 
494
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
525
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
495
526
  `CVSameData` exception will be risen.
496
527
 
528
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
529
+ from that configuration and the user does not need to define extra columns.
530
+
497
531
  ### Caveats
498
532
 
499
533
  When evaluating the model with real data, problems might occur, we deal with them as follows:
500
534
 
501
535
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
502
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
536
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
537
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
538
+ and can be accessed through:
539
+ ```python
540
+ model.cfg
541
+ ```
542
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
503
543
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
504
544
 
545
+ # Pandas dataframes
546
+
547
+ ## Utilities
548
+
549
+ These are thin layers of code that take pandas dataframes and carry out specific tasks
550
+
551
+ ### Dataframe to latex
552
+
553
+ One can save a dataframe to latex with:
554
+
555
+ ```python
556
+ import pandas as pnd
557
+ import dmu.pdataframe.utilities as put
558
+
559
+ d_data = {}
560
+ d_data['a'] = [1,2,3]
561
+ d_data['b'] = [4,5,6]
562
+ df = pnd.DataFrame(d_data)
563
+
564
+ d_format = {
565
+ 'a' : '{:.0f}',
566
+ 'b' : '{:.3f}'}
567
+
568
+ df = _get_df()
569
+ put.df_to_tex(df,
570
+ './table.tex',
571
+ d_format = d_format,
572
+ caption = 'some caption')
573
+ ```
574
+
505
575
  # Rdataframes
506
576
 
507
577
  These are utility functions meant to be used with ROOT dataframes.
@@ -653,6 +723,43 @@ axes:
653
723
  label : 'y'
654
724
  ```
655
725
 
726
+ # Other plots
727
+
728
+ ## Matrices
729
+
730
+ This can be done with `MatrixPlotter`, whose usage is illustrated below:
731
+
732
+ ```python
733
+ import numpy
734
+ import matplotlib.pyplot as plt
735
+
736
+ from dmu.plotting.matrix import MatrixPlotter
737
+
738
+ cfg = {
739
+ 'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
740
+ 'title' : 'Some title', # Optional, title of plot
741
+ 'label_angle': 45, # Labels will be rotated by 45 degrees
742
+ 'upper' : True, # Useful in case this is a symmetric matrix
743
+ 'zrange' : [0, 10], # Controls the z axis range
744
+ 'size' : [7, 7], # Plot size
745
+ 'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
746
+ 'fontsize' : 12, # Font size associated to `format`
747
+ 'mask_value' : 0, # These values will appear white in the plot
748
+ }
749
+
750
+ mat = [
751
+ [1, 2, 3],
752
+ [2, 0, 4],
753
+ [3, 4, numpy.nan]
754
+ ]
755
+
756
+ mat = numpy.array(mat)
757
+
758
+ obj = MatrixPlotter(mat=mat, cfg=cfg)
759
+ obj.plot()
760
+ plt.show()
761
+ ```
762
+
656
763
  # Manipulating ROOT files
657
764
 
658
765
  ## Getting trees from file
@@ -13,9 +13,12 @@ src/dmu/ml/cv_classifier.py
13
13
  src/dmu/ml/cv_predict.py
14
14
  src/dmu/ml/train_mva.py
15
15
  src/dmu/ml/utilities.py
16
+ src/dmu/pdataframe/utilities.py
17
+ src/dmu/plotting/matrix.py
16
18
  src/dmu/plotting/plotter.py
17
19
  src/dmu/plotting/plotter_1d.py
18
20
  src/dmu/plotting/plotter_2d.py
21
+ src/dmu/plotting/utilities.py
19
22
  src/dmu/rdataframe/atr_mgr.py
20
23
  src/dmu/rdataframe/utilities.py
21
24
  src/dmu/rfile/rfprinter.py
@@ -1,7 +1,7 @@
1
1
  logzero
2
2
  PyYAML
3
3
  scipy
4
- awkward==2.4.6
4
+ awkward
5
5
  tqdm
6
6
  joblib
7
7
  scikit-learn
@@ -2,6 +2,7 @@
2
2
  Module holding cv_classifier class
3
3
  '''
4
4
 
5
+ from typing import Union
5
6
  from sklearn.ensemble import GradientBoostingClassifier
6
7
 
7
8
  from dmu.logging.log_store import LogStore
@@ -22,7 +23,7 @@ class CVClassifier(GradientBoostingClassifier):
22
23
  '''
23
24
  # pylint: disable = too-many-ancestors, abstract-method
24
25
  # ----------------------------------
25
- def __init__(self, cfg : dict | None = None):
26
+ def __init__(self, cfg : Union[dict,None] = None):
26
27
  '''
27
28
  cfg (dict) : Dictionary with configuration, specially the hyperparameters set in the `hyper` field
28
29
  '''
@@ -32,11 +32,56 @@ class CVPredict:
32
32
  if rdf is None:
33
33
  raise ValueError('No ROOT dataframe passed')
34
34
 
35
- self._l_model = models
36
- self._rdf = rdf
35
+ self._l_model = models
36
+ self._rdf = rdf
37
+ self._d_nan_rep : dict[str,str]
37
38
 
38
39
  self._arr_patch : numpy.ndarray
39
40
  # --------------------------------------------
41
+ def _initialize(self):
42
+ self._rdf = self._define_columns(self._rdf)
43
+ self._d_nan_rep = self._get_nan_replacements()
44
+ # --------------------------------------------
45
+ def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
46
+ cfg = self._l_model[0].cfg
47
+
48
+ if 'define' not in cfg['dataset']:
49
+ log.debug('No define section found in config, will not define extra columns')
50
+ return self._rdf
51
+
52
+ d_def = cfg['dataset']['define']
53
+ log.debug(60 * '-')
54
+ log.info('Defining columns in RDF before evaluating classifier')
55
+ log.debug(60 * '-')
56
+ for name, expr in d_def.items():
57
+ log.debug(f'{name:<20}{"<---":20}{expr:<100}')
58
+ rdf = rdf.Define(name, expr)
59
+
60
+ return rdf
61
+ # --------------------------------------------
62
+ def _get_nan_replacements(self) -> dict[str,str]:
63
+ cfg = self._l_model[0].cfg
64
+
65
+ if 'nan' not in cfg['dataset']:
66
+ log.debug('No define section found in config, will not define extra columns')
67
+ return {}
68
+
69
+ return cfg['dataset']['nan']
70
+ # --------------------------------------------
71
+ def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
72
+ if len(self._d_nan_rep) == 0:
73
+ log.debug('Not doing any NaN replacement')
74
+ return df
75
+
76
+ log.debug(60 * '-')
77
+ log.info('Doing NaN replacements')
78
+ log.debug(60 * '-')
79
+ for var, val in self._d_nan_rep.items():
80
+ log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
81
+ df[var] = df[var].fillna(val)
82
+
83
+ return df
84
+ # --------------------------------------------
40
85
  def _get_df(self):
41
86
  '''
42
87
  Will make ROOT rdf into dataframe and return it
@@ -45,6 +90,7 @@ class CVPredict:
45
90
  l_ft = model.features
46
91
  d_data= self._rdf.AsNumpy(l_ft)
47
92
  df_ft = pnd.DataFrame(d_data)
93
+ df_ft = self._replace_nans(df_ft)
48
94
  df_ft = ut.patch_and_tag(df_ft)
49
95
 
50
96
  if 'patched_indices' in df_ft.attrs:
@@ -136,6 +182,8 @@ class CVPredict:
136
182
  '''
137
183
  Will return array of prediction probabilities for the signal category
138
184
  '''
185
+ self._initialize()
186
+
139
187
  df_ft = self._get_df()
140
188
  model = self._l_model[0]
141
189