data-manipulation-utilities 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/METADATA +113 -6
- {data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/RECORD +15 -12
- dmu/ml/cv_classifier.py +2 -1
- dmu/ml/cv_predict.py +50 -2
- dmu/ml/train_mva.py +216 -26
- dmu/ml/utilities.py +8 -0
- dmu/pdataframe/utilities.py +36 -0
- dmu/plotting/matrix.py +157 -0
- dmu/plotting/utilities.py +33 -0
- dmu/rdataframe/utilities.py +1 -1
- dmu_data/ml/tests/train_mva.yaml +30 -15
- {data_manipulation_utilities-0.2.0.data → data_manipulation_utilities-0.2.2.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/WHEEL +0 -0
- {data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/top_level.txt +0 -0
{data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/METADATA
RENAMED
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -423,9 +423,21 @@ obj.run()
|
|
423
423
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
424
424
|
|
425
425
|
```yaml
|
426
|
+
dataset:
|
427
|
+
# Before training, new features can be defined as below
|
428
|
+
define :
|
429
|
+
x : v + w
|
430
|
+
y : v - w
|
431
|
+
# If the key is found to be NaN, replace its value with the number provided
|
432
|
+
# This will be used in the training.
|
433
|
+
# Otherwise the entries with NaNs will be dropped
|
434
|
+
nan:
|
435
|
+
x : 0
|
436
|
+
y : 0
|
437
|
+
z : -999
|
426
438
|
training :
|
427
439
|
nfold : 10
|
428
|
-
features : [
|
440
|
+
features : [x, y, z]
|
429
441
|
hyper :
|
430
442
|
loss : log_loss
|
431
443
|
n_estimators : 100
|
@@ -433,8 +445,25 @@ training :
|
|
433
445
|
learning_rate : 0.1
|
434
446
|
min_samples_split : 2
|
435
447
|
saving:
|
448
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
436
449
|
path : 'tests/ml/train_mva/model.pkl'
|
437
450
|
plotting:
|
451
|
+
roc :
|
452
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
453
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
454
|
+
# The section below is optional and will annotate the ROC curve with
|
455
|
+
# values for the score at different signal efficiencies
|
456
|
+
annotate:
|
457
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
458
|
+
form : '{:.2f}' # Use two decimals for scores
|
459
|
+
color : 'green' # Color for text and marker
|
460
|
+
xoff : -15 # Offsets in X and Y
|
461
|
+
yoff : -15
|
462
|
+
size : 10 # Size of text
|
463
|
+
correlation: # Adds correlation matrix for training datasets
|
464
|
+
title : 'Correlation matrix'
|
465
|
+
size : [10, 10]
|
466
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
438
467
|
val_dir : 'tests/ml/train_mva'
|
439
468
|
features:
|
440
469
|
saving:
|
@@ -468,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
|
|
468
497
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
469
498
|
entries will be removed before training.
|
470
499
|
|
471
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
500
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
|
+
- For whatever remains we remove the entries from the training.
|
472
503
|
|
473
504
|
## Application
|
474
505
|
|
@@ -491,17 +522,56 @@ The picking process happens through the comparison of hashes between the samples
|
|
491
522
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
492
523
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
493
524
|
|
494
|
-
If a sample
|
525
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
495
526
|
`CVSameData` exception will be risen.
|
496
527
|
|
528
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
529
|
+
from that configuration and the user does not need to define extra columns.
|
530
|
+
|
497
531
|
### Caveats
|
498
532
|
|
499
533
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
500
534
|
|
501
535
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
502
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
536
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
537
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
538
|
+
and can be accessed through:
|
539
|
+
```python
|
540
|
+
model.cfg
|
541
|
+
```
|
542
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
503
543
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
504
544
|
|
545
|
+
# Pandas dataframes
|
546
|
+
|
547
|
+
## Utilities
|
548
|
+
|
549
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
550
|
+
|
551
|
+
### Dataframe to latex
|
552
|
+
|
553
|
+
One can save a dataframe to latex with:
|
554
|
+
|
555
|
+
```python
|
556
|
+
import pandas as pnd
|
557
|
+
import dmu.pdataframe.utilities as put
|
558
|
+
|
559
|
+
d_data = {}
|
560
|
+
d_data['a'] = [1,2,3]
|
561
|
+
d_data['b'] = [4,5,6]
|
562
|
+
df = pnd.DataFrame(d_data)
|
563
|
+
|
564
|
+
d_format = {
|
565
|
+
'a' : '{:.0f}',
|
566
|
+
'b' : '{:.3f}'}
|
567
|
+
|
568
|
+
df = _get_df()
|
569
|
+
put.df_to_tex(df,
|
570
|
+
'./table.tex',
|
571
|
+
d_format = d_format,
|
572
|
+
caption = 'some caption')
|
573
|
+
```
|
574
|
+
|
505
575
|
# Rdataframes
|
506
576
|
|
507
577
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -653,6 +723,43 @@ axes:
|
|
653
723
|
label : 'y'
|
654
724
|
```
|
655
725
|
|
726
|
+
# Other plots
|
727
|
+
|
728
|
+
## Matrices
|
729
|
+
|
730
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
731
|
+
|
732
|
+
```python
|
733
|
+
import numpy
|
734
|
+
import matplotlib.pyplot as plt
|
735
|
+
|
736
|
+
from dmu.plotting.matrix import MatrixPlotter
|
737
|
+
|
738
|
+
cfg = {
|
739
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
740
|
+
'title' : 'Some title', # Optional, title of plot
|
741
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
742
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
743
|
+
'zrange' : [0, 10], # Controls the z axis range
|
744
|
+
'size' : [7, 7], # Plot size
|
745
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
746
|
+
'fontsize' : 12, # Font size associated to `format`
|
747
|
+
'mask_value' : 0, # These values will appear white in the plot
|
748
|
+
}
|
749
|
+
|
750
|
+
mat = [
|
751
|
+
[1, 2, 3],
|
752
|
+
[2, 0, 4],
|
753
|
+
[3, 4, numpy.nan]
|
754
|
+
]
|
755
|
+
|
756
|
+
mat = numpy.array(mat)
|
757
|
+
|
758
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
759
|
+
obj.plot()
|
760
|
+
plt.show()
|
761
|
+
```
|
762
|
+
|
656
763
|
# Manipulating ROOT files
|
657
764
|
|
658
765
|
## Getting trees from file
|
{data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/RECORD
RENAMED
@@ -1,16 +1,19 @@
|
|
1
|
-
data_manipulation_utilities-0.2.
|
1
|
+
data_manipulation_utilities-0.2.2.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
|
2
2
|
dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
|
3
3
|
dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
|
4
4
|
dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
|
5
|
-
dmu/ml/cv_classifier.py,sha256=
|
6
|
-
dmu/ml/cv_predict.py,sha256=
|
7
|
-
dmu/ml/train_mva.py,sha256=
|
8
|
-
dmu/ml/utilities.py,sha256=
|
5
|
+
dmu/ml/cv_classifier.py,sha256=8Jwx6xMhJaRLktlRdq0tFl32v6t8i63KmpxrlnXlomU,3759
|
6
|
+
dmu/ml/cv_predict.py,sha256=4G7F_1yOvnLftsDC6zUpdvkxuHXGkPemhj0RsYySYDM,6708
|
7
|
+
dmu/ml/train_mva.py,sha256=SZ5cQHl7HBxn0c5Hh4HlN1aqMZaJUAlNmsfjnUSQrTg,16894
|
8
|
+
dmu/ml/utilities.py,sha256=l348bufD95CuSYdIrHScQThIy2nKwGKXZn-FQg3CEwg,3930
|
9
|
+
dmu/pdataframe/utilities.py,sha256=ypvLiFfJ82ga94qlW3t5dXnvEFwYOXnbtJb2zHwsbqk,987
|
10
|
+
dmu/plotting/matrix.py,sha256=pXuUJn-LgOvrI9qGkZQw16BzLjOjeikYQ_ll2VIcIXU,4978
|
9
11
|
dmu/plotting/plotter.py,sha256=ytMxtzHEY8ZFU0ZKEBE-ROjMszXl5kHTMnQnWe173nU,7208
|
10
12
|
dmu/plotting/plotter_1d.py,sha256=g6H2xAgsL9a6vRkpbqHICb3qwV_qMiQPZxxw_oOSf9M,5115
|
11
13
|
dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
|
14
|
+
dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
|
12
15
|
dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
|
13
|
-
dmu/rdataframe/utilities.py,sha256=
|
16
|
+
dmu/rdataframe/utilities.py,sha256=MDY3u_y0s-ANvHAWRzGyeuuZUKoaqilfmb8mqlgfrVc,2771
|
14
17
|
dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
|
15
18
|
dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
|
16
19
|
dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
|
@@ -23,7 +26,7 @@ dmu/stats/zfit_plotter.py,sha256=Xs6kisNEmNQXhYRCcjowxO6xHuyAyrfyQIFhGAR61U4,197
|
|
23
26
|
dmu/testing/utilities.py,sha256=WbMM4e9Cn3-B-12Vr64mB5qTKkV32joStlRkD-48lG0,3460
|
24
27
|
dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
|
25
28
|
dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
-
dmu_data/ml/tests/train_mva.yaml,sha256=
|
29
|
+
dmu_data/ml/tests/train_mva.yaml,sha256=k5H4Gu9Gj57B9iqabhcTQEFN674Cv_uJ2Xcumb02zF4,1279
|
27
30
|
dmu_data/plotting/tests/2d.yaml,sha256=VApcAfJFbjNcjMCTBSRm2P37MQlGavMZv6msbZwLSgw,402
|
28
31
|
dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
|
29
32
|
dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
|
@@ -44,8 +47,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
|
|
44
47
|
dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
|
45
48
|
dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
|
46
49
|
dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
|
47
|
-
data_manipulation_utilities-0.2.
|
48
|
-
data_manipulation_utilities-0.2.
|
49
|
-
data_manipulation_utilities-0.2.
|
50
|
-
data_manipulation_utilities-0.2.
|
51
|
-
data_manipulation_utilities-0.2.
|
50
|
+
data_manipulation_utilities-0.2.2.dist-info/METADATA,sha256=0QwhQmQML65qk2kaXf1znMZOVNuvaY3l35E7cXLRCZ8,27359
|
51
|
+
data_manipulation_utilities-0.2.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
52
|
+
data_manipulation_utilities-0.2.2.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
|
53
|
+
data_manipulation_utilities-0.2.2.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
|
54
|
+
data_manipulation_utilities-0.2.2.dist-info/RECORD,,
|
dmu/ml/cv_classifier.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Module holding cv_classifier class
|
3
3
|
'''
|
4
4
|
|
5
|
+
from typing import Union
|
5
6
|
from sklearn.ensemble import GradientBoostingClassifier
|
6
7
|
|
7
8
|
from dmu.logging.log_store import LogStore
|
@@ -22,7 +23,7 @@ class CVClassifier(GradientBoostingClassifier):
|
|
22
23
|
'''
|
23
24
|
# pylint: disable = too-many-ancestors, abstract-method
|
24
25
|
# ----------------------------------
|
25
|
-
def __init__(self, cfg : dict
|
26
|
+
def __init__(self, cfg : Union[dict,None] = None):
|
26
27
|
'''
|
27
28
|
cfg (dict) : Dictionary with configuration, specially the hyperparameters set in the `hyper` field
|
28
29
|
'''
|
dmu/ml/cv_predict.py
CHANGED
@@ -32,11 +32,56 @@ class CVPredict:
|
|
32
32
|
if rdf is None:
|
33
33
|
raise ValueError('No ROOT dataframe passed')
|
34
34
|
|
35
|
-
self._l_model
|
36
|
-
self._rdf
|
35
|
+
self._l_model = models
|
36
|
+
self._rdf = rdf
|
37
|
+
self._d_nan_rep : dict[str,str]
|
37
38
|
|
38
39
|
self._arr_patch : numpy.ndarray
|
39
40
|
# --------------------------------------------
|
41
|
+
def _initialize(self):
|
42
|
+
self._rdf = self._define_columns(self._rdf)
|
43
|
+
self._d_nan_rep = self._get_nan_replacements()
|
44
|
+
# --------------------------------------------
|
45
|
+
def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
|
46
|
+
cfg = self._l_model[0].cfg
|
47
|
+
|
48
|
+
if 'define' not in cfg['dataset']:
|
49
|
+
log.debug('No define section found in config, will not define extra columns')
|
50
|
+
return self._rdf
|
51
|
+
|
52
|
+
d_def = cfg['dataset']['define']
|
53
|
+
log.debug(60 * '-')
|
54
|
+
log.info('Defining columns in RDF before evaluating classifier')
|
55
|
+
log.debug(60 * '-')
|
56
|
+
for name, expr in d_def.items():
|
57
|
+
log.debug(f'{name:<20}{"<---":20}{expr:<100}')
|
58
|
+
rdf = rdf.Define(name, expr)
|
59
|
+
|
60
|
+
return rdf
|
61
|
+
# --------------------------------------------
|
62
|
+
def _get_nan_replacements(self) -> dict[str,str]:
|
63
|
+
cfg = self._l_model[0].cfg
|
64
|
+
|
65
|
+
if 'nan' not in cfg['dataset']:
|
66
|
+
log.debug('No define section found in config, will not define extra columns')
|
67
|
+
return {}
|
68
|
+
|
69
|
+
return cfg['dataset']['nan']
|
70
|
+
# --------------------------------------------
|
71
|
+
def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
|
+
if len(self._d_nan_rep) == 0:
|
73
|
+
log.debug('Not doing any NaN replacement')
|
74
|
+
return df
|
75
|
+
|
76
|
+
log.debug(60 * '-')
|
77
|
+
log.info('Doing NaN replacements')
|
78
|
+
log.debug(60 * '-')
|
79
|
+
for var, val in self._d_nan_rep.items():
|
80
|
+
log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
|
81
|
+
df[var] = df[var].fillna(val)
|
82
|
+
|
83
|
+
return df
|
84
|
+
# --------------------------------------------
|
40
85
|
def _get_df(self):
|
41
86
|
'''
|
42
87
|
Will make ROOT rdf into dataframe and return it
|
@@ -45,6 +90,7 @@ class CVPredict:
|
|
45
90
|
l_ft = model.features
|
46
91
|
d_data= self._rdf.AsNumpy(l_ft)
|
47
92
|
df_ft = pnd.DataFrame(d_data)
|
93
|
+
df_ft = self._replace_nans(df_ft)
|
48
94
|
df_ft = ut.patch_and_tag(df_ft)
|
49
95
|
|
50
96
|
if 'patched_indices' in df_ft.attrs:
|
@@ -136,6 +182,8 @@ class CVPredict:
|
|
136
182
|
'''
|
137
183
|
Will return array of prediction probabilities for the signal category
|
138
184
|
'''
|
185
|
+
self._initialize()
|
186
|
+
|
139
187
|
df_ft = self._get_df()
|
140
188
|
model = self._l_model[0]
|
141
189
|
|
dmu/ml/train_mva.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
'''
|
2
2
|
Module with TrainMva class
|
3
3
|
'''
|
4
|
+
# pylint: disable = too-many-locals
|
5
|
+
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
6
|
+
|
4
7
|
import os
|
5
|
-
from typing import Union
|
6
8
|
|
7
9
|
import joblib
|
8
10
|
import pandas as pnd
|
@@ -14,12 +16,17 @@ from sklearn.model_selection import StratifiedKFold
|
|
14
16
|
|
15
17
|
from ROOT import RDataFrame
|
16
18
|
|
17
|
-
import dmu.ml.utilities
|
19
|
+
import dmu.ml.utilities as ut
|
20
|
+
import dmu.pdataframe.utilities as put
|
21
|
+
import dmu.plotting.utilities as plu
|
22
|
+
|
18
23
|
from dmu.ml.cv_classifier import CVClassifier as cls
|
19
24
|
from dmu.plotting.plotter_1d import Plotter1D as Plotter
|
25
|
+
from dmu.plotting.matrix import MatrixPlotter
|
20
26
|
from dmu.logging.log_store import LogStore
|
21
27
|
|
22
|
-
|
28
|
+
npa = numpy.ndarray
|
29
|
+
log = LogStore.add_logger('dmu:ml:train_mva')
|
23
30
|
# ---------------------------------------------
|
24
31
|
class TrainMva:
|
25
32
|
'''
|
@@ -43,15 +50,13 @@ class TrainMva:
|
|
43
50
|
|
44
51
|
self._rdf_bkg = bkg
|
45
52
|
self._rdf_sig = sig
|
46
|
-
self._cfg = cfg
|
47
|
-
|
48
|
-
self._l_model : cls
|
53
|
+
self._cfg = cfg
|
49
54
|
|
50
55
|
self._l_ft_name = self._cfg['training']['features']
|
51
56
|
|
52
57
|
self._df_ft, self._l_lab = self._get_inputs()
|
53
58
|
# ---------------------------------------------
|
54
|
-
def _get_inputs(self) -> tuple[pnd.DataFrame,
|
59
|
+
def _get_inputs(self) -> tuple[pnd.DataFrame, npa]:
|
55
60
|
log.info('Getting signal')
|
56
61
|
df_sig, arr_lab_sig = self._get_sample_inputs(self._rdf_sig, label = 1)
|
57
62
|
|
@@ -63,15 +68,28 @@ class TrainMva:
|
|
63
68
|
|
64
69
|
return df, arr_lab
|
65
70
|
# ---------------------------------------------
|
66
|
-
def
|
71
|
+
def _pre_process_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
|
+
if 'nan' not in self._cfg['dataset']:
|
73
|
+
log.debug('dataset/nan section not found, not pre-processing NaNs')
|
74
|
+
return df
|
75
|
+
|
76
|
+
d_name_val = self._cfg['dataset']['nan']
|
77
|
+
for name, val in d_name_val.items():
|
78
|
+
log.debug(f'{val:<20}{"<---":<10}{name:<100}')
|
79
|
+
df[name] = df[name].fillna(val)
|
80
|
+
|
81
|
+
return df
|
82
|
+
# ---------------------------------------------
|
83
|
+
def _get_sample_inputs(self, rdf : RDataFrame, label : int) -> tuple[pnd.DataFrame, npa]:
|
67
84
|
d_ft = rdf.AsNumpy(self._l_ft_name)
|
68
85
|
df = pnd.DataFrame(d_ft)
|
86
|
+
df = self._pre_process_nans(df)
|
69
87
|
df = ut.cleanup(df)
|
70
88
|
l_lab= len(df) * [label]
|
71
89
|
|
72
90
|
return df, numpy.array(l_lab)
|
73
91
|
# ---------------------------------------------
|
74
|
-
def _get_model(self, arr_index :
|
92
|
+
def _get_model(self, arr_index : npa) -> cls:
|
75
93
|
model = cls(cfg = self._cfg)
|
76
94
|
df_ft = self._df_ft.iloc[arr_index]
|
77
95
|
l_lab = self._l_lab[arr_index]
|
@@ -84,7 +102,6 @@ class TrainMva:
|
|
84
102
|
return model
|
85
103
|
# ---------------------------------------------
|
86
104
|
def _get_models(self):
|
87
|
-
# pylint: disable = too-many-locals
|
88
105
|
'''
|
89
106
|
Will create models, train them and return them
|
90
107
|
'''
|
@@ -105,15 +122,55 @@ class TrainMva:
|
|
105
122
|
arr_sig_sig_tr, arr_sig_bkg_tr, arr_sig_all_tr, arr_lab_tr = self._get_scores(model, arr_itr, on_training_ok= True)
|
106
123
|
arr_sig_sig_ts, arr_sig_bkg_ts, arr_sig_all_ts, arr_lab_ts = self._get_scores(model, arr_its, on_training_ok=False)
|
107
124
|
|
125
|
+
self._save_feature_importance(model, ifold)
|
126
|
+
self._plot_correlation(arr_itr, ifold)
|
108
127
|
self._plot_scores(arr_sig_sig_tr, arr_sig_sig_ts, arr_sig_bkg_tr, arr_sig_bkg_ts, ifold)
|
109
|
-
|
110
128
|
self._plot_roc(arr_lab_ts, arr_sig_all_ts, arr_lab_tr, arr_sig_all_tr, ifold)
|
111
129
|
|
112
130
|
ifold+=1
|
113
131
|
|
114
132
|
return l_model
|
115
133
|
# ---------------------------------------------
|
116
|
-
def
|
134
|
+
def _labels_from_varnames(self, l_var_name : list[str]) -> list[str]:
|
135
|
+
try:
|
136
|
+
d_plot = self._cfg['plotting']['features']['plots']
|
137
|
+
except ValueError:
|
138
|
+
log.warning('Cannot find plotting/features/plots section in config, using dataframe names')
|
139
|
+
return l_var_name
|
140
|
+
|
141
|
+
l_label = []
|
142
|
+
for var_name in l_var_name:
|
143
|
+
if var_name not in d_plot:
|
144
|
+
log.warning(f'No plot found for: {var_name}')
|
145
|
+
l_label.append(var_name)
|
146
|
+
continue
|
147
|
+
|
148
|
+
d_setting = d_plot[var_name]
|
149
|
+
[xlab, _ ]= d_setting['labels']
|
150
|
+
|
151
|
+
l_label.append(xlab)
|
152
|
+
|
153
|
+
return l_label
|
154
|
+
# ---------------------------------------------
|
155
|
+
def _save_feature_importance(self, model : cls, ifold : int) -> None:
|
156
|
+
l_var_name = self._df_ft.columns.tolist()
|
157
|
+
|
158
|
+
d_data = {}
|
159
|
+
d_data['Variable' ] = self._labels_from_varnames(l_var_name)
|
160
|
+
d_data['Importance'] = 100 * model.feature_importances_
|
161
|
+
|
162
|
+
val_dir = self._cfg['plotting']['val_dir']
|
163
|
+
val_dir = f'{val_dir}/fold_{ifold:03}'
|
164
|
+
os.makedirs(val_dir, exist_ok=True)
|
165
|
+
|
166
|
+
df = pnd.DataFrame(d_data)
|
167
|
+
df = df.sort_values(by='Importance', ascending=False)
|
168
|
+
|
169
|
+
table_path = f'{val_dir}/importance.tex'
|
170
|
+
d_form = {'Variable' : '{}', 'Importance' : '{:.1f}'}
|
171
|
+
put.df_to_tex(df, table_path, d_format = d_form)
|
172
|
+
# ---------------------------------------------
|
173
|
+
def _get_scores(self, model : cls, arr_index : npa, on_training_ok : bool) -> tuple[npa, npa, npa, npa]:
|
117
174
|
'''
|
118
175
|
Returns a tuple of four arrays
|
119
176
|
|
@@ -136,7 +193,7 @@ class TrainMva:
|
|
136
193
|
|
137
194
|
return arr_sig, arr_bkg, arr_all, arr_lab
|
138
195
|
# ---------------------------------------------
|
139
|
-
def _split_scores(self, arr_prob :
|
196
|
+
def _split_scores(self, arr_prob : npa, arr_label : npa) -> tuple[npa, npa]:
|
140
197
|
'''
|
141
198
|
Will split the testing scores (predictions) based on the training scores
|
142
199
|
|
@@ -151,7 +208,7 @@ class TrainMva:
|
|
151
208
|
|
152
209
|
return arr_sig, arr_bkg
|
153
210
|
# ---------------------------------------------
|
154
|
-
def _save_model(self, model, ifold):
|
211
|
+
def _save_model(self, model : cls, ifold : int) -> None:
|
155
212
|
'''
|
156
213
|
Saves a model, associated to a specific fold
|
157
214
|
'''
|
@@ -168,6 +225,53 @@ class TrainMva:
|
|
168
225
|
log.info(f'Saving model to: {model_path}')
|
169
226
|
joblib.dump(model, model_path)
|
170
227
|
# ---------------------------------------------
|
228
|
+
def _get_correlation_cfg(self, df : pnd.DataFrame, ifold : int) -> dict:
|
229
|
+
l_var_name = df.columns.tolist()
|
230
|
+
l_label = self._labels_from_varnames(l_var_name)
|
231
|
+
cfg = {
|
232
|
+
'labels' : l_label,
|
233
|
+
'title' : f'Fold {ifold}',
|
234
|
+
'label_angle': 45,
|
235
|
+
'upper' : True,
|
236
|
+
'zrange' : [-1, +1],
|
237
|
+
'size' : [7, 7],
|
238
|
+
'format' : '{:.3f}',
|
239
|
+
'fontsize' : 12,
|
240
|
+
}
|
241
|
+
|
242
|
+
if 'correlation' not in self._cfg['plotting']:
|
243
|
+
log.info('Using default correlation plotting configuration')
|
244
|
+
return cfg
|
245
|
+
|
246
|
+
log.debug('Updating correlation plotting configuration')
|
247
|
+
custom = self._cfg['plotting']['correlation']
|
248
|
+
cfg.update(custom)
|
249
|
+
|
250
|
+
return cfg
|
251
|
+
# ---------------------------------------------
|
252
|
+
def _plot_correlation(self, arr_index : npa, ifold : int) -> None:
|
253
|
+
df_ft = self._df_ft.iloc[arr_index]
|
254
|
+
cfg = self._get_correlation_cfg(df_ft, ifold)
|
255
|
+
cov = df_ft.corr()
|
256
|
+
mat = cov.to_numpy()
|
257
|
+
|
258
|
+
log.debug(f'Plotting correlation for {ifold} fold')
|
259
|
+
|
260
|
+
val_dir = self._cfg['plotting']['val_dir']
|
261
|
+
val_dir = f'{val_dir}/fold_{ifold:03}'
|
262
|
+
os.makedirs(val_dir, exist_ok=True)
|
263
|
+
|
264
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
265
|
+
obj.plot()
|
266
|
+
plt.savefig(f'{val_dir}/covariance.png')
|
267
|
+
plt.close()
|
268
|
+
# ---------------------------------------------
|
269
|
+
def _get_nentries(self, arr_val : npa) -> str:
|
270
|
+
size = len(arr_val)
|
271
|
+
size = size / 1000.
|
272
|
+
|
273
|
+
return f'{size:.2f}K'
|
274
|
+
# ---------------------------------------------
|
171
275
|
def _plot_scores(self, arr_sig_trn, arr_sig_tst, arr_bkg_trn, arr_bkg_tst, ifold):
|
172
276
|
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
173
277
|
'''
|
@@ -183,11 +287,11 @@ class TrainMva:
|
|
183
287
|
val_dir = f'{val_dir}/fold_{ifold:03}'
|
184
288
|
os.makedirs(val_dir, exist_ok=True)
|
185
289
|
|
186
|
-
plt.hist(arr_sig_trn, alpha = 0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train')
|
187
|
-
plt.hist(arr_sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test')
|
290
|
+
plt.hist(arr_sig_trn, alpha = 0.3, bins=50, range=(0,1), color='b', density=True, label='Signal Train: ' + self._get_nentries(arr_sig_trn))
|
291
|
+
plt.hist(arr_sig_tst, histtype='step', bins=50, range=(0,1), color='b', density=True, label='Signal Test: ' + self._get_nentries(arr_sig_tst))
|
188
292
|
|
189
|
-
plt.hist(arr_bkg_trn, alpha = 0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train')
|
190
|
-
plt.hist(arr_bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test')
|
293
|
+
plt.hist(arr_bkg_trn, alpha = 0.3, bins=50, range=(0,1), color='r', density=True, label='Background Train: '+ self._get_nentries(arr_bkg_trn))
|
294
|
+
plt.hist(arr_bkg_tst, histtype='step', bins=50, range=(0,1), color='r', density=True, label='Background Test: ' + self._get_nentries(arr_bkg_tst))
|
191
295
|
|
192
296
|
plt.legend()
|
193
297
|
plt.title(f'Fold: {ifold}')
|
@@ -197,16 +301,15 @@ class TrainMva:
|
|
197
301
|
plt.close()
|
198
302
|
# ---------------------------------------------
|
199
303
|
def _plot_roc(self,
|
200
|
-
l_lab_ts :
|
201
|
-
l_prb_ts :
|
202
|
-
l_lab_tr :
|
203
|
-
l_prb_tr :
|
304
|
+
l_lab_ts : npa,
|
305
|
+
l_prb_ts : npa,
|
306
|
+
l_lab_tr : npa,
|
307
|
+
l_prb_tr : npa,
|
204
308
|
ifold : int):
|
205
309
|
'''
|
206
310
|
Takes the labels and the probabilities and plots ROC
|
207
311
|
curve for given fold
|
208
312
|
'''
|
209
|
-
# pylint: disable = too-many-arguments, too-many-positional-arguments
|
210
313
|
log.debug(f'Plotting ROC curve for {ifold} fold')
|
211
314
|
|
212
315
|
val_dir = self._cfg['plotting']['val_dir']
|
@@ -226,17 +329,70 @@ class TrainMva:
|
|
226
329
|
if 'min' in self._cfg['plotting']['roc']:
|
227
330
|
[min_x, min_y] = self._cfg['plotting']['roc']['min']
|
228
331
|
|
332
|
+
max_x = 1
|
333
|
+
max_y = 1
|
334
|
+
if 'max' in self._cfg['plotting']['roc']:
|
335
|
+
[max_x, max_y] = self._cfg['plotting']['roc']['max']
|
336
|
+
|
229
337
|
plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
|
230
338
|
plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
|
339
|
+
self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
|
340
|
+
|
231
341
|
plt.xlabel('Signal efficiency')
|
232
|
-
plt.ylabel('Background
|
342
|
+
plt.ylabel('Background rejection')
|
233
343
|
plt.title(f'Fold: {ifold}')
|
234
|
-
plt.xlim(min_x,
|
235
|
-
plt.ylim(min_y,
|
344
|
+
plt.xlim(min_x, max_x)
|
345
|
+
plt.ylim(min_y, max_y)
|
346
|
+
plt.grid()
|
236
347
|
plt.legend()
|
237
348
|
plt.savefig(f'{val_dir}/roc.png')
|
238
349
|
plt.close()
|
239
350
|
# ---------------------------------------------
|
351
|
+
def _plot_probabilities(self,
|
352
|
+
arr_seff: npa,
|
353
|
+
arr_brej: npa,
|
354
|
+
arr_sprb: npa,
|
355
|
+
arr_labl: npa) -> None:
|
356
|
+
|
357
|
+
roc_cfg = self._cfg['plotting']['roc']
|
358
|
+
if 'annotate' not in roc_cfg:
|
359
|
+
log.debug('Annotation section in the ROC curve config not found, skipping annotation')
|
360
|
+
return
|
361
|
+
|
362
|
+
l_sprb = [ sprb for sprb, labl in zip(arr_sprb, arr_labl) if labl == 1 ]
|
363
|
+
arr_sprb = numpy.array(l_sprb)
|
364
|
+
|
365
|
+
plt_cfg = roc_cfg['annotate']
|
366
|
+
if 'sig_eff' not in plt_cfg:
|
367
|
+
l_seff_target = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
|
368
|
+
else:
|
369
|
+
l_seff_target = plt_cfg['sig_eff']
|
370
|
+
del plt_cfg['sig_eff']
|
371
|
+
|
372
|
+
arr_seff_target = numpy.array(l_seff_target)
|
373
|
+
arr_quantile = 1 - arr_seff_target
|
374
|
+
|
375
|
+
l_score = numpy.quantile(arr_sprb, arr_quantile)
|
376
|
+
l_seff = []
|
377
|
+
l_brej = []
|
378
|
+
|
379
|
+
log.debug(60 * '-')
|
380
|
+
log.debug(f'{"SigEff":20}{"BkgRej":20}{"Score":20}')
|
381
|
+
log.debug(60 * '-')
|
382
|
+
for seff_target, score in zip(arr_seff_target, l_score):
|
383
|
+
arr_diff = numpy.abs(arr_seff - seff_target)
|
384
|
+
ind = numpy.argmin(arr_diff)
|
385
|
+
|
386
|
+
seff = arr_seff[ind]
|
387
|
+
brej = arr_brej[ind]
|
388
|
+
|
389
|
+
log.debug(f'{seff:<20.3f}{brej:<20.3f}{score:<20.2f}')
|
390
|
+
|
391
|
+
l_seff.append(seff)
|
392
|
+
l_brej.append(brej)
|
393
|
+
|
394
|
+
plu.annotate(l_x=l_seff, l_y=l_brej, l_v=l_score, **plt_cfg)
|
395
|
+
# ---------------------------------------------
|
240
396
|
def _plot_features(self):
|
241
397
|
'''
|
242
398
|
Will plot the features, based on the settings in the config
|
@@ -245,10 +401,44 @@ class TrainMva:
|
|
245
401
|
ptr = Plotter(d_rdf = {'Signal' : self._rdf_sig, 'Background' : self._rdf_bkg}, cfg=d_cfg)
|
246
402
|
ptr.run()
|
247
403
|
# ---------------------------------------------
|
404
|
+
def _save_settings_to_tex(self) -> None:
|
405
|
+
self._save_nan_conversion()
|
406
|
+
self._save_hyperparameters_to_tex()
|
407
|
+
# ---------------------------------------------
|
408
|
+
def _save_nan_conversion(self) -> None:
|
409
|
+
if 'nan' not in self._cfg['dataset']:
|
410
|
+
log.debug('NaN section not found, not saving it')
|
411
|
+
return
|
412
|
+
|
413
|
+
d_nan = self._cfg['dataset']['nan']
|
414
|
+
l_var = list(d_nan)
|
415
|
+
l_lab = self._labels_from_varnames(l_var)
|
416
|
+
l_val = list(d_nan.values())
|
417
|
+
|
418
|
+
d_tex = {'Variable' : l_lab, 'Replacement' : l_val}
|
419
|
+
df = pnd.DataFrame(d_tex)
|
420
|
+
val_dir = self._cfg['plotting']['val_dir']
|
421
|
+
os.makedirs(val_dir, exist_ok=True)
|
422
|
+
put.df_to_tex(df, f'{val_dir}/nan_replacement.tex')
|
423
|
+
# ---------------------------------------------
|
424
|
+
def _save_hyperparameters_to_tex(self) -> None:
|
425
|
+
if 'hyper' not in self._cfg['training']:
|
426
|
+
raise ValueError('Cannot find hyper parameters in configuration')
|
427
|
+
|
428
|
+
d_hyper = self._cfg['training']['hyper']
|
429
|
+
d_form = { f'\\verb|{key}|' : f'\\verb|{val}|' for key, val in d_hyper.items() }
|
430
|
+
d_latex = { 'Hyperparameter' : list(d_form.keys()), 'Value' : list(d_form.values())}
|
431
|
+
|
432
|
+
df = pnd.DataFrame(d_latex)
|
433
|
+
val_dir = self._cfg['plotting']['val_dir']
|
434
|
+
os.makedirs(val_dir, exist_ok=True)
|
435
|
+
put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
|
436
|
+
# ---------------------------------------------
|
248
437
|
def run(self):
|
249
438
|
'''
|
250
439
|
Will do the training
|
251
440
|
'''
|
441
|
+
self._save_settings_to_tex()
|
252
442
|
self._plot_features()
|
253
443
|
|
254
444
|
l_mod = self._get_models()
|
dmu/ml/utilities.py
CHANGED
@@ -51,6 +51,14 @@ def _remove_nans(df : pnd.DataFrame) -> pnd.DataFrame:
|
|
51
51
|
log.debug('No NaNs found in dataframe')
|
52
52
|
return df
|
53
53
|
|
54
|
+
sr_is_nan = df.isna().any()
|
55
|
+
l_na_name = sr_is_nan[sr_is_nan].index.tolist()
|
56
|
+
|
57
|
+
log.info('Found columns with NaNs')
|
58
|
+
for name in l_na_name:
|
59
|
+
nan_count = df[name].isna().sum()
|
60
|
+
log.info(f'{nan_count:<10}{name:<100}')
|
61
|
+
|
54
62
|
ninit = len(df)
|
55
63
|
df = df.dropna()
|
56
64
|
nfinl = len(df)
|
@@ -0,0 +1,36 @@
|
|
1
|
+
'''
|
2
|
+
Module containing utilities for pandas dataframes
|
3
|
+
'''
|
4
|
+
import os
|
5
|
+
import pandas as pnd
|
6
|
+
|
7
|
+
from dmu.logging.log_store import LogStore
|
8
|
+
|
9
|
+
log=LogStore.add_logger('dmu:pdataframe:utilities')
|
10
|
+
|
11
|
+
# -------------------------------------
|
12
|
+
def df_to_tex(df : pnd.DataFrame, path : str, hide_index : bool = True, d_format : dict[str,str]=None, caption : str =None) -> None:
|
13
|
+
'''
|
14
|
+
Saves pandas dataframe to latex
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
-------------
|
18
|
+
d_format (dict) : Dictionary specifying the formattinng of the table, e.g. `{'col1': '{}', 'col2': '{:.3f}', 'col3' : '{:.3f}'}`
|
19
|
+
'''
|
20
|
+
|
21
|
+
if path is not None:
|
22
|
+
dir_name = os.path.dirname(path)
|
23
|
+
os.makedirs(dir_name, exist_ok=True)
|
24
|
+
|
25
|
+
st = df.style
|
26
|
+
if hide_index:
|
27
|
+
st=st.hide(axis='index')
|
28
|
+
|
29
|
+
if d_format is not None:
|
30
|
+
st=st.format(formatter=d_format)
|
31
|
+
|
32
|
+
log.info(f'Saving to: {path}')
|
33
|
+
buf = st.to_latex(buf=path, caption=caption, hrules=True)
|
34
|
+
|
35
|
+
return buf
|
36
|
+
# -------------------------------------
|
dmu/plotting/matrix.py
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
'''
|
2
|
+
Module holding the MatrixPlotter class
|
3
|
+
'''
|
4
|
+
from typing import Annotated
|
5
|
+
import numpy
|
6
|
+
import numpy.typing as npt
|
7
|
+
import matplotlib.pyplot as plt
|
8
|
+
|
9
|
+
from dmu.logging.log_store import LogStore
|
10
|
+
|
11
|
+
Array2D = Annotated[npt.NDArray[numpy.float64], '(n,n)']
|
12
|
+
log = LogStore.add_logger('dmu:plotting:matrix')
|
13
|
+
#-------------------------------------------------------
|
14
|
+
class MatrixPlotter:
|
15
|
+
'''
|
16
|
+
Class used to plot matrices
|
17
|
+
'''
|
18
|
+
# -----------------------------------------------
|
19
|
+
def __init__(self, mat : Array2D, cfg : dict):
|
20
|
+
self._mat = mat
|
21
|
+
self._cfg = cfg
|
22
|
+
|
23
|
+
self._size : int
|
24
|
+
self._l_label : list[str]
|
25
|
+
# -----------------------------------------------
|
26
|
+
def _initialize(self) -> None:
|
27
|
+
self._check_matrix()
|
28
|
+
self._reformat_matrix()
|
29
|
+
self._set_labels()
|
30
|
+
self._mask_matrix()
|
31
|
+
# -----------------------------------------------
|
32
|
+
def _mask_matrix(self) -> None:
|
33
|
+
if 'mask_value' not in self._cfg:
|
34
|
+
return
|
35
|
+
|
36
|
+
mask_val = self._cfg['mask_value']
|
37
|
+
log.debug(f'Masking value: {mask_val}')
|
38
|
+
|
39
|
+
self._mat = numpy.ma.masked_where(self._mat == mask_val, self._mat)
|
40
|
+
# -----------------------------------------------
|
41
|
+
def _check_matrix(self) -> None:
|
42
|
+
a, b = self._mat.shape
|
43
|
+
|
44
|
+
if a != b:
|
45
|
+
raise ValueError(f'Matrix is not square, but with shape: {a}x{b}')
|
46
|
+
|
47
|
+
self._size = a
|
48
|
+
# -----------------------------------------------
|
49
|
+
def _set_labels(self) -> None:
|
50
|
+
if 'labels' not in self._cfg:
|
51
|
+
raise ValueError('Labels entry missing')
|
52
|
+
|
53
|
+
l_lab = self._cfg['labels']
|
54
|
+
nlab = len(l_lab)
|
55
|
+
|
56
|
+
if nlab != self._size:
|
57
|
+
raise ValueError(f'Number of labels is not equal to its size: {nlab}!={self._size}')
|
58
|
+
|
59
|
+
self._l_label = l_lab
|
60
|
+
# -----------------------------------------------
|
61
|
+
def _reformat_matrix(self) -> None:
|
62
|
+
if 'upper' not in self._cfg:
|
63
|
+
log.debug('Drawing full matrix')
|
64
|
+
return
|
65
|
+
|
66
|
+
upper = self._cfg['upper']
|
67
|
+
if upper not in [True, False]:
|
68
|
+
raise ValueError(f'Invalid value for upper setting: {upper}')
|
69
|
+
|
70
|
+
if upper:
|
71
|
+
log.debug('Drawing upper matrix')
|
72
|
+
self._mat = numpy.triu(self._mat, 0)
|
73
|
+
return
|
74
|
+
|
75
|
+
if not upper:
|
76
|
+
log.debug('Drawing lower matrix')
|
77
|
+
self._mat = numpy.triu(self._mat, 0)
|
78
|
+
return
|
79
|
+
# -----------------------------------------------
|
80
|
+
def _set_axes(self, ax) -> None:
|
81
|
+
ax.set_xticks(numpy.arange(self._size))
|
82
|
+
ax.set_yticks(numpy.arange(self._size))
|
83
|
+
|
84
|
+
ax.set_xticklabels(self._l_label)
|
85
|
+
ax.set_yticklabels(self._l_label)
|
86
|
+
|
87
|
+
rotation = 45
|
88
|
+
if 'label_angle' in self._cfg:
|
89
|
+
rotation = self._cfg['label_angle']
|
90
|
+
|
91
|
+
plt.setp(ax.get_xticklabels(), rotation=rotation, ha="right", rotation_mode="anchor")
|
92
|
+
# -----------------------------------------------
|
93
|
+
def _draw_matrix(self) -> None:
|
94
|
+
fsize = None
|
95
|
+
if 'size' in self._cfg:
|
96
|
+
fsize = self._cfg['size']
|
97
|
+
|
98
|
+
if 'zrange' not in self._cfg:
|
99
|
+
raise ValueError('z range not found in configuration')
|
100
|
+
|
101
|
+
[zmin, zmax] = self._cfg['zrange']
|
102
|
+
|
103
|
+
fig, ax = plt.subplots() if fsize is None else plt.subplots(figsize=fsize)
|
104
|
+
|
105
|
+
palette = plt.cm.viridis
|
106
|
+
im = ax.imshow(self._mat, cmap=palette, vmin=zmin, vmax=zmax)
|
107
|
+
self._set_axes(ax)
|
108
|
+
|
109
|
+
if 'format' in self._cfg:
|
110
|
+
self._add_text(ax)
|
111
|
+
else:
|
112
|
+
log.debug('Not adding values to matrix but bar')
|
113
|
+
fig.colorbar(im)
|
114
|
+
|
115
|
+
if 'title' not in self._cfg:
|
116
|
+
return
|
117
|
+
|
118
|
+
title = self._cfg['title']
|
119
|
+
ax.set_title(title)
|
120
|
+
fig.tight_layout()
|
121
|
+
# -----------------------------------------------
|
122
|
+
def _add_text(self, ax):
|
123
|
+
fontsize = 12
|
124
|
+
if 'fontsize' in self._cfg:
|
125
|
+
fontsize = self._cfg['fontsize']
|
126
|
+
|
127
|
+
form = self._cfg['format']
|
128
|
+
log.debug(f'Adding values with format {form}')
|
129
|
+
|
130
|
+
for i_x, _ in enumerate(self._l_label):
|
131
|
+
for i_y, _ in enumerate(self._l_label):
|
132
|
+
try:
|
133
|
+
val = self._mat[i_y, i_x]
|
134
|
+
except:
|
135
|
+
log.error(f'Cannot access ({i_x}, {i_y}) in:')
|
136
|
+
print(self._mat)
|
137
|
+
raise
|
138
|
+
|
139
|
+
if numpy.ma.is_masked(val):
|
140
|
+
text = ''
|
141
|
+
else:
|
142
|
+
text = form.format(val)
|
143
|
+
|
144
|
+
_ = ax.text(i_x, i_y, text, ha="center", va="center", fontsize=fontsize, color="k")
|
145
|
+
# -----------------------------------------------
|
146
|
+
def plot(self):
|
147
|
+
'''
|
148
|
+
Runs plotting, plot can be accessed through:
|
149
|
+
|
150
|
+
```python
|
151
|
+
plt.show()
|
152
|
+
plt.savefig(...)
|
153
|
+
```
|
154
|
+
'''
|
155
|
+
self._initialize()
|
156
|
+
self._draw_matrix()
|
157
|
+
#-------------------------------------------------------
|
@@ -0,0 +1,33 @@
|
|
1
|
+
'''
|
2
|
+
Module with plotting utilities
|
3
|
+
'''
|
4
|
+
# pylint: disable=too-many-positional-arguments, too-many-arguments
|
5
|
+
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
|
8
|
+
# ---------------------------------------------------------------------------
|
9
|
+
def annotate(
|
10
|
+
l_x : list[float],
|
11
|
+
l_y : list[float],
|
12
|
+
l_v : list[float],
|
13
|
+
form : str = '{}',
|
14
|
+
xoff : int = 0,
|
15
|
+
yoff : int =-20,
|
16
|
+
size : int = 20,
|
17
|
+
color : str = 'black') -> None:
|
18
|
+
'''
|
19
|
+
Function used to annotate plots
|
20
|
+
|
21
|
+
l_x(y): List of x(y) coordinates for markers
|
22
|
+
l_v : List of numerical values to annotate markers
|
23
|
+
form : Formatting, e.g. {:.3f}
|
24
|
+
color : String with color for markers and annotation, e.g. black
|
25
|
+
size : Font size, default 20
|
26
|
+
x(y)off : Offset in x(y).
|
27
|
+
'''
|
28
|
+
for x, y, v in zip(l_x, l_y, l_v):
|
29
|
+
label = form.format(v)
|
30
|
+
|
31
|
+
plt.plot(x, y, marker='o', markersize= 5, markeredgecolor=color, markerfacecolor=color)
|
32
|
+
plt.annotate(label, (x,y), fontsize=size, textcoords="offset points", xytext=(xoff, yoff), color=color, ha='center')
|
33
|
+
# ---------------------------------------------------------------------------
|
dmu/rdataframe/utilities.py
CHANGED
dmu_data/ml/tests/train_mva.yaml
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
+
dataset:
|
2
|
+
nan :
|
3
|
+
x : 0
|
1
4
|
training :
|
2
|
-
nfold : 3
|
5
|
+
nfold : 3
|
3
6
|
features : [x, y, z]
|
4
7
|
rdm_stat : 1
|
5
8
|
hyper :
|
@@ -7,31 +10,43 @@ training :
|
|
7
10
|
n_estimators : 100
|
8
11
|
max_depth : 3
|
9
12
|
learning_rate : 0.1
|
10
|
-
min_samples_split : 2
|
13
|
+
min_samples_split : 2
|
11
14
|
saving:
|
12
|
-
path : '
|
15
|
+
path : '/tmp/dmu/ml/tests/train_mva/model.pkl'
|
13
16
|
plotting:
|
14
17
|
roc :
|
15
|
-
min : [0, 0]
|
16
|
-
|
18
|
+
min : [0.0, 0.0]
|
19
|
+
max : [1.2, 1.2]
|
20
|
+
annotate:
|
21
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
|
22
|
+
form : '{:.2f}'
|
23
|
+
color: 'green'
|
24
|
+
xoff : -15
|
25
|
+
yoff : -15
|
26
|
+
size : 10
|
27
|
+
correlation:
|
28
|
+
title : 'Correlation matrix'
|
29
|
+
size : [10, 10]
|
30
|
+
mask_value : 0
|
31
|
+
val_dir : '/tmp/dmu/ml/tests/train_mva'
|
17
32
|
features:
|
18
33
|
saving:
|
19
|
-
plt_dir : '
|
34
|
+
plt_dir : '/tmp/dmu/ml/tests/train_mva/features'
|
20
35
|
plots:
|
21
|
-
w :
|
36
|
+
w :
|
22
37
|
binning : [-4, 4, 100]
|
23
|
-
yscale : 'linear'
|
38
|
+
yscale : 'linear'
|
24
39
|
labels : ['w', '']
|
25
|
-
x :
|
40
|
+
x :
|
26
41
|
binning : [-4, 4, 100]
|
27
|
-
yscale : 'linear'
|
42
|
+
yscale : 'linear'
|
28
43
|
labels : ['x', '']
|
29
|
-
y :
|
44
|
+
y :
|
30
45
|
binning : [-4, 4, 100]
|
31
|
-
yscale : 'linear'
|
46
|
+
yscale : 'linear'
|
32
47
|
labels : ['y', '']
|
33
|
-
z :
|
48
|
+
z :
|
34
49
|
binning : [-4, 4, 100]
|
35
|
-
yscale : 'linear'
|
50
|
+
yscale : 'linear'
|
36
51
|
labels : ['z', '']
|
37
|
-
|
52
|
+
|
{data_manipulation_utilities-0.2.0.data → data_manipulation_utilities-0.2.2.data}/scripts/publish
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.0.dist-info → data_manipulation_utilities-0.2.2.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|