data-manipulation-utilities 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/PKG-INFO +113 -6
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/README.md +111 -4
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/pyproject.toml +2 -2
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/PKG-INFO +113 -6
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/SOURCES.txt +3 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/cv_classifier.py +2 -1
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/cv_predict.py +50 -2
- data_manipulation_utilities-0.2.2/src/dmu/ml/train_mva.py +447 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/utilities.py +8 -0
- data_manipulation_utilities-0.2.2/src/dmu/pdataframe/utilities.py +36 -0
- data_manipulation_utilities-0.2.2/src/dmu/plotting/matrix.py +157 -0
- data_manipulation_utilities-0.2.2/src/dmu/plotting/utilities.py +33 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rdataframe/utilities.py +1 -1
- data_manipulation_utilities-0.2.2/src/dmu_data/ml/tests/train_mva.yaml +52 -0
- data_manipulation_utilities-0.2.0/src/dmu/ml/train_mva.py +0 -257
- data_manipulation_utilities-0.2.0/src/dmu_data/ml/tests/train_mva.yaml +0 -37
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/setup.cfg +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/arrays/utilities.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/generic/utilities.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/logging/log_store.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/plotting/plotter.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/plotting/plotter_1d.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/plotting/plotter_2d.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rdataframe/atr_mgr.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rfile/rfprinter.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/rfile/utilities.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/fitter.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/function.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/gof_calculator.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/minimizers.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/model_factory.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/utilities.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/stats/zfit_plotter.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/testing/utilities.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/text/transformer.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/__init__.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/2d.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/name.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/simple.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/title.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/plotting/tests/weights.yaml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform.toml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform.txt +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform_set.toml +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform_set.txt +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_data/text/transform_trf.txt +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/git/publish +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/physics/check_truth.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/rfile/print_trees.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/ssh/coned.py +0 -0
- {data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -423,9 +423,21 @@ obj.run()
|
|
423
423
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
424
424
|
|
425
425
|
```yaml
|
426
|
+
dataset:
|
427
|
+
# Before training, new features can be defined as below
|
428
|
+
define :
|
429
|
+
x : v + w
|
430
|
+
y : v - w
|
431
|
+
# If the key is found to be NaN, replace its value with the number provided
|
432
|
+
# This will be used in the training.
|
433
|
+
# Otherwise the entries with NaNs will be dropped
|
434
|
+
nan:
|
435
|
+
x : 0
|
436
|
+
y : 0
|
437
|
+
z : -999
|
426
438
|
training :
|
427
439
|
nfold : 10
|
428
|
-
features : [
|
440
|
+
features : [x, y, z]
|
429
441
|
hyper :
|
430
442
|
loss : log_loss
|
431
443
|
n_estimators : 100
|
@@ -433,8 +445,25 @@ training :
|
|
433
445
|
learning_rate : 0.1
|
434
446
|
min_samples_split : 2
|
435
447
|
saving:
|
448
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
436
449
|
path : 'tests/ml/train_mva/model.pkl'
|
437
450
|
plotting:
|
451
|
+
roc :
|
452
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
453
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
454
|
+
# The section below is optional and will annotate the ROC curve with
|
455
|
+
# values for the score at different signal efficiencies
|
456
|
+
annotate:
|
457
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
458
|
+
form : '{:.2f}' # Use two decimals for scores
|
459
|
+
color : 'green' # Color for text and marker
|
460
|
+
xoff : -15 # Offsets in X and Y
|
461
|
+
yoff : -15
|
462
|
+
size : 10 # Size of text
|
463
|
+
correlation: # Adds correlation matrix for training datasets
|
464
|
+
title : 'Correlation matrix'
|
465
|
+
size : [10, 10]
|
466
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
438
467
|
val_dir : 'tests/ml/train_mva'
|
439
468
|
features:
|
440
469
|
saving:
|
@@ -468,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
|
|
468
497
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
469
498
|
entries will be removed before training.
|
470
499
|
|
471
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
500
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
|
+
- For whatever remains we remove the entries from the training.
|
472
503
|
|
473
504
|
## Application
|
474
505
|
|
@@ -491,17 +522,56 @@ The picking process happens through the comparison of hashes between the samples
|
|
491
522
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
492
523
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
493
524
|
|
494
|
-
If a sample
|
525
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
495
526
|
`CVSameData` exception will be risen.
|
496
527
|
|
528
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
529
|
+
from that configuration and the user does not need to define extra columns.
|
530
|
+
|
497
531
|
### Caveats
|
498
532
|
|
499
533
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
500
534
|
|
501
535
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
502
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
536
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
537
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
538
|
+
and can be accessed through:
|
539
|
+
```python
|
540
|
+
model.cfg
|
541
|
+
```
|
542
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
503
543
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
504
544
|
|
545
|
+
# Pandas dataframes
|
546
|
+
|
547
|
+
## Utilities
|
548
|
+
|
549
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
550
|
+
|
551
|
+
### Dataframe to latex
|
552
|
+
|
553
|
+
One can save a dataframe to latex with:
|
554
|
+
|
555
|
+
```python
|
556
|
+
import pandas as pnd
|
557
|
+
import dmu.pdataframe.utilities as put
|
558
|
+
|
559
|
+
d_data = {}
|
560
|
+
d_data['a'] = [1,2,3]
|
561
|
+
d_data['b'] = [4,5,6]
|
562
|
+
df = pnd.DataFrame(d_data)
|
563
|
+
|
564
|
+
d_format = {
|
565
|
+
'a' : '{:.0f}',
|
566
|
+
'b' : '{:.3f}'}
|
567
|
+
|
568
|
+
df = _get_df()
|
569
|
+
put.df_to_tex(df,
|
570
|
+
'./table.tex',
|
571
|
+
d_format = d_format,
|
572
|
+
caption = 'some caption')
|
573
|
+
```
|
574
|
+
|
505
575
|
# Rdataframes
|
506
576
|
|
507
577
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -653,6 +723,43 @@ axes:
|
|
653
723
|
label : 'y'
|
654
724
|
```
|
655
725
|
|
726
|
+
# Other plots
|
727
|
+
|
728
|
+
## Matrices
|
729
|
+
|
730
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
731
|
+
|
732
|
+
```python
|
733
|
+
import numpy
|
734
|
+
import matplotlib.pyplot as plt
|
735
|
+
|
736
|
+
from dmu.plotting.matrix import MatrixPlotter
|
737
|
+
|
738
|
+
cfg = {
|
739
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
740
|
+
'title' : 'Some title', # Optional, title of plot
|
741
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
742
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
743
|
+
'zrange' : [0, 10], # Controls the z axis range
|
744
|
+
'size' : [7, 7], # Plot size
|
745
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
746
|
+
'fontsize' : 12, # Font size associated to `format`
|
747
|
+
'mask_value' : 0, # These values will appear white in the plot
|
748
|
+
}
|
749
|
+
|
750
|
+
mat = [
|
751
|
+
[1, 2, 3],
|
752
|
+
[2, 0, 4],
|
753
|
+
[3, 4, numpy.nan]
|
754
|
+
]
|
755
|
+
|
756
|
+
mat = numpy.array(mat)
|
757
|
+
|
758
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
759
|
+
obj.plot()
|
760
|
+
plt.show()
|
761
|
+
```
|
762
|
+
|
656
763
|
# Manipulating ROOT files
|
657
764
|
|
658
765
|
## Getting trees from file
|
@@ -403,9 +403,21 @@ obj.run()
|
|
403
403
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
404
404
|
|
405
405
|
```yaml
|
406
|
+
dataset:
|
407
|
+
# Before training, new features can be defined as below
|
408
|
+
define :
|
409
|
+
x : v + w
|
410
|
+
y : v - w
|
411
|
+
# If the key is found to be NaN, replace its value with the number provided
|
412
|
+
# This will be used in the training.
|
413
|
+
# Otherwise the entries with NaNs will be dropped
|
414
|
+
nan:
|
415
|
+
x : 0
|
416
|
+
y : 0
|
417
|
+
z : -999
|
406
418
|
training :
|
407
419
|
nfold : 10
|
408
|
-
features : [
|
420
|
+
features : [x, y, z]
|
409
421
|
hyper :
|
410
422
|
loss : log_loss
|
411
423
|
n_estimators : 100
|
@@ -413,8 +425,25 @@ training :
|
|
413
425
|
learning_rate : 0.1
|
414
426
|
min_samples_split : 2
|
415
427
|
saving:
|
428
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
416
429
|
path : 'tests/ml/train_mva/model.pkl'
|
417
430
|
plotting:
|
431
|
+
roc :
|
432
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
433
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
434
|
+
# The section below is optional and will annotate the ROC curve with
|
435
|
+
# values for the score at different signal efficiencies
|
436
|
+
annotate:
|
437
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
438
|
+
form : '{:.2f}' # Use two decimals for scores
|
439
|
+
color : 'green' # Color for text and marker
|
440
|
+
xoff : -15 # Offsets in X and Y
|
441
|
+
yoff : -15
|
442
|
+
size : 10 # Size of text
|
443
|
+
correlation: # Adds correlation matrix for training datasets
|
444
|
+
title : 'Correlation matrix'
|
445
|
+
size : [10, 10]
|
446
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
418
447
|
val_dir : 'tests/ml/train_mva'
|
419
448
|
features:
|
420
449
|
saving:
|
@@ -448,7 +477,9 @@ When training on real data, several things might go wrong and the code will try
|
|
448
477
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
449
478
|
entries will be removed before training.
|
450
479
|
|
451
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
480
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
481
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
482
|
+
- For whatever remains we remove the entries from the training.
|
452
483
|
|
453
484
|
## Application
|
454
485
|
|
@@ -471,17 +502,56 @@ The picking process happens through the comparison of hashes between the samples
|
|
471
502
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
472
503
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
473
504
|
|
474
|
-
If a sample
|
505
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
475
506
|
`CVSameData` exception will be risen.
|
476
507
|
|
508
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
509
|
+
from that configuration and the user does not need to define extra columns.
|
510
|
+
|
477
511
|
### Caveats
|
478
512
|
|
479
513
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
480
514
|
|
481
515
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
482
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
516
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
517
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
518
|
+
and can be accessed through:
|
519
|
+
```python
|
520
|
+
model.cfg
|
521
|
+
```
|
522
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
483
523
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
484
524
|
|
525
|
+
# Pandas dataframes
|
526
|
+
|
527
|
+
## Utilities
|
528
|
+
|
529
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
530
|
+
|
531
|
+
### Dataframe to latex
|
532
|
+
|
533
|
+
One can save a dataframe to latex with:
|
534
|
+
|
535
|
+
```python
|
536
|
+
import pandas as pnd
|
537
|
+
import dmu.pdataframe.utilities as put
|
538
|
+
|
539
|
+
d_data = {}
|
540
|
+
d_data['a'] = [1,2,3]
|
541
|
+
d_data['b'] = [4,5,6]
|
542
|
+
df = pnd.DataFrame(d_data)
|
543
|
+
|
544
|
+
d_format = {
|
545
|
+
'a' : '{:.0f}',
|
546
|
+
'b' : '{:.3f}'}
|
547
|
+
|
548
|
+
df = _get_df()
|
549
|
+
put.df_to_tex(df,
|
550
|
+
'./table.tex',
|
551
|
+
d_format = d_format,
|
552
|
+
caption = 'some caption')
|
553
|
+
```
|
554
|
+
|
485
555
|
# Rdataframes
|
486
556
|
|
487
557
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -633,6 +703,43 @@ axes:
|
|
633
703
|
label : 'y'
|
634
704
|
```
|
635
705
|
|
706
|
+
# Other plots
|
707
|
+
|
708
|
+
## Matrices
|
709
|
+
|
710
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
711
|
+
|
712
|
+
```python
|
713
|
+
import numpy
|
714
|
+
import matplotlib.pyplot as plt
|
715
|
+
|
716
|
+
from dmu.plotting.matrix import MatrixPlotter
|
717
|
+
|
718
|
+
cfg = {
|
719
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
720
|
+
'title' : 'Some title', # Optional, title of plot
|
721
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
722
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
723
|
+
'zrange' : [0, 10], # Controls the z axis range
|
724
|
+
'size' : [7, 7], # Plot size
|
725
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
726
|
+
'fontsize' : 12, # Font size associated to `format`
|
727
|
+
'mask_value' : 0, # These values will appear white in the plot
|
728
|
+
}
|
729
|
+
|
730
|
+
mat = [
|
731
|
+
[1, 2, 3],
|
732
|
+
[2, 0, 4],
|
733
|
+
[3, 4, numpy.nan]
|
734
|
+
]
|
735
|
+
|
736
|
+
mat = numpy.array(mat)
|
737
|
+
|
738
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
739
|
+
obj.plot()
|
740
|
+
plt.show()
|
741
|
+
```
|
742
|
+
|
636
743
|
# Manipulating ROOT files
|
637
744
|
|
638
745
|
## Getting trees from file
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -423,9 +423,21 @@ obj.run()
|
|
423
423
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
424
424
|
|
425
425
|
```yaml
|
426
|
+
dataset:
|
427
|
+
# Before training, new features can be defined as below
|
428
|
+
define :
|
429
|
+
x : v + w
|
430
|
+
y : v - w
|
431
|
+
# If the key is found to be NaN, replace its value with the number provided
|
432
|
+
# This will be used in the training.
|
433
|
+
# Otherwise the entries with NaNs will be dropped
|
434
|
+
nan:
|
435
|
+
x : 0
|
436
|
+
y : 0
|
437
|
+
z : -999
|
426
438
|
training :
|
427
439
|
nfold : 10
|
428
|
-
features : [
|
440
|
+
features : [x, y, z]
|
429
441
|
hyper :
|
430
442
|
loss : log_loss
|
431
443
|
n_estimators : 100
|
@@ -433,8 +445,25 @@ training :
|
|
433
445
|
learning_rate : 0.1
|
434
446
|
min_samples_split : 2
|
435
447
|
saving:
|
448
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
436
449
|
path : 'tests/ml/train_mva/model.pkl'
|
437
450
|
plotting:
|
451
|
+
roc :
|
452
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
453
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
454
|
+
# The section below is optional and will annotate the ROC curve with
|
455
|
+
# values for the score at different signal efficiencies
|
456
|
+
annotate:
|
457
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
458
|
+
form : '{:.2f}' # Use two decimals for scores
|
459
|
+
color : 'green' # Color for text and marker
|
460
|
+
xoff : -15 # Offsets in X and Y
|
461
|
+
yoff : -15
|
462
|
+
size : 10 # Size of text
|
463
|
+
correlation: # Adds correlation matrix for training datasets
|
464
|
+
title : 'Correlation matrix'
|
465
|
+
size : [10, 10]
|
466
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
438
467
|
val_dir : 'tests/ml/train_mva'
|
439
468
|
features:
|
440
469
|
saving:
|
@@ -468,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
|
|
468
497
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
469
498
|
entries will be removed before training.
|
470
499
|
|
471
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
500
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
|
+
- For whatever remains we remove the entries from the training.
|
472
503
|
|
473
504
|
## Application
|
474
505
|
|
@@ -491,17 +522,56 @@ The picking process happens through the comparison of hashes between the samples
|
|
491
522
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
492
523
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
493
524
|
|
494
|
-
If a sample
|
525
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
495
526
|
`CVSameData` exception will be risen.
|
496
527
|
|
528
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
529
|
+
from that configuration and the user does not need to define extra columns.
|
530
|
+
|
497
531
|
### Caveats
|
498
532
|
|
499
533
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
500
534
|
|
501
535
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
502
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
536
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
537
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
538
|
+
and can be accessed through:
|
539
|
+
```python
|
540
|
+
model.cfg
|
541
|
+
```
|
542
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
503
543
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
504
544
|
|
545
|
+
# Pandas dataframes
|
546
|
+
|
547
|
+
## Utilities
|
548
|
+
|
549
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
550
|
+
|
551
|
+
### Dataframe to latex
|
552
|
+
|
553
|
+
One can save a dataframe to latex with:
|
554
|
+
|
555
|
+
```python
|
556
|
+
import pandas as pnd
|
557
|
+
import dmu.pdataframe.utilities as put
|
558
|
+
|
559
|
+
d_data = {}
|
560
|
+
d_data['a'] = [1,2,3]
|
561
|
+
d_data['b'] = [4,5,6]
|
562
|
+
df = pnd.DataFrame(d_data)
|
563
|
+
|
564
|
+
d_format = {
|
565
|
+
'a' : '{:.0f}',
|
566
|
+
'b' : '{:.3f}'}
|
567
|
+
|
568
|
+
df = _get_df()
|
569
|
+
put.df_to_tex(df,
|
570
|
+
'./table.tex',
|
571
|
+
d_format = d_format,
|
572
|
+
caption = 'some caption')
|
573
|
+
```
|
574
|
+
|
505
575
|
# Rdataframes
|
506
576
|
|
507
577
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -653,6 +723,43 @@ axes:
|
|
653
723
|
label : 'y'
|
654
724
|
```
|
655
725
|
|
726
|
+
# Other plots
|
727
|
+
|
728
|
+
## Matrices
|
729
|
+
|
730
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
731
|
+
|
732
|
+
```python
|
733
|
+
import numpy
|
734
|
+
import matplotlib.pyplot as plt
|
735
|
+
|
736
|
+
from dmu.plotting.matrix import MatrixPlotter
|
737
|
+
|
738
|
+
cfg = {
|
739
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
740
|
+
'title' : 'Some title', # Optional, title of plot
|
741
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
742
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
743
|
+
'zrange' : [0, 10], # Controls the z axis range
|
744
|
+
'size' : [7, 7], # Plot size
|
745
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
746
|
+
'fontsize' : 12, # Font size associated to `format`
|
747
|
+
'mask_value' : 0, # These values will appear white in the plot
|
748
|
+
}
|
749
|
+
|
750
|
+
mat = [
|
751
|
+
[1, 2, 3],
|
752
|
+
[2, 0, 4],
|
753
|
+
[3, 4, numpy.nan]
|
754
|
+
]
|
755
|
+
|
756
|
+
mat = numpy.array(mat)
|
757
|
+
|
758
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
759
|
+
obj.plot()
|
760
|
+
plt.show()
|
761
|
+
```
|
762
|
+
|
656
763
|
# Manipulating ROOT files
|
657
764
|
|
658
765
|
## Getting trees from file
|
@@ -13,9 +13,12 @@ src/dmu/ml/cv_classifier.py
|
|
13
13
|
src/dmu/ml/cv_predict.py
|
14
14
|
src/dmu/ml/train_mva.py
|
15
15
|
src/dmu/ml/utilities.py
|
16
|
+
src/dmu/pdataframe/utilities.py
|
17
|
+
src/dmu/plotting/matrix.py
|
16
18
|
src/dmu/plotting/plotter.py
|
17
19
|
src/dmu/plotting/plotter_1d.py
|
18
20
|
src/dmu/plotting/plotter_2d.py
|
21
|
+
src/dmu/plotting/utilities.py
|
19
22
|
src/dmu/rdataframe/atr_mgr.py
|
20
23
|
src/dmu/rdataframe/utilities.py
|
21
24
|
src/dmu/rfile/rfprinter.py
|
{data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/cv_classifier.py
RENAMED
@@ -2,6 +2,7 @@
|
|
2
2
|
Module holding cv_classifier class
|
3
3
|
'''
|
4
4
|
|
5
|
+
from typing import Union
|
5
6
|
from sklearn.ensemble import GradientBoostingClassifier
|
6
7
|
|
7
8
|
from dmu.logging.log_store import LogStore
|
@@ -22,7 +23,7 @@ class CVClassifier(GradientBoostingClassifier):
|
|
22
23
|
'''
|
23
24
|
# pylint: disable = too-many-ancestors, abstract-method
|
24
25
|
# ----------------------------------
|
25
|
-
def __init__(self, cfg : dict
|
26
|
+
def __init__(self, cfg : Union[dict,None] = None):
|
26
27
|
'''
|
27
28
|
cfg (dict) : Dictionary with configuration, specially the hyperparameters set in the `hyper` field
|
28
29
|
'''
|
{data_manipulation_utilities-0.2.0 → data_manipulation_utilities-0.2.2}/src/dmu/ml/cv_predict.py
RENAMED
@@ -32,11 +32,56 @@ class CVPredict:
|
|
32
32
|
if rdf is None:
|
33
33
|
raise ValueError('No ROOT dataframe passed')
|
34
34
|
|
35
|
-
self._l_model
|
36
|
-
self._rdf
|
35
|
+
self._l_model = models
|
36
|
+
self._rdf = rdf
|
37
|
+
self._d_nan_rep : dict[str,str]
|
37
38
|
|
38
39
|
self._arr_patch : numpy.ndarray
|
39
40
|
# --------------------------------------------
|
41
|
+
def _initialize(self):
|
42
|
+
self._rdf = self._define_columns(self._rdf)
|
43
|
+
self._d_nan_rep = self._get_nan_replacements()
|
44
|
+
# --------------------------------------------
|
45
|
+
def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
|
46
|
+
cfg = self._l_model[0].cfg
|
47
|
+
|
48
|
+
if 'define' not in cfg['dataset']:
|
49
|
+
log.debug('No define section found in config, will not define extra columns')
|
50
|
+
return self._rdf
|
51
|
+
|
52
|
+
d_def = cfg['dataset']['define']
|
53
|
+
log.debug(60 * '-')
|
54
|
+
log.info('Defining columns in RDF before evaluating classifier')
|
55
|
+
log.debug(60 * '-')
|
56
|
+
for name, expr in d_def.items():
|
57
|
+
log.debug(f'{name:<20}{"<---":20}{expr:<100}')
|
58
|
+
rdf = rdf.Define(name, expr)
|
59
|
+
|
60
|
+
return rdf
|
61
|
+
# --------------------------------------------
|
62
|
+
def _get_nan_replacements(self) -> dict[str,str]:
|
63
|
+
cfg = self._l_model[0].cfg
|
64
|
+
|
65
|
+
if 'nan' not in cfg['dataset']:
|
66
|
+
log.debug('No define section found in config, will not define extra columns')
|
67
|
+
return {}
|
68
|
+
|
69
|
+
return cfg['dataset']['nan']
|
70
|
+
# --------------------------------------------
|
71
|
+
def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
|
+
if len(self._d_nan_rep) == 0:
|
73
|
+
log.debug('Not doing any NaN replacement')
|
74
|
+
return df
|
75
|
+
|
76
|
+
log.debug(60 * '-')
|
77
|
+
log.info('Doing NaN replacements')
|
78
|
+
log.debug(60 * '-')
|
79
|
+
for var, val in self._d_nan_rep.items():
|
80
|
+
log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
|
81
|
+
df[var] = df[var].fillna(val)
|
82
|
+
|
83
|
+
return df
|
84
|
+
# --------------------------------------------
|
40
85
|
def _get_df(self):
|
41
86
|
'''
|
42
87
|
Will make ROOT rdf into dataframe and return it
|
@@ -45,6 +90,7 @@ class CVPredict:
|
|
45
90
|
l_ft = model.features
|
46
91
|
d_data= self._rdf.AsNumpy(l_ft)
|
47
92
|
df_ft = pnd.DataFrame(d_data)
|
93
|
+
df_ft = self._replace_nans(df_ft)
|
48
94
|
df_ft = ut.patch_and_tag(df_ft)
|
49
95
|
|
50
96
|
if 'patched_indices' in df_ft.attrs:
|
@@ -136,6 +182,8 @@ class CVPredict:
|
|
136
182
|
'''
|
137
183
|
Will return array of prediction probabilities for the signal category
|
138
184
|
'''
|
185
|
+
self._initialize()
|
186
|
+
|
139
187
|
df_ft = self._get_df()
|
140
188
|
model = self._l_model[0]
|
141
189
|
|