data-manipulation-utilities 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/METADATA +39 -6
- {data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/RECORD +9 -9
- dmu/ml/cv_predict.py +50 -2
- dmu/ml/train_mva.py +17 -6
- dmu/rdataframe/utilities.py +28 -2
- {data_manipulation_utilities-0.2.1.data → data_manipulation_utilities-0.2.3.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/WHEEL +0 -0
- {data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/top_level.txt +0 -0
{data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/METADATA
RENAMED
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -424,6 +424,10 @@ where the settings for the training go in a config dictionary, which when writte
|
|
424
424
|
|
425
425
|
```yaml
|
426
426
|
dataset:
|
427
|
+
# Before training, new features can be defined as below
|
428
|
+
define :
|
429
|
+
x : v + w
|
430
|
+
y : v - w
|
427
431
|
# If the key is found to be NaN, replace its value with the number provided
|
428
432
|
# This will be used in the training.
|
429
433
|
# Otherwise the entries with NaNs will be dropped
|
@@ -433,7 +437,7 @@ dataset:
|
|
433
437
|
z : -999
|
434
438
|
training :
|
435
439
|
nfold : 10
|
436
|
-
features : [
|
440
|
+
features : [x, y, z]
|
437
441
|
hyper :
|
438
442
|
loss : log_loss
|
439
443
|
n_estimators : 100
|
@@ -493,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
|
|
493
497
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
494
498
|
entries will be removed before training.
|
495
499
|
|
496
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
500
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
|
+
- For whatever remains we remove the entries from the training.
|
497
503
|
|
498
504
|
## Application
|
499
505
|
|
@@ -516,15 +522,24 @@ The picking process happens through the comparison of hashes between the samples
|
|
516
522
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
517
523
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
518
524
|
|
519
|
-
If a sample
|
525
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
520
526
|
`CVSameData` exception will be risen.
|
521
527
|
|
528
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
529
|
+
from that configuration and the user does not need to define extra columns.
|
530
|
+
|
522
531
|
### Caveats
|
523
532
|
|
524
533
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
525
534
|
|
526
535
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
527
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
536
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
537
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
538
|
+
and can be accessed through:
|
539
|
+
```python
|
540
|
+
model.cfg
|
541
|
+
```
|
542
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
528
543
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
529
544
|
|
530
545
|
# Pandas dataframes
|
@@ -563,6 +578,24 @@ These are utility functions meant to be used with ROOT dataframes.
|
|
563
578
|
|
564
579
|
## Adding a column from a numpy array
|
565
580
|
|
581
|
+
### With numba
|
582
|
+
|
583
|
+
For this do:
|
584
|
+
|
585
|
+
```python
|
586
|
+
import dmu.rdataframe.utilities as ut
|
587
|
+
|
588
|
+
arr_val = numpy.array([10, 20, 30])
|
589
|
+
rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
|
590
|
+
```
|
591
|
+
|
592
|
+
where the identifier needs to be unique, every time the function is called.
|
593
|
+
This is the case, because the addition is done internally by declaring a numba function whose name
|
594
|
+
cannot be repeated as mentioned
|
595
|
+
[here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
|
596
|
+
|
597
|
+
### With awkward
|
598
|
+
|
566
599
|
For this do:
|
567
600
|
|
568
601
|
```python
|
{data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/RECORD
RENAMED
@@ -1,10 +1,10 @@
|
|
1
|
-
data_manipulation_utilities-0.2.
|
1
|
+
data_manipulation_utilities-0.2.3.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
|
2
2
|
dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
|
3
3
|
dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
|
4
4
|
dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
|
5
5
|
dmu/ml/cv_classifier.py,sha256=8Jwx6xMhJaRLktlRdq0tFl32v6t8i63KmpxrlnXlomU,3759
|
6
|
-
dmu/ml/cv_predict.py,sha256=
|
7
|
-
dmu/ml/train_mva.py,sha256=
|
6
|
+
dmu/ml/cv_predict.py,sha256=4G7F_1yOvnLftsDC6zUpdvkxuHXGkPemhj0RsYySYDM,6708
|
7
|
+
dmu/ml/train_mva.py,sha256=SZ5cQHl7HBxn0c5Hh4HlN1aqMZaJUAlNmsfjnUSQrTg,16894
|
8
8
|
dmu/ml/utilities.py,sha256=l348bufD95CuSYdIrHScQThIy2nKwGKXZn-FQg3CEwg,3930
|
9
9
|
dmu/pdataframe/utilities.py,sha256=ypvLiFfJ82ga94qlW3t5dXnvEFwYOXnbtJb2zHwsbqk,987
|
10
10
|
dmu/plotting/matrix.py,sha256=pXuUJn-LgOvrI9qGkZQw16BzLjOjeikYQ_ll2VIcIXU,4978
|
@@ -13,7 +13,7 @@ dmu/plotting/plotter_1d.py,sha256=g6H2xAgsL9a6vRkpbqHICb3qwV_qMiQPZxxw_oOSf9M,51
|
|
13
13
|
dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
|
14
14
|
dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
|
15
15
|
dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
|
16
|
-
dmu/rdataframe/utilities.py,sha256=
|
16
|
+
dmu/rdataframe/utilities.py,sha256=pNcQARMP7txMhy6k27UnDcYf0buNy5U2fshaJDl_h8o,3661
|
17
17
|
dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
|
18
18
|
dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
|
19
19
|
dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
|
@@ -47,8 +47,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
|
|
47
47
|
dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
|
48
48
|
dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
|
49
49
|
dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
|
50
|
-
data_manipulation_utilities-0.2.
|
51
|
-
data_manipulation_utilities-0.2.
|
52
|
-
data_manipulation_utilities-0.2.
|
53
|
-
data_manipulation_utilities-0.2.
|
54
|
-
data_manipulation_utilities-0.2.
|
50
|
+
data_manipulation_utilities-0.2.3.dist-info/METADATA,sha256=STJ7vYfcSIM9dtMRzywGLwDzH1sUBE5DL9FqvskMcxo,27923
|
51
|
+
data_manipulation_utilities-0.2.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
52
|
+
data_manipulation_utilities-0.2.3.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
|
53
|
+
data_manipulation_utilities-0.2.3.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
|
54
|
+
data_manipulation_utilities-0.2.3.dist-info/RECORD,,
|
dmu/ml/cv_predict.py
CHANGED
@@ -32,11 +32,56 @@ class CVPredict:
|
|
32
32
|
if rdf is None:
|
33
33
|
raise ValueError('No ROOT dataframe passed')
|
34
34
|
|
35
|
-
self._l_model
|
36
|
-
self._rdf
|
35
|
+
self._l_model = models
|
36
|
+
self._rdf = rdf
|
37
|
+
self._d_nan_rep : dict[str,str]
|
37
38
|
|
38
39
|
self._arr_patch : numpy.ndarray
|
39
40
|
# --------------------------------------------
|
41
|
+
def _initialize(self):
|
42
|
+
self._rdf = self._define_columns(self._rdf)
|
43
|
+
self._d_nan_rep = self._get_nan_replacements()
|
44
|
+
# --------------------------------------------
|
45
|
+
def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
|
46
|
+
cfg = self._l_model[0].cfg
|
47
|
+
|
48
|
+
if 'define' not in cfg['dataset']:
|
49
|
+
log.debug('No define section found in config, will not define extra columns')
|
50
|
+
return self._rdf
|
51
|
+
|
52
|
+
d_def = cfg['dataset']['define']
|
53
|
+
log.debug(60 * '-')
|
54
|
+
log.info('Defining columns in RDF before evaluating classifier')
|
55
|
+
log.debug(60 * '-')
|
56
|
+
for name, expr in d_def.items():
|
57
|
+
log.debug(f'{name:<20}{"<---":20}{expr:<100}')
|
58
|
+
rdf = rdf.Define(name, expr)
|
59
|
+
|
60
|
+
return rdf
|
61
|
+
# --------------------------------------------
|
62
|
+
def _get_nan_replacements(self) -> dict[str,str]:
|
63
|
+
cfg = self._l_model[0].cfg
|
64
|
+
|
65
|
+
if 'nan' not in cfg['dataset']:
|
66
|
+
log.debug('No define section found in config, will not define extra columns')
|
67
|
+
return {}
|
68
|
+
|
69
|
+
return cfg['dataset']['nan']
|
70
|
+
# --------------------------------------------
|
71
|
+
def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
|
+
if len(self._d_nan_rep) == 0:
|
73
|
+
log.debug('Not doing any NaN replacement')
|
74
|
+
return df
|
75
|
+
|
76
|
+
log.debug(60 * '-')
|
77
|
+
log.info('Doing NaN replacements')
|
78
|
+
log.debug(60 * '-')
|
79
|
+
for var, val in self._d_nan_rep.items():
|
80
|
+
log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
|
81
|
+
df[var] = df[var].fillna(val)
|
82
|
+
|
83
|
+
return df
|
84
|
+
# --------------------------------------------
|
40
85
|
def _get_df(self):
|
41
86
|
'''
|
42
87
|
Will make ROOT rdf into dataframe and return it
|
@@ -45,6 +90,7 @@ class CVPredict:
|
|
45
90
|
l_ft = model.features
|
46
91
|
d_data= self._rdf.AsNumpy(l_ft)
|
47
92
|
df_ft = pnd.DataFrame(d_data)
|
93
|
+
df_ft = self._replace_nans(df_ft)
|
48
94
|
df_ft = ut.patch_and_tag(df_ft)
|
49
95
|
|
50
96
|
if 'patched_indices' in df_ft.attrs:
|
@@ -136,6 +182,8 @@ class CVPredict:
|
|
136
182
|
'''
|
137
183
|
Will return array of prediction probabilities for the signal category
|
138
184
|
'''
|
185
|
+
self._initialize()
|
186
|
+
|
139
187
|
df_ft = self._get_df()
|
140
188
|
model = self._l_model[0]
|
141
189
|
|
dmu/ml/train_mva.py
CHANGED
@@ -26,7 +26,7 @@ from dmu.plotting.matrix import MatrixPlotter
|
|
26
26
|
from dmu.logging.log_store import LogStore
|
27
27
|
|
28
28
|
npa = numpy.ndarray
|
29
|
-
log = LogStore.add_logger('
|
29
|
+
log = LogStore.add_logger('dmu:ml:train_mva')
|
30
30
|
# ---------------------------------------------
|
31
31
|
class TrainMva:
|
32
32
|
'''
|
@@ -334,10 +334,10 @@ class TrainMva:
|
|
334
334
|
if 'max' in self._cfg['plotting']['roc']:
|
335
335
|
[max_x, max_y] = self._cfg['plotting']['roc']['max']
|
336
336
|
|
337
|
-
self._plot_probabilities(xval_ts, yval_ts, l_prb_ts)
|
338
|
-
|
339
337
|
plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
|
340
338
|
plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
|
339
|
+
self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
|
340
|
+
|
341
341
|
plt.xlabel('Signal efficiency')
|
342
342
|
plt.ylabel('Background rejection')
|
343
343
|
plt.title(f'Fold: {ifold}')
|
@@ -351,13 +351,17 @@ class TrainMva:
|
|
351
351
|
def _plot_probabilities(self,
|
352
352
|
arr_seff: npa,
|
353
353
|
arr_brej: npa,
|
354
|
-
arr_sprb: npa
|
354
|
+
arr_sprb: npa,
|
355
|
+
arr_labl: npa) -> None:
|
355
356
|
|
356
357
|
roc_cfg = self._cfg['plotting']['roc']
|
357
358
|
if 'annotate' not in roc_cfg:
|
358
359
|
log.debug('Annotation section in the ROC curve config not found, skipping annotation')
|
359
360
|
return
|
360
361
|
|
362
|
+
l_sprb = [ sprb for sprb, labl in zip(arr_sprb, arr_labl) if labl == 1 ]
|
363
|
+
arr_sprb = numpy.array(l_sprb)
|
364
|
+
|
361
365
|
plt_cfg = roc_cfg['annotate']
|
362
366
|
if 'sig_eff' not in plt_cfg:
|
363
367
|
l_seff_target = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
|
@@ -366,17 +370,24 @@ class TrainMva:
|
|
366
370
|
del plt_cfg['sig_eff']
|
367
371
|
|
368
372
|
arr_seff_target = numpy.array(l_seff_target)
|
373
|
+
arr_quantile = 1 - arr_seff_target
|
369
374
|
|
370
|
-
l_score = numpy.quantile(arr_sprb,
|
375
|
+
l_score = numpy.quantile(arr_sprb, arr_quantile)
|
371
376
|
l_seff = []
|
372
377
|
l_brej = []
|
373
|
-
|
378
|
+
|
379
|
+
log.debug(60 * '-')
|
380
|
+
log.debug(f'{"SigEff":20}{"BkgRej":20}{"Score":20}')
|
381
|
+
log.debug(60 * '-')
|
382
|
+
for seff_target, score in zip(arr_seff_target, l_score):
|
374
383
|
arr_diff = numpy.abs(arr_seff - seff_target)
|
375
384
|
ind = numpy.argmin(arr_diff)
|
376
385
|
|
377
386
|
seff = arr_seff[ind]
|
378
387
|
brej = arr_brej[ind]
|
379
388
|
|
389
|
+
log.debug(f'{seff:<20.3f}{brej:<20.3f}{score:<20.2f}')
|
390
|
+
|
380
391
|
l_seff.append(seff)
|
381
392
|
l_brej.append(brej)
|
382
393
|
|
dmu/rdataframe/utilities.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
'''
|
2
2
|
Module containing utility functions to be used with ROOT dataframes
|
3
3
|
'''
|
4
|
+
# pylint: disable=no-name-in-module
|
4
5
|
|
5
6
|
import re
|
6
7
|
from dataclasses import dataclass
|
@@ -10,7 +11,7 @@ import pandas as pnd
|
|
10
11
|
import awkward as ak
|
11
12
|
import numpy
|
12
13
|
|
13
|
-
from ROOT import RDataFrame, RDF
|
14
|
+
from ROOT import RDataFrame, RDF, Numba
|
14
15
|
|
15
16
|
from dmu.logging.log_store import LogStore
|
16
17
|
|
@@ -34,6 +35,8 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
|
|
34
35
|
exclude_re : Regex with patter of column names that we won't pick
|
35
36
|
'''
|
36
37
|
|
38
|
+
log.warning(f'Adding column {name} with awkward')
|
39
|
+
|
37
40
|
d_opt = {} if d_opt is None else d_opt
|
38
41
|
if arr_val is None:
|
39
42
|
raise ValueError('Array of values not introduced')
|
@@ -66,12 +69,35 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
|
|
66
69
|
if arr_val.dtype == 'object':
|
67
70
|
arr_val = arr_val.astype(float)
|
68
71
|
|
69
|
-
d_data[name] = arr_val
|
72
|
+
d_data[name] = ak.from_numpy(arr_val)
|
70
73
|
|
71
74
|
rdf = ak.to_rdataframe(d_data)
|
72
75
|
|
73
76
|
return rdf
|
74
77
|
# ---------------------------------------------------------------------
|
78
|
+
def add_column_with_numba(
|
79
|
+
rdf : RDataFrame,
|
80
|
+
arr_val : Union[numpy.ndarray,None],
|
81
|
+
name : str,
|
82
|
+
identifier : str) -> RDataFrame:
|
83
|
+
'''
|
84
|
+
Will take a dataframe, an array of numbers and a string
|
85
|
+
Will add the array as a colunm to the dataframe
|
86
|
+
|
87
|
+
The `identifier` argument is a string need in order to avoid collisions
|
88
|
+
when using Numba to define a function to get the value from.
|
89
|
+
'''
|
90
|
+
identifier=f'fun_{identifier}'
|
91
|
+
|
92
|
+
@Numba.Declare(['int'], 'float', name=identifier)
|
93
|
+
def get_value(index):
|
94
|
+
return arr_val[index]
|
95
|
+
|
96
|
+
log.debug(f'Adding column {name} with numba')
|
97
|
+
rdf = rdf.Define(name, f'Numba::{identifier}(rdfentry_)')
|
98
|
+
|
99
|
+
return rdf
|
100
|
+
# ---------------------------------------------------------------------
|
75
101
|
def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
|
76
102
|
'''
|
77
103
|
Takes the output of rdf.Report(), i.e. an RDataFrame cutflow report.
|
{data_manipulation_utilities-0.2.1.data → data_manipulation_utilities-0.2.3.data}/scripts/publish
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1.dist-info → data_manipulation_utilities-0.2.3.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|