data-manipulation-utilities 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.1/src/data_manipulation_utilities.egg-info → data_manipulation_utilities-0.2.3}/PKG-INFO +39 -6
- data_manipulation_utilities-0.2.1/PKG-INFO → data_manipulation_utilities-0.2.3/README.md +37 -24
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/pyproject.toml +2 -2
- data_manipulation_utilities-0.2.1/README.md → data_manipulation_utilities-0.2.3/src/data_manipulation_utilities.egg-info/PKG-INFO +57 -4
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/cv_predict.py +50 -2
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/train_mva.py +17 -6
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rdataframe/utilities.py +28 -2
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/setup.cfg +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/SOURCES.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/arrays/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/generic/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/logging/log_store.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/cv_classifier.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/pdataframe/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/matrix.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter_1d.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter_2d.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rdataframe/atr_mgr.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rfile/rfprinter.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rfile/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/fitter.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/function.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/gof_calculator.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/minimizers.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/model_factory.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/zfit_plotter.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/testing/utilities.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/text/transformer.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/__init__.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/ml/tests/train_mva.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/2d.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/name.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/simple.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/title.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/weights.yaml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform.toml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform_set.toml +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform_set.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform_trf.txt +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/git/publish +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/physics/check_truth.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/rfile/print_trees.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/ssh/coned.py +0 -0
- {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -424,6 +424,10 @@ where the settings for the training go in a config dictionary, which when writte
|
|
424
424
|
|
425
425
|
```yaml
|
426
426
|
dataset:
|
427
|
+
# Before training, new features can be defined as below
|
428
|
+
define :
|
429
|
+
x : v + w
|
430
|
+
y : v - w
|
427
431
|
# If the key is found to be NaN, replace its value with the number provided
|
428
432
|
# This will be used in the training.
|
429
433
|
# Otherwise the entries with NaNs will be dropped
|
@@ -433,7 +437,7 @@ dataset:
|
|
433
437
|
z : -999
|
434
438
|
training :
|
435
439
|
nfold : 10
|
436
|
-
features : [
|
440
|
+
features : [x, y, z]
|
437
441
|
hyper :
|
438
442
|
loss : log_loss
|
439
443
|
n_estimators : 100
|
@@ -493,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
|
|
493
497
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
494
498
|
entries will be removed before training.
|
495
499
|
|
496
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
500
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
|
+
- For whatever remains we remove the entries from the training.
|
497
503
|
|
498
504
|
## Application
|
499
505
|
|
@@ -516,15 +522,24 @@ The picking process happens through the comparison of hashes between the samples
|
|
516
522
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
517
523
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
518
524
|
|
519
|
-
If a sample
|
525
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
520
526
|
`CVSameData` exception will be risen.
|
521
527
|
|
528
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
529
|
+
from that configuration and the user does not need to define extra columns.
|
530
|
+
|
522
531
|
### Caveats
|
523
532
|
|
524
533
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
525
534
|
|
526
535
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
527
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
536
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
537
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
538
|
+
and can be accessed through:
|
539
|
+
```python
|
540
|
+
model.cfg
|
541
|
+
```
|
542
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
528
543
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
529
544
|
|
530
545
|
# Pandas dataframes
|
@@ -563,6 +578,24 @@ These are utility functions meant to be used with ROOT dataframes.
|
|
563
578
|
|
564
579
|
## Adding a column from a numpy array
|
565
580
|
|
581
|
+
### With numba
|
582
|
+
|
583
|
+
For this do:
|
584
|
+
|
585
|
+
```python
|
586
|
+
import dmu.rdataframe.utilities as ut
|
587
|
+
|
588
|
+
arr_val = numpy.array([10, 20, 30])
|
589
|
+
rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
|
590
|
+
```
|
591
|
+
|
592
|
+
where the identifier needs to be unique, every time the function is called.
|
593
|
+
This is the case, because the addition is done internally by declaring a numba function whose name
|
594
|
+
cannot be repeated as mentioned
|
595
|
+
[here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
|
596
|
+
|
597
|
+
### With awkward
|
598
|
+
|
566
599
|
For this do:
|
567
600
|
|
568
601
|
```python
|
@@ -1,23 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.1
|
4
|
-
Description-Content-Type: text/markdown
|
5
|
-
Requires-Dist: logzero
|
6
|
-
Requires-Dist: PyYAML
|
7
|
-
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward==2.4.6
|
9
|
-
Requires-Dist: tqdm
|
10
|
-
Requires-Dist: joblib
|
11
|
-
Requires-Dist: scikit-learn
|
12
|
-
Requires-Dist: toml
|
13
|
-
Requires-Dist: numpy
|
14
|
-
Requires-Dist: matplotlib
|
15
|
-
Requires-Dist: mplhep
|
16
|
-
Requires-Dist: hist[plot]
|
17
|
-
Requires-Dist: pandas
|
18
|
-
Provides-Extra: dev
|
19
|
-
Requires-Dist: pytest; extra == "dev"
|
20
|
-
|
21
1
|
# D(ata) M(anipulation) U(tilities)
|
22
2
|
|
23
3
|
These are tools that can be used for different data analysis tasks.
|
@@ -424,6 +404,10 @@ where the settings for the training go in a config dictionary, which when writte
|
|
424
404
|
|
425
405
|
```yaml
|
426
406
|
dataset:
|
407
|
+
# Before training, new features can be defined as below
|
408
|
+
define :
|
409
|
+
x : v + w
|
410
|
+
y : v - w
|
427
411
|
# If the key is found to be NaN, replace its value with the number provided
|
428
412
|
# This will be used in the training.
|
429
413
|
# Otherwise the entries with NaNs will be dropped
|
@@ -433,7 +417,7 @@ dataset:
|
|
433
417
|
z : -999
|
434
418
|
training :
|
435
419
|
nfold : 10
|
436
|
-
features : [
|
420
|
+
features : [x, y, z]
|
437
421
|
hyper :
|
438
422
|
loss : log_loss
|
439
423
|
n_estimators : 100
|
@@ -493,7 +477,9 @@ When training on real data, several things might go wrong and the code will try
|
|
493
477
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
494
478
|
entries will be removed before training.
|
495
479
|
|
496
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
480
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
481
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
482
|
+
- For whatever remains we remove the entries from the training.
|
497
483
|
|
498
484
|
## Application
|
499
485
|
|
@@ -516,15 +502,24 @@ The picking process happens through the comparison of hashes between the samples
|
|
516
502
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
517
503
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
518
504
|
|
519
|
-
If a sample
|
505
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
520
506
|
`CVSameData` exception will be risen.
|
521
507
|
|
508
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
509
|
+
from that configuration and the user does not need to define extra columns.
|
510
|
+
|
522
511
|
### Caveats
|
523
512
|
|
524
513
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
525
514
|
|
526
515
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
527
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
516
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
517
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
518
|
+
and can be accessed through:
|
519
|
+
```python
|
520
|
+
model.cfg
|
521
|
+
```
|
522
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
528
523
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
529
524
|
|
530
525
|
# Pandas dataframes
|
@@ -563,6 +558,24 @@ These are utility functions meant to be used with ROOT dataframes.
|
|
563
558
|
|
564
559
|
## Adding a column from a numpy array
|
565
560
|
|
561
|
+
### With numba
|
562
|
+
|
563
|
+
For this do:
|
564
|
+
|
565
|
+
```python
|
566
|
+
import dmu.rdataframe.utilities as ut
|
567
|
+
|
568
|
+
arr_val = numpy.array([10, 20, 30])
|
569
|
+
rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
|
570
|
+
```
|
571
|
+
|
572
|
+
where the identifier needs to be unique, every time the function is called.
|
573
|
+
This is the case, because the addition is done internally by declaring a numba function whose name
|
574
|
+
cannot be repeated as mentioned
|
575
|
+
[here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
|
576
|
+
|
577
|
+
### With awkward
|
578
|
+
|
566
579
|
For this do:
|
567
580
|
|
568
581
|
```python
|
@@ -1,3 +1,23 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: data_manipulation_utilities
|
3
|
+
Version: 0.2.3
|
4
|
+
Description-Content-Type: text/markdown
|
5
|
+
Requires-Dist: logzero
|
6
|
+
Requires-Dist: PyYAML
|
7
|
+
Requires-Dist: scipy
|
8
|
+
Requires-Dist: awkward
|
9
|
+
Requires-Dist: tqdm
|
10
|
+
Requires-Dist: joblib
|
11
|
+
Requires-Dist: scikit-learn
|
12
|
+
Requires-Dist: toml
|
13
|
+
Requires-Dist: numpy
|
14
|
+
Requires-Dist: matplotlib
|
15
|
+
Requires-Dist: mplhep
|
16
|
+
Requires-Dist: hist[plot]
|
17
|
+
Requires-Dist: pandas
|
18
|
+
Provides-Extra: dev
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
20
|
+
|
1
21
|
# D(ata) M(anipulation) U(tilities)
|
2
22
|
|
3
23
|
These are tools that can be used for different data analysis tasks.
|
@@ -404,6 +424,10 @@ where the settings for the training go in a config dictionary, which when writte
|
|
404
424
|
|
405
425
|
```yaml
|
406
426
|
dataset:
|
427
|
+
# Before training, new features can be defined as below
|
428
|
+
define :
|
429
|
+
x : v + w
|
430
|
+
y : v - w
|
407
431
|
# If the key is found to be NaN, replace its value with the number provided
|
408
432
|
# This will be used in the training.
|
409
433
|
# Otherwise the entries with NaNs will be dropped
|
@@ -413,7 +437,7 @@ dataset:
|
|
413
437
|
z : -999
|
414
438
|
training :
|
415
439
|
nfold : 10
|
416
|
-
features : [
|
440
|
+
features : [x, y, z]
|
417
441
|
hyper :
|
418
442
|
loss : log_loss
|
419
443
|
n_estimators : 100
|
@@ -473,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
|
|
473
497
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
474
498
|
entries will be removed before training.
|
475
499
|
|
476
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we
|
500
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
|
+
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
|
+
- For whatever remains we remove the entries from the training.
|
477
503
|
|
478
504
|
## Application
|
479
505
|
|
@@ -496,15 +522,24 @@ The picking process happens through the comparison of hashes between the samples
|
|
496
522
|
The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
|
497
523
|
`GradientBoostClassifier`, here called `CVClassifier`.
|
498
524
|
|
499
|
-
If a sample
|
525
|
+
If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
|
500
526
|
`CVSameData` exception will be risen.
|
501
527
|
|
528
|
+
During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
|
529
|
+
from that configuration and the user does not need to define extra columns.
|
530
|
+
|
502
531
|
### Caveats
|
503
532
|
|
504
533
|
When evaluating the model with real data, problems might occur, we deal with them as follows:
|
505
534
|
|
506
535
|
- **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
|
507
|
-
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be
|
536
|
+
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
|
537
|
+
- Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
|
538
|
+
and can be accessed through:
|
539
|
+
```python
|
540
|
+
model.cfg
|
541
|
+
```
|
542
|
+
- For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
508
543
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
509
544
|
|
510
545
|
# Pandas dataframes
|
@@ -543,6 +578,24 @@ These are utility functions meant to be used with ROOT dataframes.
|
|
543
578
|
|
544
579
|
## Adding a column from a numpy array
|
545
580
|
|
581
|
+
### With numba
|
582
|
+
|
583
|
+
For this do:
|
584
|
+
|
585
|
+
```python
|
586
|
+
import dmu.rdataframe.utilities as ut
|
587
|
+
|
588
|
+
arr_val = numpy.array([10, 20, 30])
|
589
|
+
rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
|
590
|
+
```
|
591
|
+
|
592
|
+
where the identifier needs to be unique, every time the function is called.
|
593
|
+
This is the case, because the addition is done internally by declaring a numba function whose name
|
594
|
+
cannot be repeated as mentioned
|
595
|
+
[here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
|
596
|
+
|
597
|
+
### With awkward
|
598
|
+
|
546
599
|
For this do:
|
547
600
|
|
548
601
|
```python
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/cv_predict.py
RENAMED
@@ -32,11 +32,56 @@ class CVPredict:
|
|
32
32
|
if rdf is None:
|
33
33
|
raise ValueError('No ROOT dataframe passed')
|
34
34
|
|
35
|
-
self._l_model
|
36
|
-
self._rdf
|
35
|
+
self._l_model = models
|
36
|
+
self._rdf = rdf
|
37
|
+
self._d_nan_rep : dict[str,str]
|
37
38
|
|
38
39
|
self._arr_patch : numpy.ndarray
|
39
40
|
# --------------------------------------------
|
41
|
+
def _initialize(self):
|
42
|
+
self._rdf = self._define_columns(self._rdf)
|
43
|
+
self._d_nan_rep = self._get_nan_replacements()
|
44
|
+
# --------------------------------------------
|
45
|
+
def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
|
46
|
+
cfg = self._l_model[0].cfg
|
47
|
+
|
48
|
+
if 'define' not in cfg['dataset']:
|
49
|
+
log.debug('No define section found in config, will not define extra columns')
|
50
|
+
return self._rdf
|
51
|
+
|
52
|
+
d_def = cfg['dataset']['define']
|
53
|
+
log.debug(60 * '-')
|
54
|
+
log.info('Defining columns in RDF before evaluating classifier')
|
55
|
+
log.debug(60 * '-')
|
56
|
+
for name, expr in d_def.items():
|
57
|
+
log.debug(f'{name:<20}{"<---":20}{expr:<100}')
|
58
|
+
rdf = rdf.Define(name, expr)
|
59
|
+
|
60
|
+
return rdf
|
61
|
+
# --------------------------------------------
|
62
|
+
def _get_nan_replacements(self) -> dict[str,str]:
|
63
|
+
cfg = self._l_model[0].cfg
|
64
|
+
|
65
|
+
if 'nan' not in cfg['dataset']:
|
66
|
+
log.debug('No define section found in config, will not define extra columns')
|
67
|
+
return {}
|
68
|
+
|
69
|
+
return cfg['dataset']['nan']
|
70
|
+
# --------------------------------------------
|
71
|
+
def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
|
+
if len(self._d_nan_rep) == 0:
|
73
|
+
log.debug('Not doing any NaN replacement')
|
74
|
+
return df
|
75
|
+
|
76
|
+
log.debug(60 * '-')
|
77
|
+
log.info('Doing NaN replacements')
|
78
|
+
log.debug(60 * '-')
|
79
|
+
for var, val in self._d_nan_rep.items():
|
80
|
+
log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
|
81
|
+
df[var] = df[var].fillna(val)
|
82
|
+
|
83
|
+
return df
|
84
|
+
# --------------------------------------------
|
40
85
|
def _get_df(self):
|
41
86
|
'''
|
42
87
|
Will make ROOT rdf into dataframe and return it
|
@@ -45,6 +90,7 @@ class CVPredict:
|
|
45
90
|
l_ft = model.features
|
46
91
|
d_data= self._rdf.AsNumpy(l_ft)
|
47
92
|
df_ft = pnd.DataFrame(d_data)
|
93
|
+
df_ft = self._replace_nans(df_ft)
|
48
94
|
df_ft = ut.patch_and_tag(df_ft)
|
49
95
|
|
50
96
|
if 'patched_indices' in df_ft.attrs:
|
@@ -136,6 +182,8 @@ class CVPredict:
|
|
136
182
|
'''
|
137
183
|
Will return array of prediction probabilities for the signal category
|
138
184
|
'''
|
185
|
+
self._initialize()
|
186
|
+
|
139
187
|
df_ft = self._get_df()
|
140
188
|
model = self._l_model[0]
|
141
189
|
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/train_mva.py
RENAMED
@@ -26,7 +26,7 @@ from dmu.plotting.matrix import MatrixPlotter
|
|
26
26
|
from dmu.logging.log_store import LogStore
|
27
27
|
|
28
28
|
npa = numpy.ndarray
|
29
|
-
log = LogStore.add_logger('
|
29
|
+
log = LogStore.add_logger('dmu:ml:train_mva')
|
30
30
|
# ---------------------------------------------
|
31
31
|
class TrainMva:
|
32
32
|
'''
|
@@ -334,10 +334,10 @@ class TrainMva:
|
|
334
334
|
if 'max' in self._cfg['plotting']['roc']:
|
335
335
|
[max_x, max_y] = self._cfg['plotting']['roc']['max']
|
336
336
|
|
337
|
-
self._plot_probabilities(xval_ts, yval_ts, l_prb_ts)
|
338
|
-
|
339
337
|
plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
|
340
338
|
plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
|
339
|
+
self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
|
340
|
+
|
341
341
|
plt.xlabel('Signal efficiency')
|
342
342
|
plt.ylabel('Background rejection')
|
343
343
|
plt.title(f'Fold: {ifold}')
|
@@ -351,13 +351,17 @@ class TrainMva:
|
|
351
351
|
def _plot_probabilities(self,
|
352
352
|
arr_seff: npa,
|
353
353
|
arr_brej: npa,
|
354
|
-
arr_sprb: npa
|
354
|
+
arr_sprb: npa,
|
355
|
+
arr_labl: npa) -> None:
|
355
356
|
|
356
357
|
roc_cfg = self._cfg['plotting']['roc']
|
357
358
|
if 'annotate' not in roc_cfg:
|
358
359
|
log.debug('Annotation section in the ROC curve config not found, skipping annotation')
|
359
360
|
return
|
360
361
|
|
362
|
+
l_sprb = [ sprb for sprb, labl in zip(arr_sprb, arr_labl) if labl == 1 ]
|
363
|
+
arr_sprb = numpy.array(l_sprb)
|
364
|
+
|
361
365
|
plt_cfg = roc_cfg['annotate']
|
362
366
|
if 'sig_eff' not in plt_cfg:
|
363
367
|
l_seff_target = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
|
@@ -366,17 +370,24 @@ class TrainMva:
|
|
366
370
|
del plt_cfg['sig_eff']
|
367
371
|
|
368
372
|
arr_seff_target = numpy.array(l_seff_target)
|
373
|
+
arr_quantile = 1 - arr_seff_target
|
369
374
|
|
370
|
-
l_score = numpy.quantile(arr_sprb,
|
375
|
+
l_score = numpy.quantile(arr_sprb, arr_quantile)
|
371
376
|
l_seff = []
|
372
377
|
l_brej = []
|
373
|
-
|
378
|
+
|
379
|
+
log.debug(60 * '-')
|
380
|
+
log.debug(f'{"SigEff":20}{"BkgRej":20}{"Score":20}')
|
381
|
+
log.debug(60 * '-')
|
382
|
+
for seff_target, score in zip(arr_seff_target, l_score):
|
374
383
|
arr_diff = numpy.abs(arr_seff - seff_target)
|
375
384
|
ind = numpy.argmin(arr_diff)
|
376
385
|
|
377
386
|
seff = arr_seff[ind]
|
378
387
|
brej = arr_brej[ind]
|
379
388
|
|
389
|
+
log.debug(f'{seff:<20.3f}{brej:<20.3f}{score:<20.2f}')
|
390
|
+
|
380
391
|
l_seff.append(seff)
|
381
392
|
l_brej.append(brej)
|
382
393
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
'''
|
2
2
|
Module containing utility functions to be used with ROOT dataframes
|
3
3
|
'''
|
4
|
+
# pylint: disable=no-name-in-module
|
4
5
|
|
5
6
|
import re
|
6
7
|
from dataclasses import dataclass
|
@@ -10,7 +11,7 @@ import pandas as pnd
|
|
10
11
|
import awkward as ak
|
11
12
|
import numpy
|
12
13
|
|
13
|
-
from ROOT import RDataFrame, RDF
|
14
|
+
from ROOT import RDataFrame, RDF, Numba
|
14
15
|
|
15
16
|
from dmu.logging.log_store import LogStore
|
16
17
|
|
@@ -34,6 +35,8 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
|
|
34
35
|
exclude_re : Regex with patter of column names that we won't pick
|
35
36
|
'''
|
36
37
|
|
38
|
+
log.warning(f'Adding column {name} with awkward')
|
39
|
+
|
37
40
|
d_opt = {} if d_opt is None else d_opt
|
38
41
|
if arr_val is None:
|
39
42
|
raise ValueError('Array of values not introduced')
|
@@ -66,12 +69,35 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
|
|
66
69
|
if arr_val.dtype == 'object':
|
67
70
|
arr_val = arr_val.astype(float)
|
68
71
|
|
69
|
-
d_data[name] = arr_val
|
72
|
+
d_data[name] = ak.from_numpy(arr_val)
|
70
73
|
|
71
74
|
rdf = ak.to_rdataframe(d_data)
|
72
75
|
|
73
76
|
return rdf
|
74
77
|
# ---------------------------------------------------------------------
|
78
|
+
def add_column_with_numba(
|
79
|
+
rdf : RDataFrame,
|
80
|
+
arr_val : Union[numpy.ndarray,None],
|
81
|
+
name : str,
|
82
|
+
identifier : str) -> RDataFrame:
|
83
|
+
'''
|
84
|
+
Will take a dataframe, an array of numbers and a string
|
85
|
+
Will add the array as a colunm to the dataframe
|
86
|
+
|
87
|
+
The `identifier` argument is a string need in order to avoid collisions
|
88
|
+
when using Numba to define a function to get the value from.
|
89
|
+
'''
|
90
|
+
identifier=f'fun_{identifier}'
|
91
|
+
|
92
|
+
@Numba.Declare(['int'], 'float', name=identifier)
|
93
|
+
def get_value(index):
|
94
|
+
return arr_val[index]
|
95
|
+
|
96
|
+
log.debug(f'Adding column {name} with numba')
|
97
|
+
rdf = rdf.Define(name, f'Numba::{identifier}(rdfentry_)')
|
98
|
+
|
99
|
+
return rdf
|
100
|
+
# ---------------------------------------------------------------------
|
75
101
|
def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
|
76
102
|
'''
|
77
103
|
Takes the output of rdf.Report(), i.e. an RDataFrame cutflow report.
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/arrays/utilities.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/generic/utilities.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/logging/log_store.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/cv_classifier.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/utilities.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/matrix.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rfile/rfprinter.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rfile/utilities.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/fitter.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/function.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/minimizers.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/utilities.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/testing/utilities.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/text/transformer.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/git/publish
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/ssh/coned.py
RENAMED
File without changes
|
File without changes
|