data-manipulation-utilities 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_manipulation_utilities-0.2.1/src/data_manipulation_utilities.egg-info → data_manipulation_utilities-0.2.3}/PKG-INFO +39 -6
  2. data_manipulation_utilities-0.2.1/PKG-INFO → data_manipulation_utilities-0.2.3/README.md +37 -24
  3. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/pyproject.toml +2 -2
  4. data_manipulation_utilities-0.2.1/README.md → data_manipulation_utilities-0.2.3/src/data_manipulation_utilities.egg-info/PKG-INFO +57 -4
  5. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
  6. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/cv_predict.py +50 -2
  7. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/train_mva.py +17 -6
  8. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rdataframe/utilities.py +28 -2
  9. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/setup.cfg +0 -0
  10. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/SOURCES.txt +0 -0
  11. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
  12. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
  13. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
  14. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/arrays/utilities.py +0 -0
  15. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/generic/utilities.py +0 -0
  16. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/logging/log_store.py +0 -0
  17. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/cv_classifier.py +0 -0
  18. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/ml/utilities.py +0 -0
  19. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/pdataframe/utilities.py +0 -0
  20. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/matrix.py +0 -0
  21. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter.py +0 -0
  22. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter_1d.py +0 -0
  23. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/plotter_2d.py +0 -0
  24. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/plotting/utilities.py +0 -0
  25. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rdataframe/atr_mgr.py +0 -0
  26. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rfile/rfprinter.py +0 -0
  27. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/rfile/utilities.py +0 -0
  28. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/fitter.py +0 -0
  29. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/function.py +0 -0
  30. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/gof_calculator.py +0 -0
  31. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/minimizers.py +0 -0
  32. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/model_factory.py +0 -0
  33. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/utilities.py +0 -0
  34. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/stats/zfit_plotter.py +0 -0
  35. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/testing/utilities.py +0 -0
  36. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu/text/transformer.py +0 -0
  37. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/__init__.py +0 -0
  38. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/ml/tests/train_mva.yaml +0 -0
  39. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/2d.yaml +0 -0
  40. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
  41. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
  42. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/name.yaml +0 -0
  43. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
  44. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
  45. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/simple.yaml +0 -0
  46. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/title.yaml +0 -0
  47. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/plotting/tests/weights.yaml +0 -0
  48. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform.toml +0 -0
  49. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform.txt +0 -0
  50. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform_set.toml +0 -0
  51. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform_set.txt +0 -0
  52. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_data/text/transform_trf.txt +0 -0
  53. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/git/publish +0 -0
  54. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/physics/check_truth.py +0 -0
  55. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
  56. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/rfile/print_trees.py +0 -0
  57. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/ssh/coned.py +0 -0
  58. {data_manipulation_utilities-0.2.1 → data_manipulation_utilities-0.2.3}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
- Requires-Dist: awkward==2.4.6
8
+ Requires-Dist: awkward
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: joblib
11
11
  Requires-Dist: scikit-learn
@@ -424,6 +424,10 @@ where the settings for the training go in a config dictionary, which when writte
424
424
 
425
425
  ```yaml
426
426
  dataset:
427
+ # Before training, new features can be defined as below
428
+ define :
429
+ x : v + w
430
+ y : v - w
427
431
  # If the key is found to be NaN, replace its value with the number provided
428
432
  # This will be used in the training.
429
433
  # Otherwise the entries with NaNs will be dropped
@@ -433,7 +437,7 @@ dataset:
433
437
  z : -999
434
438
  training :
435
439
  nfold : 10
436
- features : [w, x, y, z]
440
+ features : [x, y, z]
437
441
  hyper :
438
442
  loss : log_loss
439
443
  n_estimators : 100
@@ -493,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
493
497
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
494
498
  entries will be removed before training.
495
499
 
496
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
500
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
+ - Can use the `nan` section shown above to replace `NaN` values with something else
502
+ - For whatever remains we remove the entries from the training.
497
503
 
498
504
  ## Application
499
505
 
@@ -516,15 +522,24 @@ The picking process happens through the comparison of hashes between the samples
516
522
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
517
523
  `GradientBoostClassifier`, here called `CVClassifier`.
518
524
 
519
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
525
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
520
526
  `CVSameData` exception will be risen.
521
527
 
528
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
529
+ from that configuration and the user does not need to define extra columns.
530
+
522
531
  ### Caveats
523
532
 
524
533
  When evaluating the model with real data, problems might occur, we deal with them as follows:
525
534
 
526
535
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
527
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
536
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
537
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
538
+ and can be accessed through:
539
+ ```python
540
+ model.cfg
541
+ ```
542
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
528
543
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
529
544
 
530
545
  # Pandas dataframes
@@ -563,6 +578,24 @@ These are utility functions meant to be used with ROOT dataframes.
563
578
 
564
579
  ## Adding a column from a numpy array
565
580
 
581
+ ### With numba
582
+
583
+ For this do:
584
+
585
+ ```python
586
+ import dmu.rdataframe.utilities as ut
587
+
588
+ arr_val = numpy.array([10, 20, 30])
589
+ rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
590
+ ```
591
+
592
+ where the identifier needs to be unique, every time the function is called.
593
+ This is the case, because the addition is done internally by declaring a numba function whose name
594
+ cannot be repeated as mentioned
595
+ [here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
596
+
597
+ ### With awkward
598
+
566
599
  For this do:
567
600
 
568
601
  ```python
@@ -1,23 +1,3 @@
1
- Metadata-Version: 2.2
2
- Name: data_manipulation_utilities
3
- Version: 0.2.1
4
- Description-Content-Type: text/markdown
5
- Requires-Dist: logzero
6
- Requires-Dist: PyYAML
7
- Requires-Dist: scipy
8
- Requires-Dist: awkward==2.4.6
9
- Requires-Dist: tqdm
10
- Requires-Dist: joblib
11
- Requires-Dist: scikit-learn
12
- Requires-Dist: toml
13
- Requires-Dist: numpy
14
- Requires-Dist: matplotlib
15
- Requires-Dist: mplhep
16
- Requires-Dist: hist[plot]
17
- Requires-Dist: pandas
18
- Provides-Extra: dev
19
- Requires-Dist: pytest; extra == "dev"
20
-
21
1
  # D(ata) M(anipulation) U(tilities)
22
2
 
23
3
  These are tools that can be used for different data analysis tasks.
@@ -424,6 +404,10 @@ where the settings for the training go in a config dictionary, which when writte
424
404
 
425
405
  ```yaml
426
406
  dataset:
407
+ # Before training, new features can be defined as below
408
+ define :
409
+ x : v + w
410
+ y : v - w
427
411
  # If the key is found to be NaN, replace its value with the number provided
428
412
  # This will be used in the training.
429
413
  # Otherwise the entries with NaNs will be dropped
@@ -433,7 +417,7 @@ dataset:
433
417
  z : -999
434
418
  training :
435
419
  nfold : 10
436
- features : [w, x, y, z]
420
+ features : [x, y, z]
437
421
  hyper :
438
422
  loss : log_loss
439
423
  n_estimators : 100
@@ -493,7 +477,9 @@ When training on real data, several things might go wrong and the code will try
493
477
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
494
478
  entries will be removed before training.
495
479
 
496
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
480
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
481
+ - Can use the `nan` section shown above to replace `NaN` values with something else
482
+ - For whatever remains we remove the entries from the training.
497
483
 
498
484
  ## Application
499
485
 
@@ -516,15 +502,24 @@ The picking process happens through the comparison of hashes between the samples
516
502
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
517
503
  `GradientBoostClassifier`, here called `CVClassifier`.
518
504
 
519
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
505
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
520
506
  `CVSameData` exception will be risen.
521
507
 
508
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
509
+ from that configuration and the user does not need to define extra columns.
510
+
522
511
  ### Caveats
523
512
 
524
513
  When evaluating the model with real data, problems might occur, we deal with them as follows:
525
514
 
526
515
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
527
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
516
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
517
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
518
+ and can be accessed through:
519
+ ```python
520
+ model.cfg
521
+ ```
522
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
528
523
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
529
524
 
530
525
  # Pandas dataframes
@@ -563,6 +558,24 @@ These are utility functions meant to be used with ROOT dataframes.
563
558
 
564
559
  ## Adding a column from a numpy array
565
560
 
561
+ ### With numba
562
+
563
+ For this do:
564
+
565
+ ```python
566
+ import dmu.rdataframe.utilities as ut
567
+
568
+ arr_val = numpy.array([10, 20, 30])
569
+ rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
570
+ ```
571
+
572
+ where the identifier needs to be unique, every time the function is called.
573
+ This is the case, because the addition is done internally by declaring a numba function whose name
574
+ cannot be repeated as mentioned
575
+ [here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
576
+
577
+ ### With awkward
578
+
566
579
  For this do:
567
580
 
568
581
  ```python
@@ -1,12 +1,12 @@
1
1
  [project]
2
2
  name = 'data_manipulation_utilities'
3
- version = '0.2.1'
3
+ version = '0.2.3'
4
4
  readme = 'README.md'
5
5
  dependencies= [
6
6
  'logzero',
7
7
  'PyYAML',
8
8
  'scipy',
9
- 'awkward==2.4.6',
9
+ 'awkward',
10
10
  'tqdm',
11
11
  'joblib',
12
12
  'scikit-learn',
@@ -1,3 +1,23 @@
1
+ Metadata-Version: 2.2
2
+ Name: data_manipulation_utilities
3
+ Version: 0.2.3
4
+ Description-Content-Type: text/markdown
5
+ Requires-Dist: logzero
6
+ Requires-Dist: PyYAML
7
+ Requires-Dist: scipy
8
+ Requires-Dist: awkward
9
+ Requires-Dist: tqdm
10
+ Requires-Dist: joblib
11
+ Requires-Dist: scikit-learn
12
+ Requires-Dist: toml
13
+ Requires-Dist: numpy
14
+ Requires-Dist: matplotlib
15
+ Requires-Dist: mplhep
16
+ Requires-Dist: hist[plot]
17
+ Requires-Dist: pandas
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest; extra == "dev"
20
+
1
21
  # D(ata) M(anipulation) U(tilities)
2
22
 
3
23
  These are tools that can be used for different data analysis tasks.
@@ -404,6 +424,10 @@ where the settings for the training go in a config dictionary, which when writte
404
424
 
405
425
  ```yaml
406
426
  dataset:
427
+ # Before training, new features can be defined as below
428
+ define :
429
+ x : v + w
430
+ y : v - w
407
431
  # If the key is found to be NaN, replace its value with the number provided
408
432
  # This will be used in the training.
409
433
  # Otherwise the entries with NaNs will be dropped
@@ -413,7 +437,7 @@ dataset:
413
437
  z : -999
414
438
  training :
415
439
  nfold : 10
416
- features : [w, x, y, z]
440
+ features : [x, y, z]
417
441
  hyper :
418
442
  loss : log_loss
419
443
  n_estimators : 100
@@ -473,7 +497,9 @@ When training on real data, several things might go wrong and the code will try
473
497
  will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
474
498
  entries will be removed before training.
475
499
 
476
- - **NaNs**: Entries with NaNs will break the training with the scikit GradientBoostClassifier base class. Thus, we also remove them from the training.
500
+ - **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
501
+ - Can use the `nan` section shown above to replace `NaN` values with something else
502
+ - For whatever remains we remove the entries from the training.
477
503
 
478
504
  ## Application
479
505
 
@@ -496,15 +522,24 @@ The picking process happens through the comparison of hashes between the samples
496
522
  The hashes of the training samples are stored in the pickled model itself; which therefore is a reimplementation of
497
523
  `GradientBoostClassifier`, here called `CVClassifier`.
498
524
 
499
- If a sample exist, that was used in the training of _every_ model, no model can be chosen for the prediction and an
525
+ If a sample exists, that was used in the training of _every_ model, no model can be chosen for the prediction and a
500
526
  `CVSameData` exception will be risen.
501
527
 
528
+ During training, the configuration will be stored in the model. Therefore, variable definitions can be picked up for evaluation
529
+ from that configuration and the user does not need to define extra columns.
530
+
502
531
  ### Caveats
503
532
 
504
533
  When evaluating the model with real data, problems might occur, we deal with them as follows:
505
534
 
506
535
  - **Repeated entries**: When there are repeated features in the dataset to be evaluated we assign the same probabilities, no filtering is used.
507
- - **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
536
+ - **NaNs**: Entries with NaNs will break the evaluation. These entries will be:
537
+ - Replaced by other values before evaluation IF a replacement was specified during training. The training configuration will be stored in the model
538
+ and can be accessed through:
539
+ ```python
540
+ model.cfg
541
+ ```
542
+ - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
508
543
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
509
544
 
510
545
  # Pandas dataframes
@@ -543,6 +578,24 @@ These are utility functions meant to be used with ROOT dataframes.
543
578
 
544
579
  ## Adding a column from a numpy array
545
580
 
581
+ ### With numba
582
+
583
+ For this do:
584
+
585
+ ```python
586
+ import dmu.rdataframe.utilities as ut
587
+
588
+ arr_val = numpy.array([10, 20, 30])
589
+ rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
590
+ ```
591
+
592
+ where the identifier needs to be unique, every time the function is called.
593
+ This is the case, because the addition is done internally by declaring a numba function whose name
594
+ cannot be repeated as mentioned
595
+ [here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
596
+
597
+ ### With awkward
598
+
546
599
  For this do:
547
600
 
548
601
  ```python
@@ -1,7 +1,7 @@
1
1
  logzero
2
2
  PyYAML
3
3
  scipy
4
- awkward==2.4.6
4
+ awkward
5
5
  tqdm
6
6
  joblib
7
7
  scikit-learn
@@ -32,11 +32,56 @@ class CVPredict:
32
32
  if rdf is None:
33
33
  raise ValueError('No ROOT dataframe passed')
34
34
 
35
- self._l_model = models
36
- self._rdf = rdf
35
+ self._l_model = models
36
+ self._rdf = rdf
37
+ self._d_nan_rep : dict[str,str]
37
38
 
38
39
  self._arr_patch : numpy.ndarray
39
40
  # --------------------------------------------
41
+ def _initialize(self):
42
+ self._rdf = self._define_columns(self._rdf)
43
+ self._d_nan_rep = self._get_nan_replacements()
44
+ # --------------------------------------------
45
+ def _define_columns(self, rdf : RDataFrame) -> RDataFrame:
46
+ cfg = self._l_model[0].cfg
47
+
48
+ if 'define' not in cfg['dataset']:
49
+ log.debug('No define section found in config, will not define extra columns')
50
+ return self._rdf
51
+
52
+ d_def = cfg['dataset']['define']
53
+ log.debug(60 * '-')
54
+ log.info('Defining columns in RDF before evaluating classifier')
55
+ log.debug(60 * '-')
56
+ for name, expr in d_def.items():
57
+ log.debug(f'{name:<20}{"<---":20}{expr:<100}')
58
+ rdf = rdf.Define(name, expr)
59
+
60
+ return rdf
61
+ # --------------------------------------------
62
+ def _get_nan_replacements(self) -> dict[str,str]:
63
+ cfg = self._l_model[0].cfg
64
+
65
+ if 'nan' not in cfg['dataset']:
66
+ log.debug('No define section found in config, will not define extra columns')
67
+ return {}
68
+
69
+ return cfg['dataset']['nan']
70
+ # --------------------------------------------
71
+ def _replace_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
72
+ if len(self._d_nan_rep) == 0:
73
+ log.debug('Not doing any NaN replacement')
74
+ return df
75
+
76
+ log.debug(60 * '-')
77
+ log.info('Doing NaN replacements')
78
+ log.debug(60 * '-')
79
+ for var, val in self._d_nan_rep.items():
80
+ log.debug(f'{var:<20}{"--->":20}{val:<20.3f}')
81
+ df[var] = df[var].fillna(val)
82
+
83
+ return df
84
+ # --------------------------------------------
40
85
  def _get_df(self):
41
86
  '''
42
87
  Will make ROOT rdf into dataframe and return it
@@ -45,6 +90,7 @@ class CVPredict:
45
90
  l_ft = model.features
46
91
  d_data= self._rdf.AsNumpy(l_ft)
47
92
  df_ft = pnd.DataFrame(d_data)
93
+ df_ft = self._replace_nans(df_ft)
48
94
  df_ft = ut.patch_and_tag(df_ft)
49
95
 
50
96
  if 'patched_indices' in df_ft.attrs:
@@ -136,6 +182,8 @@ class CVPredict:
136
182
  '''
137
183
  Will return array of prediction probabilities for the signal category
138
184
  '''
185
+ self._initialize()
186
+
139
187
  df_ft = self._get_df()
140
188
  model = self._l_model[0]
141
189
 
@@ -26,7 +26,7 @@ from dmu.plotting.matrix import MatrixPlotter
26
26
  from dmu.logging.log_store import LogStore
27
27
 
28
28
  npa = numpy.ndarray
29
- log = LogStore.add_logger('data_checks:train_mva')
29
+ log = LogStore.add_logger('dmu:ml:train_mva')
30
30
  # ---------------------------------------------
31
31
  class TrainMva:
32
32
  '''
@@ -334,10 +334,10 @@ class TrainMva:
334
334
  if 'max' in self._cfg['plotting']['roc']:
335
335
  [max_x, max_y] = self._cfg['plotting']['roc']['max']
336
336
 
337
- self._plot_probabilities(xval_ts, yval_ts, l_prb_ts)
338
-
339
337
  plt.plot(xval_ts, yval_ts, color='b', label=f'Test: {area_ts:.3f}')
340
338
  plt.plot(xval_tr, yval_tr, color='r', label=f'Train: {area_tr:.3f}')
339
+ self._plot_probabilities(xval_ts, yval_ts, l_prb_ts, l_lab_ts)
340
+
341
341
  plt.xlabel('Signal efficiency')
342
342
  plt.ylabel('Background rejection')
343
343
  plt.title(f'Fold: {ifold}')
@@ -351,13 +351,17 @@ class TrainMva:
351
351
  def _plot_probabilities(self,
352
352
  arr_seff: npa,
353
353
  arr_brej: npa,
354
- arr_sprb: npa) -> None:
354
+ arr_sprb: npa,
355
+ arr_labl: npa) -> None:
355
356
 
356
357
  roc_cfg = self._cfg['plotting']['roc']
357
358
  if 'annotate' not in roc_cfg:
358
359
  log.debug('Annotation section in the ROC curve config not found, skipping annotation')
359
360
  return
360
361
 
362
+ l_sprb = [ sprb for sprb, labl in zip(arr_sprb, arr_labl) if labl == 1 ]
363
+ arr_sprb = numpy.array(l_sprb)
364
+
361
365
  plt_cfg = roc_cfg['annotate']
362
366
  if 'sig_eff' not in plt_cfg:
363
367
  l_seff_target = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
@@ -366,17 +370,24 @@ class TrainMva:
366
370
  del plt_cfg['sig_eff']
367
371
 
368
372
  arr_seff_target = numpy.array(l_seff_target)
373
+ arr_quantile = 1 - arr_seff_target
369
374
 
370
- l_score = numpy.quantile(arr_sprb, 1 - arr_seff_target)
375
+ l_score = numpy.quantile(arr_sprb, arr_quantile)
371
376
  l_seff = []
372
377
  l_brej = []
373
- for seff_target in l_seff_target:
378
+
379
+ log.debug(60 * '-')
380
+ log.debug(f'{"SigEff":20}{"BkgRej":20}{"Score":20}')
381
+ log.debug(60 * '-')
382
+ for seff_target, score in zip(arr_seff_target, l_score):
374
383
  arr_diff = numpy.abs(arr_seff - seff_target)
375
384
  ind = numpy.argmin(arr_diff)
376
385
 
377
386
  seff = arr_seff[ind]
378
387
  brej = arr_brej[ind]
379
388
 
389
+ log.debug(f'{seff:<20.3f}{brej:<20.3f}{score:<20.2f}')
390
+
380
391
  l_seff.append(seff)
381
392
  l_brej.append(brej)
382
393
 
@@ -1,6 +1,7 @@
1
1
  '''
2
2
  Module containing utility functions to be used with ROOT dataframes
3
3
  '''
4
+ # pylint: disable=no-name-in-module
4
5
 
5
6
  import re
6
7
  from dataclasses import dataclass
@@ -10,7 +11,7 @@ import pandas as pnd
10
11
  import awkward as ak
11
12
  import numpy
12
13
 
13
- from ROOT import RDataFrame, RDF
14
+ from ROOT import RDataFrame, RDF, Numba
14
15
 
15
16
  from dmu.logging.log_store import LogStore
16
17
 
@@ -34,6 +35,8 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
34
35
  exclude_re : Regex with patter of column names that we won't pick
35
36
  '''
36
37
 
38
+ log.warning(f'Adding column {name} with awkward')
39
+
37
40
  d_opt = {} if d_opt is None else d_opt
38
41
  if arr_val is None:
39
42
  raise ValueError('Array of values not introduced')
@@ -66,12 +69,35 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
66
69
  if arr_val.dtype == 'object':
67
70
  arr_val = arr_val.astype(float)
68
71
 
69
- d_data[name] = arr_val
72
+ d_data[name] = ak.from_numpy(arr_val)
70
73
 
71
74
  rdf = ak.to_rdataframe(d_data)
72
75
 
73
76
  return rdf
74
77
  # ---------------------------------------------------------------------
78
+ def add_column_with_numba(
79
+ rdf : RDataFrame,
80
+ arr_val : Union[numpy.ndarray,None],
81
+ name : str,
82
+ identifier : str) -> RDataFrame:
83
+ '''
84
+ Will take a dataframe, an array of numbers and a string
85
+ Will add the array as a colunm to the dataframe
86
+
87
+ The `identifier` argument is a string need in order to avoid collisions
88
+ when using Numba to define a function to get the value from.
89
+ '''
90
+ identifier=f'fun_{identifier}'
91
+
92
+ @Numba.Declare(['int'], 'float', name=identifier)
93
+ def get_value(index):
94
+ return arr_val[index]
95
+
96
+ log.debug(f'Adding column {name} with numba')
97
+ rdf = rdf.Define(name, f'Numba::{identifier}(rdfentry_)')
98
+
99
+ return rdf
100
+ # ---------------------------------------------------------------------
75
101
  def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
76
102
  '''
77
103
  Takes the output of rdf.Report(), i.e. an RDataFrame cutflow report.