data-manipulation-utilities 0.1.9__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/PKG-INFO +121 -2
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/README.md +119 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/pyproject.toml +2 -2
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/PKG-INFO +121 -2
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/SOURCES.txt +4 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/requires.txt +1 -1
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_classifier.py +2 -1
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_predict.py +2 -1
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/train_mva.py +204 -25
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/utilities.py +8 -0
- data_manipulation_utilities-0.2.1/src/dmu/pdataframe/utilities.py +36 -0
- data_manipulation_utilities-0.2.1/src/dmu/plotting/matrix.py +157 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/plotting/plotter_1d.py +18 -1
- data_manipulation_utilities-0.2.1/src/dmu/plotting/utilities.py +33 -0
- data_manipulation_utilities-0.2.1/src/dmu_data/ml/tests/train_mva.yaml +52 -0
- data_manipulation_utilities-0.2.1/src/dmu_data/plotting/tests/normalized.yaml +9 -0
- data_manipulation_utilities-0.1.9/src/dmu_data/ml/tests/train_mva.yaml +0 -37
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/setup.cfg +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/arrays/utilities.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/generic/utilities.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/logging/log_store.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/plotting/plotter.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/plotting/plotter_2d.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rdataframe/atr_mgr.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rdataframe/utilities.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rfile/rfprinter.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/rfile/utilities.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/fitter.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/function.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/gof_calculator.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/minimizers.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/model_factory.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/utilities.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/stats/zfit_plotter.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/testing/utilities.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/text/transformer.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/__init__.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/2d.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/name.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/simple.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/title.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/plotting/tests/weights.yaml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform.toml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform.txt +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform_set.toml +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform_set.txt +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_data/text/transform_trf.txt +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/git/publish +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/physics/check_truth.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/rfile/print_trees.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/ssh/coned.py +0 -0
- {data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.1
|
3
|
+
Version: 0.2.1
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward==2.4.6
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -204,6 +204,33 @@ print_pdf(pdf,
|
|
204
204
|
|
205
205
|
The `Fitter` class is a wrapper to zfit, use to make fitting easier.
|
206
206
|
|
207
|
+
### Goodness of fits
|
208
|
+
|
209
|
+
Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
|
210
|
+
This is done by:
|
211
|
+
|
212
|
+
- Binning the data and PDF.
|
213
|
+
- Calculating the reduced $\chi^2$.
|
214
|
+
- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
|
215
|
+
|
216
|
+
This class is used as shown below:
|
217
|
+
|
218
|
+
```python
|
219
|
+
from dmu.stats.gof_calculator import GofCalculator
|
220
|
+
|
221
|
+
nll = _get_nll()
|
222
|
+
res = Data.minimizer.minimize(nll)
|
223
|
+
|
224
|
+
gcl = GofCalculator(nll, ndof=10)
|
225
|
+
gof = gcl.get_gof(kind='pvalue')
|
226
|
+
```
|
227
|
+
|
228
|
+
where:
|
229
|
+
|
230
|
+
- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
|
231
|
+
It is needed to know how many bins to use to make the histogram. The recommended value is 10.
|
232
|
+
- `kind` The argument can be `pvalue` or `chi2/ndof`.
|
233
|
+
|
207
234
|
### Simplest fit
|
208
235
|
|
209
236
|
```python
|
@@ -396,6 +423,14 @@ obj.run()
|
|
396
423
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
397
424
|
|
398
425
|
```yaml
|
426
|
+
dataset:
|
427
|
+
# If the key is found to be NaN, replace its value with the number provided
|
428
|
+
# This will be used in the training.
|
429
|
+
# Otherwise the entries with NaNs will be dropped
|
430
|
+
nan:
|
431
|
+
x : 0
|
432
|
+
y : 0
|
433
|
+
z : -999
|
399
434
|
training :
|
400
435
|
nfold : 10
|
401
436
|
features : [w, x, y, z]
|
@@ -406,8 +441,25 @@ training :
|
|
406
441
|
learning_rate : 0.1
|
407
442
|
min_samples_split : 2
|
408
443
|
saving:
|
444
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
409
445
|
path : 'tests/ml/train_mva/model.pkl'
|
410
446
|
plotting:
|
447
|
+
roc :
|
448
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
449
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
450
|
+
# The section below is optional and will annotate the ROC curve with
|
451
|
+
# values for the score at different signal efficiencies
|
452
|
+
annotate:
|
453
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
454
|
+
form : '{:.2f}' # Use two decimals for scores
|
455
|
+
color : 'green' # Color for text and marker
|
456
|
+
xoff : -15 # Offsets in X and Y
|
457
|
+
yoff : -15
|
458
|
+
size : 10 # Size of text
|
459
|
+
correlation: # Adds correlation matrix for training datasets
|
460
|
+
title : 'Correlation matrix'
|
461
|
+
size : [10, 10]
|
462
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
411
463
|
val_dir : 'tests/ml/train_mva'
|
412
464
|
features:
|
413
465
|
saving:
|
@@ -475,6 +527,36 @@ When evaluating the model with real data, problems might occur, we deal with the
|
|
475
527
|
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
476
528
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
477
529
|
|
530
|
+
# Pandas dataframes
|
531
|
+
|
532
|
+
## Utilities
|
533
|
+
|
534
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
535
|
+
|
536
|
+
### Dataframe to latex
|
537
|
+
|
538
|
+
One can save a dataframe to latex with:
|
539
|
+
|
540
|
+
```python
|
541
|
+
import pandas as pnd
|
542
|
+
import dmu.pdataframe.utilities as put
|
543
|
+
|
544
|
+
d_data = {}
|
545
|
+
d_data['a'] = [1,2,3]
|
546
|
+
d_data['b'] = [4,5,6]
|
547
|
+
df = pnd.DataFrame(d_data)
|
548
|
+
|
549
|
+
d_format = {
|
550
|
+
'a' : '{:.0f}',
|
551
|
+
'b' : '{:.3f}'}
|
552
|
+
|
553
|
+
df = _get_df()
|
554
|
+
put.df_to_tex(df,
|
555
|
+
'./table.tex',
|
556
|
+
d_format = d_format,
|
557
|
+
caption = 'some caption')
|
558
|
+
```
|
559
|
+
|
478
560
|
# Rdataframes
|
479
561
|
|
480
562
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -626,6 +708,43 @@ axes:
|
|
626
708
|
label : 'y'
|
627
709
|
```
|
628
710
|
|
711
|
+
# Other plots
|
712
|
+
|
713
|
+
## Matrices
|
714
|
+
|
715
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
716
|
+
|
717
|
+
```python
|
718
|
+
import numpy
|
719
|
+
import matplotlib.pyplot as plt
|
720
|
+
|
721
|
+
from dmu.plotting.matrix import MatrixPlotter
|
722
|
+
|
723
|
+
cfg = {
|
724
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
725
|
+
'title' : 'Some title', # Optional, title of plot
|
726
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
727
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
728
|
+
'zrange' : [0, 10], # Controls the z axis range
|
729
|
+
'size' : [7, 7], # Plot size
|
730
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
731
|
+
'fontsize' : 12, # Font size associated to `format`
|
732
|
+
'mask_value' : 0, # These values will appear white in the plot
|
733
|
+
}
|
734
|
+
|
735
|
+
mat = [
|
736
|
+
[1, 2, 3],
|
737
|
+
[2, 0, 4],
|
738
|
+
[3, 4, numpy.nan]
|
739
|
+
]
|
740
|
+
|
741
|
+
mat = numpy.array(mat)
|
742
|
+
|
743
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
744
|
+
obj.plot()
|
745
|
+
plt.show()
|
746
|
+
```
|
747
|
+
|
629
748
|
# Manipulating ROOT files
|
630
749
|
|
631
750
|
## Getting trees from file
|
@@ -184,6 +184,33 @@ print_pdf(pdf,
|
|
184
184
|
|
185
185
|
The `Fitter` class is a wrapper to zfit, use to make fitting easier.
|
186
186
|
|
187
|
+
### Goodness of fits
|
188
|
+
|
189
|
+
Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
|
190
|
+
This is done by:
|
191
|
+
|
192
|
+
- Binning the data and PDF.
|
193
|
+
- Calculating the reduced $\chi^2$.
|
194
|
+
- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
|
195
|
+
|
196
|
+
This class is used as shown below:
|
197
|
+
|
198
|
+
```python
|
199
|
+
from dmu.stats.gof_calculator import GofCalculator
|
200
|
+
|
201
|
+
nll = _get_nll()
|
202
|
+
res = Data.minimizer.minimize(nll)
|
203
|
+
|
204
|
+
gcl = GofCalculator(nll, ndof=10)
|
205
|
+
gof = gcl.get_gof(kind='pvalue')
|
206
|
+
```
|
207
|
+
|
208
|
+
where:
|
209
|
+
|
210
|
+
- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
|
211
|
+
It is needed to know how many bins to use to make the histogram. The recommended value is 10.
|
212
|
+
- `kind` The argument can be `pvalue` or `chi2/ndof`.
|
213
|
+
|
187
214
|
### Simplest fit
|
188
215
|
|
189
216
|
```python
|
@@ -376,6 +403,14 @@ obj.run()
|
|
376
403
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
377
404
|
|
378
405
|
```yaml
|
406
|
+
dataset:
|
407
|
+
# If the key is found to be NaN, replace its value with the number provided
|
408
|
+
# This will be used in the training.
|
409
|
+
# Otherwise the entries with NaNs will be dropped
|
410
|
+
nan:
|
411
|
+
x : 0
|
412
|
+
y : 0
|
413
|
+
z : -999
|
379
414
|
training :
|
380
415
|
nfold : 10
|
381
416
|
features : [w, x, y, z]
|
@@ -386,8 +421,25 @@ training :
|
|
386
421
|
learning_rate : 0.1
|
387
422
|
min_samples_split : 2
|
388
423
|
saving:
|
424
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
389
425
|
path : 'tests/ml/train_mva/model.pkl'
|
390
426
|
plotting:
|
427
|
+
roc :
|
428
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
429
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
430
|
+
# The section below is optional and will annotate the ROC curve with
|
431
|
+
# values for the score at different signal efficiencies
|
432
|
+
annotate:
|
433
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
434
|
+
form : '{:.2f}' # Use two decimals for scores
|
435
|
+
color : 'green' # Color for text and marker
|
436
|
+
xoff : -15 # Offsets in X and Y
|
437
|
+
yoff : -15
|
438
|
+
size : 10 # Size of text
|
439
|
+
correlation: # Adds correlation matrix for training datasets
|
440
|
+
title : 'Correlation matrix'
|
441
|
+
size : [10, 10]
|
442
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
391
443
|
val_dir : 'tests/ml/train_mva'
|
392
444
|
features:
|
393
445
|
saving:
|
@@ -455,6 +507,36 @@ When evaluating the model with real data, problems might occur, we deal with the
|
|
455
507
|
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
456
508
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
457
509
|
|
510
|
+
# Pandas dataframes
|
511
|
+
|
512
|
+
## Utilities
|
513
|
+
|
514
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
515
|
+
|
516
|
+
### Dataframe to latex
|
517
|
+
|
518
|
+
One can save a dataframe to latex with:
|
519
|
+
|
520
|
+
```python
|
521
|
+
import pandas as pnd
|
522
|
+
import dmu.pdataframe.utilities as put
|
523
|
+
|
524
|
+
d_data = {}
|
525
|
+
d_data['a'] = [1,2,3]
|
526
|
+
d_data['b'] = [4,5,6]
|
527
|
+
df = pnd.DataFrame(d_data)
|
528
|
+
|
529
|
+
d_format = {
|
530
|
+
'a' : '{:.0f}',
|
531
|
+
'b' : '{:.3f}'}
|
532
|
+
|
533
|
+
df = _get_df()
|
534
|
+
put.df_to_tex(df,
|
535
|
+
'./table.tex',
|
536
|
+
d_format = d_format,
|
537
|
+
caption = 'some caption')
|
538
|
+
```
|
539
|
+
|
458
540
|
# Rdataframes
|
459
541
|
|
460
542
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -606,6 +688,43 @@ axes:
|
|
606
688
|
label : 'y'
|
607
689
|
```
|
608
690
|
|
691
|
+
# Other plots
|
692
|
+
|
693
|
+
## Matrices
|
694
|
+
|
695
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
696
|
+
|
697
|
+
```python
|
698
|
+
import numpy
|
699
|
+
import matplotlib.pyplot as plt
|
700
|
+
|
701
|
+
from dmu.plotting.matrix import MatrixPlotter
|
702
|
+
|
703
|
+
cfg = {
|
704
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
705
|
+
'title' : 'Some title', # Optional, title of plot
|
706
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
707
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
708
|
+
'zrange' : [0, 10], # Controls the z axis range
|
709
|
+
'size' : [7, 7], # Plot size
|
710
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
711
|
+
'fontsize' : 12, # Font size associated to `format`
|
712
|
+
'mask_value' : 0, # These values will appear white in the plot
|
713
|
+
}
|
714
|
+
|
715
|
+
mat = [
|
716
|
+
[1, 2, 3],
|
717
|
+
[2, 0, 4],
|
718
|
+
[3, 4, numpy.nan]
|
719
|
+
]
|
720
|
+
|
721
|
+
mat = numpy.array(mat)
|
722
|
+
|
723
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
724
|
+
obj.plot()
|
725
|
+
plt.show()
|
726
|
+
```
|
727
|
+
|
609
728
|
# Manipulating ROOT files
|
610
729
|
|
611
730
|
## Getting trees from file
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.1
|
3
|
+
Version: 0.2.1
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
8
|
+
Requires-Dist: awkward==2.4.6
|
9
9
|
Requires-Dist: tqdm
|
10
10
|
Requires-Dist: joblib
|
11
11
|
Requires-Dist: scikit-learn
|
@@ -204,6 +204,33 @@ print_pdf(pdf,
|
|
204
204
|
|
205
205
|
The `Fitter` class is a wrapper to zfit, use to make fitting easier.
|
206
206
|
|
207
|
+
### Goodness of fits
|
208
|
+
|
209
|
+
Once a fit has been done, one can use `GofCalculator` to get a rough estimate of the fit quality.
|
210
|
+
This is done by:
|
211
|
+
|
212
|
+
- Binning the data and PDF.
|
213
|
+
- Calculating the reduced $\chi^2$.
|
214
|
+
- Using the $\chi^2$ and the number of degrees of freedom to get the p-value.
|
215
|
+
|
216
|
+
This class is used as shown below:
|
217
|
+
|
218
|
+
```python
|
219
|
+
from dmu.stats.gof_calculator import GofCalculator
|
220
|
+
|
221
|
+
nll = _get_nll()
|
222
|
+
res = Data.minimizer.minimize(nll)
|
223
|
+
|
224
|
+
gcl = GofCalculator(nll, ndof=10)
|
225
|
+
gof = gcl.get_gof(kind='pvalue')
|
226
|
+
```
|
227
|
+
|
228
|
+
where:
|
229
|
+
|
230
|
+
- `ndof` Is the number of degrees of freedom used in the reduced $\chi^2$ calculation
|
231
|
+
It is needed to know how many bins to use to make the histogram. The recommended value is 10.
|
232
|
+
- `kind` The argument can be `pvalue` or `chi2/ndof`.
|
233
|
+
|
207
234
|
### Simplest fit
|
208
235
|
|
209
236
|
```python
|
@@ -396,6 +423,14 @@ obj.run()
|
|
396
423
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
397
424
|
|
398
425
|
```yaml
|
426
|
+
dataset:
|
427
|
+
# If the key is found to be NaN, replace its value with the number provided
|
428
|
+
# This will be used in the training.
|
429
|
+
# Otherwise the entries with NaNs will be dropped
|
430
|
+
nan:
|
431
|
+
x : 0
|
432
|
+
y : 0
|
433
|
+
z : -999
|
399
434
|
training :
|
400
435
|
nfold : 10
|
401
436
|
features : [w, x, y, z]
|
@@ -406,8 +441,25 @@ training :
|
|
406
441
|
learning_rate : 0.1
|
407
442
|
min_samples_split : 2
|
408
443
|
saving:
|
444
|
+
# The actual model names are model_001.pkl, model_002.pkl, etc, one for each fold
|
409
445
|
path : 'tests/ml/train_mva/model.pkl'
|
410
446
|
plotting:
|
447
|
+
roc :
|
448
|
+
min : [0.0, 0.0] # Optional, controls where the ROC curve starts and ends
|
449
|
+
max : [1.2, 1.2] # By default it does from 0 to 1 in both axes
|
450
|
+
# The section below is optional and will annotate the ROC curve with
|
451
|
+
# values for the score at different signal efficiencies
|
452
|
+
annotate:
|
453
|
+
sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9] # Values of signal efficiency at which to show the scores
|
454
|
+
form : '{:.2f}' # Use two decimals for scores
|
455
|
+
color : 'green' # Color for text and marker
|
456
|
+
xoff : -15 # Offsets in X and Y
|
457
|
+
yoff : -15
|
458
|
+
size : 10 # Size of text
|
459
|
+
correlation: # Adds correlation matrix for training datasets
|
460
|
+
title : 'Correlation matrix'
|
461
|
+
size : [10, 10]
|
462
|
+
mask_value : 0 # Where correlation is zero, the bin will appear white
|
411
463
|
val_dir : 'tests/ml/train_mva'
|
412
464
|
features:
|
413
465
|
saving:
|
@@ -475,6 +527,36 @@ When evaluating the model with real data, problems might occur, we deal with the
|
|
475
527
|
- **NaNs**: Entries with NaNs will break the evaluation. These entries will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
|
476
528
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
477
529
|
|
530
|
+
# Pandas dataframes
|
531
|
+
|
532
|
+
## Utilities
|
533
|
+
|
534
|
+
These are thin layers of code that take pandas dataframes and carry out specific tasks
|
535
|
+
|
536
|
+
### Dataframe to latex
|
537
|
+
|
538
|
+
One can save a dataframe to latex with:
|
539
|
+
|
540
|
+
```python
|
541
|
+
import pandas as pnd
|
542
|
+
import dmu.pdataframe.utilities as put
|
543
|
+
|
544
|
+
d_data = {}
|
545
|
+
d_data['a'] = [1,2,3]
|
546
|
+
d_data['b'] = [4,5,6]
|
547
|
+
df = pnd.DataFrame(d_data)
|
548
|
+
|
549
|
+
d_format = {
|
550
|
+
'a' : '{:.0f}',
|
551
|
+
'b' : '{:.3f}'}
|
552
|
+
|
553
|
+
df = _get_df()
|
554
|
+
put.df_to_tex(df,
|
555
|
+
'./table.tex',
|
556
|
+
d_format = d_format,
|
557
|
+
caption = 'some caption')
|
558
|
+
```
|
559
|
+
|
478
560
|
# Rdataframes
|
479
561
|
|
480
562
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -626,6 +708,43 @@ axes:
|
|
626
708
|
label : 'y'
|
627
709
|
```
|
628
710
|
|
711
|
+
# Other plots
|
712
|
+
|
713
|
+
## Matrices
|
714
|
+
|
715
|
+
This can be done with `MatrixPlotter`, whose usage is illustrated below:
|
716
|
+
|
717
|
+
```python
|
718
|
+
import numpy
|
719
|
+
import matplotlib.pyplot as plt
|
720
|
+
|
721
|
+
from dmu.plotting.matrix import MatrixPlotter
|
722
|
+
|
723
|
+
cfg = {
|
724
|
+
'labels' : ['x', 'y', 'z'], # Used to label the matrix axes
|
725
|
+
'title' : 'Some title', # Optional, title of plot
|
726
|
+
'label_angle': 45, # Labels will be rotated by 45 degrees
|
727
|
+
'upper' : True, # Useful in case this is a symmetric matrix
|
728
|
+
'zrange' : [0, 10], # Controls the z axis range
|
729
|
+
'size' : [7, 7], # Plot size
|
730
|
+
'format' : '{:.3f}', # Optional, if used will add numerical values to the contents, otherwise a color bar is used
|
731
|
+
'fontsize' : 12, # Font size associated to `format`
|
732
|
+
'mask_value' : 0, # These values will appear white in the plot
|
733
|
+
}
|
734
|
+
|
735
|
+
mat = [
|
736
|
+
[1, 2, 3],
|
737
|
+
[2, 0, 4],
|
738
|
+
[3, 4, numpy.nan]
|
739
|
+
]
|
740
|
+
|
741
|
+
mat = numpy.array(mat)
|
742
|
+
|
743
|
+
obj = MatrixPlotter(mat=mat, cfg=cfg)
|
744
|
+
obj.plot()
|
745
|
+
plt.show()
|
746
|
+
```
|
747
|
+
|
629
748
|
# Manipulating ROOT files
|
630
749
|
|
631
750
|
## Getting trees from file
|
@@ -13,9 +13,12 @@ src/dmu/ml/cv_classifier.py
|
|
13
13
|
src/dmu/ml/cv_predict.py
|
14
14
|
src/dmu/ml/train_mva.py
|
15
15
|
src/dmu/ml/utilities.py
|
16
|
+
src/dmu/pdataframe/utilities.py
|
17
|
+
src/dmu/plotting/matrix.py
|
16
18
|
src/dmu/plotting/plotter.py
|
17
19
|
src/dmu/plotting/plotter_1d.py
|
18
20
|
src/dmu/plotting/plotter_2d.py
|
21
|
+
src/dmu/plotting/utilities.py
|
19
22
|
src/dmu/rdataframe/atr_mgr.py
|
20
23
|
src/dmu/rdataframe/utilities.py
|
21
24
|
src/dmu/rfile/rfprinter.py
|
@@ -36,6 +39,7 @@ src/dmu_data/plotting/tests/fig_size.yaml
|
|
36
39
|
src/dmu_data/plotting/tests/high_stat.yaml
|
37
40
|
src/dmu_data/plotting/tests/name.yaml
|
38
41
|
src/dmu_data/plotting/tests/no_bounds.yaml
|
42
|
+
src/dmu_data/plotting/tests/normalized.yaml
|
39
43
|
src/dmu_data/plotting/tests/simple.yaml
|
40
44
|
src/dmu_data/plotting/tests/title.yaml
|
41
45
|
src/dmu_data/plotting/tests/weights.yaml
|
{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_classifier.py
RENAMED
@@ -2,6 +2,7 @@
|
|
2
2
|
Module holding cv_classifier class
|
3
3
|
'''
|
4
4
|
|
5
|
+
from typing import Union
|
5
6
|
from sklearn.ensemble import GradientBoostingClassifier
|
6
7
|
|
7
8
|
from dmu.logging.log_store import LogStore
|
@@ -22,7 +23,7 @@ class CVClassifier(GradientBoostingClassifier):
|
|
22
23
|
'''
|
23
24
|
# pylint: disable = too-many-ancestors, abstract-method
|
24
25
|
# ----------------------------------
|
25
|
-
def __init__(self, cfg : dict
|
26
|
+
def __init__(self, cfg : Union[dict,None] = None):
|
26
27
|
'''
|
27
28
|
cfg (dict) : Dictionary with configuration, specially the hyperparameters set in the `hyper` field
|
28
29
|
'''
|
{data_manipulation_utilities-0.1.9 → data_manipulation_utilities-0.2.1}/src/dmu/ml/cv_predict.py
RENAMED
@@ -10,8 +10,8 @@ import tqdm
|
|
10
10
|
from ROOT import RDataFrame
|
11
11
|
|
12
12
|
import dmu.ml.utilities as ut
|
13
|
-
import dmu.ml.cv_classifier as CVClassifier
|
14
13
|
|
14
|
+
from dmu.ml.cv_classifier import CVClassifier
|
15
15
|
from dmu.logging.log_store import LogStore
|
16
16
|
|
17
17
|
log = LogStore.add_logger('dmu:ml:cv_predict')
|
@@ -147,6 +147,7 @@ class CVPredict:
|
|
147
147
|
arr_prb = self._predict_with_overlap(df_ft)
|
148
148
|
|
149
149
|
arr_prb = self._patch_probabilities(arr_prb)
|
150
|
+
arr_prb = arr_prb.T[1]
|
150
151
|
|
151
152
|
return arr_prb
|
152
153
|
# ---------------------------------------
|