data-manipulation-utilities 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +179 -10
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +31 -19
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
- dmu/generic/hashing.py +44 -0
- dmu/generic/utilities.py +14 -1
- dmu/generic/version_management.py +3 -5
- dmu/ml/cv_diagnostics.py +221 -0
- dmu/ml/train_mva.py +143 -46
- dmu/pdataframe/utilities.py +36 -3
- dmu/plotting/fwhm.py +64 -0
- dmu/plotting/plotter.py +2 -0
- dmu/plotting/plotter_1d.py +87 -6
- dmu/stats/fitter.py +1 -1
- dmu/stats/minimizers.py +40 -11
- dmu/stats/model_factory.py +248 -44
- dmu/stats/zfit_models.py +68 -0
- dmu/stats/zfit_plotter.py +29 -21
- dmu/testing/utilities.py +31 -4
- dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
- dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
- dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
- dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
- dmu_data/ml/tests/train_mva.yaml +19 -10
- dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
- dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
- dmu_data/plotting/tests/plug_stats.yaml +19 -0
- dmu_data/plotting/tests/simple.yaml +4 -3
- dmu_data/plotting/tests/styling.yaml +11 -0
- {data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA
RENAMED
@@ -1,20 +1,25 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
8
|
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
|
-
Requires-Dist:
|
11
|
-
Requires-Dist: scikit-learn
|
10
|
+
Requires-Dist: numpy
|
12
11
|
Requires-Dist: toml
|
13
12
|
Requires-Dist: numpy
|
14
13
|
Requires-Dist: matplotlib
|
15
14
|
Requires-Dist: mplhep
|
16
15
|
Requires-Dist: hist[plot]
|
17
16
|
Requires-Dist: pandas
|
17
|
+
Provides-Extra: fit
|
18
|
+
Requires-Dist: zfit; extra == "fit"
|
19
|
+
Requires-Dist: tensorflow==2.18.0; extra == "fit"
|
20
|
+
Provides-Extra: ml
|
21
|
+
Requires-Dist: scikit-learn; extra == "ml"
|
22
|
+
Requires-Dist: joblib; extra == "ml"
|
18
23
|
Provides-Extra: dev
|
19
24
|
Requires-Dist: pytest; extra == "dev"
|
20
25
|
|
@@ -51,6 +56,25 @@ Then, for each remote it pushes the tags and the commits.
|
|
51
56
|
|
52
57
|
This section describes generic tools that could not be put in a specific category, but tend to be useful.
|
53
58
|
|
59
|
+
## Hashing
|
60
|
+
|
61
|
+
The snippet below:
|
62
|
+
|
63
|
+
```python
|
64
|
+
from dmu.generic import hashing
|
65
|
+
|
66
|
+
obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
|
67
|
+
val = hashing.hash_object(obj)
|
68
|
+
```
|
69
|
+
|
70
|
+
will:
|
71
|
+
|
72
|
+
- Make the input object into a JSON string
|
73
|
+
- Encode it to utf-8
|
74
|
+
- Make a 64 characters hash out of it
|
75
|
+
|
76
|
+
in two lines, thus keeping the user's code clean.
|
77
|
+
|
54
78
|
## Timer
|
55
79
|
|
56
80
|
In order to benchmark functions do:
|
@@ -67,9 +91,9 @@ def fun():
|
|
67
91
|
fun()
|
68
92
|
```
|
69
93
|
|
70
|
-
## JSON dumper
|
94
|
+
## JSON dumper and loader
|
71
95
|
|
72
|
-
The following lines will dump data (dictionaries, lists, etc) to a JSON file:
|
96
|
+
The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
|
73
97
|
|
74
98
|
```python
|
75
99
|
import dmu.generic.utilities as gut
|
@@ -77,8 +101,11 @@ import dmu.generic.utilities as gut
|
|
77
101
|
data = [1,2,3,4]
|
78
102
|
|
79
103
|
gut.dump_json(data, '/tmp/list.json')
|
104
|
+
data = gut.load_json('/tmp/list.json')
|
80
105
|
```
|
81
106
|
|
107
|
+
and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
|
108
|
+
|
82
109
|
# Physics
|
83
110
|
|
84
111
|
## Truth matching
|
@@ -132,7 +159,8 @@ from dmu.stats.model_factory import ModelFactory
|
|
132
159
|
|
133
160
|
l_pdf = ['cbr'] + 2 * ['cbl']
|
134
161
|
l_shr = ['mu', 'sg']
|
135
|
-
|
162
|
+
d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
|
163
|
+
mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
|
136
164
|
pdf = mod.get_pdf()
|
137
165
|
```
|
138
166
|
|
@@ -145,10 +173,40 @@ pol1: Polynomial of degree 1
|
|
145
173
|
pol2: Polynomial of degree 2
|
146
174
|
cbr : CrystallBall with right tail
|
147
175
|
cbl : CrystallBall with left tail
|
148
|
-
gauss : Gaussian
|
176
|
+
gauss : Gaussian
|
149
177
|
dscb : Double sided CrystallBall
|
150
178
|
```
|
151
179
|
|
180
|
+
### Model building with reparametrizations
|
181
|
+
|
182
|
+
In order to introduce reparametrizations for the means and the resolutions, such that:
|
183
|
+
|
184
|
+
$\mu\to\mu+\Delta\mu$
|
185
|
+
$\sigma\to\sigma\cdot s_{\sigma}$
|
186
|
+
|
187
|
+
where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
|
188
|
+
|
189
|
+
```python
|
190
|
+
import zfit
|
191
|
+
from dmu.stats.model_factory import ModelFactory
|
192
|
+
|
193
|
+
l_shr = ['mu', 'sg']
|
194
|
+
l_flt = []
|
195
|
+
d_rep = {'mu' : 'scale', 'sg' : 'reso'}
|
196
|
+
obs = zfit.Space('mass', limits=(5080, 5680))
|
197
|
+
|
198
|
+
mod = ModelFactory(
|
199
|
+
preffix = name,
|
200
|
+
obs = obs,
|
201
|
+
l_pdf = l_name,
|
202
|
+
d_rep = d_rep,
|
203
|
+
l_shared= l_shr,
|
204
|
+
l_float = l_flt)
|
205
|
+
pdf = mod.get_pdf()
|
206
|
+
```
|
207
|
+
|
208
|
+
Here, the floating parameters **should not** be the same as the reparametrized ones.
|
209
|
+
|
152
210
|
### Printing PDFs
|
153
211
|
|
154
212
|
One can print a zfit PDF by doing:
|
@@ -427,7 +485,7 @@ rdf_bkg = _get_rdf(kind='bkg')
|
|
427
485
|
cfg = _get_config()
|
428
486
|
|
429
487
|
obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
|
430
|
-
obj.run()
|
488
|
+
obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
|
431
489
|
```
|
432
490
|
|
433
491
|
where the settings for the training go in a config dictionary, which when written to YAML looks like:
|
@@ -549,9 +607,61 @@ When evaluating the model with real data, problems might occur, we deal with the
|
|
549
607
|
```python
|
550
608
|
model.cfg
|
551
609
|
```
|
552
|
-
- For whatever
|
610
|
+
- For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
|
553
611
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
554
612
|
|
613
|
+
## Diagnostics
|
614
|
+
|
615
|
+
To run diagnostics on the trained model do:
|
616
|
+
|
617
|
+
```python
|
618
|
+
from dmu.ml.cv_diagnostics import CVDiagnostics
|
619
|
+
|
620
|
+
# Where l_model is the list of models and cfg is a dictionary with the config
|
621
|
+
cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
|
622
|
+
cvd.run()
|
623
|
+
```
|
624
|
+
|
625
|
+
the configuration can be loaded from a YAML file and would look like:
|
626
|
+
|
627
|
+
```yaml
|
628
|
+
# Directory where plots will go
|
629
|
+
output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
|
630
|
+
# Optional, will assume that the target is already in the input dataframe
|
631
|
+
# and will use it, instead of evaluating models
|
632
|
+
score_from_rdf : mva
|
633
|
+
correlations:
|
634
|
+
# Variables with respect to which the correlations with the features will be measured
|
635
|
+
target :
|
636
|
+
name : mass
|
637
|
+
overlay :
|
638
|
+
wp :
|
639
|
+
- 0.2
|
640
|
+
- 0.5
|
641
|
+
- 0.7
|
642
|
+
- 0.9
|
643
|
+
general:
|
644
|
+
size : [20, 10]
|
645
|
+
saving:
|
646
|
+
plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
|
647
|
+
plots:
|
648
|
+
z :
|
649
|
+
binning : [1000, 4000, 30]
|
650
|
+
yscale : 'linear'
|
651
|
+
labels : ['mass', 'Entries']
|
652
|
+
normalized : true
|
653
|
+
styling :
|
654
|
+
linestyle: '-' # By default there is no line, just pointer
|
655
|
+
methods:
|
656
|
+
- Pearson
|
657
|
+
- Kendall-$\tau$
|
658
|
+
figure:
|
659
|
+
title: Scores from file
|
660
|
+
size : [10, 8]
|
661
|
+
xlabelsize: 18 # Constrols size of x axis labels. By default 30
|
662
|
+
rotate : 60 # Will rotate xlabels by 60 degrees
|
663
|
+
```
|
664
|
+
|
555
665
|
# Pandas dataframes
|
556
666
|
|
557
667
|
## Utilities
|
@@ -582,6 +692,19 @@ put.df_to_tex(df,
|
|
582
692
|
caption = 'some caption')
|
583
693
|
```
|
584
694
|
|
695
|
+
### Dataframe to and from YAML
|
696
|
+
|
697
|
+
This extends the existing JSON functionality
|
698
|
+
|
699
|
+
```python
|
700
|
+
import dmu.pdataframe.utilities as put
|
701
|
+
|
702
|
+
df_1 = _get_df()
|
703
|
+
put.to_yaml(df_1, yml_path)
|
704
|
+
df_2 = put.from_yaml(yml_path)
|
705
|
+
```
|
706
|
+
|
707
|
+
and is meant to be less verbose than doing it through the YAML module.
|
585
708
|
# Rdataframes
|
586
709
|
|
587
710
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -707,6 +830,11 @@ plots:
|
|
707
830
|
labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
|
708
831
|
title : 'some title can be added for different variable plots'
|
709
832
|
name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
|
833
|
+
# Can add styling to specific plots, this should be the argument of
|
834
|
+
# hist.plot(...)
|
835
|
+
styling :
|
836
|
+
label : x
|
837
|
+
linestyle: '-'
|
710
838
|
y :
|
711
839
|
binning : [-5.0, 8.0, 40]
|
712
840
|
yscale : 'linear'
|
@@ -730,6 +858,47 @@ stats:
|
|
730
858
|
|
731
859
|
it's up to the user to build this dictionary and load it.
|
732
860
|
|
861
|
+
### Pluggins
|
862
|
+
|
863
|
+
Extra functionality can be `plugged` into the code by using the pluggins section like:
|
864
|
+
|
865
|
+
#### FWHM
|
866
|
+
```yaml
|
867
|
+
plugin:
|
868
|
+
fwhm:
|
869
|
+
# Can control each variable fit separately
|
870
|
+
x :
|
871
|
+
plot : true
|
872
|
+
obs : [-2, 4]
|
873
|
+
plot : true
|
874
|
+
format : FWHM={:.3f}
|
875
|
+
add_std: True
|
876
|
+
y :
|
877
|
+
plot : true
|
878
|
+
obs : [-4, 8]
|
879
|
+
plot : true
|
880
|
+
format : FWHM={:.3f}
|
881
|
+
add_std: True
|
882
|
+
```
|
883
|
+
|
884
|
+
where the section will
|
885
|
+
|
886
|
+
- Use a KDE to fit the distribution and plot it on top of the histogram
|
887
|
+
- Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
|
888
|
+
|
889
|
+
#### stats
|
890
|
+
|
891
|
+
```yaml
|
892
|
+
plugin:
|
893
|
+
stats:
|
894
|
+
x :
|
895
|
+
mean : $\mu$={:.2f}
|
896
|
+
rms : $\sigma$={:.2f}
|
897
|
+
sum : $\Sigma$={:.0f}
|
898
|
+
```
|
899
|
+
|
900
|
+
Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
|
901
|
+
|
733
902
|
## 2D plots
|
734
903
|
|
735
904
|
For the 2D case it would look like:
|
{data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD
RENAMED
@@ -1,33 +1,42 @@
|
|
1
|
-
data_manipulation_utilities-0.2.
|
1
|
+
data_manipulation_utilities-0.2.7.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
|
2
2
|
dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
|
3
|
-
dmu/generic/
|
4
|
-
dmu/generic/
|
3
|
+
dmu/generic/hashing.py,sha256=Mxwr88jyGuEP5Bhqtw1gDva9fNO-l11jb2qA0ceOysw,1167
|
4
|
+
dmu/generic/utilities.py,sha256=dBvrucgdM3vyyETharLvPAC7qEbgF9cL1mLVQ0nxNJM,2020
|
5
|
+
dmu/generic/version_management.py,sha256=j0ImlAq6SVNjTh3xRsF6G7DSoyr1w8kTRY84dNriGRE,3750
|
5
6
|
dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
|
6
7
|
dmu/ml/cv_classifier.py,sha256=ZbzEm_jW9yoTC7k_xBA7hFpc1bDNayiVR3tbaj1_ieE,4228
|
8
|
+
dmu/ml/cv_diagnostics.py,sha256=T6wkNcuLkdLrZ9z1YtAYMeb-M_bxnSChRHtdPeQv_To,7507
|
7
9
|
dmu/ml/cv_predict.py,sha256=4wwYL_jcUExDqLJVfClxEUWSd_QAx8yKHO3rX-mx4vw,6711
|
8
|
-
dmu/ml/train_mva.py,sha256=
|
10
|
+
dmu/ml/train_mva.py,sha256=tkdSODahXG9R77wmcckKFxYceMmc-3O-Y4pJXMe8WAI,21137
|
9
11
|
dmu/ml/utilities.py,sha256=PK_61fW7gBV9aGZyez3PI8zAT7_Fc6IlQzDB7f8iBTM,4133
|
10
|
-
dmu/pdataframe/utilities.py,sha256=
|
12
|
+
dmu/pdataframe/utilities.py,sha256=kwv9PuSmccz518e5bkrASacewCkMLTSPLtLiMH3olBc,2011
|
13
|
+
dmu/plotting/fwhm.py,sha256=Vn0TWlxgNMNk_wb2CD_At-ltVmzI49fyhoJXa0CT0I4,1924
|
11
14
|
dmu/plotting/matrix.py,sha256=pXuUJn-LgOvrI9qGkZQw16BzLjOjeikYQ_ll2VIcIXU,4978
|
12
|
-
dmu/plotting/plotter.py,sha256=
|
13
|
-
dmu/plotting/plotter_1d.py,sha256=
|
15
|
+
dmu/plotting/plotter.py,sha256=f3hoZF1n3G-yjHsQ_2huxqz5cqFelZGs0RiaHYjF0JM,7391
|
16
|
+
dmu/plotting/plotter_1d.py,sha256=zygY6XUk4_wmaAK-bietjYdhkA03TZyft5IYX5acoYI,9142
|
14
17
|
dmu/plotting/plotter_2d.py,sha256=mZhp3D5I-JodOnFTEF1NqHtcLtuI-2WNpCQsrsoXNtw,3017
|
15
18
|
dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
|
16
19
|
dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
|
17
20
|
dmu/rdataframe/utilities.py,sha256=pNcQARMP7txMhy6k27UnDcYf0buNy5U2fshaJDl_h8o,3661
|
18
21
|
dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
|
19
22
|
dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
|
20
|
-
dmu/stats/fitter.py,sha256=
|
23
|
+
dmu/stats/fitter.py,sha256=gmSejQmoqsHmg6QPypbNb0MTfDFkSMy-DsDLKk1H3KE,18387
|
21
24
|
dmu/stats/function.py,sha256=yzi_Fvp_ASsFzbWFivIf-comquy21WoeY7is6dgY0Go,9491
|
22
25
|
dmu/stats/gof_calculator.py,sha256=4EN6OhULcztFvsAZ00rxgohJemnjtDNB5o0IBcv6kbk,4657
|
23
|
-
dmu/stats/minimizers.py,sha256=
|
24
|
-
dmu/stats/model_factory.py,sha256=
|
26
|
+
dmu/stats/minimizers.py,sha256=db9R2G0SOV-k0BKi6m4EyB_yp6AtZdP23_28B0315oo,7094
|
27
|
+
dmu/stats/model_factory.py,sha256=N0Q5R7GDfy2r_08xGduzydYO-ALvNTIRE1g0KKcRFyc,15872
|
25
28
|
dmu/stats/utilities.py,sha256=LQy4kd3xSXqpApcWuYfZxkGQyjowaXv2Wr1c4Bj-4ys,4523
|
26
|
-
dmu/stats/
|
27
|
-
dmu/
|
29
|
+
dmu/stats/zfit_models.py,sha256=SI61KJ-OG1UAabDICU1iTh6JPKM3giR2ErDraRjkCV8,1842
|
30
|
+
dmu/stats/zfit_plotter.py,sha256=bnkqFKyxhOEb5hVv4mxh4gpg-MrGeTrCh_5QKIT9hL0,19928
|
31
|
+
dmu/testing/utilities.py,sha256=c91g_PEzIZfS3Kxj2EOh05mc5ia7vFpZM1VEhjh4DA0,4661
|
28
32
|
dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
|
29
33
|
dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
dmu_data/ml/tests/
|
34
|
+
dmu_data/ml/tests/diagnostics_from_file.yaml,sha256=quvXOPkRducnBsctyape_Rn5_aqMEpPo6nO_UweMORo,404
|
35
|
+
dmu_data/ml/tests/diagnostics_from_model.yaml,sha256=rtCQlmGS9ld2xoQJEE35nA07yfRMklEfQEW0w3gRv2A,261
|
36
|
+
dmu_data/ml/tests/diagnostics_multiple_methods.yaml,sha256=w8Fpmr7kX1Jsb_h6LL2hiuYKf5lYpckFCpYKzWetbA0,265
|
37
|
+
dmu_data/ml/tests/diagnostics_overlay.yaml,sha256=ZVOsxLL8_JQtf41n8Ct-M9Ch10xBwHK54q1fttWPDlE,866
|
38
|
+
dmu_data/ml/tests/train_mva.yaml,sha256=AFUJV5yrXIdHHzA9TzfJv3RWhvk0GUB_CFubePCzfxE,1437
|
39
|
+
dmu_data/ml/tests/train_mva_with_diagnostics.yaml,sha256=ZyKU7W_mDGVvg8xP_wQX0NtO6N_tLSnKO1CyHAaYhuk,2036
|
31
40
|
dmu_data/plotting/tests/2d.yaml,sha256=HSAtER-8CEqIGBY_jdcIdSVOHMfYPYhmgeZghTpVYh8,516
|
32
41
|
dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
|
33
42
|
dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
|
@@ -35,8 +44,11 @@ dmu_data/plotting/tests/legend.yaml,sha256=wGpj58ig-GOlqbWoN894zrCet2Fj9f5QtY0ri
|
|
35
44
|
dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
|
36
45
|
dmu_data/plotting/tests/no_bounds.yaml,sha256=8e1QdphBjz-suDr857DoeUC2DXiy6SE-gvkORJQYv80,257
|
37
46
|
dmu_data/plotting/tests/normalized.yaml,sha256=Y0eKtyV5pvlSxvqfsLjytYtv8xYF3HZ5WEdCJdeHGQI,193
|
38
|
-
dmu_data/plotting/tests/
|
47
|
+
dmu_data/plotting/tests/plug_fwhm.yaml,sha256=xl5LXc9Nt66anM-HOXAxCtlaxWNM7zzIXf1Y6U8M4Wg,449
|
48
|
+
dmu_data/plotting/tests/plug_stats.yaml,sha256=ROO8soYXBbZIFYZcGngA_K5XHgIAFCmuAGfZCJgMmd0,384
|
49
|
+
dmu_data/plotting/tests/simple.yaml,sha256=Xc59Pjfb3BKMicLVBxODVqomHFupcb5GvefKbKHCQWQ,195
|
39
50
|
dmu_data/plotting/tests/stats.yaml,sha256=fSZjoV-xPnukpCH2OAXsz_SNPjI113qzDg8Ln3spaaA,165
|
51
|
+
dmu_data/plotting/tests/styling.yaml,sha256=yUV75HewT2tFofS-5i-mU-3s4PSHcpZOZRn5R_iSRAw,218
|
40
52
|
dmu_data/plotting/tests/title.yaml,sha256=bawKp9aGpeRrHzv69BOCbFX8sq9bb3Es9tdsPTE7jIk,333
|
41
53
|
dmu_data/plotting/tests/weights.yaml,sha256=RWQ1KxbCq-uO62WJ2AoY4h5Umc37zG35s-TpKnNMABI,312
|
42
54
|
dmu_data/text/transform.toml,sha256=R-832BZalzHZ6c5gD6jtT_Hj8BCsM5vxa1v6oeiwaP4,94
|
@@ -50,8 +62,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
|
|
50
62
|
dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
|
51
63
|
dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
|
52
64
|
dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
|
53
|
-
data_manipulation_utilities-0.2.
|
54
|
-
data_manipulation_utilities-0.2.
|
55
|
-
data_manipulation_utilities-0.2.
|
56
|
-
data_manipulation_utilities-0.2.
|
57
|
-
data_manipulation_utilities-0.2.
|
65
|
+
data_manipulation_utilities-0.2.7.dist-info/METADATA,sha256=aaUGi8PUMDuJCdslarv5Rn10iI-Jvbu4WkWbEOwmtzw,34464
|
66
|
+
data_manipulation_utilities-0.2.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
67
|
+
data_manipulation_utilities-0.2.7.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
|
68
|
+
data_manipulation_utilities-0.2.7.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
|
69
|
+
data_manipulation_utilities-0.2.7.dist-info/RECORD,,
|
dmu/generic/hashing.py
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
'''
|
2
|
+
Module with functions needed to provide hashes
|
3
|
+
'''
|
4
|
+
|
5
|
+
import json
|
6
|
+
import hashlib
|
7
|
+
from typing import Any
|
8
|
+
|
9
|
+
import pandas as pnd
|
10
|
+
from dmu.logging.log_store import LogStore
|
11
|
+
|
12
|
+
log=LogStore.add_logger('dmu:generic.hashing')
|
13
|
+
# ------------------------------------
|
14
|
+
def _object_to_string(obj : Any) -> str:
|
15
|
+
try:
|
16
|
+
string = json.dumps(obj)
|
17
|
+
except Exception as exc:
|
18
|
+
raise ValueError(f'Cannot hash object: {obj}') from exc
|
19
|
+
|
20
|
+
return string
|
21
|
+
# ------------------------------------
|
22
|
+
def _dataframe_to_hash(df : pnd.DataFrame) -> str:
|
23
|
+
sr_hash = pnd.util.hash_pandas_object(df, index=True)
|
24
|
+
values = sr_hash.values
|
25
|
+
hsh = hashlib.sha256(values)
|
26
|
+
hsh = hsh.hexdigest()
|
27
|
+
|
28
|
+
return hsh
|
29
|
+
# ------------------------------------
|
30
|
+
def hash_object(obj : Any) -> str:
|
31
|
+
'''
|
32
|
+
Function taking a python object and returning
|
33
|
+
a string representing the hash
|
34
|
+
'''
|
35
|
+
|
36
|
+
if isinstance(obj, pnd.DataFrame):
|
37
|
+
return _dataframe_to_hash(df=obj)
|
38
|
+
|
39
|
+
string = _object_to_string(obj=obj)
|
40
|
+
string_bin = string.encode('utf-8')
|
41
|
+
hsh = hashlib.sha256(string_bin)
|
42
|
+
|
43
|
+
return hsh.hexdigest()
|
44
|
+
# ------------------------------------
|
dmu/generic/utilities.py
CHANGED
@@ -52,7 +52,7 @@ def timeit(f):
|
|
52
52
|
return result
|
53
53
|
return wrap
|
54
54
|
# --------------------------------
|
55
|
-
def dump_json(data, path : str, sort_keys : bool = False):
|
55
|
+
def dump_json(data, path : str, sort_keys : bool = False) -> None:
|
56
56
|
'''
|
57
57
|
Saves data as JSON
|
58
58
|
|
@@ -67,3 +67,16 @@ def dump_json(data, path : str, sort_keys : bool = False):
|
|
67
67
|
with open(path, 'w', encoding='utf-8') as ofile:
|
68
68
|
json.dump(data, ofile, indent=4, sort_keys=sort_keys)
|
69
69
|
# --------------------------------
|
70
|
+
def load_json(path : str):
|
71
|
+
'''
|
72
|
+
Loads data from JSON
|
73
|
+
|
74
|
+
Parameters
|
75
|
+
path : Path to JSON file where data is saved
|
76
|
+
'''
|
77
|
+
|
78
|
+
with open(path, encoding='utf-8') as ofile:
|
79
|
+
data = json.load(ofile)
|
80
|
+
|
81
|
+
return data
|
82
|
+
# --------------------------------
|
@@ -42,8 +42,7 @@ def get_last_version(dir_path : str, version_only : bool = True, main_only : boo
|
|
42
42
|
l_obj = glob.glob(f'{dir_path}/*')
|
43
43
|
|
44
44
|
if len(l_obj) == 0:
|
45
|
-
|
46
|
-
raise ValueError
|
45
|
+
raise ValueError(f'Nothing found in {dir_path}')
|
47
46
|
|
48
47
|
d_dir_org = { os.path.basename(obj).replace('.', '') : obj for obj in l_obj if os.path.isdir(obj) }
|
49
48
|
d_dir_num = { _get_numeric_version(name) : dir_path for name, dir_path in d_dir_org.items() }
|
@@ -52,9 +51,8 @@ def get_last_version(dir_path : str, version_only : bool = True, main_only : boo
|
|
52
51
|
|
53
52
|
try:
|
54
53
|
_, path = c_dir[-1]
|
55
|
-
except:
|
56
|
-
|
57
|
-
raise
|
54
|
+
except Exception as exc:
|
55
|
+
raise ValueError(f'Cannot find path in: {dir_path}') from exc
|
58
56
|
|
59
57
|
name = os.path.basename(path)
|
60
58
|
dirn = os.path.dirname(path)
|
dmu/ml/cv_diagnostics.py
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
'''
|
2
|
+
Module containing CVDiagnostics class
|
3
|
+
'''
|
4
|
+
import os
|
5
|
+
|
6
|
+
import numpy
|
7
|
+
import matplotlib
|
8
|
+
import matplotlib.pyplot as plt
|
9
|
+
import pandas as pnd
|
10
|
+
|
11
|
+
from scipy.stats import kendalltau
|
12
|
+
from ROOT import RDataFrame, RDF
|
13
|
+
from dmu.ml.cv_classifier import CVClassifier
|
14
|
+
from dmu.ml.cv_predict import CVPredict
|
15
|
+
from dmu.logging.log_store import LogStore
|
16
|
+
from dmu.plotting.plotter_1d import Plotter1D as Plotter
|
17
|
+
|
18
|
+
NPA = numpy.ndarray
|
19
|
+
Axis= matplotlib.axes._axes.Axes
|
20
|
+
log = LogStore.add_logger('dmu:ml:cv_diagnostics')
|
21
|
+
# -------------------------
|
22
|
+
class CVDiagnostics:
|
23
|
+
'''
|
24
|
+
Class meant to rundiagnostics on classifier
|
25
|
+
|
26
|
+
Correlations
|
27
|
+
------------------
|
28
|
+
Will calculate correlations between features + signal probability and some external target variable specified in the config
|
29
|
+
'''
|
30
|
+
# -------------------------
|
31
|
+
def __init__(self, models : list[CVClassifier], rdf : RDataFrame, cfg : dict):
|
32
|
+
self._l_model = models
|
33
|
+
self._cfg = cfg
|
34
|
+
self._rdf = rdf
|
35
|
+
self._target = cfg['correlations']['target']['name']
|
36
|
+
self._l_feat = self._get_features()
|
37
|
+
self._d_xlab = self._get_xlabels()
|
38
|
+
# -------------------------
|
39
|
+
def _get_features(self) -> list[str]:
|
40
|
+
cfg = self._l_model[0].cfg
|
41
|
+
l_var = cfg['training']['features']
|
42
|
+
|
43
|
+
return l_var
|
44
|
+
# -------------------------
|
45
|
+
def _get_xlabels(self) -> dict[str,str]:
|
46
|
+
cfg = self._l_model[0].cfg
|
47
|
+
d_var = cfg['plotting']['features']['plots']
|
48
|
+
|
49
|
+
d_lab = { varname : d_field['labels'][0] for varname, d_field in d_var.items() }
|
50
|
+
|
51
|
+
target= self._cfg['correlations']['target']['name']
|
52
|
+
if 'overlay' not in self._cfg['correlations']['target']:
|
53
|
+
xlabel = target
|
54
|
+
else:
|
55
|
+
xlabel= self._cfg['correlations']['target']['overlay']['plots'][target]['labels'][0]
|
56
|
+
|
57
|
+
d_lab[target] = xlabel
|
58
|
+
d_lab['score'] = 'score'
|
59
|
+
|
60
|
+
d_lab = { var_id : var_name.replace('MeV', '') for var_id, var_name in d_lab.items() }
|
61
|
+
|
62
|
+
return d_lab
|
63
|
+
# -------------------------
|
64
|
+
def _add_columns(self, rdf : RDataFrame) -> RDataFrame:
|
65
|
+
cfg = self._l_model[0].cfg
|
66
|
+
d_def = cfg['dataset']['define']
|
67
|
+
for var, expr in d_def.items():
|
68
|
+
rdf = rdf.Define(var, expr)
|
69
|
+
|
70
|
+
return rdf
|
71
|
+
# -------------------------
|
72
|
+
def _get_scores(self) -> NPA:
|
73
|
+
if 'score_from_rdf' not in self._cfg:
|
74
|
+
log.debug('Using score from model')
|
75
|
+
prd = CVPredict(models=self._l_model, rdf = self._rdf)
|
76
|
+
|
77
|
+
return prd.predict()
|
78
|
+
|
79
|
+
name = self._cfg['score_from_rdf']
|
80
|
+
log.debug(f'Picking up score from dataframe, column: {name}')
|
81
|
+
arr_score = self._rdf.AsNumpy([name])[name]
|
82
|
+
|
83
|
+
return arr_score
|
84
|
+
# -------------------------
|
85
|
+
def _get_arrays(self) -> dict[str, NPA]:
|
86
|
+
rdf = self._add_columns(self._rdf)
|
87
|
+
l_col = [ name.c_str() for name in rdf.GetColumnNames() ]
|
88
|
+
|
89
|
+
missing= False
|
90
|
+
l_var = self._l_feat + [self._target]
|
91
|
+
for var in l_var:
|
92
|
+
if var not in l_col:
|
93
|
+
log.error(f'{"Missing":<20}{var}')
|
94
|
+
missing=True
|
95
|
+
|
96
|
+
if missing:
|
97
|
+
raise ValueError('Columns missing')
|
98
|
+
|
99
|
+
d_var = rdf.AsNumpy(l_var)
|
100
|
+
d_var['score'] = self._get_scores()
|
101
|
+
|
102
|
+
return d_var
|
103
|
+
# -------------------------
|
104
|
+
def _run_correlations(self, method : str, ax : Axis) -> Axis:
|
105
|
+
d_arr = self._get_arrays()
|
106
|
+
arr_target = d_arr[self._target]
|
107
|
+
|
108
|
+
d_corr= {}
|
109
|
+
for name, arr_val in d_arr.items():
|
110
|
+
if name == self._target:
|
111
|
+
continue
|
112
|
+
|
113
|
+
d_corr[name] = self._calculate_correlations(var=arr_val, target=arr_target, method=method)
|
114
|
+
|
115
|
+
ax = self._plot_correlations(d_corr=d_corr, method=method, ax=ax)
|
116
|
+
|
117
|
+
return ax
|
118
|
+
# -------------------------
|
119
|
+
def _plot_correlations(self, d_corr : dict[str,float], method : str, ax : Axis) -> Axis:
|
120
|
+
df = pnd.DataFrame.from_dict(d_corr, orient="index", columns=[method])
|
121
|
+
df['variable'] = df.index.map(self._d_xlab)
|
122
|
+
|
123
|
+
figsize = self._cfg['correlations']['figure']['size']
|
124
|
+
ax = df.plot(x='variable', y=method,label=method, figsize=figsize, ax=ax)
|
125
|
+
|
126
|
+
# Needed to show all labels on x axis
|
127
|
+
plt.xticks(ticks=range(len(df)), labels=df.variable)
|
128
|
+
if 'xlabelsize' in self._cfg['correlations']['figure']:
|
129
|
+
xlabsize= self._cfg['correlations']['figure']['xlabelsize']
|
130
|
+
else:
|
131
|
+
xlabsize= 30
|
132
|
+
|
133
|
+
ax.tick_params(axis='x', labelsize=xlabsize)
|
134
|
+
|
135
|
+
return ax
|
136
|
+
# -------------------------
|
137
|
+
def _save_plot(self):
|
138
|
+
plot_dir = self._cfg['output']
|
139
|
+
os.makedirs(plot_dir, exist_ok=True)
|
140
|
+
|
141
|
+
plot_path = f'{plot_dir}/correlations.png'
|
142
|
+
log.info(f'Saving to: {plot_path}')
|
143
|
+
|
144
|
+
title = None
|
145
|
+
if 'title' in self._cfg['correlations']['figure']:
|
146
|
+
title = self._cfg['correlations']['figure']['title']
|
147
|
+
|
148
|
+
rotation=30
|
149
|
+
if 'rotate' in self._cfg['correlations']['figure']:
|
150
|
+
rotation = self._cfg['correlations']['figure']['rotate']
|
151
|
+
|
152
|
+
plt.ylim(-1, +1)
|
153
|
+
plt.title(title)
|
154
|
+
plt.xlabel('')
|
155
|
+
plt.ylabel('Correlation')
|
156
|
+
plt.grid()
|
157
|
+
plt.xticks(rotation=rotation)
|
158
|
+
plt.tight_layout()
|
159
|
+
plt.savefig(plot_path)
|
160
|
+
plt.close()
|
161
|
+
# -------------------------
|
162
|
+
def _remove_nans(self, var : NPA, tgt : NPA) -> tuple[NPA,NPA]:
|
163
|
+
arr_nan_var = numpy.isnan(var)
|
164
|
+
arr_nan_tgt = numpy.isnan(tgt)
|
165
|
+
arr_is_nan = numpy.logical_or(arr_nan_var, arr_nan_tgt)
|
166
|
+
arr_not_nan = numpy.logical_not(arr_is_nan)
|
167
|
+
|
168
|
+
var = var[arr_not_nan]
|
169
|
+
tgt = tgt[arr_not_nan]
|
170
|
+
|
171
|
+
return var, tgt
|
172
|
+
# -------------------------
|
173
|
+
def _calculate_correlations(self, var : NPA, target : NPA, method : str) -> float:
|
174
|
+
var, target = self._remove_nans(var, target)
|
175
|
+
|
176
|
+
if method == 'Pearson':
|
177
|
+
mat = numpy.corrcoef(var, target)
|
178
|
+
|
179
|
+
return mat[0,1]
|
180
|
+
|
181
|
+
if method == r'Kendall-$\tau$':
|
182
|
+
tau, _ = kendalltau(var, target)
|
183
|
+
|
184
|
+
return tau
|
185
|
+
|
186
|
+
raise NotImplementedError(f'Correlation coefficient {method} not implemented')
|
187
|
+
# -------------------------
|
188
|
+
def _plot_cutflow(self) -> None:
|
189
|
+
if 'overlay' not in self._cfg['correlations']['target']:
|
190
|
+
log.debug('Not plotting cutflow of target distribution')
|
191
|
+
return
|
192
|
+
|
193
|
+
arr_score = self._get_scores()
|
194
|
+
arr_target= self._rdf.AsNumpy([self._target])[self._target]
|
195
|
+
arr_wp = self._cfg['correlations']['target']['overlay']['wp']
|
196
|
+
rdf = RDF.FromNumpy({'Score' : arr_score, self._target : arr_target})
|
197
|
+
|
198
|
+
d_rdf = {}
|
199
|
+
for wp in arr_wp:
|
200
|
+
name = f'WP > {wp:.2}'
|
201
|
+
expr = f'Score > {wp:.3}'
|
202
|
+
d_rdf[name] = rdf.Filter(expr)
|
203
|
+
|
204
|
+
cfg_target = self._cfg['correlations']['target']['overlay']
|
205
|
+
|
206
|
+
ptr=Plotter(d_rdf=d_rdf, cfg=cfg_target)
|
207
|
+
ptr.run()
|
208
|
+
# -------------------------
|
209
|
+
def run(self) -> None:
|
210
|
+
'''
|
211
|
+
Runs diagnostics
|
212
|
+
'''
|
213
|
+
if 'correlations' in self._cfg:
|
214
|
+
ax = None
|
215
|
+
for method in self._cfg['correlations']['methods']:
|
216
|
+
ax = self._run_correlations(method=method, ax=ax)
|
217
|
+
|
218
|
+
self._save_plot()
|
219
|
+
|
220
|
+
self._plot_cutflow()
|
221
|
+
# -------------------------
|