data-manipulation-utilities 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/METADATA +179 -10
  2. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/RECORD +31 -19
  3. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/WHEEL +1 -1
  4. dmu/generic/hashing.py +44 -0
  5. dmu/generic/utilities.py +14 -1
  6. dmu/generic/version_management.py +3 -5
  7. dmu/ml/cv_diagnostics.py +221 -0
  8. dmu/ml/train_mva.py +143 -46
  9. dmu/pdataframe/utilities.py +36 -3
  10. dmu/plotting/fwhm.py +64 -0
  11. dmu/plotting/plotter.py +2 -0
  12. dmu/plotting/plotter_1d.py +87 -6
  13. dmu/stats/fitter.py +1 -1
  14. dmu/stats/minimizers.py +40 -11
  15. dmu/stats/model_factory.py +248 -44
  16. dmu/stats/zfit_models.py +68 -0
  17. dmu/stats/zfit_plotter.py +29 -21
  18. dmu/testing/utilities.py +31 -4
  19. dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
  20. dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
  21. dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
  22. dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
  23. dmu_data/ml/tests/train_mva.yaml +19 -10
  24. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
  25. dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
  26. dmu_data/plotting/tests/plug_stats.yaml +19 -0
  27. dmu_data/plotting/tests/simple.yaml +4 -3
  28. dmu_data/plotting/tests/styling.yaml +11 -0
  29. {data_manipulation_utilities-0.2.5.data → data_manipulation_utilities-0.2.7.data}/scripts/publish +0 -0
  30. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/entry_points.txt +0 -0
  31. {data_manipulation_utilities-0.2.5.dist-info → data_manipulation_utilities-0.2.7.dist-info}/top_level.txt +0 -0
@@ -1,20 +1,25 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
8
  Requires-Dist: awkward
9
9
  Requires-Dist: tqdm
10
- Requires-Dist: joblib
11
- Requires-Dist: scikit-learn
10
+ Requires-Dist: numpy
12
11
  Requires-Dist: toml
13
12
  Requires-Dist: numpy
14
13
  Requires-Dist: matplotlib
15
14
  Requires-Dist: mplhep
16
15
  Requires-Dist: hist[plot]
17
16
  Requires-Dist: pandas
17
+ Provides-Extra: fit
18
+ Requires-Dist: zfit; extra == "fit"
19
+ Requires-Dist: tensorflow==2.18.0; extra == "fit"
20
+ Provides-Extra: ml
21
+ Requires-Dist: scikit-learn; extra == "ml"
22
+ Requires-Dist: joblib; extra == "ml"
18
23
  Provides-Extra: dev
19
24
  Requires-Dist: pytest; extra == "dev"
20
25
 
@@ -51,6 +56,25 @@ Then, for each remote it pushes the tags and the commits.
51
56
 
52
57
  This section describes generic tools that could not be put in a specific category, but tend to be useful.
53
58
 
59
+ ## Hashing
60
+
61
+ The snippet below:
62
+
63
+ ```python
64
+ from dmu.generic import hashing
65
+
66
+ obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
67
+ val = hashing.hash_object(obj)
68
+ ```
69
+
70
+ will:
71
+
72
+ - Make the input object into a JSON string
73
+ - Encode it to utf-8
74
+ - Make a 64 characters hash out of it
75
+
76
+ in two lines, thus keeping the user's code clean.
77
+
54
78
  ## Timer
55
79
 
56
80
  In order to benchmark functions do:
@@ -67,9 +91,9 @@ def fun():
67
91
  fun()
68
92
  ```
69
93
 
70
- ## JSON dumper
94
+ ## JSON dumper and loader
71
95
 
72
- The following lines will dump data (dictionaries, lists, etc) to a JSON file:
96
+ The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
73
97
 
74
98
  ```python
75
99
  import dmu.generic.utilities as gut
@@ -77,8 +101,11 @@ import dmu.generic.utilities as gut
77
101
  data = [1,2,3,4]
78
102
 
79
103
  gut.dump_json(data, '/tmp/list.json')
104
+ data = gut.load_json('/tmp/list.json')
80
105
  ```
81
106
 
107
+ and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
108
+
82
109
  # Physics
83
110
 
84
111
  ## Truth matching
@@ -132,7 +159,8 @@ from dmu.stats.model_factory import ModelFactory
132
159
 
133
160
  l_pdf = ['cbr'] + 2 * ['cbl']
134
161
  l_shr = ['mu', 'sg']
135
- mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
162
+ d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
163
+ mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
136
164
  pdf = mod.get_pdf()
137
165
  ```
138
166
 
@@ -145,10 +173,40 @@ pol1: Polynomial of degree 1
145
173
  pol2: Polynomial of degree 2
146
174
  cbr : CrystallBall with right tail
147
175
  cbl : CrystallBall with left tail
148
- gauss : Gaussian
176
+ gauss : Gaussian
149
177
  dscb : Double sided CrystallBall
150
178
  ```
151
179
 
180
+ ### Model building with reparametrizations
181
+
182
+ In order to introduce reparametrizations for the means and the resolutions, such that:
183
+
184
+ $\mu\to\mu+\Delta\mu$
185
+ $\sigma\to\sigma\cdot s_{\sigma}$
186
+
187
+ where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
188
+
189
+ ```python
190
+ import zfit
191
+ from dmu.stats.model_factory import ModelFactory
192
+
193
+ l_shr = ['mu', 'sg']
194
+ l_flt = []
195
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'}
196
+ obs = zfit.Space('mass', limits=(5080, 5680))
197
+
198
+ mod = ModelFactory(
199
+ preffix = name,
200
+ obs = obs,
201
+ l_pdf = l_name,
202
+ d_rep = d_rep,
203
+ l_shared= l_shr,
204
+ l_float = l_flt)
205
+ pdf = mod.get_pdf()
206
+ ```
207
+
208
+ Here, the floating parameters **should not** be the same as the reparametrized ones.
209
+
152
210
  ### Printing PDFs
153
211
 
154
212
  One can print a zfit PDF by doing:
@@ -427,7 +485,7 @@ rdf_bkg = _get_rdf(kind='bkg')
427
485
  cfg = _get_config()
428
486
 
429
487
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
430
- obj.run()
488
+ obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
431
489
  ```
432
490
 
433
491
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -549,9 +607,61 @@ When evaluating the model with real data, problems might occur, we deal with the
549
607
  ```python
550
608
  model.cfg
551
609
  ```
552
- - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
610
+ - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
553
611
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
554
612
 
613
+ ## Diagnostics
614
+
615
+ To run diagnostics on the trained model do:
616
+
617
+ ```python
618
+ from dmu.ml.cv_diagnostics import CVDiagnostics
619
+
620
+ # Where l_model is the list of models and cfg is a dictionary with the config
621
+ cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
622
+ cvd.run()
623
+ ```
624
+
625
+ the configuration can be loaded from a YAML file and would look like:
626
+
627
+ ```yaml
628
+ # Directory where plots will go
629
+ output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
630
+ # Optional, will assume that the target is already in the input dataframe
631
+ # and will use it, instead of evaluating models
632
+ score_from_rdf : mva
633
+ correlations:
634
+ # Variables with respect to which the correlations with the features will be measured
635
+ target :
636
+ name : mass
637
+ overlay :
638
+ wp :
639
+ - 0.2
640
+ - 0.5
641
+ - 0.7
642
+ - 0.9
643
+ general:
644
+ size : [20, 10]
645
+ saving:
646
+ plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
647
+ plots:
648
+ z :
649
+ binning : [1000, 4000, 30]
650
+ yscale : 'linear'
651
+ labels : ['mass', 'Entries']
652
+ normalized : true
653
+ styling :
654
+ linestyle: '-' # By default there is no line, just pointer
655
+ methods:
656
+ - Pearson
657
+ - Kendall-$\tau$
658
+ figure:
659
+ title: Scores from file
660
+ size : [10, 8]
661
+ xlabelsize: 18 # Constrols size of x axis labels. By default 30
662
+ rotate : 60 # Will rotate xlabels by 60 degrees
663
+ ```
664
+
555
665
  # Pandas dataframes
556
666
 
557
667
  ## Utilities
@@ -582,6 +692,19 @@ put.df_to_tex(df,
582
692
  caption = 'some caption')
583
693
  ```
584
694
 
695
+ ### Dataframe to and from YAML
696
+
697
+ This extends the existing JSON functionality
698
+
699
+ ```python
700
+ import dmu.pdataframe.utilities as put
701
+
702
+ df_1 = _get_df()
703
+ put.to_yaml(df_1, yml_path)
704
+ df_2 = put.from_yaml(yml_path)
705
+ ```
706
+
707
+ and is meant to be less verbose than doing it through the YAML module.
585
708
  # Rdataframes
586
709
 
587
710
  These are utility functions meant to be used with ROOT dataframes.
@@ -707,6 +830,11 @@ plots:
707
830
  labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
708
831
  title : 'some title can be added for different variable plots'
709
832
  name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
833
+ # Can add styling to specific plots, this should be the argument of
834
+ # hist.plot(...)
835
+ styling :
836
+ label : x
837
+ linestyle: '-'
710
838
  y :
711
839
  binning : [-5.0, 8.0, 40]
712
840
  yscale : 'linear'
@@ -730,6 +858,47 @@ stats:
730
858
 
731
859
  it's up to the user to build this dictionary and load it.
732
860
 
861
+ ### Pluggins
862
+
863
+ Extra functionality can be `plugged` into the code by using the pluggins section like:
864
+
865
+ #### FWHM
866
+ ```yaml
867
+ plugin:
868
+ fwhm:
869
+ # Can control each variable fit separately
870
+ x :
871
+ plot : true
872
+ obs : [-2, 4]
873
+ plot : true
874
+ format : FWHM={:.3f}
875
+ add_std: True
876
+ y :
877
+ plot : true
878
+ obs : [-4, 8]
879
+ plot : true
880
+ format : FWHM={:.3f}
881
+ add_std: True
882
+ ```
883
+
884
+ where the section will
885
+
886
+ - Use a KDE to fit the distribution and plot it on top of the histogram
887
+ - Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
888
+
889
+ #### stats
890
+
891
+ ```yaml
892
+ plugin:
893
+ stats:
894
+ x :
895
+ mean : $\mu$={:.2f}
896
+ rms : $\sigma$={:.2f}
897
+ sum : $\Sigma$={:.0f}
898
+ ```
899
+
900
+ Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
901
+
733
902
  ## 2D plots
734
903
 
735
904
  For the 2D case it would look like:
@@ -1,33 +1,42 @@
1
- data_manipulation_utilities-0.2.5.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
1
+ data_manipulation_utilities-0.2.7.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
2
2
  dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
3
- dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
4
- dmu/generic/version_management.py,sha256=G_HjGY-hu8lotZuTdVAg0B8yD0AltE866q2vJxvTg1g,3749
3
+ dmu/generic/hashing.py,sha256=Mxwr88jyGuEP5Bhqtw1gDva9fNO-l11jb2qA0ceOysw,1167
4
+ dmu/generic/utilities.py,sha256=dBvrucgdM3vyyETharLvPAC7qEbgF9cL1mLVQ0nxNJM,2020
5
+ dmu/generic/version_management.py,sha256=j0ImlAq6SVNjTh3xRsF6G7DSoyr1w8kTRY84dNriGRE,3750
5
6
  dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
6
7
  dmu/ml/cv_classifier.py,sha256=ZbzEm_jW9yoTC7k_xBA7hFpc1bDNayiVR3tbaj1_ieE,4228
8
+ dmu/ml/cv_diagnostics.py,sha256=T6wkNcuLkdLrZ9z1YtAYMeb-M_bxnSChRHtdPeQv_To,7507
7
9
  dmu/ml/cv_predict.py,sha256=4wwYL_jcUExDqLJVfClxEUWSd_QAx8yKHO3rX-mx4vw,6711
8
- dmu/ml/train_mva.py,sha256=XzXE92PzyF3cjlx5yMhtp5h4t7wzisRAyO1fBArssvc,17282
10
+ dmu/ml/train_mva.py,sha256=tkdSODahXG9R77wmcckKFxYceMmc-3O-Y4pJXMe8WAI,21137
9
11
  dmu/ml/utilities.py,sha256=PK_61fW7gBV9aGZyez3PI8zAT7_Fc6IlQzDB7f8iBTM,4133
10
- dmu/pdataframe/utilities.py,sha256=ypvLiFfJ82ga94qlW3t5dXnvEFwYOXnbtJb2zHwsbqk,987
12
+ dmu/pdataframe/utilities.py,sha256=kwv9PuSmccz518e5bkrASacewCkMLTSPLtLiMH3olBc,2011
13
+ dmu/plotting/fwhm.py,sha256=Vn0TWlxgNMNk_wb2CD_At-ltVmzI49fyhoJXa0CT0I4,1924
11
14
  dmu/plotting/matrix.py,sha256=pXuUJn-LgOvrI9qGkZQw16BzLjOjeikYQ_ll2VIcIXU,4978
12
- dmu/plotting/plotter.py,sha256=3WRbNOrFBWgI3iW5TbEgT4w_eF7-XUPs_32JL1AW3yY,7359
13
- dmu/plotting/plotter_1d.py,sha256=2AnVxulyhKtwN-2Srhfm6fqdEREZNhcpJolBsJrWcsc,5745
15
+ dmu/plotting/plotter.py,sha256=f3hoZF1n3G-yjHsQ_2huxqz5cqFelZGs0RiaHYjF0JM,7391
16
+ dmu/plotting/plotter_1d.py,sha256=zygY6XUk4_wmaAK-bietjYdhkA03TZyft5IYX5acoYI,9142
14
17
  dmu/plotting/plotter_2d.py,sha256=mZhp3D5I-JodOnFTEF1NqHtcLtuI-2WNpCQsrsoXNtw,3017
15
18
  dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
16
19
  dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
17
20
  dmu/rdataframe/utilities.py,sha256=pNcQARMP7txMhy6k27UnDcYf0buNy5U2fshaJDl_h8o,3661
18
21
  dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
19
22
  dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
20
- dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
23
+ dmu/stats/fitter.py,sha256=gmSejQmoqsHmg6QPypbNb0MTfDFkSMy-DsDLKk1H3KE,18387
21
24
  dmu/stats/function.py,sha256=yzi_Fvp_ASsFzbWFivIf-comquy21WoeY7is6dgY0Go,9491
22
25
  dmu/stats/gof_calculator.py,sha256=4EN6OhULcztFvsAZ00rxgohJemnjtDNB5o0IBcv6kbk,4657
23
- dmu/stats/minimizers.py,sha256=f9cilFY9Kp9UvbSIUsKBGFzOOg7EEWZJLPod-4k-LAQ,6216
24
- dmu/stats/model_factory.py,sha256=ixWnhE8gPiOYW5pCb3eoVIaSvbUopEx4ldkZ3xL54Xg,7714
26
+ dmu/stats/minimizers.py,sha256=db9R2G0SOV-k0BKi6m4EyB_yp6AtZdP23_28B0315oo,7094
27
+ dmu/stats/model_factory.py,sha256=N0Q5R7GDfy2r_08xGduzydYO-ALvNTIRE1g0KKcRFyc,15872
25
28
  dmu/stats/utilities.py,sha256=LQy4kd3xSXqpApcWuYfZxkGQyjowaXv2Wr1c4Bj-4ys,4523
26
- dmu/stats/zfit_plotter.py,sha256=Xs6kisNEmNQXhYRCcjowxO6xHuyAyrfyQIFhGAR61U4,19719
27
- dmu/testing/utilities.py,sha256=moImLqGX9LAt5zJtE5j0gHHkUJ5kpbodryhiVswOsyM,3696
29
+ dmu/stats/zfit_models.py,sha256=SI61KJ-OG1UAabDICU1iTh6JPKM3giR2ErDraRjkCV8,1842
30
+ dmu/stats/zfit_plotter.py,sha256=bnkqFKyxhOEb5hVv4mxh4gpg-MrGeTrCh_5QKIT9hL0,19928
31
+ dmu/testing/utilities.py,sha256=c91g_PEzIZfS3Kxj2EOh05mc5ia7vFpZM1VEhjh4DA0,4661
28
32
  dmu/text/transformer.py,sha256=4lrGknbAWRm0-rxbvgzOO-eR1-9bkYk61boJUEV3cQ0,6100
29
33
  dmu_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- dmu_data/ml/tests/train_mva.yaml,sha256=o0ZIe43qPC-KwLT9y1qfYYw2bbOLlJIKRkCMUnM5sBo,1177
34
+ dmu_data/ml/tests/diagnostics_from_file.yaml,sha256=quvXOPkRducnBsctyape_Rn5_aqMEpPo6nO_UweMORo,404
35
+ dmu_data/ml/tests/diagnostics_from_model.yaml,sha256=rtCQlmGS9ld2xoQJEE35nA07yfRMklEfQEW0w3gRv2A,261
36
+ dmu_data/ml/tests/diagnostics_multiple_methods.yaml,sha256=w8Fpmr7kX1Jsb_h6LL2hiuYKf5lYpckFCpYKzWetbA0,265
37
+ dmu_data/ml/tests/diagnostics_overlay.yaml,sha256=ZVOsxLL8_JQtf41n8Ct-M9Ch10xBwHK54q1fttWPDlE,866
38
+ dmu_data/ml/tests/train_mva.yaml,sha256=AFUJV5yrXIdHHzA9TzfJv3RWhvk0GUB_CFubePCzfxE,1437
39
+ dmu_data/ml/tests/train_mva_with_diagnostics.yaml,sha256=ZyKU7W_mDGVvg8xP_wQX0NtO6N_tLSnKO1CyHAaYhuk,2036
31
40
  dmu_data/plotting/tests/2d.yaml,sha256=HSAtER-8CEqIGBY_jdcIdSVOHMfYPYhmgeZghTpVYh8,516
32
41
  dmu_data/plotting/tests/fig_size.yaml,sha256=7ROq49nwZ1A2EbPiySmu6n3G-Jq6YAOkc3d2X3YNZv0,294
33
42
  dmu_data/plotting/tests/high_stat.yaml,sha256=bLglBLCZK6ft0xMhQ5OltxE76cWsBMPMjO6GG0OkDr8,522
@@ -35,8 +44,11 @@ dmu_data/plotting/tests/legend.yaml,sha256=wGpj58ig-GOlqbWoN894zrCet2Fj9f5QtY0ri
35
44
  dmu_data/plotting/tests/name.yaml,sha256=mkcPAVg8wBAmlSbSRQ1bcaMl4vOS6LXMtpqQeDrrtO4,312
36
45
  dmu_data/plotting/tests/no_bounds.yaml,sha256=8e1QdphBjz-suDr857DoeUC2DXiy6SE-gvkORJQYv80,257
37
46
  dmu_data/plotting/tests/normalized.yaml,sha256=Y0eKtyV5pvlSxvqfsLjytYtv8xYF3HZ5WEdCJdeHGQI,193
38
- dmu_data/plotting/tests/simple.yaml,sha256=N_TvNBh_2dU0-VYgu_LMrtY0kV_hg2HxVuEoDlr1HX8,138
47
+ dmu_data/plotting/tests/plug_fwhm.yaml,sha256=xl5LXc9Nt66anM-HOXAxCtlaxWNM7zzIXf1Y6U8M4Wg,449
48
+ dmu_data/plotting/tests/plug_stats.yaml,sha256=ROO8soYXBbZIFYZcGngA_K5XHgIAFCmuAGfZCJgMmd0,384
49
+ dmu_data/plotting/tests/simple.yaml,sha256=Xc59Pjfb3BKMicLVBxODVqomHFupcb5GvefKbKHCQWQ,195
39
50
  dmu_data/plotting/tests/stats.yaml,sha256=fSZjoV-xPnukpCH2OAXsz_SNPjI113qzDg8Ln3spaaA,165
51
+ dmu_data/plotting/tests/styling.yaml,sha256=yUV75HewT2tFofS-5i-mU-3s4PSHcpZOZRn5R_iSRAw,218
40
52
  dmu_data/plotting/tests/title.yaml,sha256=bawKp9aGpeRrHzv69BOCbFX8sq9bb3Es9tdsPTE7jIk,333
41
53
  dmu_data/plotting/tests/weights.yaml,sha256=RWQ1KxbCq-uO62WJ2AoY4h5Umc37zG35s-TpKnNMABI,312
42
54
  dmu_data/text/transform.toml,sha256=R-832BZalzHZ6c5gD6jtT_Hj8BCsM5vxa1v6oeiwaP4,94
@@ -50,8 +62,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
50
62
  dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
51
63
  dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
52
64
  dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
53
- data_manipulation_utilities-0.2.5.dist-info/METADATA,sha256=d8rJbrtHEg_fOma5NA5qL4ox8bP4MaIV0mbyl6uRiJs,30104
54
- data_manipulation_utilities-0.2.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
55
- data_manipulation_utilities-0.2.5.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
56
- data_manipulation_utilities-0.2.5.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
57
- data_manipulation_utilities-0.2.5.dist-info/RECORD,,
65
+ data_manipulation_utilities-0.2.7.dist-info/METADATA,sha256=aaUGi8PUMDuJCdslarv5Rn10iI-Jvbu4WkWbEOwmtzw,34464
66
+ data_manipulation_utilities-0.2.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
67
+ data_manipulation_utilities-0.2.7.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
68
+ data_manipulation_utilities-0.2.7.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
69
+ data_manipulation_utilities-0.2.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
dmu/generic/hashing.py ADDED
@@ -0,0 +1,44 @@
1
+ '''
2
+ Module with functions needed to provide hashes
3
+ '''
4
+
5
+ import json
6
+ import hashlib
7
+ from typing import Any
8
+
9
+ import pandas as pnd
10
+ from dmu.logging.log_store import LogStore
11
+
12
+ log=LogStore.add_logger('dmu:generic.hashing')
13
+ # ------------------------------------
14
+ def _object_to_string(obj : Any) -> str:
15
+ try:
16
+ string = json.dumps(obj)
17
+ except Exception as exc:
18
+ raise ValueError(f'Cannot hash object: {obj}') from exc
19
+
20
+ return string
21
+ # ------------------------------------
22
+ def _dataframe_to_hash(df : pnd.DataFrame) -> str:
23
+ sr_hash = pnd.util.hash_pandas_object(df, index=True)
24
+ values = sr_hash.values
25
+ hsh = hashlib.sha256(values)
26
+ hsh = hsh.hexdigest()
27
+
28
+ return hsh
29
+ # ------------------------------------
30
+ def hash_object(obj : Any) -> str:
31
+ '''
32
+ Function taking a python object and returning
33
+ a string representing the hash
34
+ '''
35
+
36
+ if isinstance(obj, pnd.DataFrame):
37
+ return _dataframe_to_hash(df=obj)
38
+
39
+ string = _object_to_string(obj=obj)
40
+ string_bin = string.encode('utf-8')
41
+ hsh = hashlib.sha256(string_bin)
42
+
43
+ return hsh.hexdigest()
44
+ # ------------------------------------
dmu/generic/utilities.py CHANGED
@@ -52,7 +52,7 @@ def timeit(f):
52
52
  return result
53
53
  return wrap
54
54
  # --------------------------------
55
- def dump_json(data, path : str, sort_keys : bool = False):
55
+ def dump_json(data, path : str, sort_keys : bool = False) -> None:
56
56
  '''
57
57
  Saves data as JSON
58
58
 
@@ -67,3 +67,16 @@ def dump_json(data, path : str, sort_keys : bool = False):
67
67
  with open(path, 'w', encoding='utf-8') as ofile:
68
68
  json.dump(data, ofile, indent=4, sort_keys=sort_keys)
69
69
  # --------------------------------
70
+ def load_json(path : str):
71
+ '''
72
+ Loads data from JSON
73
+
74
+ Parameters
75
+ path : Path to JSON file where data is saved
76
+ '''
77
+
78
+ with open(path, encoding='utf-8') as ofile:
79
+ data = json.load(ofile)
80
+
81
+ return data
82
+ # --------------------------------
@@ -42,8 +42,7 @@ def get_last_version(dir_path : str, version_only : bool = True, main_only : boo
42
42
  l_obj = glob.glob(f'{dir_path}/*')
43
43
 
44
44
  if len(l_obj) == 0:
45
- log.error(f'Nothing found in {dir_path}')
46
- raise ValueError
45
+ raise ValueError(f'Nothing found in {dir_path}')
47
46
 
48
47
  d_dir_org = { os.path.basename(obj).replace('.', '') : obj for obj in l_obj if os.path.isdir(obj) }
49
48
  d_dir_num = { _get_numeric_version(name) : dir_path for name, dir_path in d_dir_org.items() }
@@ -52,9 +51,8 @@ def get_last_version(dir_path : str, version_only : bool = True, main_only : boo
52
51
 
53
52
  try:
54
53
  _, path = c_dir[-1]
55
- except:
56
- log.error(f'Cannot find path in: {dir_path}')
57
- raise
54
+ except Exception as exc:
55
+ raise ValueError(f'Cannot find path in: {dir_path}') from exc
58
56
 
59
57
  name = os.path.basename(path)
60
58
  dirn = os.path.dirname(path)
@@ -0,0 +1,221 @@
1
+ '''
2
+ Module containing CVDiagnostics class
3
+ '''
4
+ import os
5
+
6
+ import numpy
7
+ import matplotlib
8
+ import matplotlib.pyplot as plt
9
+ import pandas as pnd
10
+
11
+ from scipy.stats import kendalltau
12
+ from ROOT import RDataFrame, RDF
13
+ from dmu.ml.cv_classifier import CVClassifier
14
+ from dmu.ml.cv_predict import CVPredict
15
+ from dmu.logging.log_store import LogStore
16
+ from dmu.plotting.plotter_1d import Plotter1D as Plotter
17
+
18
+ NPA = numpy.ndarray
19
+ Axis= matplotlib.axes._axes.Axes
20
+ log = LogStore.add_logger('dmu:ml:cv_diagnostics')
21
+ # -------------------------
22
+ class CVDiagnostics:
23
+ '''
24
+ Class meant to rundiagnostics on classifier
25
+
26
+ Correlations
27
+ ------------------
28
+ Will calculate correlations between features + signal probability and some external target variable specified in the config
29
+ '''
30
+ # -------------------------
31
+ def __init__(self, models : list[CVClassifier], rdf : RDataFrame, cfg : dict):
32
+ self._l_model = models
33
+ self._cfg = cfg
34
+ self._rdf = rdf
35
+ self._target = cfg['correlations']['target']['name']
36
+ self._l_feat = self._get_features()
37
+ self._d_xlab = self._get_xlabels()
38
+ # -------------------------
39
+ def _get_features(self) -> list[str]:
40
+ cfg = self._l_model[0].cfg
41
+ l_var = cfg['training']['features']
42
+
43
+ return l_var
44
+ # -------------------------
45
+ def _get_xlabels(self) -> dict[str,str]:
46
+ cfg = self._l_model[0].cfg
47
+ d_var = cfg['plotting']['features']['plots']
48
+
49
+ d_lab = { varname : d_field['labels'][0] for varname, d_field in d_var.items() }
50
+
51
+ target= self._cfg['correlations']['target']['name']
52
+ if 'overlay' not in self._cfg['correlations']['target']:
53
+ xlabel = target
54
+ else:
55
+ xlabel= self._cfg['correlations']['target']['overlay']['plots'][target]['labels'][0]
56
+
57
+ d_lab[target] = xlabel
58
+ d_lab['score'] = 'score'
59
+
60
+ d_lab = { var_id : var_name.replace('MeV', '') for var_id, var_name in d_lab.items() }
61
+
62
+ return d_lab
63
+ # -------------------------
64
+ def _add_columns(self, rdf : RDataFrame) -> RDataFrame:
65
+ cfg = self._l_model[0].cfg
66
+ d_def = cfg['dataset']['define']
67
+ for var, expr in d_def.items():
68
+ rdf = rdf.Define(var, expr)
69
+
70
+ return rdf
71
+ # -------------------------
72
+ def _get_scores(self) -> NPA:
73
+ if 'score_from_rdf' not in self._cfg:
74
+ log.debug('Using score from model')
75
+ prd = CVPredict(models=self._l_model, rdf = self._rdf)
76
+
77
+ return prd.predict()
78
+
79
+ name = self._cfg['score_from_rdf']
80
+ log.debug(f'Picking up score from dataframe, column: {name}')
81
+ arr_score = self._rdf.AsNumpy([name])[name]
82
+
83
+ return arr_score
84
+ # -------------------------
85
+ def _get_arrays(self) -> dict[str, NPA]:
86
+ rdf = self._add_columns(self._rdf)
87
+ l_col = [ name.c_str() for name in rdf.GetColumnNames() ]
88
+
89
+ missing= False
90
+ l_var = self._l_feat + [self._target]
91
+ for var in l_var:
92
+ if var not in l_col:
93
+ log.error(f'{"Missing":<20}{var}')
94
+ missing=True
95
+
96
+ if missing:
97
+ raise ValueError('Columns missing')
98
+
99
+ d_var = rdf.AsNumpy(l_var)
100
+ d_var['score'] = self._get_scores()
101
+
102
+ return d_var
103
+ # -------------------------
104
+ def _run_correlations(self, method : str, ax : Axis) -> Axis:
105
+ d_arr = self._get_arrays()
106
+ arr_target = d_arr[self._target]
107
+
108
+ d_corr= {}
109
+ for name, arr_val in d_arr.items():
110
+ if name == self._target:
111
+ continue
112
+
113
+ d_corr[name] = self._calculate_correlations(var=arr_val, target=arr_target, method=method)
114
+
115
+ ax = self._plot_correlations(d_corr=d_corr, method=method, ax=ax)
116
+
117
+ return ax
118
+ # -------------------------
119
+ def _plot_correlations(self, d_corr : dict[str,float], method : str, ax : Axis) -> Axis:
120
+ df = pnd.DataFrame.from_dict(d_corr, orient="index", columns=[method])
121
+ df['variable'] = df.index.map(self._d_xlab)
122
+
123
+ figsize = self._cfg['correlations']['figure']['size']
124
+ ax = df.plot(x='variable', y=method,label=method, figsize=figsize, ax=ax)
125
+
126
+ # Needed to show all labels on x axis
127
+ plt.xticks(ticks=range(len(df)), labels=df.variable)
128
+ if 'xlabelsize' in self._cfg['correlations']['figure']:
129
+ xlabsize= self._cfg['correlations']['figure']['xlabelsize']
130
+ else:
131
+ xlabsize= 30
132
+
133
+ ax.tick_params(axis='x', labelsize=xlabsize)
134
+
135
+ return ax
136
+ # -------------------------
137
+ def _save_plot(self):
138
+ plot_dir = self._cfg['output']
139
+ os.makedirs(plot_dir, exist_ok=True)
140
+
141
+ plot_path = f'{plot_dir}/correlations.png'
142
+ log.info(f'Saving to: {plot_path}')
143
+
144
+ title = None
145
+ if 'title' in self._cfg['correlations']['figure']:
146
+ title = self._cfg['correlations']['figure']['title']
147
+
148
+ rotation=30
149
+ if 'rotate' in self._cfg['correlations']['figure']:
150
+ rotation = self._cfg['correlations']['figure']['rotate']
151
+
152
+ plt.ylim(-1, +1)
153
+ plt.title(title)
154
+ plt.xlabel('')
155
+ plt.ylabel('Correlation')
156
+ plt.grid()
157
+ plt.xticks(rotation=rotation)
158
+ plt.tight_layout()
159
+ plt.savefig(plot_path)
160
+ plt.close()
161
+ # -------------------------
162
+ def _remove_nans(self, var : NPA, tgt : NPA) -> tuple[NPA,NPA]:
163
+ arr_nan_var = numpy.isnan(var)
164
+ arr_nan_tgt = numpy.isnan(tgt)
165
+ arr_is_nan = numpy.logical_or(arr_nan_var, arr_nan_tgt)
166
+ arr_not_nan = numpy.logical_not(arr_is_nan)
167
+
168
+ var = var[arr_not_nan]
169
+ tgt = tgt[arr_not_nan]
170
+
171
+ return var, tgt
172
+ # -------------------------
173
+ def _calculate_correlations(self, var : NPA, target : NPA, method : str) -> float:
174
+ var, target = self._remove_nans(var, target)
175
+
176
+ if method == 'Pearson':
177
+ mat = numpy.corrcoef(var, target)
178
+
179
+ return mat[0,1]
180
+
181
+ if method == r'Kendall-$\tau$':
182
+ tau, _ = kendalltau(var, target)
183
+
184
+ return tau
185
+
186
+ raise NotImplementedError(f'Correlation coefficient {method} not implemented')
187
+ # -------------------------
188
+ def _plot_cutflow(self) -> None:
189
+ if 'overlay' not in self._cfg['correlations']['target']:
190
+ log.debug('Not plotting cutflow of target distribution')
191
+ return
192
+
193
+ arr_score = self._get_scores()
194
+ arr_target= self._rdf.AsNumpy([self._target])[self._target]
195
+ arr_wp = self._cfg['correlations']['target']['overlay']['wp']
196
+ rdf = RDF.FromNumpy({'Score' : arr_score, self._target : arr_target})
197
+
198
+ d_rdf = {}
199
+ for wp in arr_wp:
200
+ name = f'WP > {wp:.2}'
201
+ expr = f'Score > {wp:.3}'
202
+ d_rdf[name] = rdf.Filter(expr)
203
+
204
+ cfg_target = self._cfg['correlations']['target']['overlay']
205
+
206
+ ptr=Plotter(d_rdf=d_rdf, cfg=cfg_target)
207
+ ptr.run()
208
+ # -------------------------
209
+ def run(self) -> None:
210
+ '''
211
+ Runs diagnostics
212
+ '''
213
+ if 'correlations' in self._cfg:
214
+ ax = None
215
+ for method in self._cfg['correlations']['methods']:
216
+ ax = self._run_correlations(method=method, ax=ax)
217
+
218
+ self._save_plot()
219
+
220
+ self._plot_cutflow()
221
+ # -------------------------