data-manipulation-utilities 0.2.6__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.6/src/data_manipulation_utilities.egg-info → data_manipulation_utilities-0.2.7}/PKG-INFO +177 -8
- data_manipulation_utilities-0.2.6/PKG-INFO → data_manipulation_utilities-0.2.7/README.md +168 -24
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/pyproject.toml +5 -3
- data_manipulation_utilities-0.2.6/README.md → data_manipulation_utilities-0.2.7/src/data_manipulation_utilities.egg-info/PKG-INFO +193 -4
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/SOURCES.txt +12 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/requires.txt +9 -2
- data_manipulation_utilities-0.2.7/src/dmu/generic/hashing.py +44 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/generic/utilities.py +14 -1
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/generic/version_management.py +3 -5
- data_manipulation_utilities-0.2.7/src/dmu/ml/cv_diagnostics.py +221 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/ml/train_mva.py +124 -31
- data_manipulation_utilities-0.2.7/src/dmu/pdataframe/utilities.py +69 -0
- data_manipulation_utilities-0.2.7/src/dmu/plotting/fwhm.py +64 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/plotter.py +2 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/plotter_1d.py +87 -6
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/fitter.py +1 -1
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/model_factory.py +189 -25
- data_manipulation_utilities-0.2.7/src/dmu/stats/zfit_models.py +68 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/zfit_plotter.py +29 -21
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/testing/utilities.py +31 -4
- data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/ml/tests/train_mva.yaml +15 -9
- data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/plug_stats.yaml +19 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/simple.yaml +9 -0
- data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/styling.yaml +11 -0
- data_manipulation_utilities-0.2.6/src/dmu/pdataframe/utilities.py +0 -36
- data_manipulation_utilities-0.2.6/src/dmu_data/plotting/tests/simple.yaml +0 -8
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/setup.cfg +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/arrays/utilities.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/logging/log_store.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/ml/cv_classifier.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/ml/cv_predict.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/ml/utilities.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/matrix.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/plotter_2d.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/utilities.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/rdataframe/atr_mgr.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/rdataframe/utilities.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/rfile/rfprinter.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/rfile/utilities.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/function.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/gof_calculator.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/minimizers.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/stats/utilities.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu/text/transformer.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/__init__.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/2d.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/legend.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/name.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/stats.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/title.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/weights.yaml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform.toml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform.txt +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform_set.toml +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform_set.txt +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform_trf.txt +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/git/publish +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/physics/check_truth.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/rfile/print_trees.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/ssh/coned.py +0 -0
- {data_manipulation_utilities-0.2.6 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,20 +1,25 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
7
7
|
Requires-Dist: scipy
|
8
8
|
Requires-Dist: awkward
|
9
9
|
Requires-Dist: tqdm
|
10
|
-
Requires-Dist:
|
11
|
-
Requires-Dist: scikit-learn
|
10
|
+
Requires-Dist: numpy
|
12
11
|
Requires-Dist: toml
|
13
12
|
Requires-Dist: numpy
|
14
13
|
Requires-Dist: matplotlib
|
15
14
|
Requires-Dist: mplhep
|
16
15
|
Requires-Dist: hist[plot]
|
17
16
|
Requires-Dist: pandas
|
17
|
+
Provides-Extra: fit
|
18
|
+
Requires-Dist: zfit; extra == "fit"
|
19
|
+
Requires-Dist: tensorflow==2.18.0; extra == "fit"
|
20
|
+
Provides-Extra: ml
|
21
|
+
Requires-Dist: scikit-learn; extra == "ml"
|
22
|
+
Requires-Dist: joblib; extra == "ml"
|
18
23
|
Provides-Extra: dev
|
19
24
|
Requires-Dist: pytest; extra == "dev"
|
20
25
|
|
@@ -51,6 +56,25 @@ Then, for each remote it pushes the tags and the commits.
|
|
51
56
|
|
52
57
|
This section describes generic tools that could not be put in a specific category, but tend to be useful.
|
53
58
|
|
59
|
+
## Hashing
|
60
|
+
|
61
|
+
The snippet below:
|
62
|
+
|
63
|
+
```python
|
64
|
+
from dmu.generic import hashing
|
65
|
+
|
66
|
+
obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
|
67
|
+
val = hashing.hash_object(obj)
|
68
|
+
```
|
69
|
+
|
70
|
+
will:
|
71
|
+
|
72
|
+
- Make the input object into a JSON string
|
73
|
+
- Encode it to utf-8
|
74
|
+
- Make a 64 characters hash out of it
|
75
|
+
|
76
|
+
in two lines, thus keeping the user's code clean.
|
77
|
+
|
54
78
|
## Timer
|
55
79
|
|
56
80
|
In order to benchmark functions do:
|
@@ -67,9 +91,9 @@ def fun():
|
|
67
91
|
fun()
|
68
92
|
```
|
69
93
|
|
70
|
-
## JSON dumper
|
94
|
+
## JSON dumper and loader
|
71
95
|
|
72
|
-
The following lines will dump data (dictionaries, lists, etc) to a JSON file:
|
96
|
+
The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
|
73
97
|
|
74
98
|
```python
|
75
99
|
import dmu.generic.utilities as gut
|
@@ -77,8 +101,11 @@ import dmu.generic.utilities as gut
|
|
77
101
|
data = [1,2,3,4]
|
78
102
|
|
79
103
|
gut.dump_json(data, '/tmp/list.json')
|
104
|
+
data = gut.load_json('/tmp/list.json')
|
80
105
|
```
|
81
106
|
|
107
|
+
and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
|
108
|
+
|
82
109
|
# Physics
|
83
110
|
|
84
111
|
## Truth matching
|
@@ -132,7 +159,8 @@ from dmu.stats.model_factory import ModelFactory
|
|
132
159
|
|
133
160
|
l_pdf = ['cbr'] + 2 * ['cbl']
|
134
161
|
l_shr = ['mu', 'sg']
|
135
|
-
|
162
|
+
d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
|
163
|
+
mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
|
136
164
|
pdf = mod.get_pdf()
|
137
165
|
```
|
138
166
|
|
@@ -145,10 +173,40 @@ pol1: Polynomial of degree 1
|
|
145
173
|
pol2: Polynomial of degree 2
|
146
174
|
cbr : CrystallBall with right tail
|
147
175
|
cbl : CrystallBall with left tail
|
148
|
-
gauss : Gaussian
|
176
|
+
gauss : Gaussian
|
149
177
|
dscb : Double sided CrystallBall
|
150
178
|
```
|
151
179
|
|
180
|
+
### Model building with reparametrizations
|
181
|
+
|
182
|
+
In order to introduce reparametrizations for the means and the resolutions, such that:
|
183
|
+
|
184
|
+
$\mu\to\mu+\Delta\mu$
|
185
|
+
$\sigma\to\sigma\cdot s_{\sigma}$
|
186
|
+
|
187
|
+
where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
|
188
|
+
|
189
|
+
```python
|
190
|
+
import zfit
|
191
|
+
from dmu.stats.model_factory import ModelFactory
|
192
|
+
|
193
|
+
l_shr = ['mu', 'sg']
|
194
|
+
l_flt = []
|
195
|
+
d_rep = {'mu' : 'scale', 'sg' : 'reso'}
|
196
|
+
obs = zfit.Space('mass', limits=(5080, 5680))
|
197
|
+
|
198
|
+
mod = ModelFactory(
|
199
|
+
preffix = name,
|
200
|
+
obs = obs,
|
201
|
+
l_pdf = l_name,
|
202
|
+
d_rep = d_rep,
|
203
|
+
l_shared= l_shr,
|
204
|
+
l_float = l_flt)
|
205
|
+
pdf = mod.get_pdf()
|
206
|
+
```
|
207
|
+
|
208
|
+
Here, the floating parameters **should not** be the same as the reparametrized ones.
|
209
|
+
|
152
210
|
### Printing PDFs
|
153
211
|
|
154
212
|
One can print a zfit PDF by doing:
|
@@ -552,6 +610,58 @@ When evaluating the model with real data, problems might occur, we deal with the
|
|
552
610
|
- For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
|
553
611
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
554
612
|
|
613
|
+
## Diagnostics
|
614
|
+
|
615
|
+
To run diagnostics on the trained model do:
|
616
|
+
|
617
|
+
```python
|
618
|
+
from dmu.ml.cv_diagnostics import CVDiagnostics
|
619
|
+
|
620
|
+
# Where l_model is the list of models and cfg is a dictionary with the config
|
621
|
+
cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
|
622
|
+
cvd.run()
|
623
|
+
```
|
624
|
+
|
625
|
+
the configuration can be loaded from a YAML file and would look like:
|
626
|
+
|
627
|
+
```yaml
|
628
|
+
# Directory where plots will go
|
629
|
+
output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
|
630
|
+
# Optional, will assume that the target is already in the input dataframe
|
631
|
+
# and will use it, instead of evaluating models
|
632
|
+
score_from_rdf : mva
|
633
|
+
correlations:
|
634
|
+
# Variables with respect to which the correlations with the features will be measured
|
635
|
+
target :
|
636
|
+
name : mass
|
637
|
+
overlay :
|
638
|
+
wp :
|
639
|
+
- 0.2
|
640
|
+
- 0.5
|
641
|
+
- 0.7
|
642
|
+
- 0.9
|
643
|
+
general:
|
644
|
+
size : [20, 10]
|
645
|
+
saving:
|
646
|
+
plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
|
647
|
+
plots:
|
648
|
+
z :
|
649
|
+
binning : [1000, 4000, 30]
|
650
|
+
yscale : 'linear'
|
651
|
+
labels : ['mass', 'Entries']
|
652
|
+
normalized : true
|
653
|
+
styling :
|
654
|
+
linestyle: '-' # By default there is no line, just pointer
|
655
|
+
methods:
|
656
|
+
- Pearson
|
657
|
+
- Kendall-$\tau$
|
658
|
+
figure:
|
659
|
+
title: Scores from file
|
660
|
+
size : [10, 8]
|
661
|
+
xlabelsize: 18 # Constrols size of x axis labels. By default 30
|
662
|
+
rotate : 60 # Will rotate xlabels by 60 degrees
|
663
|
+
```
|
664
|
+
|
555
665
|
# Pandas dataframes
|
556
666
|
|
557
667
|
## Utilities
|
@@ -582,6 +692,19 @@ put.df_to_tex(df,
|
|
582
692
|
caption = 'some caption')
|
583
693
|
```
|
584
694
|
|
695
|
+
### Dataframe to and from YAML
|
696
|
+
|
697
|
+
This extends the existing JSON functionality
|
698
|
+
|
699
|
+
```python
|
700
|
+
import dmu.pdataframe.utilities as put
|
701
|
+
|
702
|
+
df_1 = _get_df()
|
703
|
+
put.to_yaml(df_1, yml_path)
|
704
|
+
df_2 = put.from_yaml(yml_path)
|
705
|
+
```
|
706
|
+
|
707
|
+
and is meant to be less verbose than doing it through the YAML module.
|
585
708
|
# Rdataframes
|
586
709
|
|
587
710
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -707,6 +830,11 @@ plots:
|
|
707
830
|
labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
|
708
831
|
title : 'some title can be added for different variable plots'
|
709
832
|
name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
|
833
|
+
# Can add styling to specific plots, this should be the argument of
|
834
|
+
# hist.plot(...)
|
835
|
+
styling :
|
836
|
+
label : x
|
837
|
+
linestyle: '-'
|
710
838
|
y :
|
711
839
|
binning : [-5.0, 8.0, 40]
|
712
840
|
yscale : 'linear'
|
@@ -730,6 +858,47 @@ stats:
|
|
730
858
|
|
731
859
|
it's up to the user to build this dictionary and load it.
|
732
860
|
|
861
|
+
### Pluggins
|
862
|
+
|
863
|
+
Extra functionality can be `plugged` into the code by using the pluggins section like:
|
864
|
+
|
865
|
+
#### FWHM
|
866
|
+
```yaml
|
867
|
+
plugin:
|
868
|
+
fwhm:
|
869
|
+
# Can control each variable fit separately
|
870
|
+
x :
|
871
|
+
plot : true
|
872
|
+
obs : [-2, 4]
|
873
|
+
plot : true
|
874
|
+
format : FWHM={:.3f}
|
875
|
+
add_std: True
|
876
|
+
y :
|
877
|
+
plot : true
|
878
|
+
obs : [-4, 8]
|
879
|
+
plot : true
|
880
|
+
format : FWHM={:.3f}
|
881
|
+
add_std: True
|
882
|
+
```
|
883
|
+
|
884
|
+
where the section will
|
885
|
+
|
886
|
+
- Use a KDE to fit the distribution and plot it on top of the histogram
|
887
|
+
- Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
|
888
|
+
|
889
|
+
#### stats
|
890
|
+
|
891
|
+
```yaml
|
892
|
+
plugin:
|
893
|
+
stats:
|
894
|
+
x :
|
895
|
+
mean : $\mu$={:.2f}
|
896
|
+
rms : $\sigma$={:.2f}
|
897
|
+
sum : $\Sigma$={:.0f}
|
898
|
+
```
|
899
|
+
|
900
|
+
Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
|
901
|
+
|
733
902
|
## 2D plots
|
734
903
|
|
735
904
|
For the 2D case it would look like:
|
@@ -1,23 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.6
|
4
|
-
Description-Content-Type: text/markdown
|
5
|
-
Requires-Dist: logzero
|
6
|
-
Requires-Dist: PyYAML
|
7
|
-
Requires-Dist: scipy
|
8
|
-
Requires-Dist: awkward
|
9
|
-
Requires-Dist: tqdm
|
10
|
-
Requires-Dist: joblib
|
11
|
-
Requires-Dist: scikit-learn
|
12
|
-
Requires-Dist: toml
|
13
|
-
Requires-Dist: numpy
|
14
|
-
Requires-Dist: matplotlib
|
15
|
-
Requires-Dist: mplhep
|
16
|
-
Requires-Dist: hist[plot]
|
17
|
-
Requires-Dist: pandas
|
18
|
-
Provides-Extra: dev
|
19
|
-
Requires-Dist: pytest; extra == "dev"
|
20
|
-
|
21
1
|
# D(ata) M(anipulation) U(tilities)
|
22
2
|
|
23
3
|
These are tools that can be used for different data analysis tasks.
|
@@ -51,6 +31,25 @@ Then, for each remote it pushes the tags and the commits.
|
|
51
31
|
|
52
32
|
This section describes generic tools that could not be put in a specific category, but tend to be useful.
|
53
33
|
|
34
|
+
## Hashing
|
35
|
+
|
36
|
+
The snippet below:
|
37
|
+
|
38
|
+
```python
|
39
|
+
from dmu.generic import hashing
|
40
|
+
|
41
|
+
obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
|
42
|
+
val = hashing.hash_object(obj)
|
43
|
+
```
|
44
|
+
|
45
|
+
will:
|
46
|
+
|
47
|
+
- Make the input object into a JSON string
|
48
|
+
- Encode it to utf-8
|
49
|
+
- Make a 64 characters hash out of it
|
50
|
+
|
51
|
+
in two lines, thus keeping the user's code clean.
|
52
|
+
|
54
53
|
## Timer
|
55
54
|
|
56
55
|
In order to benchmark functions do:
|
@@ -67,9 +66,9 @@ def fun():
|
|
67
66
|
fun()
|
68
67
|
```
|
69
68
|
|
70
|
-
## JSON dumper
|
69
|
+
## JSON dumper and loader
|
71
70
|
|
72
|
-
The following lines will dump data (dictionaries, lists, etc) to a JSON file:
|
71
|
+
The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
|
73
72
|
|
74
73
|
```python
|
75
74
|
import dmu.generic.utilities as gut
|
@@ -77,8 +76,11 @@ import dmu.generic.utilities as gut
|
|
77
76
|
data = [1,2,3,4]
|
78
77
|
|
79
78
|
gut.dump_json(data, '/tmp/list.json')
|
79
|
+
data = gut.load_json('/tmp/list.json')
|
80
80
|
```
|
81
81
|
|
82
|
+
and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
|
83
|
+
|
82
84
|
# Physics
|
83
85
|
|
84
86
|
## Truth matching
|
@@ -132,7 +134,8 @@ from dmu.stats.model_factory import ModelFactory
|
|
132
134
|
|
133
135
|
l_pdf = ['cbr'] + 2 * ['cbl']
|
134
136
|
l_shr = ['mu', 'sg']
|
135
|
-
|
137
|
+
d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
|
138
|
+
mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
|
136
139
|
pdf = mod.get_pdf()
|
137
140
|
```
|
138
141
|
|
@@ -145,10 +148,40 @@ pol1: Polynomial of degree 1
|
|
145
148
|
pol2: Polynomial of degree 2
|
146
149
|
cbr : CrystallBall with right tail
|
147
150
|
cbl : CrystallBall with left tail
|
148
|
-
gauss : Gaussian
|
151
|
+
gauss : Gaussian
|
149
152
|
dscb : Double sided CrystallBall
|
150
153
|
```
|
151
154
|
|
155
|
+
### Model building with reparametrizations
|
156
|
+
|
157
|
+
In order to introduce reparametrizations for the means and the resolutions, such that:
|
158
|
+
|
159
|
+
$\mu\to\mu+\Delta\mu$
|
160
|
+
$\sigma\to\sigma\cdot s_{\sigma}$
|
161
|
+
|
162
|
+
where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
|
163
|
+
|
164
|
+
```python
|
165
|
+
import zfit
|
166
|
+
from dmu.stats.model_factory import ModelFactory
|
167
|
+
|
168
|
+
l_shr = ['mu', 'sg']
|
169
|
+
l_flt = []
|
170
|
+
d_rep = {'mu' : 'scale', 'sg' : 'reso'}
|
171
|
+
obs = zfit.Space('mass', limits=(5080, 5680))
|
172
|
+
|
173
|
+
mod = ModelFactory(
|
174
|
+
preffix = name,
|
175
|
+
obs = obs,
|
176
|
+
l_pdf = l_name,
|
177
|
+
d_rep = d_rep,
|
178
|
+
l_shared= l_shr,
|
179
|
+
l_float = l_flt)
|
180
|
+
pdf = mod.get_pdf()
|
181
|
+
```
|
182
|
+
|
183
|
+
Here, the floating parameters **should not** be the same as the reparametrized ones.
|
184
|
+
|
152
185
|
### Printing PDFs
|
153
186
|
|
154
187
|
One can print a zfit PDF by doing:
|
@@ -552,6 +585,58 @@ When evaluating the model with real data, problems might occur, we deal with the
|
|
552
585
|
- For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
|
553
586
|
saved as -1. I.e. entries with NaNs will have probabilities of -1.
|
554
587
|
|
588
|
+
## Diagnostics
|
589
|
+
|
590
|
+
To run diagnostics on the trained model do:
|
591
|
+
|
592
|
+
```python
|
593
|
+
from dmu.ml.cv_diagnostics import CVDiagnostics
|
594
|
+
|
595
|
+
# Where l_model is the list of models and cfg is a dictionary with the config
|
596
|
+
cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
|
597
|
+
cvd.run()
|
598
|
+
```
|
599
|
+
|
600
|
+
the configuration can be loaded from a YAML file and would look like:
|
601
|
+
|
602
|
+
```yaml
|
603
|
+
# Directory where plots will go
|
604
|
+
output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
|
605
|
+
# Optional, will assume that the target is already in the input dataframe
|
606
|
+
# and will use it, instead of evaluating models
|
607
|
+
score_from_rdf : mva
|
608
|
+
correlations:
|
609
|
+
# Variables with respect to which the correlations with the features will be measured
|
610
|
+
target :
|
611
|
+
name : mass
|
612
|
+
overlay :
|
613
|
+
wp :
|
614
|
+
- 0.2
|
615
|
+
- 0.5
|
616
|
+
- 0.7
|
617
|
+
- 0.9
|
618
|
+
general:
|
619
|
+
size : [20, 10]
|
620
|
+
saving:
|
621
|
+
plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
|
622
|
+
plots:
|
623
|
+
z :
|
624
|
+
binning : [1000, 4000, 30]
|
625
|
+
yscale : 'linear'
|
626
|
+
labels : ['mass', 'Entries']
|
627
|
+
normalized : true
|
628
|
+
styling :
|
629
|
+
linestyle: '-' # By default there is no line, just pointer
|
630
|
+
methods:
|
631
|
+
- Pearson
|
632
|
+
- Kendall-$\tau$
|
633
|
+
figure:
|
634
|
+
title: Scores from file
|
635
|
+
size : [10, 8]
|
636
|
+
xlabelsize: 18 # Constrols size of x axis labels. By default 30
|
637
|
+
rotate : 60 # Will rotate xlabels by 60 degrees
|
638
|
+
```
|
639
|
+
|
555
640
|
# Pandas dataframes
|
556
641
|
|
557
642
|
## Utilities
|
@@ -582,6 +667,19 @@ put.df_to_tex(df,
|
|
582
667
|
caption = 'some caption')
|
583
668
|
```
|
584
669
|
|
670
|
+
### Dataframe to and from YAML
|
671
|
+
|
672
|
+
This extends the existing JSON functionality
|
673
|
+
|
674
|
+
```python
|
675
|
+
import dmu.pdataframe.utilities as put
|
676
|
+
|
677
|
+
df_1 = _get_df()
|
678
|
+
put.to_yaml(df_1, yml_path)
|
679
|
+
df_2 = put.from_yaml(yml_path)
|
680
|
+
```
|
681
|
+
|
682
|
+
and is meant to be less verbose than doing it through the YAML module.
|
585
683
|
# Rdataframes
|
586
684
|
|
587
685
|
These are utility functions meant to be used with ROOT dataframes.
|
@@ -707,6 +805,11 @@ plots:
|
|
707
805
|
labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
|
708
806
|
title : 'some title can be added for different variable plots'
|
709
807
|
name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
|
808
|
+
# Can add styling to specific plots, this should be the argument of
|
809
|
+
# hist.plot(...)
|
810
|
+
styling :
|
811
|
+
label : x
|
812
|
+
linestyle: '-'
|
710
813
|
y :
|
711
814
|
binning : [-5.0, 8.0, 40]
|
712
815
|
yscale : 'linear'
|
@@ -730,6 +833,47 @@ stats:
|
|
730
833
|
|
731
834
|
it's up to the user to build this dictionary and load it.
|
732
835
|
|
836
|
+
### Pluggins
|
837
|
+
|
838
|
+
Extra functionality can be `plugged` into the code by using the pluggins section like:
|
839
|
+
|
840
|
+
#### FWHM
|
841
|
+
```yaml
|
842
|
+
plugin:
|
843
|
+
fwhm:
|
844
|
+
# Can control each variable fit separately
|
845
|
+
x :
|
846
|
+
plot : true
|
847
|
+
obs : [-2, 4]
|
848
|
+
plot : true
|
849
|
+
format : FWHM={:.3f}
|
850
|
+
add_std: True
|
851
|
+
y :
|
852
|
+
plot : true
|
853
|
+
obs : [-4, 8]
|
854
|
+
plot : true
|
855
|
+
format : FWHM={:.3f}
|
856
|
+
add_std: True
|
857
|
+
```
|
858
|
+
|
859
|
+
where the section will
|
860
|
+
|
861
|
+
- Use a KDE to fit the distribution and plot it on top of the histogram
|
862
|
+
- Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
|
863
|
+
|
864
|
+
#### stats
|
865
|
+
|
866
|
+
```yaml
|
867
|
+
plugin:
|
868
|
+
stats:
|
869
|
+
x :
|
870
|
+
mean : $\mu$={:.2f}
|
871
|
+
rms : $\sigma$={:.2f}
|
872
|
+
sum : $\Sigma$={:.0f}
|
873
|
+
```
|
874
|
+
|
875
|
+
Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
|
876
|
+
|
733
877
|
## 2D plots
|
734
878
|
|
735
879
|
For the 2D case it would look like:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = 'data_manipulation_utilities'
|
3
|
-
version = '0.2.
|
3
|
+
version = '0.2.7'
|
4
4
|
readme = 'README.md'
|
5
5
|
dependencies= [
|
6
6
|
'logzero',
|
@@ -8,8 +8,7 @@ dependencies= [
|
|
8
8
|
'scipy',
|
9
9
|
'awkward',
|
10
10
|
'tqdm',
|
11
|
-
'
|
12
|
-
'scikit-learn',
|
11
|
+
'numpy',
|
13
12
|
'toml',
|
14
13
|
'numpy',
|
15
14
|
'matplotlib',
|
@@ -18,6 +17,9 @@ dependencies= [
|
|
18
17
|
'pandas']
|
19
18
|
|
20
19
|
[project.optional-dependencies]
|
20
|
+
# Use latest tensorflow allowed by zfit
|
21
|
+
fit = ['zfit','tensorflow==2.18.0']
|
22
|
+
ml = ['scikit-learn', 'joblib']
|
21
23
|
dev = ['pytest']
|
22
24
|
|
23
25
|
[tools.setuptools.packages.find]
|