data-manipulation-utilities 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {data_manipulation_utilities-0.2.5/src/data_manipulation_utilities.egg-info → data_manipulation_utilities-0.2.7}/PKG-INFO +179 -10
  2. data_manipulation_utilities-0.2.5/PKG-INFO → data_manipulation_utilities-0.2.7/README.md +170 -26
  3. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/pyproject.toml +5 -3
  4. data_manipulation_utilities-0.2.5/README.md → data_manipulation_utilities-0.2.7/src/data_manipulation_utilities.egg-info/PKG-INFO +195 -6
  5. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/SOURCES.txt +12 -0
  6. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/requires.txt +9 -2
  7. data_manipulation_utilities-0.2.7/src/dmu/generic/hashing.py +44 -0
  8. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/generic/utilities.py +14 -1
  9. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/generic/version_management.py +3 -5
  10. data_manipulation_utilities-0.2.7/src/dmu/ml/cv_diagnostics.py +221 -0
  11. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/ml/train_mva.py +143 -46
  12. data_manipulation_utilities-0.2.7/src/dmu/pdataframe/utilities.py +69 -0
  13. data_manipulation_utilities-0.2.7/src/dmu/plotting/fwhm.py +64 -0
  14. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/plotter.py +2 -0
  15. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/plotter_1d.py +87 -6
  16. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/stats/fitter.py +1 -1
  17. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/stats/minimizers.py +40 -11
  18. data_manipulation_utilities-0.2.7/src/dmu/stats/model_factory.py +417 -0
  19. data_manipulation_utilities-0.2.7/src/dmu/stats/zfit_models.py +68 -0
  20. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/stats/zfit_plotter.py +29 -21
  21. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/testing/utilities.py +31 -4
  22. data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_from_file.yaml +13 -0
  23. data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_from_model.yaml +10 -0
  24. data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_multiple_methods.yaml +10 -0
  25. data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/diagnostics_overlay.yaml +33 -0
  26. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/ml/tests/train_mva.yaml +19 -10
  27. data_manipulation_utilities-0.2.7/src/dmu_data/ml/tests/train_mva_with_diagnostics.yaml +82 -0
  28. data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/plug_fwhm.yaml +24 -0
  29. data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/plug_stats.yaml +19 -0
  30. data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/simple.yaml +9 -0
  31. data_manipulation_utilities-0.2.7/src/dmu_data/plotting/tests/styling.yaml +11 -0
  32. data_manipulation_utilities-0.2.5/src/dmu/pdataframe/utilities.py +0 -36
  33. data_manipulation_utilities-0.2.5/src/dmu/stats/model_factory.py +0 -213
  34. data_manipulation_utilities-0.2.5/src/dmu_data/plotting/tests/simple.yaml +0 -8
  35. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/setup.cfg +0 -0
  36. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
  37. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
  38. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
  39. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/arrays/utilities.py +0 -0
  40. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/logging/log_store.py +0 -0
  41. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/ml/cv_classifier.py +0 -0
  42. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/ml/cv_predict.py +0 -0
  43. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/ml/utilities.py +0 -0
  44. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/matrix.py +0 -0
  45. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/plotter_2d.py +0 -0
  46. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/plotting/utilities.py +0 -0
  47. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/rdataframe/atr_mgr.py +0 -0
  48. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/rdataframe/utilities.py +0 -0
  49. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/rfile/rfprinter.py +0 -0
  50. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/rfile/utilities.py +0 -0
  51. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/stats/function.py +0 -0
  52. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/stats/gof_calculator.py +0 -0
  53. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/stats/utilities.py +0 -0
  54. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu/text/transformer.py +0 -0
  55. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/__init__.py +0 -0
  56. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/2d.yaml +0 -0
  57. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
  58. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
  59. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/legend.yaml +0 -0
  60. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/name.yaml +0 -0
  61. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
  62. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
  63. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/stats.yaml +0 -0
  64. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/title.yaml +0 -0
  65. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/plotting/tests/weights.yaml +0 -0
  66. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform.toml +0 -0
  67. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform.txt +0 -0
  68. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform_set.toml +0 -0
  69. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform_set.txt +0 -0
  70. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_data/text/transform_trf.txt +0 -0
  71. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/git/publish +0 -0
  72. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/physics/check_truth.py +0 -0
  73. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
  74. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/rfile/print_trees.py +0 -0
  75. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/ssh/coned.py +0 -0
  76. {data_manipulation_utilities-0.2.5 → data_manipulation_utilities-0.2.7}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,20 +1,25 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: data_manipulation_utilities
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Description-Content-Type: text/markdown
5
5
  Requires-Dist: logzero
6
6
  Requires-Dist: PyYAML
7
7
  Requires-Dist: scipy
8
8
  Requires-Dist: awkward
9
9
  Requires-Dist: tqdm
10
- Requires-Dist: joblib
11
- Requires-Dist: scikit-learn
10
+ Requires-Dist: numpy
12
11
  Requires-Dist: toml
13
12
  Requires-Dist: numpy
14
13
  Requires-Dist: matplotlib
15
14
  Requires-Dist: mplhep
16
15
  Requires-Dist: hist[plot]
17
16
  Requires-Dist: pandas
17
+ Provides-Extra: fit
18
+ Requires-Dist: zfit; extra == "fit"
19
+ Requires-Dist: tensorflow==2.18.0; extra == "fit"
20
+ Provides-Extra: ml
21
+ Requires-Dist: scikit-learn; extra == "ml"
22
+ Requires-Dist: joblib; extra == "ml"
18
23
  Provides-Extra: dev
19
24
  Requires-Dist: pytest; extra == "dev"
20
25
 
@@ -51,6 +56,25 @@ Then, for each remote it pushes the tags and the commits.
51
56
 
52
57
  This section describes generic tools that could not be put in a specific category, but tend to be useful.
53
58
 
59
+ ## Hashing
60
+
61
+ The snippet below:
62
+
63
+ ```python
64
+ from dmu.generic import hashing
65
+
66
+ obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
67
+ val = hashing.hash_object(obj)
68
+ ```
69
+
70
+ will:
71
+
72
+ - Make the input object into a JSON string
73
+ - Encode it to utf-8
74
+ - Make a 64 characters hash out of it
75
+
76
+ in two lines, thus keeping the user's code clean.
77
+
54
78
  ## Timer
55
79
 
56
80
  In order to benchmark functions do:
@@ -67,9 +91,9 @@ def fun():
67
91
  fun()
68
92
  ```
69
93
 
70
- ## JSON dumper
94
+ ## JSON dumper and loader
71
95
 
72
- The following lines will dump data (dictionaries, lists, etc) to a JSON file:
96
+ The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
73
97
 
74
98
  ```python
75
99
  import dmu.generic.utilities as gut
@@ -77,8 +101,11 @@ import dmu.generic.utilities as gut
77
101
  data = [1,2,3,4]
78
102
 
79
103
  gut.dump_json(data, '/tmp/list.json')
104
+ data = gut.load_json('/tmp/list.json')
80
105
  ```
81
106
 
107
+ and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
108
+
82
109
  # Physics
83
110
 
84
111
  ## Truth matching
@@ -132,7 +159,8 @@ from dmu.stats.model_factory import ModelFactory
132
159
 
133
160
  l_pdf = ['cbr'] + 2 * ['cbl']
134
161
  l_shr = ['mu', 'sg']
135
- mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
162
+ d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
163
+ mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
136
164
  pdf = mod.get_pdf()
137
165
  ```
138
166
 
@@ -145,10 +173,40 @@ pol1: Polynomial of degree 1
145
173
  pol2: Polynomial of degree 2
146
174
  cbr : CrystallBall with right tail
147
175
  cbl : CrystallBall with left tail
148
- gauss : Gaussian
176
+ gauss : Gaussian
149
177
  dscb : Double sided CrystallBall
150
178
  ```
151
179
 
180
+ ### Model building with reparametrizations
181
+
182
+ In order to introduce reparametrizations for the means and the resolutions, such that:
183
+
184
+ $\mu\to\mu+\Delta\mu$
185
+ $\sigma\to\sigma\cdot s_{\sigma}$
186
+
187
+ where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
188
+
189
+ ```python
190
+ import zfit
191
+ from dmu.stats.model_factory import ModelFactory
192
+
193
+ l_shr = ['mu', 'sg']
194
+ l_flt = []
195
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'}
196
+ obs = zfit.Space('mass', limits=(5080, 5680))
197
+
198
+ mod = ModelFactory(
199
+ preffix = name,
200
+ obs = obs,
201
+ l_pdf = l_name,
202
+ d_rep = d_rep,
203
+ l_shared= l_shr,
204
+ l_float = l_flt)
205
+ pdf = mod.get_pdf()
206
+ ```
207
+
208
+ Here, the floating parameters **should not** be the same as the reparametrized ones.
209
+
152
210
  ### Printing PDFs
153
211
 
154
212
  One can print a zfit PDF by doing:
@@ -427,7 +485,7 @@ rdf_bkg = _get_rdf(kind='bkg')
427
485
  cfg = _get_config()
428
486
 
429
487
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
430
- obj.run()
488
+ obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
431
489
  ```
432
490
 
433
491
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -549,9 +607,61 @@ When evaluating the model with real data, problems might occur, we deal with the
549
607
  ```python
550
608
  model.cfg
551
609
  ```
552
- - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
610
+ - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
553
611
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
554
612
 
613
+ ## Diagnostics
614
+
615
+ To run diagnostics on the trained model do:
616
+
617
+ ```python
618
+ from dmu.ml.cv_diagnostics import CVDiagnostics
619
+
620
+ # Where l_model is the list of models and cfg is a dictionary with the config
621
+ cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
622
+ cvd.run()
623
+ ```
624
+
625
+ the configuration can be loaded from a YAML file and would look like:
626
+
627
+ ```yaml
628
+ # Directory where plots will go
629
+ output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
630
+ # Optional, will assume that the target is already in the input dataframe
631
+ # and will use it, instead of evaluating models
632
+ score_from_rdf : mva
633
+ correlations:
634
+ # Variables with respect to which the correlations with the features will be measured
635
+ target :
636
+ name : mass
637
+ overlay :
638
+ wp :
639
+ - 0.2
640
+ - 0.5
641
+ - 0.7
642
+ - 0.9
643
+ general:
644
+ size : [20, 10]
645
+ saving:
646
+ plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
647
+ plots:
648
+ z :
649
+ binning : [1000, 4000, 30]
650
+ yscale : 'linear'
651
+ labels : ['mass', 'Entries']
652
+ normalized : true
653
+ styling :
654
+ linestyle: '-' # By default there is no line, just pointer
655
+ methods:
656
+ - Pearson
657
+ - Kendall-$\tau$
658
+ figure:
659
+ title: Scores from file
660
+ size : [10, 8]
661
+ xlabelsize: 18 # Constrols size of x axis labels. By default 30
662
+ rotate : 60 # Will rotate xlabels by 60 degrees
663
+ ```
664
+
555
665
  # Pandas dataframes
556
666
 
557
667
  ## Utilities
@@ -582,6 +692,19 @@ put.df_to_tex(df,
582
692
  caption = 'some caption')
583
693
  ```
584
694
 
695
+ ### Dataframe to and from YAML
696
+
697
+ This extends the existing JSON functionality
698
+
699
+ ```python
700
+ import dmu.pdataframe.utilities as put
701
+
702
+ df_1 = _get_df()
703
+ put.to_yaml(df_1, yml_path)
704
+ df_2 = put.from_yaml(yml_path)
705
+ ```
706
+
707
+ and is meant to be less verbose than doing it through the YAML module.
585
708
  # Rdataframes
586
709
 
587
710
  These are utility functions meant to be used with ROOT dataframes.
@@ -707,6 +830,11 @@ plots:
707
830
  labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
708
831
  title : 'some title can be added for different variable plots'
709
832
  name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
833
+ # Can add styling to specific plots, this should be the argument of
834
+ # hist.plot(...)
835
+ styling :
836
+ label : x
837
+ linestyle: '-'
710
838
  y :
711
839
  binning : [-5.0, 8.0, 40]
712
840
  yscale : 'linear'
@@ -730,6 +858,47 @@ stats:
730
858
 
731
859
  it's up to the user to build this dictionary and load it.
732
860
 
861
+ ### Pluggins
862
+
863
+ Extra functionality can be `plugged` into the code by using the pluggins section like:
864
+
865
+ #### FWHM
866
+ ```yaml
867
+ plugin:
868
+ fwhm:
869
+ # Can control each variable fit separately
870
+ x :
871
+ plot : true
872
+ obs : [-2, 4]
873
+ plot : true
874
+ format : FWHM={:.3f}
875
+ add_std: True
876
+ y :
877
+ plot : true
878
+ obs : [-4, 8]
879
+ plot : true
880
+ format : FWHM={:.3f}
881
+ add_std: True
882
+ ```
883
+
884
+ where the section will
885
+
886
+ - Use a KDE to fit the distribution and plot it on top of the histogram
887
+ - Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
888
+
889
+ #### stats
890
+
891
+ ```yaml
892
+ plugin:
893
+ stats:
894
+ x :
895
+ mean : $\mu$={:.2f}
896
+ rms : $\sigma$={:.2f}
897
+ sum : $\Sigma$={:.0f}
898
+ ```
899
+
900
+ Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
901
+
733
902
  ## 2D plots
734
903
 
735
904
  For the 2D case it would look like:
@@ -1,23 +1,3 @@
1
- Metadata-Version: 2.2
2
- Name: data_manipulation_utilities
3
- Version: 0.2.5
4
- Description-Content-Type: text/markdown
5
- Requires-Dist: logzero
6
- Requires-Dist: PyYAML
7
- Requires-Dist: scipy
8
- Requires-Dist: awkward
9
- Requires-Dist: tqdm
10
- Requires-Dist: joblib
11
- Requires-Dist: scikit-learn
12
- Requires-Dist: toml
13
- Requires-Dist: numpy
14
- Requires-Dist: matplotlib
15
- Requires-Dist: mplhep
16
- Requires-Dist: hist[plot]
17
- Requires-Dist: pandas
18
- Provides-Extra: dev
19
- Requires-Dist: pytest; extra == "dev"
20
-
21
1
  # D(ata) M(anipulation) U(tilities)
22
2
 
23
3
  These are tools that can be used for different data analysis tasks.
@@ -51,6 +31,25 @@ Then, for each remote it pushes the tags and the commits.
51
31
 
52
32
  This section describes generic tools that could not be put in a specific category, but tend to be useful.
53
33
 
34
+ ## Hashing
35
+
36
+ The snippet below:
37
+
38
+ ```python
39
+ from dmu.generic import hashing
40
+
41
+ obj = [1, 'name', [1, 'sub', 'list'], {'x' : 1}]
42
+ val = hashing.hash_object(obj)
43
+ ```
44
+
45
+ will:
46
+
47
+ - Make the input object into a JSON string
48
+ - Encode it to utf-8
49
+ - Make a 64 characters hash out of it
50
+
51
+ in two lines, thus keeping the user's code clean.
52
+
54
53
  ## Timer
55
54
 
56
55
  In order to benchmark functions do:
@@ -67,9 +66,9 @@ def fun():
67
66
  fun()
68
67
  ```
69
68
 
70
- ## JSON dumper
69
+ ## JSON dumper and loader
71
70
 
72
- The following lines will dump data (dictionaries, lists, etc) to a JSON file:
71
+ The following lines will dump data (dictionaries, lists, etc) to a JSON file and load it back:
73
72
 
74
73
  ```python
75
74
  import dmu.generic.utilities as gut
@@ -77,8 +76,11 @@ import dmu.generic.utilities as gut
77
76
  data = [1,2,3,4]
78
77
 
79
78
  gut.dump_json(data, '/tmp/list.json')
79
+ data = gut.load_json('/tmp/list.json')
80
80
  ```
81
81
 
82
+ and it's meant to allow the user to bypass all the boilerplate and keep their code brief.
83
+
82
84
  # Physics
83
85
 
84
86
  ## Truth matching
@@ -132,7 +134,8 @@ from dmu.stats.model_factory import ModelFactory
132
134
 
133
135
  l_pdf = ['cbr'] + 2 * ['cbl']
134
136
  l_shr = ['mu', 'sg']
135
- mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr)
137
+ d_fix = {'al_cbl' : 3, 'nr_cbr' : 1} # This is optional and will fix two parameters whose names start with the keys
138
+ mod = ModelFactory(obs = Data.obs, l_pdf = l_pdf, l_shared=l_shr, d_fix=d_fix)
136
139
  pdf = mod.get_pdf()
137
140
  ```
138
141
 
@@ -145,10 +148,40 @@ pol1: Polynomial of degree 1
145
148
  pol2: Polynomial of degree 2
146
149
  cbr : CrystallBall with right tail
147
150
  cbl : CrystallBall with left tail
148
- gauss : Gaussian
151
+ gauss : Gaussian
149
152
  dscb : Double sided CrystallBall
150
153
  ```
151
154
 
155
+ ### Model building with reparametrizations
156
+
157
+ In order to introduce reparametrizations for the means and the resolutions, such that:
158
+
159
+ $\mu\to\mu+\Delta\mu$
160
+ $\sigma\to\sigma\cdot s_{\sigma}$
161
+
162
+ where the reparametrized $\mu$ and $\sigma$ are constant, while the scale and resolution is floating, do:
163
+
164
+ ```python
165
+ import zfit
166
+ from dmu.stats.model_factory import ModelFactory
167
+
168
+ l_shr = ['mu', 'sg']
169
+ l_flt = []
170
+ d_rep = {'mu' : 'scale', 'sg' : 'reso'}
171
+ obs = zfit.Space('mass', limits=(5080, 5680))
172
+
173
+ mod = ModelFactory(
174
+ preffix = name,
175
+ obs = obs,
176
+ l_pdf = l_name,
177
+ d_rep = d_rep,
178
+ l_shared= l_shr,
179
+ l_float = l_flt)
180
+ pdf = mod.get_pdf()
181
+ ```
182
+
183
+ Here, the floating parameters **should not** be the same as the reparametrized ones.
184
+
152
185
  ### Printing PDFs
153
186
 
154
187
  One can print a zfit PDF by doing:
@@ -427,7 +460,7 @@ rdf_bkg = _get_rdf(kind='bkg')
427
460
  cfg = _get_config()
428
461
 
429
462
  obj= TrainMva(sig=rdf_sig, bkg=rdf_bkg, cfg=cfg)
430
- obj.run()
463
+ obj.run(skip_fit=False) # by default it will be false, if true, it will only make plots of features
431
464
  ```
432
465
 
433
466
  where the settings for the training go in a config dictionary, which when written to YAML looks like:
@@ -549,9 +582,61 @@ When evaluating the model with real data, problems might occur, we deal with the
549
582
  ```python
550
583
  model.cfg
551
584
  ```
552
- - For whatever entries that are still NaN, they will be _patched_ with zeros and evaluated. However, before returning, the probabilities will be
585
+ - For whatever features that are still NaN, they will be _patched_ with zeros when evaluated. However, the returned probabilities will be
553
586
  saved as -1. I.e. entries with NaNs will have probabilities of -1.
554
587
 
588
+ ## Diagnostics
589
+
590
+ To run diagnostics on the trained model do:
591
+
592
+ ```python
593
+ from dmu.ml.cv_diagnostics import CVDiagnostics
594
+
595
+ # Where l_model is the list of models and cfg is a dictionary with the config
596
+ cvd = CVDiagnostics(models=l_model, rdf=rdf, cfg=cfg)
597
+ cvd.run()
598
+ ```
599
+
600
+ the configuration can be loaded from a YAML file and would look like:
601
+
602
+ ```yaml
603
+ # Directory where plots will go
604
+ output : /tmp/tests/dmu/ml/cv_diagnostics/overlay
605
+ # Optional, will assume that the target is already in the input dataframe
606
+ # and will use it, instead of evaluating models
607
+ score_from_rdf : mva
608
+ correlations:
609
+ # Variables with respect to which the correlations with the features will be measured
610
+ target :
611
+ name : mass
612
+ overlay :
613
+ wp :
614
+ - 0.2
615
+ - 0.5
616
+ - 0.7
617
+ - 0.9
618
+ general:
619
+ size : [20, 10]
620
+ saving:
621
+ plt_dir : /tmp/tests/dmu/ml/cv_diagnostics/from_rdf
622
+ plots:
623
+ z :
624
+ binning : [1000, 4000, 30]
625
+ yscale : 'linear'
626
+ labels : ['mass', 'Entries']
627
+ normalized : true
628
+ styling :
629
+ linestyle: '-' # By default there is no line, just pointer
630
+ methods:
631
+ - Pearson
632
+ - Kendall-$\tau$
633
+ figure:
634
+ title: Scores from file
635
+ size : [10, 8]
636
+ xlabelsize: 18 # Constrols size of x axis labels. By default 30
637
+ rotate : 60 # Will rotate xlabels by 60 degrees
638
+ ```
639
+
555
640
  # Pandas dataframes
556
641
 
557
642
  ## Utilities
@@ -582,6 +667,19 @@ put.df_to_tex(df,
582
667
  caption = 'some caption')
583
668
  ```
584
669
 
670
+ ### Dataframe to and from YAML
671
+
672
+ This extends the existing JSON functionality
673
+
674
+ ```python
675
+ import dmu.pdataframe.utilities as put
676
+
677
+ df_1 = _get_df()
678
+ put.to_yaml(df_1, yml_path)
679
+ df_2 = put.from_yaml(yml_path)
680
+ ```
681
+
682
+ and is meant to be less verbose than doing it through the YAML module.
585
683
  # Rdataframes
586
684
 
587
685
  These are utility functions meant to be used with ROOT dataframes.
@@ -707,6 +805,11 @@ plots:
707
805
  labels : ['x', 'Entries'] # Labels are optional, will use varname and Entries as labels if not present
708
806
  title : 'some title can be added for different variable plots'
709
807
  name : 'plot_of_x' # This will ensure that one gets plot_of_x.png as a result, if missing x.png would be saved
808
+ # Can add styling to specific plots, this should be the argument of
809
+ # hist.plot(...)
810
+ styling :
811
+ label : x
812
+ linestyle: '-'
710
813
  y :
711
814
  binning : [-5.0, 8.0, 40]
712
815
  yscale : 'linear'
@@ -730,6 +833,47 @@ stats:
730
833
 
731
834
  it's up to the user to build this dictionary and load it.
732
835
 
836
+ ### Pluggins
837
+
838
+ Extra functionality can be `plugged` into the code by using the pluggins section like:
839
+
840
+ #### FWHM
841
+ ```yaml
842
+ plugin:
843
+ fwhm:
844
+ # Can control each variable fit separately
845
+ x :
846
+ plot : true
847
+ obs : [-2, 4]
848
+ plot : true
849
+ format : FWHM={:.3f}
850
+ add_std: True
851
+ y :
852
+ plot : true
853
+ obs : [-4, 8]
854
+ plot : true
855
+ format : FWHM={:.3f}
856
+ add_std: True
857
+ ```
858
+
859
+ where the section will
860
+
861
+ - Use a KDE to fit the distribution and plot it on top of the histogram
862
+ - Add the value of the FullWidth at Half Maximum in the title, for each distribution with a specific formatting.
863
+
864
+ #### stats
865
+
866
+ ```yaml
867
+ plugin:
868
+ stats:
869
+ x :
870
+ mean : $\mu$={:.2f}
871
+ rms : $\sigma$={:.2f}
872
+ sum : $\Sigma$={:.0f}
873
+ ```
874
+
875
+ Can be used to print statistics, mean, rms and weighted sum of entries for each distribution.
876
+
733
877
  ## 2D plots
734
878
 
735
879
  For the 2D case it would look like:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = 'data_manipulation_utilities'
3
- version = '0.2.5'
3
+ version = '0.2.7'
4
4
  readme = 'README.md'
5
5
  dependencies= [
6
6
  'logzero',
@@ -8,8 +8,7 @@ dependencies= [
8
8
  'scipy',
9
9
  'awkward',
10
10
  'tqdm',
11
- 'joblib',
12
- 'scikit-learn',
11
+ 'numpy',
13
12
  'toml',
14
13
  'numpy',
15
14
  'matplotlib',
@@ -18,6 +17,9 @@ dependencies= [
18
17
  'pandas']
19
18
 
20
19
  [project.optional-dependencies]
20
+ # Use latest tensorflow allowed by zfit
21
+ fit = ['zfit','tensorflow==2.18.0']
22
+ ml = ['scikit-learn', 'joblib']
21
23
  dev = ['pytest']
22
24
 
23
25
  [tools.setuptools.packages.find]