data-manipulation-utilities 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/PKG-INFO +43 -15
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/README.md +42 -14
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/pyproject.toml +1 -1
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/data_manipulation_utilities.egg-info/PKG-INFO +43 -15
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/data_manipulation_utilities.egg-info/SOURCES.txt +2 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/cv_classifier.py +16 -2
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/cv_predict.py +5 -5
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/train_mva.py +18 -4
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/utilities.py +11 -5
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/plotter.py +6 -2
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/plotter_1d.py +22 -4
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/plotter_2d.py +10 -9
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/model_factory.py +13 -7
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/testing/utilities.py +36 -27
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/ml/tests/train_mva.yaml +2 -5
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/2d.yaml +8 -4
- data_manipulation_utilities-0.2.5/src/dmu_data/plotting/tests/legend.yaml +12 -0
- data_manipulation_utilities-0.2.5/src/dmu_data/plotting/tests/stats.yaml +9 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/setup.cfg +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/data_manipulation_utilities.egg-info/dependency_links.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/data_manipulation_utilities.egg-info/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/data_manipulation_utilities.egg-info/requires.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/data_manipulation_utilities.egg-info/top_level.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/arrays/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/generic/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/generic/version_management.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/logging/log_store.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/pdataframe/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/matrix.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/rdataframe/atr_mgr.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/rdataframe/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/rfile/rfprinter.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/rfile/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/fitter.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/function.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/gof_calculator.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/minimizers.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/utilities.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/zfit_plotter.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/text/transformer.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/__init__.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/fig_size.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/high_stat.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/name.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/no_bounds.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/normalized.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/simple.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/title.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/plotting/tests/weights.yaml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/text/transform.toml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/text/transform.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/text/transform_set.toml +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/text/transform_set.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/text/transform_trf.txt +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/git/publish +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/physics/check_truth.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/rfile/compare_root_files.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/rfile/print_trees.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/ssh/coned.py +0 -0
- {data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/text/transform_text.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.5
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
@@ -26,7 +26,7 @@ These are tools that can be used for different data analysis tasks.
|
|
26
26
|
|
27
27
|
## Pushing
|
28
28
|
|
29
|
-
From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
|
29
|
+
From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
|
30
30
|
using a `pyproject.toml` file, run:
|
31
31
|
|
32
32
|
```bash
|
@@ -36,10 +36,10 @@ publish
|
|
36
36
|
such that:
|
37
37
|
|
38
38
|
1. The `pyproject.toml` file is checked and the version of the project is extracted.
|
39
|
-
1. If a tag named as the version exists move to the steps below.
|
39
|
+
1. If a tag named as the version exists move to the steps below.
|
40
40
|
1. If it does not, make a new tag with the name as the version
|
41
41
|
|
42
|
-
Then, for each remote it pushes the tags and the commits.
|
42
|
+
Then, for each remote it pushes the tags and the commits.
|
43
43
|
|
44
44
|
*Why?*
|
45
45
|
|
@@ -137,7 +137,17 @@ pdf = mod.get_pdf()
|
|
137
137
|
```
|
138
138
|
|
139
139
|
where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
|
140
|
-
The `mu` and `sg` parameters are shared.
|
140
|
+
The `mu` and `sg` parameters are shared. The elementary components that can be plugged are:
|
141
|
+
|
142
|
+
```
|
143
|
+
exp: Exponential
|
144
|
+
pol1: Polynomial of degree 1
|
145
|
+
pol2: Polynomial of degree 2
|
146
|
+
cbr : CrystallBall with right tail
|
147
|
+
cbl : CrystallBall with left tail
|
148
|
+
gauss : Gaussian
|
149
|
+
dscb : Double sided CrystallBall
|
150
|
+
```
|
141
151
|
|
142
152
|
### Printing PDFs
|
143
153
|
|
@@ -299,7 +309,7 @@ this will:
|
|
299
309
|
- Try fitting at most 10 times
|
300
310
|
- After each fit, calculate the goodness of fit (in this case the p-value)
|
301
311
|
- Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
|
302
|
-
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
312
|
+
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
303
313
|
randomize the parameters and try again.
|
304
314
|
- If the desired goodness of fit has not been achieved, pick the best result.
|
305
315
|
- Return the `FitResult` object and set the PDF to the final fit result.
|
@@ -337,11 +347,11 @@ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
|
|
337
347
|
nbk = zfit.Parameter('nbk', 1000, 0, 10000)
|
338
348
|
ebkg= bkg.create_extended(nbk, name='expo')
|
339
349
|
|
340
|
-
# Add them
|
350
|
+
# Add them
|
341
351
|
pdf = zfit.pdf.SumPDF([ebkg, esig])
|
342
352
|
sam = pdf.create_sampler()
|
343
353
|
|
344
|
-
# Plot them
|
354
|
+
# Plot them
|
345
355
|
obj = ZFitPlotter(data=sam, model=pdf)
|
346
356
|
d_leg = {'gauss': 'New Gauss'}
|
347
357
|
obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
|
@@ -353,7 +363,7 @@ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
|
|
353
363
|
this class supports:
|
354
364
|
|
355
365
|
- Handling title, legend, plots size.
|
356
|
-
- Adding pulls.
|
366
|
+
- Adding pulls.
|
357
367
|
- Stacking and overlaying of PDFs.
|
358
368
|
- Blinding.
|
359
369
|
|
@@ -434,7 +444,7 @@ dataset:
|
|
434
444
|
nan:
|
435
445
|
x : 0
|
436
446
|
y : 0
|
437
|
-
z : -999
|
447
|
+
z : -999
|
438
448
|
training :
|
439
449
|
nfold : 10
|
440
450
|
features : [x, y, z]
|
@@ -497,7 +507,7 @@ When training on real data, several things might go wrong and the code will try
|
|
497
507
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
498
508
|
entries will be removed before training.
|
499
509
|
|
500
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
510
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
511
|
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
512
|
- For whatever remains we remove the entries from the training.
|
503
513
|
|
@@ -674,6 +684,9 @@ ptr.run()
|
|
674
684
|
where the config dictionary `cfg_dat` in YAML would look like:
|
675
685
|
|
676
686
|
```yaml
|
687
|
+
general:
|
688
|
+
# This will set the figure size
|
689
|
+
size : [20, 10]
|
677
690
|
selection:
|
678
691
|
#Will do at most 50K random entries. Will only happen if the dataset has more than 50K entries
|
679
692
|
max_ran_entries : 50000
|
@@ -703,6 +716,16 @@ plots:
|
|
703
716
|
yscale : 'linear'
|
704
717
|
labels : ['x + y', 'Entries']
|
705
718
|
normalized : true #This should normalize to the area
|
719
|
+
# Some vertical dashed lines are drawn by default
|
720
|
+
# If you see them, you can turn them off with this
|
721
|
+
style:
|
722
|
+
skip_lines : true
|
723
|
+
# This can pass arguments to legend making function `plt.legend()` in matplotlib
|
724
|
+
legend:
|
725
|
+
# The line below would place the legend outside the figure to avoid ovelaps with the histogram
|
726
|
+
bbox_to_anchor : [1.2, 1]
|
727
|
+
stats:
|
728
|
+
nentries : '{:.2e}' # This will add number of entries in legend box
|
706
729
|
```
|
707
730
|
|
708
731
|
it's up to the user to build this dictionary and load it.
|
@@ -724,14 +747,19 @@ The config would look like:
|
|
724
747
|
```yaml
|
725
748
|
saving:
|
726
749
|
plt_dir : tests/plotting/2d
|
750
|
+
selection:
|
751
|
+
cuts:
|
752
|
+
xlow : x > -1.5
|
727
753
|
general:
|
728
754
|
size : [20, 10]
|
729
755
|
plots_2d:
|
730
756
|
# Column x and y
|
731
757
|
# Name of column where weights are, null for not weights
|
732
758
|
# Name of output plot, e.g. xy_x.png
|
733
|
-
|
734
|
-
- [x, y,
|
759
|
+
# Book signaling to use log scale for z axis
|
760
|
+
- [x, y, weights, 'xy_w', false]
|
761
|
+
- [x, y, null, 'xy_r', false]
|
762
|
+
- [x, y, null, 'xy_l', true]
|
735
763
|
axes:
|
736
764
|
x :
|
737
765
|
binning : [-5.0, 8.0, 40]
|
@@ -823,7 +851,7 @@ Directory/Treename
|
|
823
851
|
B_ENDVERTEX_CHI2DOF Double_t
|
824
852
|
```
|
825
853
|
|
826
|
-
## Comparing ROOT files
|
854
|
+
## Comparing ROOT files
|
827
855
|
|
828
856
|
Given two ROOT files the command below:
|
829
857
|
|
@@ -885,7 +913,7 @@ last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
|
|
885
913
|
# of directories in `dir_path`, e.g.:
|
886
914
|
|
887
915
|
oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
|
888
|
-
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
916
|
+
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
889
917
|
```
|
890
918
|
|
891
919
|
The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
|
@@ -6,7 +6,7 @@ These are tools that can be used for different data analysis tasks.
|
|
6
6
|
|
7
7
|
## Pushing
|
8
8
|
|
9
|
-
From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
|
9
|
+
From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
|
10
10
|
using a `pyproject.toml` file, run:
|
11
11
|
|
12
12
|
```bash
|
@@ -16,10 +16,10 @@ publish
|
|
16
16
|
such that:
|
17
17
|
|
18
18
|
1. The `pyproject.toml` file is checked and the version of the project is extracted.
|
19
|
-
1. If a tag named as the version exists move to the steps below.
|
19
|
+
1. If a tag named as the version exists move to the steps below.
|
20
20
|
1. If it does not, make a new tag with the name as the version
|
21
21
|
|
22
|
-
Then, for each remote it pushes the tags and the commits.
|
22
|
+
Then, for each remote it pushes the tags and the commits.
|
23
23
|
|
24
24
|
*Why?*
|
25
25
|
|
@@ -117,7 +117,17 @@ pdf = mod.get_pdf()
|
|
117
117
|
```
|
118
118
|
|
119
119
|
where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
|
120
|
-
The `mu` and `sg` parameters are shared.
|
120
|
+
The `mu` and `sg` parameters are shared. The elementary components that can be plugged are:
|
121
|
+
|
122
|
+
```
|
123
|
+
exp: Exponential
|
124
|
+
pol1: Polynomial of degree 1
|
125
|
+
pol2: Polynomial of degree 2
|
126
|
+
cbr : CrystallBall with right tail
|
127
|
+
cbl : CrystallBall with left tail
|
128
|
+
gauss : Gaussian
|
129
|
+
dscb : Double sided CrystallBall
|
130
|
+
```
|
121
131
|
|
122
132
|
### Printing PDFs
|
123
133
|
|
@@ -279,7 +289,7 @@ this will:
|
|
279
289
|
- Try fitting at most 10 times
|
280
290
|
- After each fit, calculate the goodness of fit (in this case the p-value)
|
281
291
|
- Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
|
282
|
-
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
292
|
+
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
283
293
|
randomize the parameters and try again.
|
284
294
|
- If the desired goodness of fit has not been achieved, pick the best result.
|
285
295
|
- Return the `FitResult` object and set the PDF to the final fit result.
|
@@ -317,11 +327,11 @@ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
|
|
317
327
|
nbk = zfit.Parameter('nbk', 1000, 0, 10000)
|
318
328
|
ebkg= bkg.create_extended(nbk, name='expo')
|
319
329
|
|
320
|
-
# Add them
|
330
|
+
# Add them
|
321
331
|
pdf = zfit.pdf.SumPDF([ebkg, esig])
|
322
332
|
sam = pdf.create_sampler()
|
323
333
|
|
324
|
-
# Plot them
|
334
|
+
# Plot them
|
325
335
|
obj = ZFitPlotter(data=sam, model=pdf)
|
326
336
|
d_leg = {'gauss': 'New Gauss'}
|
327
337
|
obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
|
@@ -333,7 +343,7 @@ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
|
|
333
343
|
this class supports:
|
334
344
|
|
335
345
|
- Handling title, legend, plots size.
|
336
|
-
- Adding pulls.
|
346
|
+
- Adding pulls.
|
337
347
|
- Stacking and overlaying of PDFs.
|
338
348
|
- Blinding.
|
339
349
|
|
@@ -414,7 +424,7 @@ dataset:
|
|
414
424
|
nan:
|
415
425
|
x : 0
|
416
426
|
y : 0
|
417
|
-
z : -999
|
427
|
+
z : -999
|
418
428
|
training :
|
419
429
|
nfold : 10
|
420
430
|
features : [x, y, z]
|
@@ -477,7 +487,7 @@ When training on real data, several things might go wrong and the code will try
|
|
477
487
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
478
488
|
entries will be removed before training.
|
479
489
|
|
480
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
490
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
481
491
|
- Can use the `nan` section shown above to replace `NaN` values with something else
|
482
492
|
- For whatever remains we remove the entries from the training.
|
483
493
|
|
@@ -654,6 +664,9 @@ ptr.run()
|
|
654
664
|
where the config dictionary `cfg_dat` in YAML would look like:
|
655
665
|
|
656
666
|
```yaml
|
667
|
+
general:
|
668
|
+
# This will set the figure size
|
669
|
+
size : [20, 10]
|
657
670
|
selection:
|
658
671
|
#Will do at most 50K random entries. Will only happen if the dataset has more than 50K entries
|
659
672
|
max_ran_entries : 50000
|
@@ -683,6 +696,16 @@ plots:
|
|
683
696
|
yscale : 'linear'
|
684
697
|
labels : ['x + y', 'Entries']
|
685
698
|
normalized : true #This should normalize to the area
|
699
|
+
# Some vertical dashed lines are drawn by default
|
700
|
+
# If you see them, you can turn them off with this
|
701
|
+
style:
|
702
|
+
skip_lines : true
|
703
|
+
# This can pass arguments to legend making function `plt.legend()` in matplotlib
|
704
|
+
legend:
|
705
|
+
# The line below would place the legend outside the figure to avoid ovelaps with the histogram
|
706
|
+
bbox_to_anchor : [1.2, 1]
|
707
|
+
stats:
|
708
|
+
nentries : '{:.2e}' # This will add number of entries in legend box
|
686
709
|
```
|
687
710
|
|
688
711
|
it's up to the user to build this dictionary and load it.
|
@@ -704,14 +727,19 @@ The config would look like:
|
|
704
727
|
```yaml
|
705
728
|
saving:
|
706
729
|
plt_dir : tests/plotting/2d
|
730
|
+
selection:
|
731
|
+
cuts:
|
732
|
+
xlow : x > -1.5
|
707
733
|
general:
|
708
734
|
size : [20, 10]
|
709
735
|
plots_2d:
|
710
736
|
# Column x and y
|
711
737
|
# Name of column where weights are, null for not weights
|
712
738
|
# Name of output plot, e.g. xy_x.png
|
713
|
-
|
714
|
-
- [x, y,
|
739
|
+
# Book signaling to use log scale for z axis
|
740
|
+
- [x, y, weights, 'xy_w', false]
|
741
|
+
- [x, y, null, 'xy_r', false]
|
742
|
+
- [x, y, null, 'xy_l', true]
|
715
743
|
axes:
|
716
744
|
x :
|
717
745
|
binning : [-5.0, 8.0, 40]
|
@@ -803,7 +831,7 @@ Directory/Treename
|
|
803
831
|
B_ENDVERTEX_CHI2DOF Double_t
|
804
832
|
```
|
805
833
|
|
806
|
-
## Comparing ROOT files
|
834
|
+
## Comparing ROOT files
|
807
835
|
|
808
836
|
Given two ROOT files the command below:
|
809
837
|
|
@@ -865,7 +893,7 @@ last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
|
|
865
893
|
# of directories in `dir_path`, e.g.:
|
866
894
|
|
867
895
|
oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
|
868
|
-
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
896
|
+
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
869
897
|
```
|
870
898
|
|
871
899
|
The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.5
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
@@ -26,7 +26,7 @@ These are tools that can be used for different data analysis tasks.
|
|
26
26
|
|
27
27
|
## Pushing
|
28
28
|
|
29
|
-
From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
|
29
|
+
From the root directory of a version controlled project (i.e. a directory with the `.git` subdirectory)
|
30
30
|
using a `pyproject.toml` file, run:
|
31
31
|
|
32
32
|
```bash
|
@@ -36,10 +36,10 @@ publish
|
|
36
36
|
such that:
|
37
37
|
|
38
38
|
1. The `pyproject.toml` file is checked and the version of the project is extracted.
|
39
|
-
1. If a tag named as the version exists move to the steps below.
|
39
|
+
1. If a tag named as the version exists move to the steps below.
|
40
40
|
1. If it does not, make a new tag with the name as the version
|
41
41
|
|
42
|
-
Then, for each remote it pushes the tags and the commits.
|
42
|
+
Then, for each remote it pushes the tags and the commits.
|
43
43
|
|
44
44
|
*Why?*
|
45
45
|
|
@@ -137,7 +137,17 @@ pdf = mod.get_pdf()
|
|
137
137
|
```
|
138
138
|
|
139
139
|
where the model is a sum of three `CrystallBall` PDFs, one with a right tail and two with a left tail.
|
140
|
-
The `mu` and `sg` parameters are shared.
|
140
|
+
The `mu` and `sg` parameters are shared. The elementary components that can be plugged are:
|
141
|
+
|
142
|
+
```
|
143
|
+
exp: Exponential
|
144
|
+
pol1: Polynomial of degree 1
|
145
|
+
pol2: Polynomial of degree 2
|
146
|
+
cbr : CrystallBall with right tail
|
147
|
+
cbl : CrystallBall with left tail
|
148
|
+
gauss : Gaussian
|
149
|
+
dscb : Double sided CrystallBall
|
150
|
+
```
|
141
151
|
|
142
152
|
### Printing PDFs
|
143
153
|
|
@@ -299,7 +309,7 @@ this will:
|
|
299
309
|
- Try fitting at most 10 times
|
300
310
|
- After each fit, calculate the goodness of fit (in this case the p-value)
|
301
311
|
- Stop when the number of tries has been exhausted or the p-value reached is higher than `0.05`
|
302
|
-
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
312
|
+
- If the fit has not succeeded because of convergence, validity or goodness of fit issues,
|
303
313
|
randomize the parameters and try again.
|
304
314
|
- If the desired goodness of fit has not been achieved, pick the best result.
|
305
315
|
- Return the `FitResult` object and set the PDF to the final fit result.
|
@@ -337,11 +347,11 @@ bkg = zfit.pdf.Exponential(obs=obs, lam=lm)
|
|
337
347
|
nbk = zfit.Parameter('nbk', 1000, 0, 10000)
|
338
348
|
ebkg= bkg.create_extended(nbk, name='expo')
|
339
349
|
|
340
|
-
# Add them
|
350
|
+
# Add them
|
341
351
|
pdf = zfit.pdf.SumPDF([ebkg, esig])
|
342
352
|
sam = pdf.create_sampler()
|
343
353
|
|
344
|
-
# Plot them
|
354
|
+
# Plot them
|
345
355
|
obj = ZFitPlotter(data=sam, model=pdf)
|
346
356
|
d_leg = {'gauss': 'New Gauss'}
|
347
357
|
obj.plot(nbins=50, d_leg=d_leg, stacked=True, plot_range=(0, 10), ext_text='Extra text here')
|
@@ -353,7 +363,7 @@ obj.axs[1].plot([0, 10], [0, 0], linestyle='--', color='black')
|
|
353
363
|
this class supports:
|
354
364
|
|
355
365
|
- Handling title, legend, plots size.
|
356
|
-
- Adding pulls.
|
366
|
+
- Adding pulls.
|
357
367
|
- Stacking and overlaying of PDFs.
|
358
368
|
- Blinding.
|
359
369
|
|
@@ -434,7 +444,7 @@ dataset:
|
|
434
444
|
nan:
|
435
445
|
x : 0
|
436
446
|
y : 0
|
437
|
-
z : -999
|
447
|
+
z : -999
|
438
448
|
training :
|
439
449
|
nfold : 10
|
440
450
|
features : [x, y, z]
|
@@ -497,7 +507,7 @@ When training on real data, several things might go wrong and the code will try
|
|
497
507
|
will end up in different folds. The tool checks for wether a model is evaluated for an entry that was used for training and raise an exception. Thus, repeated
|
498
508
|
entries will be removed before training.
|
499
509
|
|
500
|
-
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
510
|
+
- **NaNs**: Entries with NaNs will break the training with the scikit `GradientBoostClassifier` base class. Thus, we:
|
501
511
|
- Can use the `nan` section shown above to replace `NaN` values with something else
|
502
512
|
- For whatever remains we remove the entries from the training.
|
503
513
|
|
@@ -674,6 +684,9 @@ ptr.run()
|
|
674
684
|
where the config dictionary `cfg_dat` in YAML would look like:
|
675
685
|
|
676
686
|
```yaml
|
687
|
+
general:
|
688
|
+
# This will set the figure size
|
689
|
+
size : [20, 10]
|
677
690
|
selection:
|
678
691
|
#Will do at most 50K random entries. Will only happen if the dataset has more than 50K entries
|
679
692
|
max_ran_entries : 50000
|
@@ -703,6 +716,16 @@ plots:
|
|
703
716
|
yscale : 'linear'
|
704
717
|
labels : ['x + y', 'Entries']
|
705
718
|
normalized : true #This should normalize to the area
|
719
|
+
# Some vertical dashed lines are drawn by default
|
720
|
+
# If you see them, you can turn them off with this
|
721
|
+
style:
|
722
|
+
skip_lines : true
|
723
|
+
# This can pass arguments to legend making function `plt.legend()` in matplotlib
|
724
|
+
legend:
|
725
|
+
# The line below would place the legend outside the figure to avoid ovelaps with the histogram
|
726
|
+
bbox_to_anchor : [1.2, 1]
|
727
|
+
stats:
|
728
|
+
nentries : '{:.2e}' # This will add number of entries in legend box
|
706
729
|
```
|
707
730
|
|
708
731
|
it's up to the user to build this dictionary and load it.
|
@@ -724,14 +747,19 @@ The config would look like:
|
|
724
747
|
```yaml
|
725
748
|
saving:
|
726
749
|
plt_dir : tests/plotting/2d
|
750
|
+
selection:
|
751
|
+
cuts:
|
752
|
+
xlow : x > -1.5
|
727
753
|
general:
|
728
754
|
size : [20, 10]
|
729
755
|
plots_2d:
|
730
756
|
# Column x and y
|
731
757
|
# Name of column where weights are, null for not weights
|
732
758
|
# Name of output plot, e.g. xy_x.png
|
733
|
-
|
734
|
-
- [x, y,
|
759
|
+
# Book signaling to use log scale for z axis
|
760
|
+
- [x, y, weights, 'xy_w', false]
|
761
|
+
- [x, y, null, 'xy_r', false]
|
762
|
+
- [x, y, null, 'xy_l', true]
|
735
763
|
axes:
|
736
764
|
x :
|
737
765
|
binning : [-5.0, 8.0, 40]
|
@@ -823,7 +851,7 @@ Directory/Treename
|
|
823
851
|
B_ENDVERTEX_CHI2DOF Double_t
|
824
852
|
```
|
825
853
|
|
826
|
-
## Comparing ROOT files
|
854
|
+
## Comparing ROOT files
|
827
855
|
|
828
856
|
Given two ROOT files the command below:
|
829
857
|
|
@@ -885,7 +913,7 @@ last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
|
|
885
913
|
# of directories in `dir_path`, e.g.:
|
886
914
|
|
887
915
|
oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
|
888
|
-
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
916
|
+
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
889
917
|
```
|
890
918
|
|
891
919
|
The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
|
@@ -38,10 +38,12 @@ src/dmu_data/ml/tests/train_mva.yaml
|
|
38
38
|
src/dmu_data/plotting/tests/2d.yaml
|
39
39
|
src/dmu_data/plotting/tests/fig_size.yaml
|
40
40
|
src/dmu_data/plotting/tests/high_stat.yaml
|
41
|
+
src/dmu_data/plotting/tests/legend.yaml
|
41
42
|
src/dmu_data/plotting/tests/name.yaml
|
42
43
|
src/dmu_data/plotting/tests/no_bounds.yaml
|
43
44
|
src/dmu_data/plotting/tests/normalized.yaml
|
44
45
|
src/dmu_data/plotting/tests/simple.yaml
|
46
|
+
src/dmu_data/plotting/tests/stats.yaml
|
45
47
|
src/dmu_data/plotting/tests/title.yaml
|
46
48
|
src/dmu_data/plotting/tests/weights.yaml
|
47
49
|
src/dmu_data/text/transform.toml
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/cv_classifier.py
RENAMED
@@ -1,15 +1,15 @@
|
|
1
1
|
'''
|
2
2
|
Module holding cv_classifier class
|
3
3
|
'''
|
4
|
-
|
4
|
+
import os
|
5
5
|
from typing import Union
|
6
6
|
from sklearn.ensemble import GradientBoostingClassifier
|
7
7
|
|
8
|
+
import yaml
|
8
9
|
from dmu.logging.log_store import LogStore
|
9
10
|
import dmu.ml.utilities as ut
|
10
11
|
|
11
12
|
log = LogStore.add_logger('dmu:ml:CVClassifier')
|
12
|
-
|
13
13
|
# ---------------------------------------
|
14
14
|
class CVSameData(Exception):
|
15
15
|
'''
|
@@ -61,6 +61,20 @@ class CVClassifier(GradientBoostingClassifier):
|
|
61
61
|
|
62
62
|
return self._cfg
|
63
63
|
# ----------------------------------
|
64
|
+
def save_cfg(self, path : str):
|
65
|
+
'''
|
66
|
+
Will save configuration used to train this classifier to YAML
|
67
|
+
|
68
|
+
path: Path to YAML file
|
69
|
+
'''
|
70
|
+
dir_name = os.path.dirname(path)
|
71
|
+
os.makedirs(dir_name, exist_ok=True)
|
72
|
+
|
73
|
+
with open(path, 'w', encoding='utf-8') as ofile:
|
74
|
+
yaml.safe_dump(self._cfg, ofile, indent=2)
|
75
|
+
|
76
|
+
log.info(f'Saved config to: {path}')
|
77
|
+
# ----------------------------------
|
64
78
|
def __str__(self):
|
65
79
|
nhash = len(self._s_hash)
|
66
80
|
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/cv_predict.py
RENAMED
@@ -73,11 +73,11 @@ class CVPredict:
|
|
73
73
|
log.debug('Not doing any NaN replacement')
|
74
74
|
return df
|
75
75
|
|
76
|
-
log.
|
76
|
+
log.info(60 * '-')
|
77
77
|
log.info('Doing NaN replacements')
|
78
|
-
log.
|
78
|
+
log.info(60 * '-')
|
79
79
|
for var, val in self._d_nan_rep.items():
|
80
|
-
log.
|
80
|
+
log.info(f'{var:<20}{"--->":20}{val:<20.3f}')
|
81
81
|
df[var] = df[var].fillna(val)
|
82
82
|
|
83
83
|
return df
|
@@ -155,7 +155,7 @@ class CVPredict:
|
|
155
155
|
ndif = len(s_dif_hash)
|
156
156
|
ndat = len(s_dat_hash)
|
157
157
|
nmod = len(s_mod_hash)
|
158
|
-
log.debug(f'{ndif:<
|
158
|
+
log.debug(f'{ndif:<10}{"=":5}{ndat:<10}{"-":5}{nmod:<10}')
|
159
159
|
|
160
160
|
df_ft_group= df_ft.loc[df_ft.index.isin(s_dif_hash)]
|
161
161
|
|
@@ -173,7 +173,7 @@ class CVPredict:
|
|
173
173
|
return arr_prb
|
174
174
|
|
175
175
|
nentries = len(self._arr_patch)
|
176
|
-
log.warning(f'Patching {nentries} probabilities')
|
176
|
+
log.warning(f'Patching {nentries} probabilities with -1')
|
177
177
|
arr_prb[self._arr_patch] = -1
|
178
178
|
|
179
179
|
return arr_prb
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/train_mva.py
RENAMED
@@ -69,14 +69,20 @@ class TrainMva:
|
|
69
69
|
return df, arr_lab
|
70
70
|
# ---------------------------------------------
|
71
71
|
def _pre_process_nans(self, df : pnd.DataFrame) -> pnd.DataFrame:
|
72
|
+
if 'dataset' not in self._cfg:
|
73
|
+
return df
|
74
|
+
|
72
75
|
if 'nan' not in self._cfg['dataset']:
|
73
76
|
log.debug('dataset/nan section not found, not pre-processing NaNs')
|
74
77
|
return df
|
75
78
|
|
76
79
|
d_name_val = self._cfg['dataset']['nan']
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
+
log.info(60 * '-')
|
81
|
+
log.info('Doing NaN replacements')
|
82
|
+
log.info(60 * '-')
|
83
|
+
for var, val in d_name_val.items():
|
84
|
+
log.info(f'{var:<20}{"--->":20}{val:<20.3f}')
|
85
|
+
df[var] = df[var].fillna(val)
|
80
86
|
|
81
87
|
return df
|
82
88
|
# ---------------------------------------------
|
@@ -406,6 +412,9 @@ class TrainMva:
|
|
406
412
|
self._save_hyperparameters_to_tex()
|
407
413
|
# ---------------------------------------------
|
408
414
|
def _save_nan_conversion(self) -> None:
|
415
|
+
if 'dataset' not in self._cfg:
|
416
|
+
return
|
417
|
+
|
409
418
|
if 'nan' not in self._cfg['dataset']:
|
410
419
|
log.debug('NaN section not found, not saving it')
|
411
420
|
return
|
@@ -434,13 +443,18 @@ class TrainMva:
|
|
434
443
|
os.makedirs(val_dir, exist_ok=True)
|
435
444
|
put.df_to_tex(df, f'{val_dir}/hyperparameters.tex')
|
436
445
|
# ---------------------------------------------
|
437
|
-
def run(self):
|
446
|
+
def run(self, skip_fit : bool = False) -> None:
|
438
447
|
'''
|
439
448
|
Will do the training
|
449
|
+
|
450
|
+
skip_fit: By default false, if True, it will only do the plots of features and save tables
|
440
451
|
'''
|
441
452
|
self._save_settings_to_tex()
|
442
453
|
self._plot_features()
|
443
454
|
|
455
|
+
if skip_fit:
|
456
|
+
return
|
457
|
+
|
444
458
|
l_mod = self._get_models()
|
445
459
|
for ifold, mod in enumerate(l_mod):
|
446
460
|
self._save_model(mod, ifold)
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/ml/utilities.py
RENAMED
@@ -16,7 +16,7 @@ log = LogStore.add_logger('dmu:ml:utilities')
|
|
16
16
|
# ---------------------------------------------
|
17
17
|
def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
|
18
18
|
'''
|
19
|
-
Takes
|
19
|
+
Takes pandas dataframe, replaces NaNs with value introduced, by default 0
|
20
20
|
Returns array of indices where the replacement happened
|
21
21
|
'''
|
22
22
|
l_nan = df.index[df.isna().any(axis=1)].tolist()
|
@@ -25,7 +25,13 @@ def patch_and_tag(df : pnd.DataFrame, value : float = 0) -> pnd.DataFrame:
|
|
25
25
|
log.debug('No NaNs found')
|
26
26
|
return df
|
27
27
|
|
28
|
-
log.warning(f'Found {nnan} NaNs
|
28
|
+
log.warning(f'Found {nnan} NaNs')
|
29
|
+
|
30
|
+
df_nan_frq = df.isna().sum()
|
31
|
+
df_nan_frq = df_nan_frq[df_nan_frq > 0]
|
32
|
+
print(df_nan_frq)
|
33
|
+
|
34
|
+
log.warning(f'Attaching array with NaN {nnan} indexes and removing NaNs from dataframe')
|
29
35
|
|
30
36
|
df_pa = df.fillna(value)
|
31
37
|
|
@@ -57,7 +63,7 @@ def _remove_nans(df : pnd.DataFrame) -> pnd.DataFrame:
|
|
57
63
|
log.info('Found columns with NaNs')
|
58
64
|
for name in l_na_name:
|
59
65
|
nan_count = df[name].isna().sum()
|
60
|
-
log.info(f'{nan_count:<10}{name
|
66
|
+
log.info(f'{nan_count:<10}{name}')
|
61
67
|
|
62
68
|
ninit = len(df)
|
63
69
|
df = df.dropna()
|
@@ -75,10 +81,10 @@ def _remove_repeated(df : pnd.DataFrame) -> pnd.DataFrame:
|
|
75
81
|
nfinl = len(s_hash)
|
76
82
|
|
77
83
|
if ninit == nfinl:
|
78
|
-
log.debug('No
|
84
|
+
log.debug('No overlap between training and application found')
|
79
85
|
return df
|
80
86
|
|
81
|
-
log.warning(f'
|
87
|
+
log.warning(f'Overlap between training and application found, cleaning up: {ninit} -> {nfinl}')
|
82
88
|
|
83
89
|
df['hash_index'] = l_hash
|
84
90
|
df = df.set_index('hash_index', drop=True)
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/plotter.py
RENAMED
@@ -107,7 +107,7 @@ class Plotter:
|
|
107
107
|
|
108
108
|
d_cut = self._d_cfg['selection']['cuts']
|
109
109
|
|
110
|
-
log.
|
110
|
+
log.debug('Applying cuts')
|
111
111
|
for name, cut in d_cut.items():
|
112
112
|
log.debug(f'{name:<50}{cut:<150}')
|
113
113
|
rdf = rdf.Filter(cut, name)
|
@@ -212,7 +212,11 @@ class Plotter:
|
|
212
212
|
|
213
213
|
var (str) : Name of variable, needed for plot name
|
214
214
|
'''
|
215
|
-
|
215
|
+
d_leg = {}
|
216
|
+
if 'style' in self._d_cfg and 'legend' in self._d_cfg['style']:
|
217
|
+
d_leg = self._d_cfg['style']['legend']
|
218
|
+
|
219
|
+
plt.legend(**d_leg)
|
216
220
|
|
217
221
|
plt_dir = self._d_cfg['saving']['plt_dir']
|
218
222
|
os.makedirs(plt_dir, exist_ok=True)
|
@@ -77,17 +77,33 @@ class Plotter1D(Plotter):
|
|
77
77
|
|
78
78
|
l_bc_all = []
|
79
79
|
for name, arr_val in d_data.items():
|
80
|
+
label = self._label_from_name(name, arr_val)
|
80
81
|
arr_wgt = d_wgt[name] if d_wgt is not None else numpy.ones_like(arr_val)
|
81
82
|
arr_wgt = self._normalize_weights(arr_wgt, var)
|
82
|
-
hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x'
|
83
|
+
hst = Hist.new.Reg(bins=bins, start=minx, stop=maxx, name='x').Weight()
|
83
84
|
hst.fill(x=arr_val, weight=arr_wgt)
|
84
|
-
hst.plot(label=
|
85
|
+
hst.plot(label=label)
|
85
86
|
l_bc_all += hst.values().tolist()
|
86
87
|
|
87
88
|
max_y = max(l_bc_all)
|
88
89
|
|
89
90
|
return max_y
|
90
91
|
# --------------------------------------------
|
92
|
+
def _label_from_name(self, name : str, arr_val : numpy.ndarray) -> str:
|
93
|
+
if 'stats' not in self._d_cfg:
|
94
|
+
return name
|
95
|
+
|
96
|
+
d_stat = self._d_cfg['stats']
|
97
|
+
if 'nentries' not in d_stat:
|
98
|
+
return name
|
99
|
+
|
100
|
+
form = d_stat['nentries']
|
101
|
+
|
102
|
+
nentries = len(arr_val)
|
103
|
+
nentries = form.format(nentries)
|
104
|
+
|
105
|
+
return f'{name}{nentries}'
|
106
|
+
# --------------------------------------------
|
91
107
|
def _normalize_weights(self, arr_wgt : numpy.ndarray, var : str) -> numpy.ndarray:
|
92
108
|
cfg_var = self._d_cfg['plots'][var]
|
93
109
|
if 'normalized' not in cfg_var:
|
@@ -104,7 +120,6 @@ class Plotter1D(Plotter):
|
|
104
120
|
|
105
121
|
return arr_wgt
|
106
122
|
# --------------------------------------------
|
107
|
-
|
108
123
|
def _style_plot(self, var : str, max_y : float) -> None:
|
109
124
|
d_cfg = self._d_cfg['plots'][var]
|
110
125
|
yscale = d_cfg['yscale' ] if 'yscale' in d_cfg else 'linear'
|
@@ -124,12 +139,15 @@ class Plotter1D(Plotter):
|
|
124
139
|
plt.legend()
|
125
140
|
plt.title(title)
|
126
141
|
# --------------------------------------------
|
127
|
-
def _plot_lines(self, var : str):
|
142
|
+
def _plot_lines(self, var : str) -> None:
|
128
143
|
'''
|
129
144
|
Will plot vertical lines for some variables
|
130
145
|
|
131
146
|
var (str) : name of variable
|
132
147
|
'''
|
148
|
+
if 'style' in self._d_cfg and 'skip_lines' in self._d_cfg['style'] and self._d_cfg['style']['skip_lines']:
|
149
|
+
return
|
150
|
+
|
133
151
|
if var in ['B_const_mass_M', 'B_M']:
|
134
152
|
plt.axvline(x=5280, color='r', label=r'$B^+$' , linestyle=':')
|
135
153
|
elif var == 'Jpsi_M':
|
@@ -10,6 +10,7 @@ import matplotlib.pyplot as plt
|
|
10
10
|
|
11
11
|
from hist import Hist
|
12
12
|
from ROOT import RDataFrame
|
13
|
+
from matplotlib.colors import LogNorm
|
13
14
|
from dmu.logging.log_store import LogStore
|
14
15
|
from dmu.plotting.plotter import Plotter
|
15
16
|
|
@@ -28,11 +29,8 @@ class Plotter2D(Plotter):
|
|
28
29
|
cfg (dict): Dictionary with configuration, e.g. binning, ranges, etc
|
29
30
|
'''
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
self._d_cfg : dict = cfg
|
35
|
-
self._rdf : RDataFrame = super()._preprocess_rdf(rdf)
|
32
|
+
super().__init__({'single_rdf' : rdf}, cfg)
|
33
|
+
self._rdf : RDataFrame = self._d_rdf['single_rdf']
|
36
34
|
|
37
35
|
self._wgt : numpy.ndarray
|
38
36
|
# --------------------------------------------
|
@@ -61,7 +59,7 @@ class Plotter2D(Plotter):
|
|
61
59
|
|
62
60
|
return arr_wgt
|
63
61
|
# --------------------------------------------
|
64
|
-
def _plot_vars(self, varx : str, vary : str, wgt_name : str) -> None:
|
62
|
+
def _plot_vars(self, varx : str, vary : str, wgt_name : str, use_log : bool) -> None:
|
65
63
|
log.info(f'Plotting {varx} vs {vary} with weights {wgt_name}')
|
66
64
|
|
67
65
|
ax_x = self._get_axis(varx)
|
@@ -72,7 +70,10 @@ class Plotter2D(Plotter):
|
|
72
70
|
hst = Hist(ax_x, ax_y)
|
73
71
|
hst.fill(arr_x, arr_y, weight=arr_w)
|
74
72
|
|
75
|
-
|
73
|
+
if use_log:
|
74
|
+
mplhep.hist2dplot(hst, norm=LogNorm())
|
75
|
+
else:
|
76
|
+
mplhep.hist2dplot(hst)
|
76
77
|
# --------------------------------------------
|
77
78
|
def run(self):
|
78
79
|
'''
|
@@ -80,8 +81,8 @@ class Plotter2D(Plotter):
|
|
80
81
|
'''
|
81
82
|
|
82
83
|
fig_size = self._get_fig_size()
|
83
|
-
for [varx, vary, wgt_name, plot_name] in self._d_cfg['plots_2d']:
|
84
|
+
for [varx, vary, wgt_name, plot_name, use_log] in self._d_cfg['plots_2d']:
|
84
85
|
plt.figure(plot_name, figsize=fig_size)
|
85
|
-
self._plot_vars(varx, vary, wgt_name)
|
86
|
+
self._plot_vars(varx, vary, wgt_name, use_log)
|
86
87
|
self._save_plot(plot_name)
|
87
88
|
# --------------------------------------------
|
@@ -1,7 +1,7 @@
|
|
1
1
|
'''
|
2
2
|
Module storing ZModel class
|
3
3
|
'''
|
4
|
-
# pylint: disable=too-many-lines, import-error
|
4
|
+
# pylint: disable=too-many-lines, import-error, too-many-positional-arguments, too-many-arguments
|
5
5
|
|
6
6
|
from typing import Callable, Union
|
7
7
|
|
@@ -69,12 +69,18 @@ class ModelFactory:
|
|
69
69
|
|
70
70
|
self._d_par : dict[str,zpar] = {}
|
71
71
|
#-----------------------------------------
|
72
|
+
def _fltname_from_name(self, name : str) -> str:
|
73
|
+
if name in ['mu', 'sg']:
|
74
|
+
return f'{name}_flt'
|
75
|
+
|
76
|
+
return name
|
77
|
+
#-----------------------------------------
|
72
78
|
def _get_name(self, name : str, suffix : str) -> str:
|
73
79
|
for can_be_shared in self._l_can_be_shared:
|
74
80
|
if name.startswith(f'{can_be_shared}_') and can_be_shared in self._l_shr:
|
75
|
-
return can_be_shared
|
81
|
+
return self._fltname_from_name(can_be_shared)
|
76
82
|
|
77
|
-
return f'{name}{suffix}'
|
83
|
+
return self._fltname_from_name(f'{name}{suffix}')
|
78
84
|
#-----------------------------------------
|
79
85
|
def _get_parameter(self,
|
80
86
|
name : str,
|
@@ -129,8 +135,8 @@ class ModelFactory:
|
|
129
135
|
def _get_cbl(self, suffix : str = '') -> zpdf:
|
130
136
|
mu = self._get_parameter('mu_cbl', suffix, 5300, 5250, 5350)
|
131
137
|
sg = self._get_parameter('sg_cbl', suffix, 10, 2, 300)
|
132
|
-
al = self._get_parameter('ac_cbl', suffix, 2, 1.,
|
133
|
-
nl = self._get_parameter('nc_cbl', suffix, 1, 0.5,
|
138
|
+
al = self._get_parameter('ac_cbl', suffix, 2, 1., 14.)
|
139
|
+
nl = self._get_parameter('nc_cbl', suffix, 1, 0.5, 15.)
|
134
140
|
|
135
141
|
pdf = zfit.pdf.CrystalBall(mu, sg, al, nl, self._obs)
|
136
142
|
|
@@ -151,8 +157,8 @@ class ModelFactory:
|
|
151
157
|
sg = self._get_parameter('sg_dscb', suffix, 10, 2, 30)
|
152
158
|
ar = self._get_parameter('ar_dscb', suffix, 1, 0, 5)
|
153
159
|
al = self._get_parameter('al_dscb', suffix, 1, 0, 5)
|
154
|
-
nr = self._get_parameter('nr_dscb', suffix, 2, 1,
|
155
|
-
nl = self._get_parameter('nl_dscb', suffix, 2, 0,
|
160
|
+
nr = self._get_parameter('nr_dscb', suffix, 2, 1, 15)
|
161
|
+
nl = self._get_parameter('nl_dscb', suffix, 2, 0, 15)
|
156
162
|
|
157
163
|
pdf = zfit.pdf.DoubleCB(mu, sg, al, nl, ar, nr, self._obs)
|
158
164
|
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/testing/utilities.py
RENAMED
@@ -2,6 +2,7 @@
|
|
2
2
|
Module containing utility functions needed by unit tests
|
3
3
|
'''
|
4
4
|
import os
|
5
|
+
import math
|
5
6
|
from typing import Union
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from importlib.resources import files
|
@@ -21,56 +22,64 @@ class Data:
|
|
21
22
|
'''
|
22
23
|
Class storing shared data
|
23
24
|
'''
|
24
|
-
nentries = 3000
|
25
25
|
# -------------------------------
|
26
|
-
def _double_data(
|
27
|
-
|
28
|
-
df_2 = pnd.DataFrame(d_data)
|
29
|
-
|
26
|
+
def _double_data(df_1 : pnd.DataFrame) -> pnd.DataFrame:
|
27
|
+
df_2 = df_1.copy()
|
30
28
|
df = pnd.concat([df_1, df_2], axis=0)
|
31
29
|
|
32
|
-
|
33
|
-
|
34
|
-
return d_data
|
30
|
+
return df
|
35
31
|
# -------------------------------
|
36
|
-
def _add_nans(
|
37
|
-
|
38
|
-
|
39
|
-
|
32
|
+
def _add_nans(df : pnd.DataFrame, columns : list[str]) -> pnd.DataFrame:
|
33
|
+
size = len(df) * 0.2
|
34
|
+
size = math.floor(size)
|
35
|
+
|
36
|
+
l_col = df.columns.tolist()
|
37
|
+
if columns is None:
|
38
|
+
l_col_index = range(len(l_col))
|
39
|
+
else:
|
40
|
+
l_col_index = [ l_col.index(column) for column in columns ]
|
40
41
|
|
41
|
-
|
42
|
-
|
42
|
+
log.debug('Replacing randomly with {size} NaNs')
|
43
|
+
for _ in range(size):
|
44
|
+
irow = numpy.random.randint(0, df.shape[0]) # Random row index
|
45
|
+
icol = numpy.random.choice(l_col_index) # Random column index
|
43
46
|
|
44
|
-
|
47
|
+
df.iat[irow, icol] = numpy.nan
|
48
|
+
|
49
|
+
return df
|
45
50
|
# -------------------------------
|
46
51
|
def get_rdf(kind : Union[str,None] = None,
|
47
52
|
repeated : bool = False,
|
48
|
-
|
53
|
+
nentries : int = 3_000,
|
54
|
+
add_nans : list[str] = None):
|
49
55
|
'''
|
50
56
|
Return ROOT dataframe with toy data
|
51
57
|
'''
|
58
|
+
|
52
59
|
d_data = {}
|
53
60
|
if kind == 'sig':
|
54
|
-
d_data['w'] = numpy.random.normal(0, 1, size=
|
55
|
-
d_data['x'] = numpy.random.normal(0, 1, size=
|
56
|
-
d_data['y'] = numpy.random.normal(0, 1, size=
|
57
|
-
d_data['z'] = numpy.random.normal(0, 1, size=
|
61
|
+
d_data['w'] = numpy.random.normal(0, 1, size=nentries)
|
62
|
+
d_data['x'] = numpy.random.normal(0, 1, size=nentries)
|
63
|
+
d_data['y'] = numpy.random.normal(0, 1, size=nentries)
|
64
|
+
d_data['z'] = numpy.random.normal(0, 1, size=nentries)
|
58
65
|
elif kind == 'bkg':
|
59
|
-
d_data['w'] = numpy.random.normal(1, 1, size=
|
60
|
-
d_data['x'] = numpy.random.normal(1, 1, size=
|
61
|
-
d_data['y'] = numpy.random.normal(1, 1, size=
|
62
|
-
d_data['z'] = numpy.random.normal(1, 1, size=
|
66
|
+
d_data['w'] = numpy.random.normal(1, 1, size=nentries)
|
67
|
+
d_data['x'] = numpy.random.normal(1, 1, size=nentries)
|
68
|
+
d_data['y'] = numpy.random.normal(1, 1, size=nentries)
|
69
|
+
d_data['z'] = numpy.random.normal(1, 1, size=nentries)
|
63
70
|
else:
|
64
71
|
log.error(f'Invalid kind: {kind}')
|
65
72
|
raise ValueError
|
66
73
|
|
74
|
+
df = pnd.DataFrame(d_data)
|
75
|
+
|
67
76
|
if repeated:
|
68
|
-
|
77
|
+
df = _double_data(df)
|
69
78
|
|
70
79
|
if add_nans:
|
71
|
-
|
80
|
+
df = _add_nans(df, columns=add_nans)
|
72
81
|
|
73
|
-
rdf = RDF.
|
82
|
+
rdf = RDF.FromPandas(df)
|
74
83
|
|
75
84
|
return rdf
|
76
85
|
# -------------------------------
|
@@ -1,6 +1,7 @@
|
|
1
1
|
dataset:
|
2
2
|
nan :
|
3
|
-
x :
|
3
|
+
x : 1
|
4
|
+
y : 2
|
4
5
|
training :
|
5
6
|
nfold : 3
|
6
7
|
features : [x, y, z]
|
@@ -33,10 +34,6 @@ plotting:
|
|
33
34
|
saving:
|
34
35
|
plt_dir : '/tmp/dmu/ml/tests/train_mva/features'
|
35
36
|
plots:
|
36
|
-
w :
|
37
|
-
binning : [-4, 4, 100]
|
38
|
-
yscale : 'linear'
|
39
|
-
labels : ['w', '']
|
40
37
|
x :
|
41
38
|
binning : [-4, 4, 100]
|
42
39
|
yscale : 'linear'
|
@@ -1,13 +1,17 @@
|
|
1
1
|
saving:
|
2
|
-
plt_dir : tests/plotting/2d_weighted
|
2
|
+
plt_dir : /tmp/dmu/tests/plotting/2d_weighted
|
3
|
+
selection:
|
4
|
+
cuts:
|
5
|
+
xlow : x > -1.5
|
3
6
|
definitions:
|
4
7
|
z : x + y
|
5
8
|
general:
|
6
9
|
size : [20, 10]
|
7
10
|
plots_2d:
|
8
|
-
- [x, y, weights, '
|
9
|
-
- [x, y, null, '
|
10
|
-
- [x, z, null, '
|
11
|
+
- [x, y, weights, 'xy_wgt', false]
|
12
|
+
- [x, y, null, 'xy_raw', false]
|
13
|
+
- [x, z, null, 'xz_raw', false]
|
14
|
+
- [x, z, null, 'xz_log', true]
|
11
15
|
axes:
|
12
16
|
x :
|
13
17
|
binning : [-3.0, 3.0, 40]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/arrays/utilities.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/generic/utilities.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/logging/log_store.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/plotting/matrix.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/rfile/rfprinter.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/rfile/utilities.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/fitter.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/function.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/minimizers.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/stats/utilities.py
RENAMED
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu/text/transformer.py
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_data/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/git/publish
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{data_manipulation_utilities-0.2.4 → data_manipulation_utilities-0.2.5}/src/dmu_scripts/ssh/coned.py
RENAMED
File without changes
|
File without changes
|