data-manipulation-utilities 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data_manipulation_utilities-0.0.1.dist-info/METADATA +713 -0
  2. data_manipulation_utilities-0.0.1.dist-info/RECORD +45 -0
  3. data_manipulation_utilities-0.0.1.dist-info/WHEEL +5 -0
  4. data_manipulation_utilities-0.0.1.dist-info/entry_points.txt +6 -0
  5. data_manipulation_utilities-0.0.1.dist-info/top_level.txt +3 -0
  6. dmu/arrays/utilities.py +55 -0
  7. dmu/dataframe/dataframe.py +36 -0
  8. dmu/generic/utilities.py +69 -0
  9. dmu/logging/log_store.py +129 -0
  10. dmu/ml/cv_classifier.py +122 -0
  11. dmu/ml/cv_predict.py +152 -0
  12. dmu/ml/train_mva.py +257 -0
  13. dmu/ml/utilities.py +132 -0
  14. dmu/plotting/plotter.py +227 -0
  15. dmu/plotting/plotter_1d.py +113 -0
  16. dmu/plotting/plotter_2d.py +87 -0
  17. dmu/rdataframe/atr_mgr.py +79 -0
  18. dmu/rdataframe/utilities.py +72 -0
  19. dmu/rfile/rfprinter.py +91 -0
  20. dmu/rfile/utilities.py +34 -0
  21. dmu/stats/fitter.py +515 -0
  22. dmu/stats/function.py +314 -0
  23. dmu/stats/utilities.py +134 -0
  24. dmu/testing/utilities.py +119 -0
  25. dmu/text/transformer.py +182 -0
  26. dmu_data/__init__.py +0 -0
  27. dmu_data/ml/tests/train_mva.yaml +37 -0
  28. dmu_data/plotting/tests/2d.yaml +14 -0
  29. dmu_data/plotting/tests/fig_size.yaml +13 -0
  30. dmu_data/plotting/tests/high_stat.yaml +22 -0
  31. dmu_data/plotting/tests/name.yaml +14 -0
  32. dmu_data/plotting/tests/no_bounds.yaml +12 -0
  33. dmu_data/plotting/tests/simple.yaml +8 -0
  34. dmu_data/plotting/tests/title.yaml +14 -0
  35. dmu_data/plotting/tests/weights.yaml +13 -0
  36. dmu_data/text/transform.toml +4 -0
  37. dmu_data/text/transform.txt +6 -0
  38. dmu_data/text/transform_set.toml +8 -0
  39. dmu_data/text/transform_set.txt +6 -0
  40. dmu_data/text/transform_trf.txt +12 -0
  41. dmu_scripts/physics/check_truth.py +121 -0
  42. dmu_scripts/rfile/compare_root_files.py +299 -0
  43. dmu_scripts/rfile/print_trees.py +35 -0
  44. dmu_scripts/ssh/coned.py +168 -0
  45. dmu_scripts/text/transform_text.py +46 -0
@@ -0,0 +1,182 @@
1
+ '''
2
+ Module used to hold transformer class
3
+ '''
4
+
5
+ import os
6
+ import pprint
7
+
8
+ import toml
9
+ import numpy
10
+
11
+ from dmu.logging.log_store import LogStore
12
+
13
+ log = LogStore.add_logger('dmu:text:transformer')
14
+ # -------------------------------------------------------------------------------------------
15
+ class transformer:
16
+ # pyling disable = invalid-name
17
+ '''
18
+ Class used to apply transformations to text files
19
+ '''
20
+ # -----------------------------------------
21
+ def __init__(self, txt_path=None, cfg_path=None):
22
+ '''
23
+ txt_path (str): Path to text file to be transformed, can have any extension, py, txt, log, etc
24
+ cfg_path (str): Path to TOML file holding configuration needed for transformations
25
+ '''
26
+ self._txt_path = txt_path
27
+ self._cfg_path = cfg_path
28
+ self._suffix = 'trf'
29
+
30
+ self._l_line = None
31
+ self._cfg = None
32
+
33
+ self._initialized = False
34
+ # -----------------------------------------
35
+ def _initialize(self):
36
+ if self._initialized:
37
+ return
38
+
39
+ self._check_file(self._txt_path)
40
+ self._check_file(self._cfg_path)
41
+ self._load_input()
42
+ self._cfg = toml.load(self._cfg_path)
43
+
44
+ self._initialized=True
45
+ # -----------------------------------------
46
+ def _check_file(self, file_path):
47
+ '''
48
+ Will raise exception if path not found
49
+
50
+ file_path (str): path to file
51
+ '''
52
+ if not os.path.isfile(file_path):
53
+ raise FileNotFoundError(f'File not found: {file_path}')
54
+
55
+ log.debug(f'Found: {file_path}')
56
+ # -----------------------------------------
57
+ def _load_input(self):
58
+ '''
59
+ Will open self._txt_path and put the lines in self._l_line
60
+ '''
61
+ with open(self._txt_path) as ifile:
62
+ self._l_line = ifile.read().splitlines()
63
+
64
+ nline = len(self._l_line)
65
+ log.info(f'Found {nline} lines in {self._txt_path}')
66
+ # -----------------------------------------
67
+ def _get_out_path(self, out_path):
68
+ '''
69
+ Will return name of output file
70
+ If arg is not None, will make directory (in case it does not exist) and return arg
71
+ If arg is None, will rename input path using suffix and return
72
+ '''
73
+ if out_path is not None:
74
+ dir_name = os.path.dirname(out_path)
75
+ os.makedirs(dir_name, exist_ok=True)
76
+
77
+ return out_path
78
+
79
+ file_name = os.path.basename(self._txt_path)
80
+ if '.' not in file_name:
81
+ return f'{file_name}_{self._suffix}'
82
+
83
+ l_part = file_name.split('.')
84
+ bef_ext = l_part[-2]
85
+ l_part[-2] = f'{bef_ext}_{self._suffix}'
86
+
87
+ file_name = '.'.join(l_part)
88
+ file_dir = os.path.dirname(self._txt_path)
89
+
90
+ return f'{file_dir}/{file_name}'
91
+ # -----------------------------------------
92
+ def _transform(self, l_line, trf):
93
+ log.info(f'{"":<4}{trf}')
94
+
95
+ if trf == 'append':
96
+ return self._apply_append(l_line)
97
+ else:
98
+ raise ValueError(f'Invalid transformation: {trf}')
99
+
100
+ return l_line
101
+ # -----------------------------------------
102
+ def _apply_append(self, l_line):
103
+ '''
104
+ Will take list of lines
105
+ and return list of lines with extra lines appended
106
+ according to config file
107
+ '''
108
+ d_append = self._cfg['trf']['append']
109
+
110
+ for target, l_to_be_added in d_append.items():
111
+ l_to_be_added = self._format_lines(l_to_be_added)
112
+ arr_line = numpy.array(self._l_line)
113
+ arr_index, = numpy.where(self._find_append_index(arr_line, target))
114
+
115
+ if arr_index.size == 0:
116
+ pprint.pprint(self._l_line)
117
+ raise RuntimeError(f'No instance of \"{target}\" found in \"{self._txt_path}\"')
118
+
119
+ for index in arr_index:
120
+ org_line = l_line[index]
121
+ ext_line = '\n'.join(l_to_be_added)
122
+ l_line[index] = f'{org_line}\n{ext_line}'
123
+
124
+ return l_line
125
+ # -----------------------------------------
126
+ def _find_append_index(self, l_line, target):
127
+ '''
128
+ Returns list of flags denoting if target was or not fouund in list l_line
129
+ target can be exact or included in the l_line elements
130
+ '''
131
+ is_subst = False
132
+ try:
133
+ is_subst = self._cfg['settings']['as_substring']
134
+ except:
135
+ pass
136
+
137
+ if not is_subst:
138
+ log.debug(f'Searching exact matches for target: {target}')
139
+ l_flag = [ target == element for element in l_line ]
140
+ else:
141
+ log.debug(f'Searching with substrings for target: {target}')
142
+ l_flag = [ target in element for element in l_line ]
143
+
144
+ return l_flag
145
+ # -----------------------------------------
146
+ def _format_lines(self, l_line):
147
+ '''
148
+ If format was specified in the settings section, will format the
149
+ elements of the input list of lines
150
+ '''
151
+ if 'settings' not in self._cfg:
152
+ return l_line
153
+
154
+ if 'format' not in self._cfg['settings']:
155
+ return l_line
156
+
157
+ fmt = self._cfg['settings']['format']
158
+ l_formatted = [ fmt.format(line) for line in l_line ]
159
+
160
+ return l_formatted
161
+ # -----------------------------------------
162
+ def save_as(self, out_path=None):
163
+ '''
164
+ Saves text file after transformation to `out_path`
165
+ If no path is passed, will name as:
166
+
167
+ /some/dir/file.txt -> /some/dir/file_trf.txt
168
+ '''
169
+ self._initialize()
170
+
171
+ log.info(20 * '-')
172
+ log.info('Applying transformations')
173
+ log.info(20 * '-')
174
+ for trf in self._cfg['trf']:
175
+ self._l_line = self._transform(self._l_line, trf)
176
+
177
+ out_path = self._get_out_path(out_path)
178
+ log.info(f'Saving to: {out_path}')
179
+ with open(out_path, 'w') as ofile:
180
+ text = '\n'.join(self._l_line)
181
+ ofile.write(text)
182
+ # -------------------------------------------------------------------------------------------
dmu_data/__init__.py ADDED
File without changes
@@ -0,0 +1,37 @@
1
+ training :
2
+ nfold : 3
3
+ features : [x, y, z]
4
+ rdm_stat : 1
5
+ hyper :
6
+ loss : log_loss
7
+ n_estimators : 100
8
+ max_depth : 3
9
+ learning_rate : 0.1
10
+ min_samples_split : 2
11
+ saving:
12
+ path : 'tests/ml/train_mva/model.pkl'
13
+ plotting:
14
+ roc :
15
+ min : [0, 0]
16
+ val_dir : 'tests/ml/train_mva'
17
+ features:
18
+ saving:
19
+ plt_dir : 'tests/ml/train_mva/features'
20
+ plots:
21
+ w :
22
+ binning : [-4, 4, 100]
23
+ yscale : 'linear'
24
+ labels : ['w', '']
25
+ x :
26
+ binning : [-4, 4, 100]
27
+ yscale : 'linear'
28
+ labels : ['x', '']
29
+ y :
30
+ binning : [-4, 4, 100]
31
+ yscale : 'linear'
32
+ labels : ['y', '']
33
+ z :
34
+ binning : [-4, 4, 100]
35
+ yscale : 'linear'
36
+ labels : ['z', '']
37
+
@@ -0,0 +1,14 @@
1
+ saving:
2
+ plt_dir : tests/plotting/2d_weighted
3
+ general:
4
+ size : [20, 10]
5
+ plots_2d:
6
+ - [x, y, weights, 'xy_w']
7
+ - [x, y, null, 'xy_r']
8
+ axes:
9
+ x :
10
+ binning : [-5.0, 8.0, 40]
11
+ label : 'x'
12
+ y :
13
+ binning : [-5.0, 8.0, 40]
14
+ label : 'y'
@@ -0,0 +1,13 @@
1
+ saving:
2
+ plt_dir : tests/plotting/fig_size
3
+ general:
4
+ size : [20, 10]
5
+ plots:
6
+ x :
7
+ binning : [-5.0, 8.0, 40]
8
+ yscale : 'linear'
9
+ labels : ['x', 'Entries']
10
+ y :
11
+ binning : [-5.0, 8.0, 40]
12
+ yscale : 'linear'
13
+ labels : ['y', 'Entries']
@@ -0,0 +1,22 @@
1
+ selection:
2
+ max_ran_entries : 50000
3
+ cuts:
4
+ z : 'z > 0'
5
+ saving:
6
+ plt_dir : tests/plotting/high_stat
7
+ definitions:
8
+ z : 'x + y'
9
+ plots:
10
+ x :
11
+ binning : [-5.0, 8.0, 40]
12
+ yscale : 'linear'
13
+ labels : ['x', 'Entries']
14
+ y :
15
+ binning : [-5.0, 8.0, 40]
16
+ yscale : 'linear'
17
+ labels : ['y', 'Entries']
18
+ z :
19
+ binning : [-5.0, 8.0, 40]
20
+ yscale : 'linear'
21
+ labels : ['x + y', 'Normalized']
22
+ normalized : true
@@ -0,0 +1,14 @@
1
+ saving:
2
+ plt_dir : tests/plotting/name
3
+
4
+ plots:
5
+ x :
6
+ binning : [-5.0, 8.0, 40]
7
+ yscale : 'linear'
8
+ labels : ['x', 'Entries']
9
+ name : 'xvar'
10
+ y :
11
+ binning : [-5.0, 8.0, 40]
12
+ yscale : 'linear'
13
+ labels : ['y', 'Entries']
14
+ name : 'yvar'
@@ -0,0 +1,12 @@
1
+ saving:
2
+ plt_dir : tests/plotting/no_bounds
3
+
4
+ plots:
5
+ x :
6
+ binning : [1, 1, 40]
7
+ yscale : 'linear'
8
+ labels : ['x', 'Entries']
9
+ y :
10
+ binning : [1, 1, 40]
11
+ yscale : 'linear'
12
+ labels : ['y', 'Entries']
@@ -0,0 +1,8 @@
1
+ saving:
2
+ plt_dir : tests/plotting/simple
3
+
4
+ plots:
5
+ x :
6
+ binning : [-5.0, 8.0, 40]
7
+ y :
8
+ binning : [-5.0, 8.0, 40]
@@ -0,0 +1,14 @@
1
+ saving:
2
+ plt_dir : tests/plotting/title
3
+
4
+ plots:
5
+ x :
6
+ binning : [-5.0, 8.0, 40]
7
+ yscale : 'linear'
8
+ labels : ['x', 'Entries']
9
+ title : 'Title for X plot'
10
+ y :
11
+ binning : [-5.0, 8.0, 40]
12
+ yscale : 'linear'
13
+ labels : ['y', 'Entries']
14
+ title : 'Title for Y plot'
@@ -0,0 +1,13 @@
1
+ saving:
2
+ plt_dir : tests/plotting/weights
3
+ plots:
4
+ x :
5
+ weights : weights
6
+ binning : [-5.0, 8.0, 40]
7
+ yscale : 'linear'
8
+ labels : ['x', 'Entries']
9
+ y :
10
+ weights : weights
11
+ binning : [-5.0, 8.0, 40]
12
+ yscale : 'linear'
13
+ labels : ['y', 'Entries']
@@ -0,0 +1,4 @@
1
+ [trf]
2
+ [trf.append]
3
+ 'primes are'=['2', '3', '5']
4
+ 'days are'=['Monday', 'Tuesday', 'Wednesday']
@@ -0,0 +1,6 @@
1
+ the
2
+ first
3
+ primes are
4
+ and
5
+ the first
6
+ days are
@@ -0,0 +1,8 @@
1
+ [settings]
2
+ as_substring=true
3
+ format ='--> {} <--'
4
+
5
+ [trf]
6
+ [trf.append]
7
+ 'primes are'=['2', '3', '5']
8
+ 'days are'=['Monday', 'Tuesday', 'Wednesday']
@@ -0,0 +1,6 @@
1
+ the
2
+ first
3
+ primes are:
4
+ and
5
+ the first
6
+ days are:
@@ -0,0 +1,12 @@
1
+ the
2
+ first
3
+ primes are
4
+ 2
5
+ 3
6
+ 5
7
+ and
8
+ the first
9
+ days are
10
+ Monday
11
+ Tuesday
12
+ Wednesday
@@ -0,0 +1,121 @@
1
+ '''
2
+ Script meant to do truth matching checks
3
+ '''
4
+ import os
5
+ import copy
6
+ import argparse
7
+
8
+ import yaml
9
+ import mplhep
10
+ import matplotlib.pyplot as plt
11
+
12
+ from ROOT import RDataFrame
13
+
14
+ from dmu.logging.log_store import LogStore
15
+ from dmu.plotting.plotter_1d import Plotter1D as Plotter
16
+
17
+ log=LogStore.add_logger('dmu:physics:check_truth')
18
+ # ----------------------------------
19
+ def _set_logs() -> None:
20
+ LogStore.set_level('dmu:plotting:Plotter' , 30)
21
+ LogStore.set_level('dmu:plotting:Plotter1D', 30)
22
+ # ----------------------------------
23
+ def _get_args() -> argparse.Namespace:
24
+ '''
25
+ Parse args
26
+ '''
27
+ parser = argparse.ArgumentParser(description='Script used to carry out checks on truth matching mechanisms for MC')
28
+ parser.add_argument('-c', '--conf' , type=str, help='Path to config file', required=True)
29
+ args = parser.parse_args()
30
+
31
+ return args
32
+ # ----------------------------------
33
+ def _get_config(args : argparse.Namespace) -> dict:
34
+ path = args.conf
35
+ if not os.path.isfile(path):
36
+ raise FileNotFoundError(f'Cannot find {path}')
37
+
38
+ with open(path, encoding='utf-8') as ifile:
39
+ cfg = yaml.safe_load(ifile)
40
+
41
+ return cfg
42
+ # ----------------------------------
43
+ def _get_rdf(file_path : str, tree_path : str) -> RDataFrame:
44
+ log.debug(f'Picking inputs from: {file_path}/{tree_path}')
45
+ rdf = RDataFrame(tree_path, file_path)
46
+
47
+ nentries = rdf.Count().GetValue()
48
+ log.debug(f'Found {nentries} entries')
49
+
50
+ return rdf
51
+ # ----------------------------------
52
+ def _preprocess_rdf(rdf : RDataFrame, cfg : dict) -> RDataFrame:
53
+ if 'max_entries' in cfg:
54
+ max_entries = cfg['max_entries']
55
+ rdf = rdf.Range(max_entries)
56
+
57
+ return rdf
58
+ # ----------------------------------
59
+ def _check(cfg : dict) -> None:
60
+ log.info(110 * '-')
61
+ log.info(f'{"Sample":<20}{"Method":<20}{"Initial":<15}{"":<15}{"Final":<15}{"":15}{"Efficiency":<10}')
62
+ log.info(110 * '-')
63
+
64
+ for sample_name in cfg['samples']:
65
+ file_path = cfg['samples'][sample_name]['file_path']
66
+ tree_path = cfg['samples'][sample_name]['tree_path']
67
+ rdf = _get_rdf(file_path, tree_path)
68
+ rdf = _preprocess_rdf(rdf, cfg)
69
+
70
+ d_cut_true = {}
71
+ d_cut_fake = {}
72
+ for method, cut in cfg['samples'][sample_name]['methods'].items():
73
+ _check_kind(rdf, sample_name, method, cut)
74
+
75
+ d_cut_true[method] = cut
76
+ d_cut_fake[method] = f'({cut}) == 0'
77
+ log.info('')
78
+
79
+ _plot_distributions(cfg, sample_name, rdf, d_cut_true, kind='matched')
80
+ _plot_distributions(cfg, sample_name, rdf, d_cut_fake, kind='anti_matched')
81
+ # ----------------------------------
82
+ def _plot_distributions(cfg : dict, sample_name : str, rdf : RDataFrame, d_cut : dict[str,str], kind : str) -> None:
83
+ cfg = copy.deepcopy(cfg)
84
+ cfg_plt = cfg['samples'][sample_name]['plot']
85
+ cfg_plt = _add_suffix(cfg_plt, sample_name, kind)
86
+ d_rdf = { method : rdf.Filter(cut) for method, cut in d_cut.items() }
87
+
88
+ ptr=Plotter(d_rdf=d_rdf, cfg=cfg_plt)
89
+ ptr.run()
90
+ # ----------------------------------
91
+ def _add_suffix(cfg : dict, sample_name : str, kind : str) -> dict:
92
+ d_var = cfg['plots']
93
+ for var in d_var:
94
+ d_var[var]['name'] = f'{var}_{kind}'
95
+ d_var[var]['title'] = f'{sample_name}; {kind}'
96
+
97
+ cfg['plots'] = d_var
98
+
99
+ return cfg
100
+ # ----------------------------------
101
+ def _check_kind(rdf : RDataFrame, sample : str, name : str, cut : str) -> RDataFrame:
102
+ nini = rdf.Count().GetValue()
103
+ rdf = rdf.Filter(cut, name)
104
+ nfnl = rdf.Count().GetValue()
105
+ eff = nfnl / nini * 100
106
+
107
+ log.info(f'{sample:<20}{name:<20}{nini:<15}{"":<15}{nfnl:<15}{"-->":15}{eff:10.2f}')
108
+ # ----------------------------------
109
+ def main():
110
+ '''
111
+ Script starts here
112
+ '''
113
+ _set_logs()
114
+ args = _get_args()
115
+ cfg = _get_config(args)
116
+ plt.style.use(mplhep.style.LHCb2)
117
+
118
+ _check(cfg)
119
+ # ----------------------------------
120
+ if __name__ == '__main__':
121
+ main()