data-manipulation-utilities 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_manipulation_utilities-0.0.1.dist-info/METADATA +713 -0
- data_manipulation_utilities-0.0.1.dist-info/RECORD +45 -0
- data_manipulation_utilities-0.0.1.dist-info/WHEEL +5 -0
- data_manipulation_utilities-0.0.1.dist-info/entry_points.txt +6 -0
- data_manipulation_utilities-0.0.1.dist-info/top_level.txt +3 -0
- dmu/arrays/utilities.py +55 -0
- dmu/dataframe/dataframe.py +36 -0
- dmu/generic/utilities.py +69 -0
- dmu/logging/log_store.py +129 -0
- dmu/ml/cv_classifier.py +122 -0
- dmu/ml/cv_predict.py +152 -0
- dmu/ml/train_mva.py +257 -0
- dmu/ml/utilities.py +132 -0
- dmu/plotting/plotter.py +227 -0
- dmu/plotting/plotter_1d.py +113 -0
- dmu/plotting/plotter_2d.py +87 -0
- dmu/rdataframe/atr_mgr.py +79 -0
- dmu/rdataframe/utilities.py +72 -0
- dmu/rfile/rfprinter.py +91 -0
- dmu/rfile/utilities.py +34 -0
- dmu/stats/fitter.py +515 -0
- dmu/stats/function.py +314 -0
- dmu/stats/utilities.py +134 -0
- dmu/testing/utilities.py +119 -0
- dmu/text/transformer.py +182 -0
- dmu_data/__init__.py +0 -0
- dmu_data/ml/tests/train_mva.yaml +37 -0
- dmu_data/plotting/tests/2d.yaml +14 -0
- dmu_data/plotting/tests/fig_size.yaml +13 -0
- dmu_data/plotting/tests/high_stat.yaml +22 -0
- dmu_data/plotting/tests/name.yaml +14 -0
- dmu_data/plotting/tests/no_bounds.yaml +12 -0
- dmu_data/plotting/tests/simple.yaml +8 -0
- dmu_data/plotting/tests/title.yaml +14 -0
- dmu_data/plotting/tests/weights.yaml +13 -0
- dmu_data/text/transform.toml +4 -0
- dmu_data/text/transform.txt +6 -0
- dmu_data/text/transform_set.toml +8 -0
- dmu_data/text/transform_set.txt +6 -0
- dmu_data/text/transform_trf.txt +12 -0
- dmu_scripts/physics/check_truth.py +121 -0
- dmu_scripts/rfile/compare_root_files.py +299 -0
- dmu_scripts/rfile/print_trees.py +35 -0
- dmu_scripts/ssh/coned.py +168 -0
- dmu_scripts/text/transform_text.py +46 -0
dmu/text/transformer.py
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
'''
|
2
|
+
Module used to hold transformer class
|
3
|
+
'''
|
4
|
+
|
5
|
+
import os
|
6
|
+
import pprint
|
7
|
+
|
8
|
+
import toml
|
9
|
+
import numpy
|
10
|
+
|
11
|
+
from dmu.logging.log_store import LogStore
|
12
|
+
|
13
|
+
log = LogStore.add_logger('dmu:text:transformer')
|
14
|
+
# -------------------------------------------------------------------------------------------
|
15
|
+
class transformer:
|
16
|
+
# pyling disable = invalid-name
|
17
|
+
'''
|
18
|
+
Class used to apply transformations to text files
|
19
|
+
'''
|
20
|
+
# -----------------------------------------
|
21
|
+
def __init__(self, txt_path=None, cfg_path=None):
|
22
|
+
'''
|
23
|
+
txt_path (str): Path to text file to be transformed, can have any extension, py, txt, log, etc
|
24
|
+
cfg_path (str): Path to TOML file holding configuration needed for transformations
|
25
|
+
'''
|
26
|
+
self._txt_path = txt_path
|
27
|
+
self._cfg_path = cfg_path
|
28
|
+
self._suffix = 'trf'
|
29
|
+
|
30
|
+
self._l_line = None
|
31
|
+
self._cfg = None
|
32
|
+
|
33
|
+
self._initialized = False
|
34
|
+
# -----------------------------------------
|
35
|
+
def _initialize(self):
|
36
|
+
if self._initialized:
|
37
|
+
return
|
38
|
+
|
39
|
+
self._check_file(self._txt_path)
|
40
|
+
self._check_file(self._cfg_path)
|
41
|
+
self._load_input()
|
42
|
+
self._cfg = toml.load(self._cfg_path)
|
43
|
+
|
44
|
+
self._initialized=True
|
45
|
+
# -----------------------------------------
|
46
|
+
def _check_file(self, file_path):
|
47
|
+
'''
|
48
|
+
Will raise exception if path not found
|
49
|
+
|
50
|
+
file_path (str): path to file
|
51
|
+
'''
|
52
|
+
if not os.path.isfile(file_path):
|
53
|
+
raise FileNotFoundError(f'File not found: {file_path}')
|
54
|
+
|
55
|
+
log.debug(f'Found: {file_path}')
|
56
|
+
# -----------------------------------------
|
57
|
+
def _load_input(self):
|
58
|
+
'''
|
59
|
+
Will open self._txt_path and put the lines in self._l_line
|
60
|
+
'''
|
61
|
+
with open(self._txt_path) as ifile:
|
62
|
+
self._l_line = ifile.read().splitlines()
|
63
|
+
|
64
|
+
nline = len(self._l_line)
|
65
|
+
log.info(f'Found {nline} lines in {self._txt_path}')
|
66
|
+
# -----------------------------------------
|
67
|
+
def _get_out_path(self, out_path):
|
68
|
+
'''
|
69
|
+
Will return name of output file
|
70
|
+
If arg is not None, will make directory (in case it does not exist) and return arg
|
71
|
+
If arg is None, will rename input path using suffix and return
|
72
|
+
'''
|
73
|
+
if out_path is not None:
|
74
|
+
dir_name = os.path.dirname(out_path)
|
75
|
+
os.makedirs(dir_name, exist_ok=True)
|
76
|
+
|
77
|
+
return out_path
|
78
|
+
|
79
|
+
file_name = os.path.basename(self._txt_path)
|
80
|
+
if '.' not in file_name:
|
81
|
+
return f'{file_name}_{self._suffix}'
|
82
|
+
|
83
|
+
l_part = file_name.split('.')
|
84
|
+
bef_ext = l_part[-2]
|
85
|
+
l_part[-2] = f'{bef_ext}_{self._suffix}'
|
86
|
+
|
87
|
+
file_name = '.'.join(l_part)
|
88
|
+
file_dir = os.path.dirname(self._txt_path)
|
89
|
+
|
90
|
+
return f'{file_dir}/{file_name}'
|
91
|
+
# -----------------------------------------
|
92
|
+
def _transform(self, l_line, trf):
|
93
|
+
log.info(f'{"":<4}{trf}')
|
94
|
+
|
95
|
+
if trf == 'append':
|
96
|
+
return self._apply_append(l_line)
|
97
|
+
else:
|
98
|
+
raise ValueError(f'Invalid transformation: {trf}')
|
99
|
+
|
100
|
+
return l_line
|
101
|
+
# -----------------------------------------
|
102
|
+
def _apply_append(self, l_line):
|
103
|
+
'''
|
104
|
+
Will take list of lines
|
105
|
+
and return list of lines with extra lines appended
|
106
|
+
according to config file
|
107
|
+
'''
|
108
|
+
d_append = self._cfg['trf']['append']
|
109
|
+
|
110
|
+
for target, l_to_be_added in d_append.items():
|
111
|
+
l_to_be_added = self._format_lines(l_to_be_added)
|
112
|
+
arr_line = numpy.array(self._l_line)
|
113
|
+
arr_index, = numpy.where(self._find_append_index(arr_line, target))
|
114
|
+
|
115
|
+
if arr_index.size == 0:
|
116
|
+
pprint.pprint(self._l_line)
|
117
|
+
raise RuntimeError(f'No instance of \"{target}\" found in \"{self._txt_path}\"')
|
118
|
+
|
119
|
+
for index in arr_index:
|
120
|
+
org_line = l_line[index]
|
121
|
+
ext_line = '\n'.join(l_to_be_added)
|
122
|
+
l_line[index] = f'{org_line}\n{ext_line}'
|
123
|
+
|
124
|
+
return l_line
|
125
|
+
# -----------------------------------------
|
126
|
+
def _find_append_index(self, l_line, target):
|
127
|
+
'''
|
128
|
+
Returns list of flags denoting if target was or not fouund in list l_line
|
129
|
+
target can be exact or included in the l_line elements
|
130
|
+
'''
|
131
|
+
is_subst = False
|
132
|
+
try:
|
133
|
+
is_subst = self._cfg['settings']['as_substring']
|
134
|
+
except:
|
135
|
+
pass
|
136
|
+
|
137
|
+
if not is_subst:
|
138
|
+
log.debug(f'Searching exact matches for target: {target}')
|
139
|
+
l_flag = [ target == element for element in l_line ]
|
140
|
+
else:
|
141
|
+
log.debug(f'Searching with substrings for target: {target}')
|
142
|
+
l_flag = [ target in element for element in l_line ]
|
143
|
+
|
144
|
+
return l_flag
|
145
|
+
# -----------------------------------------
|
146
|
+
def _format_lines(self, l_line):
|
147
|
+
'''
|
148
|
+
If format was specified in the settings section, will format the
|
149
|
+
elements of the input list of lines
|
150
|
+
'''
|
151
|
+
if 'settings' not in self._cfg:
|
152
|
+
return l_line
|
153
|
+
|
154
|
+
if 'format' not in self._cfg['settings']:
|
155
|
+
return l_line
|
156
|
+
|
157
|
+
fmt = self._cfg['settings']['format']
|
158
|
+
l_formatted = [ fmt.format(line) for line in l_line ]
|
159
|
+
|
160
|
+
return l_formatted
|
161
|
+
# -----------------------------------------
|
162
|
+
def save_as(self, out_path=None):
|
163
|
+
'''
|
164
|
+
Saves text file after transformation to `out_path`
|
165
|
+
If no path is passed, will name as:
|
166
|
+
|
167
|
+
/some/dir/file.txt -> /some/dir/file_trf.txt
|
168
|
+
'''
|
169
|
+
self._initialize()
|
170
|
+
|
171
|
+
log.info(20 * '-')
|
172
|
+
log.info('Applying transformations')
|
173
|
+
log.info(20 * '-')
|
174
|
+
for trf in self._cfg['trf']:
|
175
|
+
self._l_line = self._transform(self._l_line, trf)
|
176
|
+
|
177
|
+
out_path = self._get_out_path(out_path)
|
178
|
+
log.info(f'Saving to: {out_path}')
|
179
|
+
with open(out_path, 'w') as ofile:
|
180
|
+
text = '\n'.join(self._l_line)
|
181
|
+
ofile.write(text)
|
182
|
+
# -------------------------------------------------------------------------------------------
|
dmu_data/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,37 @@
|
|
1
|
+
training :
|
2
|
+
nfold : 3
|
3
|
+
features : [x, y, z]
|
4
|
+
rdm_stat : 1
|
5
|
+
hyper :
|
6
|
+
loss : log_loss
|
7
|
+
n_estimators : 100
|
8
|
+
max_depth : 3
|
9
|
+
learning_rate : 0.1
|
10
|
+
min_samples_split : 2
|
11
|
+
saving:
|
12
|
+
path : 'tests/ml/train_mva/model.pkl'
|
13
|
+
plotting:
|
14
|
+
roc :
|
15
|
+
min : [0, 0]
|
16
|
+
val_dir : 'tests/ml/train_mva'
|
17
|
+
features:
|
18
|
+
saving:
|
19
|
+
plt_dir : 'tests/ml/train_mva/features'
|
20
|
+
plots:
|
21
|
+
w :
|
22
|
+
binning : [-4, 4, 100]
|
23
|
+
yscale : 'linear'
|
24
|
+
labels : ['w', '']
|
25
|
+
x :
|
26
|
+
binning : [-4, 4, 100]
|
27
|
+
yscale : 'linear'
|
28
|
+
labels : ['x', '']
|
29
|
+
y :
|
30
|
+
binning : [-4, 4, 100]
|
31
|
+
yscale : 'linear'
|
32
|
+
labels : ['y', '']
|
33
|
+
z :
|
34
|
+
binning : [-4, 4, 100]
|
35
|
+
yscale : 'linear'
|
36
|
+
labels : ['z', '']
|
37
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
selection:
|
2
|
+
max_ran_entries : 50000
|
3
|
+
cuts:
|
4
|
+
z : 'z > 0'
|
5
|
+
saving:
|
6
|
+
plt_dir : tests/plotting/high_stat
|
7
|
+
definitions:
|
8
|
+
z : 'x + y'
|
9
|
+
plots:
|
10
|
+
x :
|
11
|
+
binning : [-5.0, 8.0, 40]
|
12
|
+
yscale : 'linear'
|
13
|
+
labels : ['x', 'Entries']
|
14
|
+
y :
|
15
|
+
binning : [-5.0, 8.0, 40]
|
16
|
+
yscale : 'linear'
|
17
|
+
labels : ['y', 'Entries']
|
18
|
+
z :
|
19
|
+
binning : [-5.0, 8.0, 40]
|
20
|
+
yscale : 'linear'
|
21
|
+
labels : ['x + y', 'Normalized']
|
22
|
+
normalized : true
|
@@ -0,0 +1,14 @@
|
|
1
|
+
saving:
|
2
|
+
plt_dir : tests/plotting/title
|
3
|
+
|
4
|
+
plots:
|
5
|
+
x :
|
6
|
+
binning : [-5.0, 8.0, 40]
|
7
|
+
yscale : 'linear'
|
8
|
+
labels : ['x', 'Entries']
|
9
|
+
title : 'Title for X plot'
|
10
|
+
y :
|
11
|
+
binning : [-5.0, 8.0, 40]
|
12
|
+
yscale : 'linear'
|
13
|
+
labels : ['y', 'Entries']
|
14
|
+
title : 'Title for Y plot'
|
@@ -0,0 +1,13 @@
|
|
1
|
+
saving:
|
2
|
+
plt_dir : tests/plotting/weights
|
3
|
+
plots:
|
4
|
+
x :
|
5
|
+
weights : weights
|
6
|
+
binning : [-5.0, 8.0, 40]
|
7
|
+
yscale : 'linear'
|
8
|
+
labels : ['x', 'Entries']
|
9
|
+
y :
|
10
|
+
weights : weights
|
11
|
+
binning : [-5.0, 8.0, 40]
|
12
|
+
yscale : 'linear'
|
13
|
+
labels : ['y', 'Entries']
|
@@ -0,0 +1,121 @@
|
|
1
|
+
'''
|
2
|
+
Script meant to do truth matching checks
|
3
|
+
'''
|
4
|
+
import os
|
5
|
+
import copy
|
6
|
+
import argparse
|
7
|
+
|
8
|
+
import yaml
|
9
|
+
import mplhep
|
10
|
+
import matplotlib.pyplot as plt
|
11
|
+
|
12
|
+
from ROOT import RDataFrame
|
13
|
+
|
14
|
+
from dmu.logging.log_store import LogStore
|
15
|
+
from dmu.plotting.plotter_1d import Plotter1D as Plotter
|
16
|
+
|
17
|
+
log=LogStore.add_logger('dmu:physics:check_truth')
|
18
|
+
# ----------------------------------
|
19
|
+
def _set_logs() -> None:
|
20
|
+
LogStore.set_level('dmu:plotting:Plotter' , 30)
|
21
|
+
LogStore.set_level('dmu:plotting:Plotter1D', 30)
|
22
|
+
# ----------------------------------
|
23
|
+
def _get_args() -> argparse.Namespace:
|
24
|
+
'''
|
25
|
+
Parse args
|
26
|
+
'''
|
27
|
+
parser = argparse.ArgumentParser(description='Script used to carry out checks on truth matching mechanisms for MC')
|
28
|
+
parser.add_argument('-c', '--conf' , type=str, help='Path to config file', required=True)
|
29
|
+
args = parser.parse_args()
|
30
|
+
|
31
|
+
return args
|
32
|
+
# ----------------------------------
|
33
|
+
def _get_config(args : argparse.Namespace) -> dict:
|
34
|
+
path = args.conf
|
35
|
+
if not os.path.isfile(path):
|
36
|
+
raise FileNotFoundError(f'Cannot find {path}')
|
37
|
+
|
38
|
+
with open(path, encoding='utf-8') as ifile:
|
39
|
+
cfg = yaml.safe_load(ifile)
|
40
|
+
|
41
|
+
return cfg
|
42
|
+
# ----------------------------------
|
43
|
+
def _get_rdf(file_path : str, tree_path : str) -> RDataFrame:
|
44
|
+
log.debug(f'Picking inputs from: {file_path}/{tree_path}')
|
45
|
+
rdf = RDataFrame(tree_path, file_path)
|
46
|
+
|
47
|
+
nentries = rdf.Count().GetValue()
|
48
|
+
log.debug(f'Found {nentries} entries')
|
49
|
+
|
50
|
+
return rdf
|
51
|
+
# ----------------------------------
|
52
|
+
def _preprocess_rdf(rdf : RDataFrame, cfg : dict) -> RDataFrame:
|
53
|
+
if 'max_entries' in cfg:
|
54
|
+
max_entries = cfg['max_entries']
|
55
|
+
rdf = rdf.Range(max_entries)
|
56
|
+
|
57
|
+
return rdf
|
58
|
+
# ----------------------------------
|
59
|
+
def _check(cfg : dict) -> None:
|
60
|
+
log.info(110 * '-')
|
61
|
+
log.info(f'{"Sample":<20}{"Method":<20}{"Initial":<15}{"":<15}{"Final":<15}{"":15}{"Efficiency":<10}')
|
62
|
+
log.info(110 * '-')
|
63
|
+
|
64
|
+
for sample_name in cfg['samples']:
|
65
|
+
file_path = cfg['samples'][sample_name]['file_path']
|
66
|
+
tree_path = cfg['samples'][sample_name]['tree_path']
|
67
|
+
rdf = _get_rdf(file_path, tree_path)
|
68
|
+
rdf = _preprocess_rdf(rdf, cfg)
|
69
|
+
|
70
|
+
d_cut_true = {}
|
71
|
+
d_cut_fake = {}
|
72
|
+
for method, cut in cfg['samples'][sample_name]['methods'].items():
|
73
|
+
_check_kind(rdf, sample_name, method, cut)
|
74
|
+
|
75
|
+
d_cut_true[method] = cut
|
76
|
+
d_cut_fake[method] = f'({cut}) == 0'
|
77
|
+
log.info('')
|
78
|
+
|
79
|
+
_plot_distributions(cfg, sample_name, rdf, d_cut_true, kind='matched')
|
80
|
+
_plot_distributions(cfg, sample_name, rdf, d_cut_fake, kind='anti_matched')
|
81
|
+
# ----------------------------------
|
82
|
+
def _plot_distributions(cfg : dict, sample_name : str, rdf : RDataFrame, d_cut : dict[str,str], kind : str) -> None:
|
83
|
+
cfg = copy.deepcopy(cfg)
|
84
|
+
cfg_plt = cfg['samples'][sample_name]['plot']
|
85
|
+
cfg_plt = _add_suffix(cfg_plt, sample_name, kind)
|
86
|
+
d_rdf = { method : rdf.Filter(cut) for method, cut in d_cut.items() }
|
87
|
+
|
88
|
+
ptr=Plotter(d_rdf=d_rdf, cfg=cfg_plt)
|
89
|
+
ptr.run()
|
90
|
+
# ----------------------------------
|
91
|
+
def _add_suffix(cfg : dict, sample_name : str, kind : str) -> dict:
|
92
|
+
d_var = cfg['plots']
|
93
|
+
for var in d_var:
|
94
|
+
d_var[var]['name'] = f'{var}_{kind}'
|
95
|
+
d_var[var]['title'] = f'{sample_name}; {kind}'
|
96
|
+
|
97
|
+
cfg['plots'] = d_var
|
98
|
+
|
99
|
+
return cfg
|
100
|
+
# ----------------------------------
|
101
|
+
def _check_kind(rdf : RDataFrame, sample : str, name : str, cut : str) -> RDataFrame:
|
102
|
+
nini = rdf.Count().GetValue()
|
103
|
+
rdf = rdf.Filter(cut, name)
|
104
|
+
nfnl = rdf.Count().GetValue()
|
105
|
+
eff = nfnl / nini * 100
|
106
|
+
|
107
|
+
log.info(f'{sample:<20}{name:<20}{nini:<15}{"":<15}{nfnl:<15}{"-->":15}{eff:10.2f}')
|
108
|
+
# ----------------------------------
|
109
|
+
def main():
|
110
|
+
'''
|
111
|
+
Script starts here
|
112
|
+
'''
|
113
|
+
_set_logs()
|
114
|
+
args = _get_args()
|
115
|
+
cfg = _get_config(args)
|
116
|
+
plt.style.use(mplhep.style.LHCb2)
|
117
|
+
|
118
|
+
_check(cfg)
|
119
|
+
# ----------------------------------
|
120
|
+
if __name__ == '__main__':
|
121
|
+
main()
|