data-manipulation-utilities 0.2.7__py3-none-any.whl → 0.2.8.dev714__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/METADATA +641 -44
  2. data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD +93 -0
  3. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/WHEEL +1 -1
  4. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/entry_points.txt +1 -0
  5. dmu/__init__.py +0 -0
  6. dmu/generic/hashing.py +34 -8
  7. dmu/generic/utilities.py +164 -11
  8. dmu/logging/log_store.py +34 -2
  9. dmu/logging/messages.py +96 -0
  10. dmu/ml/cv_classifier.py +3 -3
  11. dmu/ml/cv_diagnostics.py +3 -0
  12. dmu/ml/cv_performance.py +58 -0
  13. dmu/ml/cv_predict.py +149 -46
  14. dmu/ml/train_mva.py +482 -100
  15. dmu/ml/utilities.py +29 -10
  16. dmu/pdataframe/utilities.py +28 -3
  17. dmu/plotting/fwhm.py +2 -2
  18. dmu/plotting/matrix.py +1 -1
  19. dmu/plotting/plotter.py +23 -3
  20. dmu/plotting/plotter_1d.py +96 -32
  21. dmu/plotting/plotter_2d.py +5 -0
  22. dmu/rdataframe/utilities.py +54 -3
  23. dmu/rfile/ddfgetter.py +102 -0
  24. dmu/stats/fit_stats.py +129 -0
  25. dmu/stats/fitter.py +55 -22
  26. dmu/stats/gof_calculator.py +7 -0
  27. dmu/stats/model_factory.py +153 -62
  28. dmu/stats/parameters.py +100 -0
  29. dmu/stats/utilities.py +443 -12
  30. dmu/stats/wdata.py +187 -0
  31. dmu/stats/zfit.py +17 -0
  32. dmu/stats/zfit_plotter.py +147 -36
  33. dmu/testing/utilities.py +102 -24
  34. dmu/workflow/__init__.py +0 -0
  35. dmu/workflow/cache.py +266 -0
  36. dmu_data/ml/tests/train_mva.yaml +9 -7
  37. dmu_data/ml/tests/train_mva_def.yaml +75 -0
  38. dmu_data/ml/tests/train_mva_with_diagnostics.yaml +10 -5
  39. dmu_data/ml/tests/train_mva_with_preffix.yaml +58 -0
  40. dmu_data/plotting/tests/2d.yaml +5 -5
  41. dmu_data/plotting/tests/line.yaml +15 -0
  42. dmu_data/plotting/tests/styling.yaml +8 -1
  43. dmu_data/rfile/friends.yaml +13 -0
  44. dmu_data/stats/fitter/test_simple.yaml +28 -0
  45. dmu_data/stats/kde_optimizer/control.json +1 -0
  46. dmu_data/stats/kde_optimizer/signal.json +1 -0
  47. dmu_data/stats/parameters/data.yaml +178 -0
  48. dmu_data/tests/config.json +6 -0
  49. dmu_data/tests/config.yaml +4 -0
  50. dmu_data/tests/pdf_to_tex.txt +34 -0
  51. dmu_scripts/kerberos/check_expiration +21 -0
  52. dmu_scripts/kerberos/convert_certificate +22 -0
  53. dmu_scripts/ml/compare_classifiers.py +85 -0
  54. data_manipulation_utilities-0.2.7.dist-info/RECORD +0 -69
  55. {data_manipulation_utilities-0.2.7.data → data_manipulation_utilities-0.2.8.dev714.data}/scripts/publish +0 -0
  56. {data_manipulation_utilities-0.2.7.dist-info → data_manipulation_utilities-0.2.8.dev714.dist-info}/top_level.txt +0 -0
dmu/workflow/cache.py ADDED
@@ -0,0 +1,266 @@
1
+ '''
2
+ This module contains
3
+ '''
4
+ import os
5
+ import sys
6
+ import shutil
7
+ from types import NoneType
8
+ from pathlib import Path
9
+ from contextlib import contextmanager
10
+
11
+ from dmu.generic import hashing
12
+ from dmu.logging.log_store import LogStore
13
+
14
+ log=LogStore.add_logger('dmu:workflow:cache')
15
+ # ---------------------------
16
+ class Cache:
17
+ '''
18
+ Class meant to wrap other classes in order to
19
+
20
+ - Keep track of the inputs through hashes
21
+ - Load cached data, if found, and prevent calculations
22
+
23
+ The following directories will be important:
24
+
25
+ out_dir : Directory where the outputs will go, specified by the user
26
+ cache_dir: Subdirectory of out_dir, ${out_dir}/.cache
27
+ hash_dir : Subdirectory of out_dir, ${out_dir}/.cache/{hash}
28
+ Where {hash} is a 10 alphanumeric representing the has of the inputs
29
+
30
+ # On skipping caching
31
+
32
+ This is controlled by `_l_skip_class` which is a list of class names:
33
+
34
+ - These classes will have the caching turned off
35
+ - If the list is empty, caching runs for everything
36
+ - If the list is None, caching is turned off for everything
37
+ '''
38
+ _cache_root : str|None = None
39
+ _l_skip_class : list[str]|None = []
40
+ # ---------------------------
41
+ def __init__(self, out_path : str, **kwargs):
42
+ '''
43
+ Parameters
44
+ ---------------
45
+ out_path: Path to directory where outputs will go
46
+ kwargs : Key word arguments symbolizing identity of inputs, used for hashing
47
+ '''
48
+ if Cache._cache_root is None:
49
+ raise ValueError('Caching directory not set')
50
+
51
+ log.debug(f'Using {Cache._cache_root} root directory for caching')
52
+ if 'code' in kwargs:
53
+ raise ValueError('Cannot append hashing data with key "code", already used')
54
+
55
+ kwargs['code'] = self._get_code_hash()
56
+
57
+ self._out_path = os.path.normpath(f'{Cache._cache_root}/{out_path}')
58
+ log.debug(f'Using {self._out_path} output path')
59
+ os.makedirs(self._out_path, exist_ok=True)
60
+
61
+ self._dat_hash = kwargs
62
+
63
+ self._cache_dir = self._get_dir(kind='cache')
64
+ self._hash_dir : str
65
+ # ---------------------------
66
+ @classmethod
67
+ def set_cache_root(cls, root : str) -> None:
68
+ '''
69
+ Sets the path to the directory WRT which the _out_path_
70
+ will be placed
71
+ '''
72
+ if cls._cache_root is not None:
73
+ raise ValueError(f'Trying to set {root}, but already found {cls._cache_root}')
74
+
75
+ os.makedirs(root, exist_ok=True)
76
+
77
+ cls._cache_root = root
78
+ # ---------------------------
79
+ def _get_code_hash(self) -> str:
80
+ '''
81
+ If `MyTool` inherits from `Cache`. `mytool.py` git commit hash
82
+ should be returned
83
+ '''
84
+ cls = self.__class__
85
+ mod = sys.modules.get(cls.__module__)
86
+ if mod is None:
87
+ raise ValueError(f'Module not found: {cls.__module__}')
88
+
89
+ if mod.__file__ is None:
90
+ raise ValueError(f'Cannot extract file path for module: {cls.__module__}')
91
+
92
+ fname = mod.__file__
93
+ fpath = os.path.abspath(fname)
94
+ val = hashing.hash_file(path=fpath)
95
+
96
+ log.debug(f'Using hash for: {fpath} = {val}')
97
+
98
+ return val
99
+ # ---------------------------
100
+ def _get_dir(
101
+ self,
102
+ kind : str,
103
+ make : bool = True) -> str:
104
+ '''
105
+ Parameters
106
+ --------------
107
+ kind : Kind of directory, cash, hash
108
+ make : If True (default) will try to make directory
109
+ '''
110
+ if kind == 'cache':
111
+ dir_path = f'{self._out_path}/.cache'
112
+ elif kind == 'hash':
113
+ cache_dir = self._get_dir(kind='cache')
114
+ hsh = hashing.hash_object(self._dat_hash)
115
+ dir_path = f'{cache_dir}/{hsh}'
116
+ else:
117
+ raise ValueError(f'Invalid directory kind: {kind}')
118
+
119
+ if make:
120
+ os.makedirs(dir_path, exist_ok=True)
121
+
122
+ return dir_path
123
+ # ---------------------------
124
+ def _cache(self) -> None:
125
+ '''
126
+ Meant to be called after all the calculations finish
127
+ It will copy all the outputs of the processing
128
+ to a hashed directory
129
+ '''
130
+ self._hash_dir = self._get_dir(kind= 'hash')
131
+ log.info(f'Caching outputs to: {self._hash_dir}')
132
+
133
+ for source in Path(self._out_path).glob('*'):
134
+ if str(source) == self._cache_dir:
135
+ continue
136
+
137
+ log.debug(str(source))
138
+ log.debug('-->')
139
+ log.debug(self._hash_dir)
140
+ log.debug('')
141
+
142
+ if source.is_dir():
143
+ shutil.copytree(source, self._hash_dir+'/'+source.name, dirs_exist_ok=True)
144
+ else:
145
+ shutil.copy2(source, self._hash_dir)
146
+
147
+ self._delete_from_output(only_links=False)
148
+ self._copy_from_hashdir()
149
+ # ---------------------------
150
+ def _delete_from_output(self, only_links : bool) -> None:
151
+ '''
152
+ Delete all objects from _out_path directory, except for `.cache`
153
+
154
+ only_links: If true will only delete links
155
+ '''
156
+ for path in Path(self._out_path).iterdir():
157
+ if str(path) == self._cache_dir:
158
+ log.debug(f'Skipping cache dir: {self._cache_dir}')
159
+ continue
160
+
161
+ # These will always be symbolic links
162
+ if only_links and not path.is_symlink():
163
+ log.warning(f'Found a non-symlink not deleting: {path}')
164
+ continue
165
+
166
+ log.debug(f'Deleting {path}')
167
+ if path.is_dir() and not path.is_symlink():
168
+ shutil.rmtree(path)
169
+ else:
170
+ path.unlink()
171
+ # ---------------------------
172
+ def _copy_from_hashdir(self) -> None:
173
+ '''
174
+ Copies all the objects from _hash_dir to _out_path
175
+ '''
176
+ for source in Path(self._hash_dir).iterdir():
177
+ target = f'{self._out_path}/{source.name}'
178
+ log.debug(f'{str(source):<50}{"-->"}{target}')
179
+
180
+ os.symlink(source, target)
181
+ # ---------------------------
182
+ def _dont_cache(self) -> bool:
183
+ '''
184
+ Returns
185
+ ---------------
186
+ Flag that if:
187
+
188
+ True : Will stop the derived class from using caching (i.e. caching is off)
189
+ False: Cache
190
+ '''
191
+ if Cache._l_skip_class is None:
192
+ log.info('No class will be cached')
193
+ return True
194
+
195
+ if len(Cache._l_skip_class) == 0:
196
+ log.debug('All classes will be cached')
197
+ return False
198
+
199
+ class_name = self.__class__.__name__
200
+
201
+ skip = class_name in Cache._l_skip_class
202
+
203
+ if skip:
204
+ log.warning(f'Caching turned off for {class_name}')
205
+ else:
206
+ log.debug(f'Caching turned on for {class_name}')
207
+
208
+ return skip
209
+ # ---------------------------
210
+ def _copy_from_cache(self) -> bool:
211
+ '''
212
+ Checks if hash directory exists:
213
+
214
+ No : Returns False
215
+ Yes:
216
+ - Removes contents of `out_path`, except for .cache
217
+ - Copies the contents of `hash_dir` to `out_dir`
218
+
219
+ Returns
220
+ ---------------
221
+ True if the object, cached was found, false otherwise.
222
+ '''
223
+ if self._dont_cache():
224
+ # If not copying from cache, will need to remove what is
225
+ # in the output directory, so that it gets replaced with
226
+ # new outputs
227
+ self._delete_from_output(only_links=False)
228
+ log.info('Not picking already cached outputs, remaking them')
229
+ return False
230
+
231
+ hash_dir = self._get_dir(kind='hash', make=False)
232
+ if not os.path.isdir(hash_dir):
233
+ log.debug(f'Hash directory {hash_dir} not found, not caching')
234
+ self._delete_from_output(only_links=False)
235
+ return False
236
+
237
+ self._hash_dir = hash_dir
238
+ log.debug(f'Data found in hash directory: {self._hash_dir}')
239
+
240
+ self._delete_from_output(only_links=False)
241
+ self._copy_from_hashdir()
242
+
243
+ return True
244
+ # ---------------------------
245
+ @contextmanager
246
+ @staticmethod
247
+ def turn_off_cache(val : list[str]|None):
248
+ '''
249
+ Parameters
250
+ ------------------
251
+ val: List of names of classes that inherit from `Cache`.
252
+ If None, will not cache for any class.
253
+ By default this is an empty list and it will cache for every class
254
+ '''
255
+ if not isinstance(val, (NoneType, list)):
256
+ log.error('This manager expects: list[str]|None')
257
+ raise ValueError(f'Invalid value: {val}')
258
+
259
+ old_val = Cache._l_skip_class
260
+
261
+ Cache._l_skip_class = val
262
+ try:
263
+ yield
264
+ finally:
265
+ Cache._l_skip_class = old_val
266
+ # ---------------------------
@@ -6,16 +6,21 @@ dataset:
6
6
  y : -3
7
7
  training :
8
8
  nfold : 3
9
- features : [x, y, r]
9
+ features :
10
+ - x
11
+ - y
12
+ - r
10
13
  rdm_stat : 1
11
14
  hyper :
12
15
  loss : log_loss
16
+ max_features : sqrt
13
17
  n_estimators : 100
14
- max_depth : 3
15
- learning_rate : 0.1
18
+ max_depth : 5
16
19
  min_samples_split : 2
20
+ subsample : 0.8
21
+ learning_rate : 0.1
17
22
  saving:
18
- path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
23
+ output : /tmp/tests/dmu/ml/train_mva
19
24
  plotting:
20
25
  roc :
21
26
  min : [0.0, 0.0]
@@ -31,10 +36,7 @@ plotting:
31
36
  title : 'Correlation matrix'
32
37
  size : [10, 10]
33
38
  mask_value : 0
34
- val_dir : '/tmp/tests/dmu/ml/train_mva'
35
39
  features:
36
- saving:
37
- plt_dir : '/tmp/tests/dmu/ml/train_mva/features'
38
40
  plots:
39
41
  r :
40
42
  binning : [-6, 6, 100]
@@ -0,0 +1,75 @@
1
+ # This config file is used for testing training and evaluation
2
+ # when there is a variable that is defined in different ways
3
+ # for the `sig` and `bkg` samples
4
+
5
+ dataset:
6
+ samples:
7
+ sig :
8
+ definitions:
9
+ n : x + y
10
+ bkg :
11
+ definitions:
12
+ n : x - y
13
+ define :
14
+ r : z + x
15
+ nan :
16
+ n : -3
17
+ y : -3
18
+ training :
19
+ nfold : 3
20
+ features :
21
+ - n
22
+ - y
23
+ - r
24
+ rdm_stat : 1
25
+ hyper :
26
+ loss : log_loss
27
+ max_features : sqrt
28
+ n_estimators : 100
29
+ max_depth : 5
30
+ min_samples_split : 2
31
+ subsample : 0.8
32
+ learning_rate : 0.1
33
+ saving:
34
+ output : /tmp/tests/dmu/ml/train_mva
35
+ plotting:
36
+ roc :
37
+ min : [0.0, 0.0]
38
+ max : [1.2, 1.2]
39
+ annotate:
40
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
41
+ form : '{:.2f}'
42
+ color: 'green'
43
+ xoff : -15
44
+ yoff : -15
45
+ size : 10
46
+ correlation:
47
+ title : 'Correlation matrix'
48
+ size : [10, 10]
49
+ mask_value : 0
50
+ features:
51
+ plots:
52
+ r :
53
+ binning : [-6, 6, 100]
54
+ yscale : 'linear'
55
+ labels : ['$r$', '']
56
+ n :
57
+ binning : [-4, 4, 100]
58
+ yscale : 'linear'
59
+ labels : ['$n$', '']
60
+ w :
61
+ binning : [-4, 4, 100]
62
+ yscale : 'linear'
63
+ labels : ['$w$', '']
64
+ x :
65
+ binning : [-4, 4, 100]
66
+ yscale : 'linear'
67
+ labels : ['$x$', '']
68
+ y :
69
+ binning : [-4, 4, 100]
70
+ yscale : 'linear'
71
+ labels : ['$y$', '']
72
+ z :
73
+ binning : [-4, 4, 100]
74
+ yscale : 'linear'
75
+ labels : ['$z$', '']
@@ -15,7 +15,7 @@ training :
15
15
  learning_rate : 0.1
16
16
  min_samples_split : 2
17
17
  saving:
18
- path : '/tmp/tests/dmu/ml/train_mva/model.pkl'
18
+ output : /tmp/tests/dmu/ml/train_mva
19
19
  plotting:
20
20
  roc :
21
21
  min : [0.0, 0.0]
@@ -60,15 +60,20 @@ diagnostics:
60
60
  output : /tmp/tests/dmu/ml/train_mva/diagnostics
61
61
  correlations:
62
62
  target :
63
- name : z
64
- overlay :
63
+ name : z
64
+ overlay :
65
+ wp :
66
+ - 0.2
67
+ - 0.6
68
+ - 0.8
69
+ - 0.9
65
70
  general:
66
71
  size : [20, 10]
67
72
  saving:
68
73
  plt_dir : /tmp/tests/dmu/ml/train_mva/diagnostics
69
74
  plots:
70
75
  z :
71
- binning : [-4, +4, 30]
76
+ binning : [-4, +4, 30]
72
77
  yscale : 'linear'
73
78
  labels : ['z', 'Entries']
74
79
  normalized : true
@@ -78,5 +83,5 @@ diagnostics:
78
83
  - Pearson
79
84
  - Kendall-$\tau$
80
85
  figure:
81
- title: Training diagnostics
86
+ title: Training diagnostics
82
87
  size : [10, 8]
@@ -0,0 +1,58 @@
1
+ dataset:
2
+ define :
3
+ r : z + preffix.x.suffix
4
+ nan :
5
+ preffix.x.suffix : -3
6
+ y : -3
7
+ training :
8
+ nfold : 2
9
+ features :
10
+ - preffix.x.suffix
11
+ - y
12
+ - r
13
+ rdm_stat : 1
14
+ hyper :
15
+ loss : log_loss
16
+ n_estimators : 100
17
+ max_depth : 3
18
+ learning_rate : 0.1
19
+ min_samples_split : 2
20
+ saving:
21
+ output : /tmp/tests/dmu/ml/train_mva
22
+ plotting:
23
+ roc :
24
+ min : [0.0, 0.0]
25
+ max : [1.2, 1.2]
26
+ annotate:
27
+ sig_eff : [0.5, 0.6, 0.7, 0.8, 0.9]
28
+ form : '{:.2f}'
29
+ color: 'green'
30
+ xoff : -15
31
+ yoff : -15
32
+ size : 10
33
+ correlation:
34
+ title : 'Correlation matrix'
35
+ size : [10, 10]
36
+ mask_value : 0
37
+ features:
38
+ plots:
39
+ r :
40
+ binning : [-6, 6, 100]
41
+ yscale : 'linear'
42
+ labels : ['$r$', '']
43
+ w :
44
+ binning : [-4, 4, 100]
45
+ yscale : 'linear'
46
+ labels : ['$w$', '']
47
+ preffix.x.suffix :
48
+ binning : [-4, 4, 100]
49
+ yscale : 'linear'
50
+ labels : ['$x$', '']
51
+ y :
52
+ binning : [-4, 4, 100]
53
+ yscale : 'linear'
54
+ labels : ['$y$', '']
55
+ z :
56
+ binning : [-4, 4, 100]
57
+ yscale : 'linear'
58
+ labels : ['$z$', '']
@@ -1,5 +1,5 @@
1
1
  saving:
2
- plt_dir : /tmp/dmu/tests/plotting/2d_weighted
2
+ plt_dir : /tmp/tests/dmu/plotting
3
3
  selection:
4
4
  cuts:
5
5
  xlow : x > -1.5
@@ -8,10 +8,10 @@ definitions:
8
8
  general:
9
9
  size : [20, 10]
10
10
  plots_2d:
11
- - [x, y, weights, 'xy_wgt', false]
12
- - [x, y, null, 'xy_raw', false]
13
- - [x, z, null, 'xz_raw', false]
14
- - [x, z, null, 'xz_log', true]
11
+ - [x, y, weights, 'xy_wgt_lin', false]
12
+ - [x, z, weights, 'xz_wgt_log', true]
13
+ - [x, y, null, 'xy_raw_lin', false]
14
+ - [x, z, null, 'xz_raw_log', true]
15
15
  axes:
16
16
  x :
17
17
  binning : [-3.0, 3.0, 40]
@@ -0,0 +1,15 @@
1
+ saving:
2
+ plt_dir : tests/plotting/line
3
+ plots:
4
+ x :
5
+ binning : [-5.0, 8.0, 40]
6
+ title : x distribution
7
+ vline :
8
+ x : 0
9
+ label : label
10
+ ls : --
11
+ c : blue
12
+ lw : 1
13
+ y :
14
+ binning : [-5.0, 8.0, 40]
15
+ title : y distribution
@@ -5,7 +5,14 @@ plots:
5
5
  binning : [-5.0, 8.0, 40]
6
6
  title : x distribution
7
7
  styling :
8
- histtype : step
8
+ class A:
9
+ histtype : fill
10
+ color : gray
11
+ alpha : 0.3
12
+ class B:
13
+ color : red
14
+ histtype : step
15
+ linestyle: '-'
9
16
  y :
10
17
  binning : [-5.0, 8.0, 40]
11
18
  title : y distribution
@@ -0,0 +1,13 @@
1
+ tree : tree_name
2
+ primary_keys:
3
+ - index
4
+ files :
5
+ - file_001.root
6
+ - file_002.root
7
+ - file_003.root
8
+ samples:
9
+ - /tmp/tests/dmu/rfile/main
10
+ - /tmp/tests/dmu/rfile/frn1
11
+ - /tmp/tests/dmu/rfile/frn2
12
+ - /tmp/tests/dmu/rfile/frn3
13
+ - /tmp/tests/dmu/rfile/frn4
@@ -0,0 +1,28 @@
1
+ # The strategies below are exclusive, only can should be used at a time
2
+ strategy :
3
+ # This strategy will fit multiple times and retry the fit until either
4
+ # ntries is exhausted or the pvalue is reached.
5
+ retry :
6
+ ntries : 4 #Number of tries
7
+ pvalue_thresh : 0.05 #Pvalue threshold, if the fit is better than this, the loop ends
8
+ ignore_status : true #Will pick invalid fits if this is true, otherwise only valid fits will be counted
9
+ # This will fit smaller datasets and get the value of the shape parameters to allow
10
+ # these shapes to float only around this value and within nsigma
11
+ # Fit can be carried out multiple times with larger and larger samples to tighten parameters
12
+ steps :
13
+ nsteps : [1e3, 1e4] #Number of entries to use
14
+ nsigma : [5.0, 2.0] #Number of sigmas for the range of the parameter, for each step
15
+ # The lines below will split the range of the data [0-10] into two subranges, such that the NLL is built
16
+ # only in those ranges. The ranges need to be tuples
17
+ ranges :
18
+ - !!python/tuple [0, 3]
19
+ - !!python/tuple [6, 9]
20
+ #The lines below will allow using contraints for each parameter, where the first element is the mean and the second
21
+ #the width of a Gaussian constraint. No correlations are implemented, yet.
22
+ constraints :
23
+ mu : [5.0, 1.0]
24
+ sg : [1.0, 0.1]
25
+ #After each fit, the parameters spciefied below will be printed, for debugging purposes
26
+ print_pars : ['mu', 'sg']
27
+ likelihood :
28
+ binned : false