data-manipulation-utilities 0.2.8.dev714__py3-none-any.whl → 0.2.8.dev725__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,36 @@
1
- Metadata-Version: 2.4
2
- Name: data_manipulation_utilities
3
- Version: 0.2.8.dev714
1
+ Metadata-Version: 2.3
2
+ Name: data-manipulation-utilities
3
+ Version: 0.2.8.dev725
4
4
  Summary: Project storing utilities needed to reduce boilerplate code when analyzing data
5
+ Requires-Python: >=3.10,<3.13
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.10
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Provides-Extra: dev
11
+ Provides-Extra: fit
12
+ Provides-Extra: ml
13
+ Requires-Dist: PyYAML
14
+ Requires-Dist: awkward
15
+ Requires-Dist: awkward-pandas
16
+ Requires-Dist: dask[dataframe,distributed]
17
+ Requires-Dist: hist[plot]
18
+ Requires-Dist: joblib ; extra == "ml"
19
+ Requires-Dist: logzero
20
+ Requires-Dist: matplotlib
21
+ Requires-Dist: mplhep
22
+ Requires-Dist: numpy
23
+ Requires-Dist: omegaconf
24
+ Requires-Dist: optuna ; extra == "ml"
25
+ Requires-Dist: pandas
26
+ Requires-Dist: pytest ; extra == "dev"
27
+ Requires-Dist: scikit-learn ; extra == "ml"
28
+ Requires-Dist: scipy
29
+ Requires-Dist: tensorflow
30
+ Requires-Dist: toml
31
+ Requires-Dist: tqdm
32
+ Requires-Dist: uproot
33
+ Requires-Dist: zfit (==0.26.0) ; extra == "fit"
5
34
  Description-Content-Type: text/markdown
6
35
 
7
36
  [TOC]
@@ -1793,3 +1822,4 @@ lxplus:
1793
1822
  ```
1794
1823
 
1795
1824
  and should be placed in `$HOME/.config/dmu/ssh/servers.yaml`
1825
+
@@ -1,4 +1,3 @@
1
- data_manipulation_utilities-0.2.8.dev714.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
2
1
  dmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
2
  dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
4
3
  dmu/generic/hashing.py,sha256=QR5Gbv6-ANvi5hL232UNMrw9DONpU27BWTynXGxQLGU,1806
@@ -15,7 +14,7 @@ dmu/ml/utilities.py,sha256=A9j3tBh-jfaFdwwLUleo1QnttfawN7XDiQRh4VTvqVY,4597
15
14
  dmu/pdataframe/utilities.py,sha256=xl6iLVKUccqVXYjuHsDUZ6UrCKQPw1k8D-f6407Yq30,2742
16
15
  dmu/plotting/fwhm.py,sha256=4e8n6624pxWLcOOtayCQ_hDSSMKU21-3UsdmbkX1ojk,1949
17
16
  dmu/plotting/matrix.py,sha256=s_5W8O3yXF3u8OX3f4J4hCoxIVZt1TF8S-qJsFBh2Go,5005
18
- dmu/plotting/plotter.py,sha256=oc_n9ug0JPaQZycrW_TJkgNxjr0LHNrVJcijqmiLUR4,8136
17
+ dmu/plotting/plotter.py,sha256=5N5mLdQAqOUs43ukX5mT9nRaYD5dkn_sED5NoZJV5A0,8483
19
18
  dmu/plotting/plotter_1d.py,sha256=Kyoyh-QyZLXXqX19wqEDUWCD1nJEvEonGp9nlgEaoZE,10936
20
19
  dmu/plotting/plotter_2d.py,sha256=dXC-7Rsquibe5cn7622ryoKpuv7KCAmouIIXwQ_VEFM,3172
21
20
  dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
@@ -77,17 +76,7 @@ dmu_data/text/transform.txt,sha256=EX760da6Vkf-_EPxnQlC5hGSkfFhJCCGCD19NU-1Qto,4
77
76
  dmu_data/text/transform_set.toml,sha256=Jeh7BTz82idqvbOQJtl9-ur56mZkzDn5WtvmIb48LoE,150
78
77
  dmu_data/text/transform_set.txt,sha256=1KivMoP9LxPn9955QrRmOzjEqduEjhTetQ9MXykO5LY,46
79
78
  dmu_data/text/transform_trf.txt,sha256=zxBRTgcSmX7RdqfmWF88W1YqbyNHa4Ccruf1MmnYv2A,74
80
- dmu_scripts/git/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
81
- dmu_scripts/kerberos/check_expiration,sha256=PRJopcyFSeiAHdWpLEZp6mu_OKctUdIJj0HZfC0EWxg,308
82
- dmu_scripts/kerberos/convert_certificate,sha256=_4k4fmxpK-MbSLkkRYEPLQc9twfYBqOIiYZqL9yAXKE,445
83
- dmu_scripts/ml/compare_classifiers.py,sha256=XuHdcVyDLFGoKfvfv6YrgIavRpjpMrnBSqUnlliD7ew,2312
84
- dmu_scripts/physics/check_truth.py,sha256=b1P_Pa9ef6VcFtyY6Y9KS9Om9L-QrCBjDKp4dqca0PQ,3964
85
- dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki7ZQynxXX9Q,9540
86
- dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
87
- dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
88
- dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
89
- data_manipulation_utilities-0.2.8.dev714.dist-info/METADATA,sha256=M5n-tPUt3o_0kY4viuQj6lbP4JQxWhpxkSnWCW29PFg,50263
90
- data_manipulation_utilities-0.2.8.dev714.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
- data_manipulation_utilities-0.2.8.dev714.dist-info/entry_points.txt,sha256=-02cr8ibY6L_reX-_Owz2N7OUQyTAwydRIvLr9kKZK0,332
92
- data_manipulation_utilities-0.2.8.dev714.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
93
- data_manipulation_utilities-0.2.8.dev714.dist-info/RECORD,,
79
+ data_manipulation_utilities-0.2.8.dev725.dist-info/METADATA,sha256=_4bxAW7aoKgPY3H1rKhp626lTEYrD2UWrwZX9avU750,51153
80
+ data_manipulation_utilities-0.2.8.dev725.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
81
+ data_manipulation_utilities-0.2.8.dev725.dist-info/entry_points.txt,sha256=M0C8_u9B_xSmyfemdPwdIBh9QuPIkjhEpG060Y5_Pjw,321
82
+ data_manipulation_utilities-0.2.8.dev725.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: poetry-core 2.1.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -0,0 +1,8 @@
1
+ [console_scripts]
2
+ check_truth=dmu_scripts.physics.check_truth:main
3
+ compare_classifiers=dmu_scripts.ml.compare_classifiers:main
4
+ compare_root_files=dmu_scripts.rfile.compare_root_files:main
5
+ coned=dmu_scripts.ssh.coned:main
6
+ print_trees=dmu_scripts.rfile.print_trees:main
7
+ transform_text=dmu_scripts.text.transform_text:main
8
+
dmu/plotting/plotter.py CHANGED
@@ -5,12 +5,12 @@ Module containing plotter class
5
5
  import os
6
6
  import json
7
7
  import math
8
- from typing import Union
9
8
 
10
9
  import numpy
11
10
  import matplotlib.pyplot as plt
12
11
 
13
- from ROOT import RDataFrame
12
+ from ROOT import RDataFrame, RDF
13
+ from omegaconf import DictConfig
14
14
  from dmu.logging.log_store import LogStore
15
15
 
16
16
  log = LogStore.add_logger('dmu:plotting:Plotter')
@@ -20,16 +20,28 @@ class Plotter:
20
20
  Base class of Plotter1D and Plotter2D
21
21
  '''
22
22
  #-------------------------------------
23
- def __init__(self, d_rdf=None, cfg=None):
24
- if not isinstance( cfg, dict):
23
+ def __init__(
24
+ self,
25
+ d_rdf: dict|None =None,
26
+ cfg : dict|DictConfig|None =None):
27
+ '''
28
+ Parameters
29
+ --------------
30
+ d_rdf: Dictionary where
31
+ key : Identifier of dataset
32
+ value: ROOT dataframe representing dataset
33
+
34
+ cfg : Dictionary or DictConfig instance holding configuration
35
+ '''
36
+ if not isinstance( cfg, (dict,DictConfig)):
25
37
  raise ValueError('Config dictionary not passed')
26
38
 
27
39
  if not isinstance(d_rdf, dict):
28
40
  raise ValueError('Dataframe dictionary not passed')
29
41
 
30
42
  self._d_cfg = cfg
31
- self._d_rdf : dict[str, RDataFrame] = { name : self._preprocess_rdf(rdf) for name, rdf in d_rdf.items()}
32
- self._d_wgt : Union[dict[str, Union[numpy.ndarray, None]], None]
43
+ self._d_rdf : dict[str, RDF.RNode] = { name : self._preprocess_rdf(rdf) for name, rdf in d_rdf.items()}
44
+ self._d_wgt : dict[str, numpy.ndarray|None] | None
33
45
 
34
46
  self._title : str = ''
35
47
  #-------------------------------------
@@ -68,9 +80,9 @@ class Plotter:
68
80
 
69
81
  return minx, maxx
70
82
  #-------------------------------------
71
- def _preprocess_rdf(self, rdf : RDataFrame) -> RDataFrame:
83
+ def _preprocess_rdf(self, rdf : RDF.RNode) -> RDF.RNode:
72
84
  '''
73
- rdf (RDataFrame): ROOT dataframe
85
+ rdf (RDF.RNode): ROOT dataframe
74
86
 
75
87
  returns preprocessed dataframe
76
88
  '''
@@ -146,7 +158,7 @@ class Plotter:
146
158
 
147
159
  return rdf
148
160
  # --------------------------------------------
149
- def _print_weights(self, arr_wgt : Union[numpy.ndarray, None], var : str, sample : str) -> None:
161
+ def _print_weights(self, arr_wgt : numpy.ndarray|None, var : str, sample : str) -> None:
150
162
  if arr_wgt is None:
151
163
  log.debug(f'Not using weights for {sample}:{var}')
152
164
  return
@@ -171,7 +183,7 @@ class Plotter:
171
183
 
172
184
  return fig_size
173
185
  #-------------------------------------
174
- def _get_weights(self, var) -> Union[dict[str, Union[numpy.ndarray, None]], None]:
186
+ def _get_weights(self, var) -> dict[str, numpy.ndarray|None]| None:
175
187
  d_cfg = self._d_cfg['plots'][var]
176
188
  if 'weights' not in d_cfg:
177
189
  return None
@@ -186,7 +198,7 @@ class Plotter:
186
198
 
187
199
  return d_weight
188
200
  # --------------------------------------------
189
- def _read_weights(self, name : str, rdf : RDataFrame) -> numpy.ndarray:
201
+ def _read_weights(self, name : str, rdf : RDF.RNode) -> numpy.ndarray:
190
202
  v_col = rdf.GetColumnNames()
191
203
  l_col = [ col.c_str() for col in v_col ]
192
204
 
@@ -1,89 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- # --------------------------
4
- display_help()
5
- {
6
- echo "Script meant to:"
7
- echo ""
8
- echo "1. Check if version in pyproject.toml has been modified"
9
- echo "2. If it has create new tag following version name"
10
- echo "3. Push to remote "
11
- }
12
- # --------------------------
13
- get_opts()
14
- {
15
- while getopts :hf: option; do
16
- case "${option}" in
17
- h)
18
- display_help
19
- exit 0
20
- ;;
21
- \?) echo "Invalid option: -${OPTARG}"
22
- display_help
23
- exit 1
24
- ;;
25
- :) echo "$0: Arguments needed"
26
- display_help
27
- exit 1
28
- ;;
29
- esac
30
- done
31
- }
32
- # --------------------------
33
- # Picks up version from pyproject.toml
34
- get_version()
35
- {
36
- if [[ ! -f pyproject.toml ]];then
37
- echo "Cannot find pyproject.toml"
38
- exit 1
39
- fi
40
-
41
- VERSION_LINE=$(grep version pyproject.toml)
42
-
43
- if [[ $? -ne 0 ]];then
44
- ehco "Could not extract version from pyproject.toml"
45
- exit 1
46
- fi
47
-
48
- if [[ "$VERSION_LINE" =~ .*([0-9]\.[0-9]\.[0-9]).* ]];then
49
- VERSION=${BASH_REMATCH[1]}
50
- echo "Using version: $VERSION"
51
- return
52
- fi
53
-
54
- echo "Could not extract version from: $VERSION_LINE"
55
- exit 1
56
- }
57
- # --------------------------
58
- create_tag()
59
- {
60
- git tag -n | grep $VERSION
61
-
62
- if [[ $? -eq 0 ]];then
63
- echo "Version found among tags, not tagging"
64
- return
65
- fi
66
-
67
- echo "Version $VERSION not found among tags, creating new tag"
68
-
69
- git tag -a $VERSION
70
- }
71
- # --------------------------
72
- push_all()
73
- {
74
- for REMOTE in $(git remote);do
75
- echo "Pushing tags and commits to remote: $REMOTE"
76
- git add pyproject.toml
77
- git commit -m "Publication commit"
78
-
79
- git pull $REMOTE HEAD
80
- git push -u $REMOTE HEAD
81
- git push $REMOTE --tags
82
- done
83
- }
84
- # --------------------------
85
- get_opts "$@"
86
-
87
- get_version
88
- create_tag
89
- push_all
@@ -1,7 +0,0 @@
1
- [console_scripts]
2
- check_truth = dmu_scripts.physics.check_truth:main
3
- compare_classifiers = dmu_scripts.ml.compare_classifiers:main
4
- compare_root_files = dmu_scripts.rfile.compare_root_files:main
5
- coned = dmu_scripts.ssh.coned:main
6
- print_trees = dmu_scripts.rfile.print_trees:main
7
- transform_text = dmu_scripts.text.transform_text:main
@@ -1,3 +0,0 @@
1
- dmu
2
- dmu_data
3
- dmu_scripts
dmu_scripts/git/publish DELETED
@@ -1,89 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- # --------------------------
4
- display_help()
5
- {
6
- echo "Script meant to:"
7
- echo ""
8
- echo "1. Check if version in pyproject.toml has been modified"
9
- echo "2. If it has create new tag following version name"
10
- echo "3. Push to remote "
11
- }
12
- # --------------------------
13
- get_opts()
14
- {
15
- while getopts :hf: option; do
16
- case "${option}" in
17
- h)
18
- display_help
19
- exit 0
20
- ;;
21
- \?) echo "Invalid option: -${OPTARG}"
22
- display_help
23
- exit 1
24
- ;;
25
- :) echo "$0: Arguments needed"
26
- display_help
27
- exit 1
28
- ;;
29
- esac
30
- done
31
- }
32
- # --------------------------
33
- # Picks up version from pyproject.toml
34
- get_version()
35
- {
36
- if [[ ! -f pyproject.toml ]];then
37
- echo "Cannot find pyproject.toml"
38
- exit 1
39
- fi
40
-
41
- VERSION_LINE=$(grep version pyproject.toml)
42
-
43
- if [[ $? -ne 0 ]];then
44
- ehco "Could not extract version from pyproject.toml"
45
- exit 1
46
- fi
47
-
48
- if [[ "$VERSION_LINE" =~ .*([0-9]\.[0-9]\.[0-9]).* ]];then
49
- VERSION=${BASH_REMATCH[1]}
50
- echo "Using version: $VERSION"
51
- return
52
- fi
53
-
54
- echo "Could not extract version from: $VERSION_LINE"
55
- exit 1
56
- }
57
- # --------------------------
58
- create_tag()
59
- {
60
- git tag -n | grep $VERSION
61
-
62
- if [[ $? -eq 0 ]];then
63
- echo "Version found among tags, not tagging"
64
- return
65
- fi
66
-
67
- echo "Version $VERSION not found among tags, creating new tag"
68
-
69
- git tag -a $VERSION
70
- }
71
- # --------------------------
72
- push_all()
73
- {
74
- for REMOTE in $(git remote);do
75
- echo "Pushing tags and commits to remote: $REMOTE"
76
- git add pyproject.toml
77
- git commit -m "Publication commit"
78
-
79
- git pull $REMOTE HEAD
80
- git push -u $REMOTE HEAD
81
- git push $REMOTE --tags
82
- done
83
- }
84
- # --------------------------
85
- get_opts "$@"
86
-
87
- get_version
88
- create_tag
89
- push_all
@@ -1,21 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- : '
4
- This script is meant to check the expiration date of a grid certificate
5
-
6
- Usage:
7
-
8
- ./check_expiration
9
- '
10
-
11
- check()
12
- {
13
- PEMFILE=$1
14
- if [[ ! -f $PEMFILE ]];then
15
- echo "Cannot find PEM file: $PEMFILE"
16
- fi
17
-
18
- openssl x509 -enddate -noout -in $PEMFILE
19
- }
20
-
21
- check usercert.pem
@@ -1,22 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- : '
4
- This script is used to convert p12 grid certificate files into PEM files
5
-
6
- Usage:
7
-
8
- ./convert_certificate cert.p12
9
- '
10
-
11
- CERTIFICATE=$1
12
-
13
- if [[ ! -f $CERTIFICATE ]];then
14
- echo "ERROR::Certificate \"$CERTIFICATE\" does not exist."
15
- kill -INT $$
16
- fi
17
-
18
- openssl pkcs12 -in $CERTIFICATE -clcerts -nokeys -out usercert.pem
19
- openssl pkcs12 -in $CERTIFICATE -nocerts -out userkey.pem
20
- chmod 400 userkey.pem
21
- chmod 444 usercert.pem
22
-
@@ -1,85 +0,0 @@
1
- '''
2
- Script used to compare performance of classifiers
3
- '''
4
- import os
5
- import argparse
6
- import yaml
7
- import mplhep
8
- import matplotlib.pyplot as plt
9
- import pandas as pnd
10
-
11
- from sklearn.metrics import auc
12
- from dmu.logging.log_store import LogStore
13
-
14
- log=LogStore.add_logger('dmu:ml:compare_classifiers')
15
- # ------------------------------
16
- class Data:
17
- '''
18
- Data class
19
- '''
20
- out_path : str
21
- cfg_path : str
22
- logl : int
23
- cfg : dict
24
-
25
- plt.style.use(mplhep.style.LHCb2)
26
- # ------------------------------
27
- def _initialize() -> None:
28
- log.info(f'Loading settings from: {Data.cfg_path}')
29
- with open(Data.cfg_path, encoding='utf-8') as ifile:
30
- Data.cfg = yaml.safe_load(ifile)
31
-
32
- Data.out_path = Data.cfg['out_dir']
33
- os.makedirs(Data.out_path, exist_ok=True)
34
- # ------------------------------
35
- def _parse_args():
36
- parser = argparse.ArgumentParser(description='Used to perform comparisons of classifier performances')
37
- parser.add_argument('-c', '--conf' , help='Path to configuration path', required=True)
38
- parser.add_argument('-l', '--logl' , help='Logging level', choices=[10, 20, 30], default=20)
39
- args = parser.parse_args()
40
-
41
- Data.cfg_path = args.conf
42
- Data.logl = args.logl
43
- # ------------------------------
44
- def _plot_roc(name : str, path : str) -> None:
45
- roc_path = f'{path}/fold_all/roc.json'
46
- df = pnd.read_json(roc_path)
47
-
48
- plt.figure(num='ROC')
49
- xval = df['x'].to_numpy()
50
- yval = df['y'].to_numpy()
51
- area = auc(xval, yval)
52
-
53
- plt.plot(xval, yval, label=f'{name}: {area:.3f}')
54
- # ------------------------------
55
- def _compare():
56
- for name, cls_path in Data.cfg['classifiers'].items():
57
- _plot_roc(name=name, path=cls_path)
58
-
59
- _save_roc()
60
- # ------------------------------
61
- def _save_roc():
62
- d_set = Data.cfg['roc']
63
- if 'xrange' in d_set:
64
- plt.xlim(d_set['xrange'])
65
-
66
- if 'yrange' in d_set:
67
- plt.ylim(d_set['yrange'])
68
-
69
- plt.figure(num='ROC')
70
- plt.legend()
71
- plt.grid()
72
- plt.xlabel('Signal Efficiency')
73
- plt.ylabel('Background Rejection')
74
- plt.savefig(f'{Data.out_path}/roc.png')
75
- # ------------------------------
76
- def main():
77
- '''
78
- Start here
79
- '''
80
- _parse_args()
81
- _initialize()
82
- _compare()
83
- # ------------------------------
84
- if __name__ == '__main__':
85
- main()
@@ -1,121 +0,0 @@
1
- '''
2
- Script meant to do truth matching checks
3
- '''
4
- import os
5
- import copy
6
- import argparse
7
-
8
- import yaml
9
- import mplhep
10
- import matplotlib.pyplot as plt
11
-
12
- from ROOT import RDataFrame
13
-
14
- from dmu.logging.log_store import LogStore
15
- from dmu.plotting.plotter_1d import Plotter1D as Plotter
16
-
17
- log=LogStore.add_logger('dmu:physics:check_truth')
18
- # ----------------------------------
19
- def _set_logs() -> None:
20
- LogStore.set_level('dmu:plotting:Plotter' , 30)
21
- LogStore.set_level('dmu:plotting:Plotter1D', 30)
22
- # ----------------------------------
23
- def _get_args() -> argparse.Namespace:
24
- '''
25
- Parse args
26
- '''
27
- parser = argparse.ArgumentParser(description='Script used to carry out checks on truth matching mechanisms for MC')
28
- parser.add_argument('-c', '--conf' , type=str, help='Path to config file', required=True)
29
- args = parser.parse_args()
30
-
31
- return args
32
- # ----------------------------------
33
- def _get_config(args : argparse.Namespace) -> dict:
34
- path = args.conf
35
- if not os.path.isfile(path):
36
- raise FileNotFoundError(f'Cannot find {path}')
37
-
38
- with open(path, encoding='utf-8') as ifile:
39
- cfg = yaml.safe_load(ifile)
40
-
41
- return cfg
42
- # ----------------------------------
43
- def _get_rdf(file_path : str, tree_path : str) -> RDataFrame:
44
- log.debug(f'Picking inputs from: {file_path}/{tree_path}')
45
- rdf = RDataFrame(tree_path, file_path)
46
-
47
- nentries = rdf.Count().GetValue()
48
- log.debug(f'Found {nentries} entries')
49
-
50
- return rdf
51
- # ----------------------------------
52
- def _preprocess_rdf(rdf : RDataFrame, cfg : dict) -> RDataFrame:
53
- if 'max_entries' in cfg:
54
- max_entries = cfg['max_entries']
55
- rdf = rdf.Range(max_entries)
56
-
57
- return rdf
58
- # ----------------------------------
59
- def _check(cfg : dict) -> None:
60
- log.info(110 * '-')
61
- log.info(f'{"Sample":<20}{"Method":<20}{"Initial":<15}{"":<15}{"Final":<15}{"":15}{"Efficiency":<10}')
62
- log.info(110 * '-')
63
-
64
- for sample_name in cfg['samples']:
65
- file_path = cfg['samples'][sample_name]['file_path']
66
- tree_path = cfg['samples'][sample_name]['tree_path']
67
- rdf = _get_rdf(file_path, tree_path)
68
- rdf = _preprocess_rdf(rdf, cfg)
69
-
70
- d_cut_true = {}
71
- d_cut_fake = {}
72
- for method, cut in cfg['samples'][sample_name]['methods'].items():
73
- _check_kind(rdf, sample_name, method, cut)
74
-
75
- d_cut_true[method] = cut
76
- d_cut_fake[method] = f'({cut}) == 0'
77
- log.info('')
78
-
79
- _plot_distributions(cfg, sample_name, rdf, d_cut_true, kind='matched')
80
- _plot_distributions(cfg, sample_name, rdf, d_cut_fake, kind='anti_matched')
81
- # ----------------------------------
82
- def _plot_distributions(cfg : dict, sample_name : str, rdf : RDataFrame, d_cut : dict[str,str], kind : str) -> None:
83
- cfg = copy.deepcopy(cfg)
84
- cfg_plt = cfg['samples'][sample_name]['plot']
85
- cfg_plt = _add_suffix(cfg_plt, sample_name, kind)
86
- d_rdf = { method : rdf.Filter(cut) for method, cut in d_cut.items() }
87
-
88
- ptr=Plotter(d_rdf=d_rdf, cfg=cfg_plt)
89
- ptr.run()
90
- # ----------------------------------
91
- def _add_suffix(cfg : dict, sample_name : str, kind : str) -> dict:
92
- d_var = cfg['plots']
93
- for var in d_var:
94
- d_var[var]['name'] = f'{var}_{kind}'
95
- d_var[var]['title'] = f'{sample_name}; {kind}'
96
-
97
- cfg['plots'] = d_var
98
-
99
- return cfg
100
- # ----------------------------------
101
- def _check_kind(rdf : RDataFrame, sample : str, name : str, cut : str) -> RDataFrame:
102
- nini = rdf.Count().GetValue()
103
- rdf = rdf.Filter(cut, name)
104
- nfnl = rdf.Count().GetValue()
105
- eff = nfnl / nini * 100
106
-
107
- log.info(f'{sample:<20}{name:<20}{nini:<15}{"":<15}{nfnl:<15}{"-->":15}{eff:10.2f}')
108
- # ----------------------------------
109
- def main():
110
- '''
111
- Script starts here
112
- '''
113
- _set_logs()
114
- args = _get_args()
115
- cfg = _get_config(args)
116
- plt.style.use(mplhep.style.LHCb2)
117
-
118
- _check(cfg)
119
- # ----------------------------------
120
- if __name__ == '__main__':
121
- main()
@@ -1,299 +0,0 @@
1
- '''
2
- Script used to compare ROOT files
3
- '''
4
-
5
- import re
6
- import os
7
- from dataclasses import dataclass
8
- from typing import ClassVar
9
-
10
- import argparse
11
-
12
- import yaml
13
- import numpy
14
- from dmu.logging.log_store import LogStore
15
-
16
- from ROOT import TFile, TTree, RDataFrame
17
-
18
- import dmu.rfile.utilities as rfut
19
-
20
-
21
- log=LogStore.add_logger('rx_scripts:compare_files')
22
- #------------------
23
- @dataclass
24
- class Data:
25
- '''
26
- Class used to store shared attributes
27
- '''
28
- max_entries : int
29
- max_trees : int
30
- l_exclude : list[str]
31
- raise_if_diff : bool
32
- file_name_1 : str
33
- file_name_2 : str
34
-
35
- d_summary : ClassVar[dict]= {}
36
- #------------------
37
- def _print_trees_difference(l_val_1 : list[str], l_val_2 : list[str]) -> None:
38
- s_val_1 = set(l_val_1)
39
- s_val_2 = set(l_val_2)
40
-
41
- s_only_1 = s_val_1 - s_val_2
42
- s_only_2 = s_val_2 - s_val_1
43
-
44
- Data.d_summary[f'Trees only in {Data.file_name_1}'] = list(s_only_1)
45
- Data.d_summary[f'Trees only in {Data.file_name_2}'] = list(s_only_2)
46
-
47
- nonly_1 = len(s_only_1)
48
- nonly_2 = len(s_only_2)
49
-
50
- if nonly_1 > 0:
51
- log.info(f'Found {nonly_1} trees in first file but not second')
52
- for name in s_only_1:
53
- log.info(f'{"":<4}{name:<20}')
54
-
55
- if nonly_2 > 0:
56
- log.info(f'Found {nonly_2} trees in second file but not first')
57
- for name in s_only_2:
58
- log.info(f'{"":<4}{name:<20}')
59
- #------------------
60
- def _check_trees(d_tree_1 : dict[str, TTree], d_tree_2 : dict[str, TTree]):
61
- '''
62
- Check if dictionaries have same trees
63
- For corresponding trees, check if number of entries is the same
64
- '''
65
- l_treename_1 = list(d_tree_1.keys())
66
- l_treename_2 = list(d_tree_2.keys())
67
-
68
- if l_treename_1 != l_treename_2:
69
- log.warning('Files contain different trees')
70
- _print_trees_difference(l_treename_1, l_treename_2)
71
-
72
- s_treename_1 = set(l_treename_1)
73
- s_treename_2 = set(l_treename_2)
74
- s_treename = s_treename_1 & s_treename_2
75
-
76
- for treename in s_treename:
77
- if treename in Data.l_exclude:
78
- continue
79
-
80
- tree_1 = d_tree_1[treename]
81
- tree_2 = d_tree_2[treename]
82
-
83
- entries_1 = tree_1.GetEntries()
84
- entries_2 = tree_2.GetEntries()
85
-
86
- if entries_1 != entries_2:
87
- raise ValueError(f'Tree {treename} differs in entries {entries_1}/{entries_2}')
88
-
89
- return list(s_treename)
90
- #------------------
91
- def _get_data(tree : TTree) -> dict[str, numpy.ndarray]:
92
- rdf = RDataFrame(tree)
93
- if Data.max_entries > 0:
94
- log.warning(f'Limiting to {Data.max_entries} entries')
95
- rdf = rdf.Range(Data.max_entries)
96
-
97
- d_data = rdf.AsNumpy(exclude=[])
98
-
99
- return d_data
100
- #------------------
101
- def _check_branches(tree_name : str, l_branch_1 : list[str], l_branch_2 : list[str]) -> None:
102
- '''
103
- Takes lists of branch names
104
- Checks if they are the same, if not print differences
105
-
106
- if raise_if_diff is True, will raise exception if branches are not the same
107
- '''
108
- if l_branch_1 == l_branch_2:
109
- return
110
-
111
- s_branch_1 = set(l_branch_1)
112
- s_branch_2 = set(l_branch_2)
113
-
114
- s_branch_1_m_2 = s_branch_1.difference(s_branch_2)
115
- log.info(f'Found len({s_branch_1_m_2}) branches in first tree but not second')
116
- for branch_name in s_branch_1_m_2:
117
- log.debug(f'{"":<4}{branch_name:<20}')
118
-
119
- s_branch_2_m_1 = s_branch_2.difference(s_branch_1)
120
- log.info(f'Found len({s_branch_2_m_1}) branches in second tree but not first')
121
- for branch_name in s_branch_2_m_1:
122
- log.debug(f'{"":<4}{branch_name:<20}')
123
-
124
- Data.d_summary[tree_name] = {
125
- f'Only {Data.file_name_1}' : list(s_branch_1_m_2),
126
- f'Only {Data.file_name_2}' : list(s_branch_2_m_1),
127
- }
128
-
129
- if Data.raise_if_diff:
130
- raise ValueError('Branches differ')
131
- #------------------
132
- def _compare_branches(tree_name : str, d_data_1 : dict[str, list], d_data_2 : dict[str, list]) -> list[str]:
133
- '''
134
- Will check for different branches in trees
135
- Will return list of branch names for common branches
136
- '''
137
- l_branch_1 = list(d_data_1.keys())
138
- l_branch_2 = list(d_data_2.keys())
139
-
140
- l_branch_1.sort()
141
- l_branch_2.sort()
142
- _check_branches(tree_name, l_branch_1, l_branch_2)
143
-
144
- s_branch_1 = set(l_branch_1)
145
- s_branch_2 = set(l_branch_2)
146
-
147
- s_branch = s_branch_1.intersection(s_branch_2)
148
-
149
- return list(s_branch)
150
- #------------------
151
- def _compare(tree_name : str, d_data_1, d_data_2) -> None:
152
- log.info('')
153
- log.debug('Comparing branches')
154
- l_branch_name = _compare_branches(tree_name, d_data_1, d_data_2)
155
-
156
- log.debug('Comparing contents of branches')
157
- l_diff_branch = []
158
- for branch_name in l_branch_name:
159
- arr_val_1 = d_data_1[branch_name]
160
- arr_val_2 = d_data_2[branch_name]
161
-
162
- if _contents_differ(tree_name, branch_name, arr_val_1, arr_val_2):
163
- l_diff_branch.append(branch_name)
164
-
165
- ndiff = len(l_diff_branch)
166
- ntot = len(l_branch_name)
167
-
168
- Data.d_summary[f'Branches that differ for tree: {tree_name}'] = l_diff_branch
169
-
170
- if ndiff == 0:
171
- log.debug(f'Trees {tree_name} have same contents')
172
- return
173
-
174
- log.warning(f'{ndiff:<10}{"differing branches out of":<20}{ntot:<10}{"in":<10}{tree_name:<50}')
175
- for branch_name in l_diff_branch:
176
- log.debug(f'{"":<4}{branch_name:<20}')
177
- #------------------
178
- def _contents_differ(tree_name : str, branch_name : str, arr_val_1 : numpy.ndarray, arr_val_2 : numpy.ndarray) -> bool:
179
- is_different = False
180
- str_type = str(arr_val_1.dtype)
181
- if str_type == 'object':
182
- return is_different
183
-
184
- if str_type not in ['bool', 'int32', 'uint32', 'uint64', 'float64', 'float32']:
185
- log.info(f'Skipping {branch_name}, {str_type}')
186
- return is_different
187
-
188
- if not numpy.array_equal(arr_val_1, arr_val_2):
189
- is_different = True
190
-
191
- log.debug(20 * '-')
192
- log.debug(f'Branch {branch_name} in tree {tree_name} differ')
193
- log.debug(20 * '-')
194
- log.debug(arr_val_1)
195
- log.debug(arr_val_2)
196
- log.debug(20 * '-')
197
-
198
- return is_different
199
- #------------------
200
- def _update_keys(d_tree):
201
- d_out = {}
202
-
203
- for key, val in d_tree.items():
204
- #Remove everything before .root/ and use it as new key
205
- new_key = re.sub(r'^.*\.root/', '', key)
206
- d_out[new_key] = val
207
-
208
- return d_out
209
- #------------------
210
- def _check_file_existence(path : str) -> None:
211
- if not os.path.isfile(path):
212
- raise FileNotFoundError(f'Cannot find {path}')
213
- #------------------
214
- def _validate(file_1 : str, file_2 : str) -> None:
215
- _check_file_existence(file_1)
216
- _check_file_existence(file_2)
217
-
218
- ifile_1 = TFile(file_1)
219
- ifile_2 = TFile(file_2)
220
-
221
- d_tree_1 = rfut.get_trees_from_file(ifile_1)
222
- d_tree_1 = _update_keys(d_tree_1)
223
-
224
- d_tree_2 = rfut.get_trees_from_file(ifile_2)
225
- d_tree_2 = _update_keys(d_tree_2)
226
-
227
- l_tree_name = _check_trees(d_tree_1, d_tree_2)
228
-
229
- if Data.max_trees > -1:
230
- log.warning(f'Limiting to {Data.max_trees} trees')
231
- l_tree_name = l_tree_name[:Data.max_trees]
232
-
233
- ncommon = len(l_tree_name)
234
- log.debug(f'Found common {ncommon} trees')
235
- for name in l_tree_name:
236
- log.debug(f'{"":<4}{name}')
237
-
238
- log.info('Checking trees')
239
- for treename in l_tree_name:
240
- if treename in Data.l_exclude:
241
- log.debug(f'Skipping {treename}')
242
- continue
243
-
244
- log.debug(f'{"":<4}{treename}')
245
-
246
- tree_1 = d_tree_1[treename]
247
- tree_2 = d_tree_2[treename]
248
-
249
- log.debug('Getting data from reference')
250
- d_data_1= _get_data(tree_1)
251
-
252
- log.debug('Getting data from new')
253
- d_data_2= _get_data(tree_2)
254
-
255
- log.debug(f'Comparing {treename}')
256
- _compare(treename, d_data_1, d_data_2)
257
-
258
- ifile_1.Close()
259
- ifile_2.Close()
260
- #------------------
261
- def _save_summary() -> None:
262
- '''
263
- Saves Data.d_summary to summary.yaml
264
- '''
265
-
266
- with open('summary.yaml', 'w', encoding='utf-8') as ofile:
267
- yaml.dump(Data.d_summary, ofile, indent=2, default_flow_style=False)
268
- #------------------
269
- def main():
270
- '''
271
- Script starts here
272
- '''
273
- parser = argparse.ArgumentParser(description='Used to validate versions of code that produce potentially different files')
274
- parser.add_argument('-f', '--files' , nargs= 2, help='List of files to compare')
275
- parser.add_argument('-n', '--max_entries' , type=int , help='Limit running over this number of entries. By default will run over everything', default=-1)
276
- parser.add_argument('-t', '--max_trees' , type=int , help='Limit running over this number of trees. By default will run over everything' , default=-1)
277
- parser.add_argument('-l', '--log_level' , type=int , help='Logging level' , default=20, choices=[10, 20, 30, 40])
278
- parser.add_argument('-e', '--exclude' , nargs='+', help='List of trees that should not be compared' , default=[], )
279
- parser.add_argument('-r', '--raise_if_diff' , help='If used, will fail as soon as it finds trees with different branches.', action='store_true')
280
-
281
- args = parser.parse_args()
282
-
283
- LogStore.set_level('rx_scripts:compare_files', args.log_level)
284
-
285
- Data.max_entries = args.max_entries
286
- Data.max_trees = args.max_trees
287
- Data.l_exclude = args.exclude
288
- Data.raise_if_diff = args.raise_if_diff
289
-
290
- [file_1, file_2] = args.files
291
-
292
- Data.file_name_1 = file_1
293
- Data.file_name_2 = file_2
294
-
295
- _validate(file_1, file_2)
296
- _save_summary()
297
- #------------------
298
- if __name__ == '__main__':
299
- main()
@@ -1,35 +0,0 @@
1
- '''
2
- Script used to print contents of root files
3
- '''
4
-
5
- import argparse
6
-
7
- from dmu.rfile.rfprinter import RFPrinter
8
-
9
- # -----------------------------
10
- class Data:
11
- '''
12
- Data class holding shared attributes
13
- '''
14
- path : str
15
- screen : bool
16
- # -----------------------------
17
- def _get_args():
18
- parser = argparse.ArgumentParser(description='Script used to print information about ROOT files and dump it to text')
19
- parser.add_argument('-p', '--path' , type=str, help='Path to ROOT file')
20
- parser.add_argument('-s', '--screen', help='If used, will dump output to screen', action='store_true')
21
- args = parser.parse_args()
22
-
23
- Data.path = args.path
24
- Data.screen= args.screen
25
- # -----------------------------
26
- def main():
27
- '''
28
- Execution starts here
29
- '''
30
- _get_args()
31
- prt = RFPrinter(path = Data.path)
32
- prt.save(to_screen = Data.screen)
33
- # -----------------------------
34
- if __name__ == '__main__':
35
- main()
dmu_scripts/ssh/coned.py DELETED
@@ -1,168 +0,0 @@
1
- '''
2
- Script used to implement connection to servers
3
- '''
4
-
5
- import os
6
- import copy
7
- import argparse
8
-
9
- import yaml
10
- from dmu.logging.log_store import LogStore
11
-
12
- log = LogStore.add_logger('dmu:scripts:coned')
13
- #---------------------------------------
14
- class Data:
15
- '''
16
- Class used to store shared data
17
- '''
18
- logl : int
19
- dry : bool
20
- prnt : bool
21
- cfg : dict
22
- l_ad : list[str]
23
- l_rm : list[str]
24
- #----------------------------
25
- def _print_configs():
26
- '''
27
- Prints configuration
28
- '''
29
-
30
- yaml_output = yaml.dump(Data.cfg, default_flow_style=False)
31
- print(yaml_output)
32
- #----------------------------
33
- def _initialize():
34
- _load_config()
35
-
36
- LogStore.set_level('dmu:scripts:coned', Data.logl)
37
-
38
- log.debug(f'Running at {Data.logl} logging level')
39
- #----------------------------
40
- def _get_args():
41
- '''
42
- Will parse arguments
43
- '''
44
- parser = argparse.ArgumentParser(description='Used to edit and print server list specified by ~/.config/connect/servers.yaml')
45
- parser.add_argument('-p', '--print' , help ='Prints config settings and exits', action='store_true')
46
- parser.add_argument('-l', '--log_lvl', type =int, help='Logging level', default=20, choices=[10,20,30])
47
- parser.add_argument('-a', '--add' , nargs=3 , help='Adds task to given server, e.g. task 123 server' , default=[])
48
- parser.add_argument('-r', '--rem' , nargs=3 , help='Removes task from given server, e.g. task 123 server', default=[])
49
- parser.add_argument('-d', '--dry' , help='Run dry run, for adding and removing entries', action='store_true')
50
- args = parser.parse_args()
51
-
52
- Data.prnt = args.print
53
- Data.logl = args.log_lvl
54
- Data.l_ad = args.add
55
- Data.l_rm = args.rem
56
- Data.dry = args.dry
57
- #---------------------------------------
58
- def _load_config():
59
- home_dir = os.environ['HOME']
60
- config_path = f'{home_dir}/.config/dmu/ssh/servers.yaml'
61
- if not os.path.isfile(config_path):
62
- raise FileNotFoundError(f'Config not found: {config_path}')
63
-
64
- with open(config_path, encoding='utf-8') as ifile:
65
- Data.cfg = yaml.safe_load(ifile)
66
- #---------------------------------------
67
- def _dump_config(cfg : dict):
68
- if cfg == Data.cfg:
69
- log.debug('Config was not modified, will not save it')
70
- return
71
-
72
- home_dir = os.environ['HOME']
73
- config_path = f'{home_dir}/.config/dmu/ssh/servers.yaml'
74
- if not os.path.isfile(config_path):
75
- raise FileNotFoundError(f'Config not found: {config_path}')
76
-
77
- if Data.dry:
78
- content = yaml.dump(cfg, default_flow_style=False)
79
- print(content)
80
- return
81
-
82
- with open(config_path, 'w', encoding='utf-8') as ofile:
83
- yaml.dump(cfg, ofile, default_flow_style=False)
84
- #---------------------------------------
85
- def _get_updated_config() -> dict:
86
- log.debug('Getting updated config')
87
-
88
- cfg = copy.deepcopy(Data.cfg)
89
- cfg = _add_task(cfg)
90
- cfg = _remove_task(cfg)
91
-
92
- return cfg
93
- #---------------------------------------
94
- def _add_task(cfg : dict) -> dict:
95
- if len(Data.l_ad) == 0:
96
- log.debug('No task added')
97
- return cfg
98
-
99
- [task, machine, server] = Data.l_ad
100
- if server not in cfg:
101
- cfg[server] = {}
102
-
103
- if machine not in cfg[server]:
104
- cfg[server][machine] = []
105
-
106
- cfg[server][machine].append(task)
107
-
108
- log.info(f'{"Added":<10}{server:<20}{machine:<10}{task:<20}')
109
-
110
- return cfg
111
- #---------------------------------------
112
- def _remove_task(cfg : dict) -> dict:
113
- if len(Data.l_rm) == 0:
114
- log.debug('No task removed')
115
- return cfg
116
-
117
- [task, machine, server] = Data.l_rm
118
- if server not in cfg:
119
- log.warning(f'Server {server} not found')
120
- return cfg
121
-
122
- if machine not in cfg[server]:
123
- log.warning(f'Machine {machine} not found in server {server}')
124
- return cfg
125
-
126
- l_task = cfg[server][machine]
127
- if task not in l_task:
128
- log.warning(f'Task {task} not found in {server}:{machine}')
129
- return cfg
130
-
131
- index = l_task.index(task)
132
- del l_task[index]
133
- cfg[server][machine] = l_task
134
-
135
- log.info(f'{"Removed":<10}{server:<20}{machine:<10}{task:<20}')
136
-
137
- cfg = _trim_config(cfg, machine, server)
138
-
139
- return cfg
140
- #---------------------------------------
141
- def _trim_config(cfg : dict, machine : str, server : str) -> dict:
142
- if cfg[server][machine] == []:
143
- log.debug(f'Trimming {server}:{machine}')
144
- del cfg[server][machine]
145
-
146
- if cfg[server] == {}:
147
- log.debug(f'Trimming {server}')
148
- del cfg[server]
149
-
150
- return cfg
151
- #---------------------------------------
152
- def main():
153
- '''
154
- Starts here
155
- '''
156
- _get_args()
157
- _initialize()
158
-
159
- if Data.prnt:
160
- log.debug('Printing and returning')
161
- _print_configs()
162
- return
163
-
164
- cfg = _get_updated_config()
165
- _dump_config(cfg)
166
- #---------------------------------------
167
- if __name__ == '__main__':
168
- main()
@@ -1,46 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- from dmu.text.transformer import transformer as txt_trf
4
-
5
- import argparse
6
- import logging
7
-
8
- log = logging.getLogger('dmu_scripts:text:transformer')
9
- #---------------------------------
10
- class data:
11
- txt = None
12
- out = None
13
- cfg = None
14
- lvl = None
15
- #---------------------------------
16
- def get_args():
17
- parser=argparse.ArgumentParser(description='Will transform a text file following a set of rules')
18
- parser.add_argument('-i', '--input' , type=str, help='Path to input file' , required=True)
19
- parser.add_argument('-o', '--output', type=str, help='Path to output file, if not passed, it will be same as input, but with trf before extension')
20
- parser.add_argument('-c', '--config', type=str, help='Path to config file', required=True)
21
- parser.add_argument('-l', '--loglvl', type=int, help='Log level' , default=20, choices=[10, 20, 30, 40])
22
- args = parser.parse_args()
23
-
24
- data.txt = args.input
25
- data.out = args.output
26
- data.cfg = args.config
27
- data.lvl = args.loglvl
28
- #---------------------------------
29
- def set_logs():
30
- logging.basicConfig()
31
-
32
- log_tr = logging.getLogger('dmu:text:transformer')
33
-
34
- log_tr.setLevel(data.lvl)
35
- log.setLevel(data.lvl)
36
- #---------------------------------
37
- def main():
38
- get_args()
39
- set_logs()
40
-
41
- trf = txt_trf(txt_path=data.txt, cfg_path=data.cfg)
42
- trf.save_as(data.out)
43
- #---------------------------------
44
- if __name__ == '__main__':
45
- main()
46
-