data-manipulation-utilities 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/METADATA +53 -1
- {data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/RECORD +8 -7
- dmu/generic/version_management.py +132 -0
- dmu/rdataframe/utilities.py +27 -1
- {data_manipulation_utilities-0.2.2.data → data_manipulation_utilities-0.2.4.data}/scripts/publish +0 -0
- {data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/WHEEL +0 -0
- {data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/entry_points.txt +0 -0
- {data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/top_level.txt +0 -0
{data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: data_manipulation_utilities
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Description-Content-Type: text/markdown
|
5
5
|
Requires-Dist: logzero
|
6
6
|
Requires-Dist: PyYAML
|
@@ -578,6 +578,24 @@ These are utility functions meant to be used with ROOT dataframes.
|
|
578
578
|
|
579
579
|
## Adding a column from a numpy array
|
580
580
|
|
581
|
+
### With numba
|
582
|
+
|
583
|
+
For this do:
|
584
|
+
|
585
|
+
```python
|
586
|
+
import dmu.rdataframe.utilities as ut
|
587
|
+
|
588
|
+
arr_val = numpy.array([10, 20, 30])
|
589
|
+
rdf = ut.add_column_with_numba(rdf, arr_val, 'values', identifier='some_name')
|
590
|
+
```
|
591
|
+
|
592
|
+
where the identifier needs to be unique, every time the function is called.
|
593
|
+
This is the case, because the addition is done internally by declaring a numba function whose name
|
594
|
+
cannot be repeated as mentioned
|
595
|
+
[here](https://root-forum.cern.ch/t/ways-to-work-around-the-redefinition-of-compiled-functions-in-one-single-notebook-session/41442/1)
|
596
|
+
|
597
|
+
### With awkward
|
598
|
+
|
581
599
|
For this do:
|
582
600
|
|
583
601
|
```python
|
@@ -838,6 +856,40 @@ Trees only in file_2.root:
|
|
838
856
|
- Hlt2RD_BsToPhiMuMu_MVA/DecayTree
|
839
857
|
```
|
840
858
|
|
859
|
+
# File system
|
860
|
+
|
861
|
+
## Versions
|
862
|
+
|
863
|
+
The utilities below allow the user to deal with versioned files and directories
|
864
|
+
|
865
|
+
```python
|
866
|
+
from dmu.generic.version_management import get_last_version
|
867
|
+
from dmu.generic.version_management import get_next_version
|
868
|
+
from dmu.generic.version_management import get_latest_file
|
869
|
+
|
870
|
+
# get_next_version will take a version and provide the next one, e.g.
|
871
|
+
get_next_version('v1') # -> 'v2'
|
872
|
+
get_next_version('v1.1') # -> 'v2.1'
|
873
|
+
get_next_version('v10.1') # -> 'v11.1'
|
874
|
+
|
875
|
+
get_next_version('/a/b/c/v1') # -> '/a/b/c/v2'
|
876
|
+
get_next_version('/a/b/c/v1.1') # -> '/a/b/c/v2.1'
|
877
|
+
get_next_version('/a/b/c/v10.1') # -> '/a/b/c/v11.1'
|
878
|
+
|
879
|
+
# `get_latest_file` will return the path to the file with the highest version
|
880
|
+
# in the `dir_path` directory that matches a wildcard, e.g.:
|
881
|
+
|
882
|
+
last_file = get_latest_file(dir_path = file_dir, wc='name_*.txt')
|
883
|
+
|
884
|
+
# `get_last_version` will return the string with the latest version
|
885
|
+
# of directories in `dir_path`, e.g.:
|
886
|
+
|
887
|
+
oversion=get_last_version(dir_path=dir_path, version_only=True) # This will return only the version, e.g. v3.2
|
888
|
+
oversion=get_last_version(dir_path=dir_path, version_only=False) # This will return full path, e.g. /a/b/c/v3.2
|
889
|
+
```
|
890
|
+
|
891
|
+
The function above should work for numeric (e.g. `v1.2`) and non-numeric (e.g. `va`, `vb`) versions.
|
892
|
+
|
841
893
|
# Text manipulation
|
842
894
|
|
843
895
|
## Transformations
|
{data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/RECORD
RENAMED
@@ -1,6 +1,7 @@
|
|
1
|
-
data_manipulation_utilities-0.2.
|
1
|
+
data_manipulation_utilities-0.2.4.data/scripts/publish,sha256=-3K_Y2_4CfWCV50rPB8CRuhjxDu7xMGswinRwPovgLs,1976
|
2
2
|
dmu/arrays/utilities.py,sha256=PKoYyybPptA2aU-V3KLnJXBudWxTXu4x1uGdIMQ49HY,1722
|
3
3
|
dmu/generic/utilities.py,sha256=0Xnq9t35wuebAqKxbyAiMk1ISB7IcXK4cFH25MT1fgw,1741
|
4
|
+
dmu/generic/version_management.py,sha256=G_HjGY-hu8lotZuTdVAg0B8yD0AltE866q2vJxvTg1g,3749
|
4
5
|
dmu/logging/log_store.py,sha256=umdvjNDuV3LdezbG26b0AiyTglbvkxST19CQu9QATbA,4184
|
5
6
|
dmu/ml/cv_classifier.py,sha256=8Jwx6xMhJaRLktlRdq0tFl32v6t8i63KmpxrlnXlomU,3759
|
6
7
|
dmu/ml/cv_predict.py,sha256=4G7F_1yOvnLftsDC6zUpdvkxuHXGkPemhj0RsYySYDM,6708
|
@@ -13,7 +14,7 @@ dmu/plotting/plotter_1d.py,sha256=g6H2xAgsL9a6vRkpbqHICb3qwV_qMiQPZxxw_oOSf9M,51
|
|
13
14
|
dmu/plotting/plotter_2d.py,sha256=J-gKnagoHGfJFU7HBrhDFpGYH5Rxy0_zF5l8eE_7ZHE,2944
|
14
15
|
dmu/plotting/utilities.py,sha256=SI9dvtZq2gr-PXVz71KE4o0i09rZOKgqJKD1jzf6KXk,1167
|
15
16
|
dmu/rdataframe/atr_mgr.py,sha256=FdhaQWVpsm4OOe1IRbm7rfrq8VenTNdORyI-lZ2Bs1M,2386
|
16
|
-
dmu/rdataframe/utilities.py,sha256=
|
17
|
+
dmu/rdataframe/utilities.py,sha256=pNcQARMP7txMhy6k27UnDcYf0buNy5U2fshaJDl_h8o,3661
|
17
18
|
dmu/rfile/rfprinter.py,sha256=mp5jd-oCJAnuokbdmGyL9i6tK2lY72jEfROuBIZ_ums,3941
|
18
19
|
dmu/rfile/utilities.py,sha256=XuYY7HuSBj46iSu3c60UYBHtI6KIPoJU_oofuhb-be0,945
|
19
20
|
dmu/stats/fitter.py,sha256=vHNZ16U3apoQyeyM8evq-if49doF48sKB3q9wmA96Fw,18387
|
@@ -47,8 +48,8 @@ dmu_scripts/rfile/compare_root_files.py,sha256=T8lDnQxsRNMr37x1Y7YvWD8ySHrJOWZki
|
|
47
48
|
dmu_scripts/rfile/print_trees.py,sha256=Ze4Ccl_iUldl4eVEDVnYBoe4amqBT1fSBR1zN5WSztk,941
|
48
49
|
dmu_scripts/ssh/coned.py,sha256=lhilYNHWRCGxC-jtyJ3LQ4oUgWW33B2l1tYCcyHHsR0,4858
|
49
50
|
dmu_scripts/text/transform_text.py,sha256=9akj1LB0HAyopOvkLjNOJiptZw5XoOQLe17SlcrGMD0,1456
|
50
|
-
data_manipulation_utilities-0.2.
|
51
|
-
data_manipulation_utilities-0.2.
|
52
|
-
data_manipulation_utilities-0.2.
|
53
|
-
data_manipulation_utilities-0.2.
|
54
|
-
data_manipulation_utilities-0.2.
|
51
|
+
data_manipulation_utilities-0.2.4.dist-info/METADATA,sha256=Gc-ZuL88YHEK3pOK1IfQmaN6rKCcVVqrFS2VlT70jyk,29229
|
52
|
+
data_manipulation_utilities-0.2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
53
|
+
data_manipulation_utilities-0.2.4.dist-info/entry_points.txt,sha256=1TIZDed651KuOH-DgaN5AoBdirKmrKE_oM1b6b7zTUU,270
|
54
|
+
data_manipulation_utilities-0.2.4.dist-info/top_level.txt,sha256=n_x5J6uWtSqy9mRImKtdA2V2NJNyU8Kn3u8DTOKJix0,25
|
55
|
+
data_manipulation_utilities-0.2.4.dist-info/RECORD,,
|
@@ -0,0 +1,132 @@
|
|
1
|
+
'''
|
2
|
+
Module containing functions used to find latest, next version, etc of a path.
|
3
|
+
'''
|
4
|
+
|
5
|
+
import glob
|
6
|
+
import os
|
7
|
+
import re
|
8
|
+
|
9
|
+
from dmu.logging.log_store import LogStore
|
10
|
+
|
11
|
+
log=LogStore.add_logger('dmu:version_management')
|
12
|
+
#---------------------------------------
|
13
|
+
def _get_numeric_version(version : str) -> int:
|
14
|
+
'''
|
15
|
+
Takes string with numbers at the end (padded or not)
|
16
|
+
Returns integer version of numbers
|
17
|
+
'''
|
18
|
+
#Skip these directories
|
19
|
+
if version in ['__pycache__']:
|
20
|
+
return -1
|
21
|
+
|
22
|
+
regex=r'[a-z]+(\d+)'
|
23
|
+
mtch =re.match(regex, version)
|
24
|
+
if not mtch:
|
25
|
+
log.debug(f'Cannot extract numeric version from: {version}')
|
26
|
+
return -1
|
27
|
+
|
28
|
+
str_val = mtch.group(1)
|
29
|
+
val = int(str_val)
|
30
|
+
|
31
|
+
return val
|
32
|
+
#---------------------------------------
|
33
|
+
def get_last_version(dir_path : str, version_only : bool = True, main_only : bool = False):
|
34
|
+
'''Returns path or just version associated to latest version found in given path
|
35
|
+
|
36
|
+
Parameters
|
37
|
+
---------------------
|
38
|
+
dir_path (str) : Path to directory where versioned subdirectories exist
|
39
|
+
version_only (bool): Returns only vxxxx if True, otherwise, full path to directory
|
40
|
+
main_only (bool): Returns vX where X is a number. Otherwise it will return vx.y in case version has subversion
|
41
|
+
'''
|
42
|
+
l_obj = glob.glob(f'{dir_path}/*')
|
43
|
+
|
44
|
+
if len(l_obj) == 0:
|
45
|
+
log.error(f'Nothing found in {dir_path}')
|
46
|
+
raise ValueError
|
47
|
+
|
48
|
+
d_dir_org = { os.path.basename(obj).replace('.', '') : obj for obj in l_obj if os.path.isdir(obj) }
|
49
|
+
d_dir_num = { _get_numeric_version(name) : dir_path for name, dir_path in d_dir_org.items() }
|
50
|
+
|
51
|
+
c_dir = sorted(d_dir_num.items())
|
52
|
+
|
53
|
+
try:
|
54
|
+
_, path = c_dir[-1]
|
55
|
+
except:
|
56
|
+
log.error(f'Cannot find path in: {dir_path}')
|
57
|
+
raise
|
58
|
+
|
59
|
+
name = os.path.basename(path)
|
60
|
+
dirn = os.path.dirname(path)
|
61
|
+
|
62
|
+
if main_only and '.' in name:
|
63
|
+
ind = name.index('.')
|
64
|
+
name= name[:ind]
|
65
|
+
|
66
|
+
if version_only:
|
67
|
+
return name
|
68
|
+
|
69
|
+
return f'{dirn}/{name}'
|
70
|
+
#---------------------------------------
|
71
|
+
def get_latest_file(dir_path : str, wc : str) -> str:
|
72
|
+
'''Will find latest file in a given directory
|
73
|
+
|
74
|
+
Parameters
|
75
|
+
--------------------
|
76
|
+
dir_path (str): Directory where files are found
|
77
|
+
wc (str): Wildcard associated to files, e.g. file_*.txt
|
78
|
+
|
79
|
+
Returns
|
80
|
+
--------------------
|
81
|
+
Path to latest file, according to version
|
82
|
+
'''
|
83
|
+
l_path = glob.glob(f'{dir_path}/{wc}')
|
84
|
+
if len(l_path) == 0:
|
85
|
+
log.error(f'Cannot find files in: {dir_path}/{wc}')
|
86
|
+
raise ValueError
|
87
|
+
|
88
|
+
l_path.sort()
|
89
|
+
|
90
|
+
return l_path[-1]
|
91
|
+
#---------------------------------------
|
92
|
+
def get_next_version(version : str) -> str:
|
93
|
+
'''Pick up string symbolizing version and return next version
|
94
|
+
Parameters
|
95
|
+
-------------------------
|
96
|
+
version (str) : Of the form vx.y or vx where x and y are integers. It can also be a full path
|
97
|
+
|
98
|
+
Returns
|
99
|
+
-------------------------
|
100
|
+
String equal to the argument, but with the main version augmented by 1, e.g. vx+1.y
|
101
|
+
|
102
|
+
Examples:
|
103
|
+
-------------------------
|
104
|
+
|
105
|
+
get_next_version('v1.1') = 'v2.1'
|
106
|
+
get_next_version('v1' ) = 'v2'
|
107
|
+
'''
|
108
|
+
if '/' in version:
|
109
|
+
path = version
|
110
|
+
dirname = os.path.dirname(path)
|
111
|
+
version = os.path.basename(path)
|
112
|
+
else:
|
113
|
+
dirname = None
|
114
|
+
|
115
|
+
rgx = r'v(\d+)(\.\d+)?'
|
116
|
+
|
117
|
+
mtch = re.match(rgx, version)
|
118
|
+
if not mtch:
|
119
|
+
log.error(f'Cannot match {version} with {rgx}')
|
120
|
+
raise ValueError
|
121
|
+
|
122
|
+
ver_org = mtch.group(1)
|
123
|
+
ver_nxt = int(ver_org) + 1
|
124
|
+
ver_nxt = str(ver_nxt)
|
125
|
+
|
126
|
+
version = version.replace(f'v{ver_org}', f'v{ver_nxt}')
|
127
|
+
|
128
|
+
if dirname is not None:
|
129
|
+
version = f'{dirname}/{version}'
|
130
|
+
|
131
|
+
return version
|
132
|
+
#---------------------------------------
|
dmu/rdataframe/utilities.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
'''
|
2
2
|
Module containing utility functions to be used with ROOT dataframes
|
3
3
|
'''
|
4
|
+
# pylint: disable=no-name-in-module
|
4
5
|
|
5
6
|
import re
|
6
7
|
from dataclasses import dataclass
|
@@ -10,7 +11,7 @@ import pandas as pnd
|
|
10
11
|
import awkward as ak
|
11
12
|
import numpy
|
12
13
|
|
13
|
-
from ROOT import RDataFrame, RDF
|
14
|
+
from ROOT import RDataFrame, RDF, Numba
|
14
15
|
|
15
16
|
from dmu.logging.log_store import LogStore
|
16
17
|
|
@@ -34,6 +35,8 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
|
|
34
35
|
exclude_re : Regex with patter of column names that we won't pick
|
35
36
|
'''
|
36
37
|
|
38
|
+
log.warning(f'Adding column {name} with awkward')
|
39
|
+
|
37
40
|
d_opt = {} if d_opt is None else d_opt
|
38
41
|
if arr_val is None:
|
39
42
|
raise ValueError('Array of values not introduced')
|
@@ -72,6 +75,29 @@ def add_column(rdf : RDataFrame, arr_val : Union[numpy.ndarray,None], name : str
|
|
72
75
|
|
73
76
|
return rdf
|
74
77
|
# ---------------------------------------------------------------------
|
78
|
+
def add_column_with_numba(
|
79
|
+
rdf : RDataFrame,
|
80
|
+
arr_val : Union[numpy.ndarray,None],
|
81
|
+
name : str,
|
82
|
+
identifier : str) -> RDataFrame:
|
83
|
+
'''
|
84
|
+
Will take a dataframe, an array of numbers and a string
|
85
|
+
Will add the array as a colunm to the dataframe
|
86
|
+
|
87
|
+
The `identifier` argument is a string need in order to avoid collisions
|
88
|
+
when using Numba to define a function to get the value from.
|
89
|
+
'''
|
90
|
+
identifier=f'fun_{identifier}'
|
91
|
+
|
92
|
+
@Numba.Declare(['int'], 'float', name=identifier)
|
93
|
+
def get_value(index):
|
94
|
+
return arr_val[index]
|
95
|
+
|
96
|
+
log.debug(f'Adding column {name} with numba')
|
97
|
+
rdf = rdf.Define(name, f'Numba::{identifier}(rdfentry_)')
|
98
|
+
|
99
|
+
return rdf
|
100
|
+
# ---------------------------------------------------------------------
|
75
101
|
def rdf_report_to_df(rep : RDF.RCutFlowReport) -> pnd.DataFrame:
|
76
102
|
'''
|
77
103
|
Takes the output of rdf.Report(), i.e. an RDataFrame cutflow report.
|
{data_manipulation_utilities-0.2.2.data → data_manipulation_utilities-0.2.4.data}/scripts/publish
RENAMED
File without changes
|
{data_manipulation_utilities-0.2.2.dist-info → data_manipulation_utilities-0.2.4.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|