seabirdfilehandler 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of seabirdfilehandler might be problematic. Click here for more details.
- seabirdfilehandler/__init__.py +4 -2
- seabirdfilehandler/bottlefile.py +181 -0
- seabirdfilehandler/bottlelogfile.py +151 -0
- seabirdfilehandler/cnvfile.py +284 -0
- seabirdfilehandler/datafiles.py +265 -0
- seabirdfilehandler/file_collection.py +19 -18
- seabirdfilehandler/parameter.py +29 -3
- {seabirdfilehandler-0.4.3.dist-info → seabirdfilehandler-0.5.1.dist-info}/METADATA +1 -1
- seabirdfilehandler-0.5.1.dist-info/RECORD +14 -0
- {seabirdfilehandler-0.4.3.dist-info → seabirdfilehandler-0.5.1.dist-info}/WHEEL +1 -1
- seabirdfilehandler/dataframe_meta_accessor.py +0 -184
- seabirdfilehandler/datatablefiles.py +0 -930
- seabirdfilehandler/logging.yaml +0 -23
- seabirdfilehandler/seabirdfiles.py +0 -210
- seabirdfilehandler-0.4.3.dist-info/RECORD +0 -14
- {seabirdfilehandler-0.4.3.dist-info → seabirdfilehandler-0.5.1.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import xmltodict
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(
|
|
8
|
+
level=logging.INFO,
|
|
9
|
+
format="%(asctime)s - %(name)s - [%(levelname)s] - %(message)s",
|
|
10
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
11
|
+
handlers=[
|
|
12
|
+
logging.FileHandler("filehandler.log"),
|
|
13
|
+
logging.StreamHandler(),
|
|
14
|
+
],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataFile:
|
|
21
|
+
"""Collection of methods for the SeaBird files that feature some kind of
|
|
22
|
+
data table that is represented in a pandas dataframe.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
path_to_file: Path | str,
|
|
35
|
+
only_header: bool = False,
|
|
36
|
+
):
|
|
37
|
+
self.path_to_file = Path(path_to_file)
|
|
38
|
+
self.file_name = self.path_to_file.stem
|
|
39
|
+
self.file_dir = self.path_to_file.parent
|
|
40
|
+
self.only_header = only_header
|
|
41
|
+
self.raw_file_data = [] # the text file input
|
|
42
|
+
self.header = [] # the full file header
|
|
43
|
+
self.sbe9_data = [] # device specific information
|
|
44
|
+
self.metadata = {} # non-SeaBird metadata
|
|
45
|
+
self.metadata_list = [] # unstructured metadata for easier export
|
|
46
|
+
self.data_table_description = [] # the column names and other info
|
|
47
|
+
self.sensor_data = []
|
|
48
|
+
self.sensors = {} # xml-parsed sensor data
|
|
49
|
+
self.processing_info = [] # everything after the sensor data
|
|
50
|
+
self.data = [] # the data table
|
|
51
|
+
self.file_data = self.raw_file_data # variable file information
|
|
52
|
+
self.read_file()
|
|
53
|
+
self.metadata = self.structure_metadata(self.metadata_list)
|
|
54
|
+
if len(self.sensor_data) > 0:
|
|
55
|
+
self.sensors = self.sensor_xml_to_flattened_dict(
|
|
56
|
+
"".join(self.sensor_data)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def __str__(self) -> str:
|
|
60
|
+
return "/n".join(self.file_data)
|
|
61
|
+
|
|
62
|
+
def __repr__(self) -> str:
|
|
63
|
+
return str(self.path_to_file.absolute())
|
|
64
|
+
|
|
65
|
+
def __eq__(self, other) -> bool:
|
|
66
|
+
return self.file_data == other.file_data
|
|
67
|
+
|
|
68
|
+
def read_file(self):
|
|
69
|
+
"""Reads and structures all the different information present in the
|
|
70
|
+
file. Lists and Dictionaries are the data structures of choice. Uses
|
|
71
|
+
basic prefix checking to distinguish different header information.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
|
|
79
|
+
"""
|
|
80
|
+
past_sensors = False
|
|
81
|
+
with self.path_to_file.open("r", encoding="latin-1") as file:
|
|
82
|
+
for line in file:
|
|
83
|
+
self.raw_file_data.append(line)
|
|
84
|
+
line_prefix = line[:2]
|
|
85
|
+
if line_prefix == "* ":
|
|
86
|
+
self.header.append(line)
|
|
87
|
+
self.sbe9_data.append(line[2:])
|
|
88
|
+
elif line_prefix == "**":
|
|
89
|
+
self.header.append(line)
|
|
90
|
+
self.metadata_list.append(line[3:])
|
|
91
|
+
elif line_prefix == "# ":
|
|
92
|
+
self.header.append(line)
|
|
93
|
+
if line[2:].strip()[0] == "<":
|
|
94
|
+
self.sensor_data.append(line[2:])
|
|
95
|
+
past_sensors = True
|
|
96
|
+
else:
|
|
97
|
+
if past_sensors:
|
|
98
|
+
self.processing_info.append(line[2:])
|
|
99
|
+
else:
|
|
100
|
+
self.data_table_description.append(line[2:])
|
|
101
|
+
else:
|
|
102
|
+
if line.startswith("*END*"):
|
|
103
|
+
self.header.append(line)
|
|
104
|
+
if self.only_header:
|
|
105
|
+
break
|
|
106
|
+
else:
|
|
107
|
+
self.data.append(line)
|
|
108
|
+
|
|
109
|
+
def sensor_xml_to_flattened_dict(
|
|
110
|
+
self, sensor_data: str
|
|
111
|
+
) -> list[dict] | dict:
|
|
112
|
+
"""Reads the pure xml sensor input and creates a multilevel dictionary,
|
|
113
|
+
dropping the first two dictionaries, as they are single entry only
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
|
|
121
|
+
"""
|
|
122
|
+
full_sensor_dict = xmltodict.parse(sensor_data, process_comments=True)
|
|
123
|
+
try:
|
|
124
|
+
sensors = full_sensor_dict["Sensors"]["sensor"]
|
|
125
|
+
except KeyError as error:
|
|
126
|
+
logger.error(f"XML is not formatted as expected: {error}")
|
|
127
|
+
return full_sensor_dict
|
|
128
|
+
else:
|
|
129
|
+
# create a tidied version of the xml-parsed sensor dict
|
|
130
|
+
tidied_sensor_list = []
|
|
131
|
+
for entry in sensors:
|
|
132
|
+
# use comment value as type descriptor
|
|
133
|
+
comment = entry["#comment"]
|
|
134
|
+
split_comment = comment.split(",")
|
|
135
|
+
new_entry = split_comment[1].strip()
|
|
136
|
+
if split_comment[-1] == " 2":
|
|
137
|
+
new_entry += " 2"
|
|
138
|
+
# remove second-level dict
|
|
139
|
+
calibration_info = list(entry.values())[-1]
|
|
140
|
+
try:
|
|
141
|
+
new_dict = {
|
|
142
|
+
"Channel": entry["@Channel"],
|
|
143
|
+
"SensorName": new_entry,
|
|
144
|
+
**calibration_info,
|
|
145
|
+
}
|
|
146
|
+
except TypeError:
|
|
147
|
+
new_dict = {
|
|
148
|
+
"Channel": entry["@Channel"],
|
|
149
|
+
"SensorName": new_entry,
|
|
150
|
+
"Info": calibration_info,
|
|
151
|
+
}
|
|
152
|
+
tidied_sensor_list.append(new_dict)
|
|
153
|
+
return tidied_sensor_list
|
|
154
|
+
|
|
155
|
+
def structure_metadata(self, metadata_list: list) -> dict:
|
|
156
|
+
"""Creates a dictionary to store the metadata that is added by using
|
|
157
|
+
werums dship API.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
metadata_list: list :
|
|
162
|
+
a list of the individual lines of metadata found in the file
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
a dictionary of the lines of metadata divided into key-value pairs
|
|
167
|
+
"""
|
|
168
|
+
out_dict = {}
|
|
169
|
+
for line in metadata_list:
|
|
170
|
+
try:
|
|
171
|
+
(key, val) = line.split("=")
|
|
172
|
+
except ValueError:
|
|
173
|
+
out_dict["text"] = line
|
|
174
|
+
else:
|
|
175
|
+
out_dict[key.strip()] = val.strip()
|
|
176
|
+
return out_dict
|
|
177
|
+
|
|
178
|
+
def define_output_path(
|
|
179
|
+
self,
|
|
180
|
+
file_path: Path | str | None = None,
|
|
181
|
+
file_name: str | None = None,
|
|
182
|
+
file_type: str = ".csv",
|
|
183
|
+
) -> Path:
|
|
184
|
+
"""Creates a Path object holding the desired output path.
|
|
185
|
+
|
|
186
|
+
Parameters
|
|
187
|
+
----------
|
|
188
|
+
file_path : Path :
|
|
189
|
+
directory the file sits in (Default value = self.file_dir)
|
|
190
|
+
file_name : str :
|
|
191
|
+
the original file name (Default value = self.file_name)
|
|
192
|
+
file_type : str :
|
|
193
|
+
the output file type (Default = '.csv')
|
|
194
|
+
Returns
|
|
195
|
+
-------
|
|
196
|
+
a Path object consisting of the full path of the new file
|
|
197
|
+
|
|
198
|
+
"""
|
|
199
|
+
file_path = self.file_dir if file_path is None else file_path
|
|
200
|
+
file_name = self.file_name if file_name is None else file_name
|
|
201
|
+
if file_type[0] != ".":
|
|
202
|
+
file_type = "." + file_type
|
|
203
|
+
return Path(file_path).joinpath(file_name).with_suffix(file_type)
|
|
204
|
+
|
|
205
|
+
def to_csv(
|
|
206
|
+
self,
|
|
207
|
+
data: pd.DataFrame | np.ndarray,
|
|
208
|
+
with_header: bool = True,
|
|
209
|
+
output_file_path: Path | str | None = None,
|
|
210
|
+
output_file_name: str | None = None,
|
|
211
|
+
):
|
|
212
|
+
"""Writes a csv from the current dataframe. Takes a list of columns to
|
|
213
|
+
use, a boolean for writing the header and the output file parameters.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
selected_columns : list :
|
|
218
|
+
a list of columns to include in the csv
|
|
219
|
+
(Default value = self.df.columns)
|
|
220
|
+
with_header : boolean :
|
|
221
|
+
indicating whether the header shall appear in the output
|
|
222
|
+
(Default value = True)
|
|
223
|
+
output_file_path : Path :
|
|
224
|
+
file directory (Default value = None)
|
|
225
|
+
output_file_name : str :
|
|
226
|
+
original file name (Default value = None)
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
|
|
231
|
+
"""
|
|
232
|
+
new_file_path = self.define_output_path(
|
|
233
|
+
output_file_path, output_file_name
|
|
234
|
+
)
|
|
235
|
+
if with_header:
|
|
236
|
+
with open(new_file_path, "w") as file:
|
|
237
|
+
for line in self.header:
|
|
238
|
+
file.write(line)
|
|
239
|
+
if isinstance(data, pd.DataFrame):
|
|
240
|
+
data.to_csv(new_file_path, index=False, mode="a")
|
|
241
|
+
else:
|
|
242
|
+
np.savetxt(new_file_path, data, delimiter=",")
|
|
243
|
+
|
|
244
|
+
def selecting_columns(
|
|
245
|
+
self,
|
|
246
|
+
list_of_columns: list | str,
|
|
247
|
+
df: pd.DataFrame,
|
|
248
|
+
):
|
|
249
|
+
"""Alters the dataframe to only hold the given columns.
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
list_of_columns: list or str : a collection of columns
|
|
254
|
+
df : pandas.Dataframe :
|
|
255
|
+
Dataframe (Default value = None)
|
|
256
|
+
|
|
257
|
+
Returns
|
|
258
|
+
-------
|
|
259
|
+
|
|
260
|
+
"""
|
|
261
|
+
# ensure that the input is a list, so that isin() can do its job
|
|
262
|
+
if isinstance(list_of_columns, str):
|
|
263
|
+
list_of_columns = [list_of_columns]
|
|
264
|
+
if isinstance(df, pd.DataFrame):
|
|
265
|
+
self.df = df[list_of_columns].reset_index(drop=True)
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
import logging
|
|
3
3
|
from collections import UserList
|
|
4
|
-
from typing import Type
|
|
4
|
+
from typing import Callable, Type
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import numpy as np
|
|
7
7
|
from seabirdfilehandler import (
|
|
8
|
-
SeaBirdFile,
|
|
9
8
|
CnvFile,
|
|
10
9
|
BottleFile,
|
|
11
10
|
BottleLogFile,
|
|
12
11
|
)
|
|
13
|
-
from seabirdfilehandler
|
|
12
|
+
from seabirdfilehandler import DataFile
|
|
14
13
|
from seabirdfilehandler.utils import get_unique_sensor_data
|
|
15
14
|
|
|
16
15
|
logger = logging.getLogger(__name__)
|
|
@@ -34,23 +33,19 @@ class FileCollection(UserList):
|
|
|
34
33
|
self,
|
|
35
34
|
path_to_files: str | Path,
|
|
36
35
|
file_suffix: str,
|
|
37
|
-
pattern: str | None = None,
|
|
38
36
|
only_metadata: bool = False,
|
|
37
|
+
sorting_key: Callable | None = None,
|
|
39
38
|
):
|
|
40
39
|
super().__init__()
|
|
41
40
|
self.path_to_files = Path(path_to_files)
|
|
42
41
|
self.file_suffix = file_suffix.strip(".")
|
|
43
|
-
self.file_type: Type[
|
|
42
|
+
self.file_type: Type[DataFile]
|
|
44
43
|
self.extract_file_type()
|
|
45
44
|
self.individual_file_paths = []
|
|
46
|
-
self.collect_files()
|
|
47
|
-
|
|
48
|
-
# TODO: implement pattern handling
|
|
49
|
-
self.pattern = pattern
|
|
50
|
-
else:
|
|
51
|
-
self.load_files(only_metadata)
|
|
45
|
+
self.collect_files(sorting_key=sorting_key)
|
|
46
|
+
self.load_files(only_metadata)
|
|
52
47
|
if not only_metadata:
|
|
53
|
-
if self.file_type ==
|
|
48
|
+
if self.file_type == DataFile:
|
|
54
49
|
self.df_list = self.get_dataframes()
|
|
55
50
|
self.df = self.get_collection_dataframe(self.df_list)
|
|
56
51
|
if self.file_type == CnvFile:
|
|
@@ -74,13 +69,19 @@ class FileCollection(UserList):
|
|
|
74
69
|
self.file_type = value
|
|
75
70
|
break
|
|
76
71
|
else:
|
|
77
|
-
self.file_type =
|
|
72
|
+
self.file_type = DataFile
|
|
78
73
|
|
|
79
|
-
def collect_files(
|
|
74
|
+
def collect_files(
|
|
75
|
+
self,
|
|
76
|
+
sorting_key: Callable | None = lambda file: int(
|
|
77
|
+
file.stem.split("_")[3]
|
|
78
|
+
),
|
|
79
|
+
):
|
|
80
80
|
""" """
|
|
81
|
-
|
|
82
|
-
self.
|
|
83
|
-
|
|
81
|
+
self.individual_file_paths = sorted(
|
|
82
|
+
self.path_to_files.rglob(f"*{self.file_suffix}"),
|
|
83
|
+
key=sorting_key,
|
|
84
|
+
)
|
|
84
85
|
|
|
85
86
|
def load_files(self, only_metadata: bool = False):
|
|
86
87
|
""" """
|
|
@@ -254,4 +255,4 @@ class FileCollection(UserList):
|
|
|
254
255
|
|
|
255
256
|
def get_data_table_meta_info(self) -> list[list[dict]]:
|
|
256
257
|
""" """
|
|
257
|
-
return [file.
|
|
258
|
+
return [file.parameters.metadata for file in self.data]
|
seabirdfilehandler/parameter.py
CHANGED
|
@@ -113,6 +113,20 @@ class Parameters(UserDict):
|
|
|
113
113
|
)
|
|
114
114
|
return parameter_dict
|
|
115
115
|
|
|
116
|
+
def _form_data_table_info(self) -> list:
|
|
117
|
+
"""Recreates the data table descriptions, like column names and spans
|
|
118
|
+
from the structured dictionaries these values were stored in."""
|
|
119
|
+
new_table_info = []
|
|
120
|
+
for key, value in self.data_table_stats.items():
|
|
121
|
+
new_table_info.append(f"{key} = {value}\n")
|
|
122
|
+
for index, (name, _) in enumerate(self.data_table_names_and_spans):
|
|
123
|
+
new_table_info.append(f"name {index} = {name}\n")
|
|
124
|
+
for index, (_, span) in enumerate(self.data_table_names_and_spans):
|
|
125
|
+
new_table_info.append(f"span {index} = {span}\n")
|
|
126
|
+
for key, value in self.data_table_misc.items():
|
|
127
|
+
new_table_info.append(f"{key} = {value}\n")
|
|
128
|
+
return new_table_info
|
|
129
|
+
|
|
116
130
|
def differentiate_table_description(self):
|
|
117
131
|
"""
|
|
118
132
|
The original method that structures data table metadata.
|
|
@@ -144,7 +158,10 @@ class Parameters(UserDict):
|
|
|
144
158
|
(name, span)
|
|
145
159
|
for name, span in zip(column_names, column_value_spans)
|
|
146
160
|
]
|
|
147
|
-
self.data_table_misc =
|
|
161
|
+
self.data_table_misc = {
|
|
162
|
+
line.split("=")[0].strip(): line.split("=")[1].strip()
|
|
163
|
+
for line in post
|
|
164
|
+
}
|
|
148
165
|
|
|
149
166
|
def add_parameter(self, parameter: Parameter):
|
|
150
167
|
"""
|
|
@@ -201,7 +218,6 @@ class Parameters(UserDict):
|
|
|
201
218
|
data = np.full(
|
|
202
219
|
fill_value=data,
|
|
203
220
|
shape=self.full_data_array.shape[0],
|
|
204
|
-
dtype=type(data),
|
|
205
221
|
)
|
|
206
222
|
parameter = Parameter(data=data, metadata=metadata)
|
|
207
223
|
self.add_parameter(parameter)
|
|
@@ -263,7 +279,17 @@ class Parameters(UserDict):
|
|
|
263
279
|
).T
|
|
264
280
|
columns = [parameter.name for parameter in self.get_parameter_list()]
|
|
265
281
|
assert data.shape[1] == len(columns)
|
|
266
|
-
|
|
282
|
+
df = pd.DataFrame(data=data, columns=columns)
|
|
283
|
+
for column in df.columns:
|
|
284
|
+
if column.lower() not in [
|
|
285
|
+
"latitude",
|
|
286
|
+
"longitude",
|
|
287
|
+
"event",
|
|
288
|
+
"cast",
|
|
289
|
+
"flag",
|
|
290
|
+
]:
|
|
291
|
+
df[column].astype("float64")
|
|
292
|
+
return df
|
|
267
293
|
|
|
268
294
|
def with_name_type(self, name_type: str = "shortname"):
|
|
269
295
|
"""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
seabirdfilehandler/__init__.py,sha256=5JTzYE3oRdrxkC9_etAnFQ1cy10PHtpmesdR6n5PoPQ,192
|
|
2
|
+
seabirdfilehandler/bottlefile.py,sha256=nnfoDczPMG_ge40dT2rHNhifR7-NRgnZNFrfPM_9OSQ,5925
|
|
3
|
+
seabirdfilehandler/bottlelogfile.py,sha256=MtMmEebdAktO3mk6KbmJC7dfx9sRLbV5qqDQt2qtpJE,4310
|
|
4
|
+
seabirdfilehandler/cnvfile.py,sha256=LXpJcC3ukiD-2b5vy4aKESCbIvwV12TwQy1G6Y25_GE,9709
|
|
5
|
+
seabirdfilehandler/datafiles.py,sha256=lqENvdGSwRKT6PyNFN2etaWKMA-4OONG0x-up1W5ezo,8991
|
|
6
|
+
seabirdfilehandler/file_collection.py,sha256=b5iJaP4F34Vq7-FiJOlPvfS4IePGWsYx20XwWbZQw1A,6882
|
|
7
|
+
seabirdfilehandler/parameter.py,sha256=UuwFzege94sqPt0kOjEqtMGGol4hjuFjj2_EH7o0pzA,14374
|
|
8
|
+
seabirdfilehandler/utils.py,sha256=5KXdB8Hdv65dv5tPyXxNMct1mCEOyA3S8XP54AFAnx0,1745
|
|
9
|
+
seabirdfilehandler/validation_modules.py,sha256=eZ6x0giftUtlxnRMOnK_vCkgccdwUXPrDjajFa-E6n0,4698
|
|
10
|
+
seabirdfilehandler/xmlfiles.py,sha256=L_puQf8eg0ojv85AyEMID4jnwkOlV_fgZP3W5yeSUBY,4668
|
|
11
|
+
seabirdfilehandler-0.5.1.dist-info/LICENSE,sha256=Ifd1VPmYv32oJd2QVh3wIQP9X05vYJlcY6kONz360ws,34603
|
|
12
|
+
seabirdfilehandler-0.5.1.dist-info/METADATA,sha256=2VrJmgeRr-Par2zU5A--xDS5r_7VsKzi-HLi8SlPUX4,1289
|
|
13
|
+
seabirdfilehandler-0.5.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
14
|
+
seabirdfilehandler-0.5.1.dist-info/RECORD,,
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import logging
|
|
3
|
-
from pandas.api.extensions import register_series_accessor
|
|
4
|
-
from pandas.api.extensions import register_dataframe_accessor
|
|
5
|
-
import warnings
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
logger = logging.getLogger(__name__)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class MetadataHandler:
|
|
12
|
-
"""
|
|
13
|
-
The base class for the pandas series and dataframe accessors.
|
|
14
|
-
Offers a very basic metadata handling, by using a dictionary as metadata
|
|
15
|
-
store. The accessors then allow to access this metadata store and
|
|
16
|
-
corresponding methods by calling 'df.meta' or 'series.meta', respectively.
|
|
17
|
-
Mainly targeted for usage with dataframes featuring data from CNV files,
|
|
18
|
-
it for example allows the attachement of parameter metadata found in the
|
|
19
|
-
CNV header to individual dataframe columns.
|
|
20
|
-
|
|
21
|
-
This approach was chosen over others, like directly subclassing the pandas
|
|
22
|
-
dataframe or series class, or a seperate metadata storage, due to its
|
|
23
|
-
simplicity and ability to keep using the full powerfull pandas library
|
|
24
|
-
without the need to implement each and every transformation. Of course,
|
|
25
|
-
the 'attrs' attribute does offer a similar metadata storage. But at the
|
|
26
|
-
time of writing this, it is still in a very experimental condition and does
|
|
27
|
-
not propagate reliably.
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
def __init__(self, pandas_obj):
|
|
31
|
-
self._obj = pandas_obj
|
|
32
|
-
if not hasattr(self._obj, "_metadata_store"):
|
|
33
|
-
with warnings.catch_warnings():
|
|
34
|
-
warnings.simplefilter("ignore")
|
|
35
|
-
self._obj._metadata_store = {}
|
|
36
|
-
|
|
37
|
-
@property
|
|
38
|
-
def metadata(self):
|
|
39
|
-
return self._obj._metadata_store
|
|
40
|
-
|
|
41
|
-
@metadata.setter
|
|
42
|
-
def metadata(self, value):
|
|
43
|
-
self._obj._metadata_store = value
|
|
44
|
-
|
|
45
|
-
def get(self, key, default=None):
|
|
46
|
-
return self._obj._metadata_store.get(key, default)
|
|
47
|
-
|
|
48
|
-
def set(self, key, value):
|
|
49
|
-
self._obj._metadata_store[key] = value
|
|
50
|
-
|
|
51
|
-
def clear(self):
|
|
52
|
-
self._obj._metadata_store.clear()
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@register_series_accessor("meta")
|
|
56
|
-
class SeriesMetaAccessor(MetadataHandler):
|
|
57
|
-
"""
|
|
58
|
-
Series implementation of the Metadata Accessor.
|
|
59
|
-
Does not offer anything more than the base class at the moment.
|
|
60
|
-
"""
|
|
61
|
-
|
|
62
|
-
def __init__(self, pandas_obj):
|
|
63
|
-
super().__init__(pandas_obj)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
@register_dataframe_accessor("meta")
|
|
67
|
-
class DataFrameMetaAccessor(MetadataHandler):
|
|
68
|
-
"""
|
|
69
|
-
DataFrame implementation of the Metadata Accessor.
|
|
70
|
-
Introduces another attribute, '_header_level_detail', that stores the
|
|
71
|
-
currently displayed metadata as column names. Additionally offers methods
|
|
72
|
-
to sync metadata between the dataframe and its series, and the handling of
|
|
73
|
-
common operations, like renaming or the addition of new columns.
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
def __init__(self, pandas_obj):
|
|
77
|
-
super().__init__(pandas_obj)
|
|
78
|
-
if not hasattr(self._obj, "_header_level_detail"):
|
|
79
|
-
self._obj._header_level_detail = "shortname"
|
|
80
|
-
# Initialize DataFrame metadata
|
|
81
|
-
self.aggregate_series_metadata()
|
|
82
|
-
|
|
83
|
-
@property
|
|
84
|
-
def header_detail(self):
|
|
85
|
-
return self._obj._header_level_detail
|
|
86
|
-
|
|
87
|
-
@header_detail.setter
|
|
88
|
-
def header_detail(self, value):
|
|
89
|
-
self._obj._header_level_detail = value
|
|
90
|
-
|
|
91
|
-
@property
|
|
92
|
-
def metadata(self):
|
|
93
|
-
return self._obj._metadata_store
|
|
94
|
-
|
|
95
|
-
@metadata.setter
|
|
96
|
-
def metadata(self, value):
|
|
97
|
-
meta_dict = {
|
|
98
|
-
shortname: self.add_default_metadata(shortname, metainfo)
|
|
99
|
-
for shortname, metainfo in value.items()
|
|
100
|
-
}
|
|
101
|
-
self._obj._metadata_store = meta_dict
|
|
102
|
-
self.propagate_metadata_to_series()
|
|
103
|
-
|
|
104
|
-
def aggregate_series_metadata(self):
|
|
105
|
-
"""Aggregate metadata from Series within the DataFrame."""
|
|
106
|
-
for column in self._obj.columns:
|
|
107
|
-
if isinstance(self._obj[column], pd.Series) and hasattr(
|
|
108
|
-
self._obj[column], "meta"
|
|
109
|
-
):
|
|
110
|
-
self.metadata[column] = self._obj[column].meta.metadata
|
|
111
|
-
|
|
112
|
-
def propagate_metadata_to_series(self):
|
|
113
|
-
"""Propagate DataFrame-level metadata back to Series."""
|
|
114
|
-
for column in self._obj.columns:
|
|
115
|
-
if isinstance(self._obj[column], pd.Series) and hasattr(
|
|
116
|
-
self._obj[column], "meta"
|
|
117
|
-
):
|
|
118
|
-
for key, value in self.metadata.items():
|
|
119
|
-
if key == column:
|
|
120
|
-
try:
|
|
121
|
-
self._obj[column].meta.metadata = value
|
|
122
|
-
except TypeError:
|
|
123
|
-
logger.error(f"{column}: {value}")
|
|
124
|
-
|
|
125
|
-
def update_metadata_on_rename(self, rename_dict):
|
|
126
|
-
"""Update metadata when columns are renamed."""
|
|
127
|
-
new_metadata = {}
|
|
128
|
-
for old_name, new_name in rename_dict.items():
|
|
129
|
-
for key, value in self.metadata.items():
|
|
130
|
-
if key == old_name:
|
|
131
|
-
new_metadata[new_name] = value
|
|
132
|
-
self.metadata = new_metadata
|
|
133
|
-
self.propagate_metadata_to_series()
|
|
134
|
-
|
|
135
|
-
def rename(self, rename_key):
|
|
136
|
-
"""Rename the column names by using a metadata point."""
|
|
137
|
-
rename_dict = {
|
|
138
|
-
column: (
|
|
139
|
-
self._obj[column].meta.get(rename_key)
|
|
140
|
-
if rename_key in list(self._obj[column].meta.metadata.keys())
|
|
141
|
-
else column
|
|
142
|
-
)
|
|
143
|
-
for column in self._obj.columns
|
|
144
|
-
}
|
|
145
|
-
self._obj.rename(columns=rename_dict, inplace=True)
|
|
146
|
-
self.header_detail = rename_key
|
|
147
|
-
self.update_metadata_on_rename(rename_dict)
|
|
148
|
-
|
|
149
|
-
def add_column(
|
|
150
|
-
self,
|
|
151
|
-
name: str,
|
|
152
|
-
data: pd.Series | list,
|
|
153
|
-
location: int | None = None,
|
|
154
|
-
metadata: dict = {},
|
|
155
|
-
):
|
|
156
|
-
"""Add a column and use or generate metadata for it."""
|
|
157
|
-
location = len(self._obj.columns) if location is None else location
|
|
158
|
-
self._obj.insert(
|
|
159
|
-
loc=location,
|
|
160
|
-
column=name,
|
|
161
|
-
value=data,
|
|
162
|
-
allow_duplicates=False,
|
|
163
|
-
)
|
|
164
|
-
self.metadata[name] = self.add_default_metadata(name, metadata)
|
|
165
|
-
self.propagate_metadata_to_series()
|
|
166
|
-
|
|
167
|
-
def add_default_metadata(
|
|
168
|
-
self,
|
|
169
|
-
name: str,
|
|
170
|
-
metadata: dict = {},
|
|
171
|
-
list_of_keys: list = [
|
|
172
|
-
"shortname",
|
|
173
|
-
"longinfo",
|
|
174
|
-
"name",
|
|
175
|
-
"metainfo",
|
|
176
|
-
"unit",
|
|
177
|
-
],
|
|
178
|
-
) -> dict:
|
|
179
|
-
"""Fill up missing metadata points with a default value."""
|
|
180
|
-
default = {}
|
|
181
|
-
for key in list_of_keys:
|
|
182
|
-
if key not in list(metadata.keys()):
|
|
183
|
-
default[key] = name
|
|
184
|
-
return {**metadata, **default}
|