seabirdfilehandler 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -0,0 +1,269 @@
1
+ from pathlib import Path
2
+ import logging
3
+ from collections import UserList
4
+ from typing import Type
5
+ import pandas as pd
6
+ import numpy as np
7
+ from seabirdfilehandler import SeaBirdFile, CnvFile, BottleFile, BottleLogFile
8
+ from seabirdfilehandler.datatablefiles import DataTableFile
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class FileCollection(UserList):
14
+ """A representation of multiple files of the same kind. These files share
15
+ the same suffix and are otherwise closely connected to each other. A common
16
+ use case would be the collection of CNVs to allow for easier processing or
17
+ integration of field calibration measurements.
18
+
19
+ Parameters
20
+ ----------
21
+
22
+ Returns
23
+ -------
24
+
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ path_to_files: str | Path,
30
+ file_suffix: str,
31
+ pattern: str | None = None,
32
+ only_metadata: bool = False,
33
+ ):
34
+ super().__init__()
35
+ self.path_to_files = Path(path_to_files)
36
+ self.file_suffix = file_suffix.strip(".")
37
+ self.file_type: Type[SeaBirdFile]
38
+ self.extract_file_type()
39
+ self.individual_file_paths = []
40
+ self.collect_files()
41
+ if pattern:
42
+ # TODO: implement pattern handling
43
+ self.pattern = pattern
44
+ else:
45
+ self.load_files(only_metadata)
46
+ if not only_metadata:
47
+ if self.file_type == DataTableFile:
48
+ self.df_list = self.get_dataframes()
49
+ self.df = self.get_collection_dataframe(self.df_list)
50
+ if self.file_type == CnvFile:
51
+ self.data_meta_info = self.get_data_table_meta_info()
52
+ self.sensor_data = self.get_sensor_data()
53
+
54
+ def __str__(self):
55
+ return "/n".join(self.data)
56
+
57
+ def extract_file_type(self):
58
+ """ """
59
+ mapping_suffix_to_type = {
60
+ "cnv": CnvFile,
61
+ "btl": BottleFile,
62
+ "bl": BottleLogFile,
63
+ }
64
+ for key, value in mapping_suffix_to_type.items():
65
+ if key == self.file_suffix:
66
+ self.file_type = value
67
+ break
68
+ else:
69
+ self.file_type = SeaBirdFile
70
+
71
+ def collect_files(self):
72
+ """ """
73
+ for path in self.path_to_files.rglob(f"*{self.file_suffix}"):
74
+ self.individual_file_paths.append(path)
75
+ self.individual_file_paths.sort()
76
+
77
+ def load_files(self, only_metadata: bool = False):
78
+ """ """
79
+ for file in self.individual_file_paths:
80
+ try:
81
+ self.data.append(self.file_type(file))
82
+ except TypeError:
83
+ logger.error(
84
+ f"Could not open file {
85
+ file} with the type "
86
+ f"{self.file_type}."
87
+ )
88
+ continue
89
+
90
+ def get_dataframes(
91
+ self,
92
+ event_log: bool = False,
93
+ coordinates: bool = False,
94
+ time_correction: bool = False,
95
+ cast_identifier: bool = False,
96
+ long_header_names: bool = False,
97
+ full_data_header: bool = True,
98
+ ) -> list[pd.DataFrame]:
99
+ """
100
+
101
+ Parameters
102
+ ----------
103
+ event_log: bool :
104
+ (Default value = False)
105
+ coordinates: bool :
106
+ (Default value = False)
107
+ time_correction: bool :
108
+ (Default value = False)
109
+ cast_identifier: bool :
110
+ (Default value = False)
111
+ long_header_names: bool :
112
+ (Default value = False)
113
+ full_data_header: bool :
114
+ (Default value = True)
115
+
116
+ Returns
117
+ -------
118
+
119
+ """
120
+ for index, file in enumerate(self.data):
121
+ if full_data_header:
122
+ file.rename_dataframe_header(header_detail_level="longinfo")
123
+ elif long_header_names:
124
+ file.rename_dataframe_header(header_detail_level="name")
125
+ if event_log:
126
+ file.add_station_and_event_column()
127
+ if coordinates:
128
+ file.add_position_columns()
129
+ if time_correction:
130
+ file.absolute_time_calculation()
131
+ file.add_start_time()
132
+ if cast_identifier:
133
+ file.add_cast_number(index + 1)
134
+ return [file.df for file in self.data]
135
+
136
+ def get_collection_dataframe(
137
+ self, list_of_dfs: list[pd.DataFrame] | None = None
138
+ ) -> pd.DataFrame:
139
+ """
140
+
141
+ Parameters
142
+ ----------
143
+ list_of_dfs: list[pd.DataFrame] | None :
144
+ (Default value = None)
145
+
146
+ Returns
147
+ -------
148
+
149
+ """
150
+ if not list_of_dfs:
151
+ list_of_dfs = self.get_dataframes()
152
+ df = pd.concat(list_of_dfs, ignore_index=True)
153
+ # df.meta.metadata = list_of_dfs[0].meta.metadata
154
+ return df
155
+
156
+ def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
157
+ """
158
+
159
+ Parameters
160
+ ----------
161
+ df: pd.DataFrame :
162
+
163
+
164
+ Returns
165
+ -------
166
+
167
+ """
168
+ df = self.use_bad_flag_for_nan(df)
169
+ df = self.set_dtype_to_float(df)
170
+ return self.select_real_scan_data(df)
171
+
172
+ def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
173
+ """
174
+
175
+ Parameters
176
+ ----------
177
+ df: pd.DataFrame :
178
+
179
+
180
+ Returns
181
+ -------
182
+
183
+ """
184
+ bad_flags = set()
185
+ for file in self.data:
186
+ for line in file.data_table_description:
187
+ if line.startswith("bad_flag"):
188
+ flag = line.split("=")[1].strip()
189
+ bad_flags.add(flag)
190
+ for flag in bad_flags:
191
+ df.replace(to_replace=flag, value=np.nan, inplace=True)
192
+ return df
193
+
194
+ def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
195
+ """
196
+
197
+ Parameters
198
+ ----------
199
+ df: pd.DataFrame :
200
+
201
+
202
+ Returns
203
+ -------
204
+
205
+ """
206
+ for parameter in df.columns:
207
+ if parameter in ["datetime"]:
208
+ continue
209
+ try:
210
+ df[parameter] = df[parameter].astype("float")
211
+ finally:
212
+ continue
213
+ return df
214
+
215
+ def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
216
+ """
217
+
218
+ Parameters
219
+ ----------
220
+ df: pd.DataFrame :
221
+
222
+
223
+ Returns
224
+ -------
225
+
226
+ """
227
+ # TODO: fix this hardcoded name
228
+ try:
229
+ df = df.loc[df["Scan Count"].notna()]
230
+ finally:
231
+ pass
232
+ return df
233
+
234
+ def to_csv(self, file_name):
235
+ """
236
+
237
+ Parameters
238
+ ----------
239
+ file_name :
240
+
241
+
242
+ Returns
243
+ -------
244
+
245
+ """
246
+ self.get_collection_dataframe().to_csv(file_name)
247
+
248
+ def get_data_table_meta_info(self) -> list[list[dict]]:
249
+ """ """
250
+ return [file.data_header_meta_info for file in self.data]
251
+
252
+ def get_sensor_data(self) -> list[tuple[list[dict]]]:
253
+ """ """
254
+ unique = []
255
+ last_unique = None
256
+ for file in [file for file in self.data]:
257
+ cast_sensors = file.sensors
258
+ if last_unique is None:
259
+ unique.append((file.file_name, cast_sensors))
260
+ else:
261
+ differing_dicts = [
262
+ current_dict
263
+ for last_dict, current_dict in zip(last_unique, cast_sensors)
264
+ if current_dict != last_dict
265
+ ]
266
+ if differing_dicts:
267
+ unique.append((file.file_name, differing_dicts))
268
+ last_unique = cast_sensors
269
+ return unique
@@ -0,0 +1,23 @@
1
+ version: 1
2
+ formatters:
3
+ simple:
4
+ format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
5
+ handlers:
6
+ console:
7
+ class: logging.StreamHandler
8
+ level: DEBUG
9
+ formatter: simple
10
+ stream: ext://sys.stdout
11
+ file:
12
+ class: logging.FileHandler
13
+ filename: seabirdfiles.log
14
+ level: DEBUG
15
+ formatter: simple
16
+ loggers:
17
+ simpleExample:
18
+ level: DEBUG
19
+ handlers: [console]
20
+ propagate: no
21
+ root:
22
+ level: DEBUG
23
+ handlers: [console]
@@ -0,0 +1,410 @@
1
+ from __future__ import annotations
2
+ from typing import Tuple
3
+ import re
4
+ from collections import UserDict
5
+ import numpy as np
6
+ import pandas as pd
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Parameters(UserDict):
13
+ """
14
+ A collection of all the parameters in a CnvFile.
15
+
16
+ Allows for a much cleaner handling of parameter data and their metadata.
17
+ Will be heavily expanded.
18
+
19
+ Parameters
20
+ ----------
21
+ data: list:
22
+ The raw data as extraced by SeaBirdFile
23
+ metadata: list,
24
+ The raw metadata as extraced by SeaBirdFile
25
+
26
+ Returns
27
+ -------
28
+
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ data: list,
34
+ metadata: list,
35
+ ):
36
+ self.raw_input_data = data
37
+ self.raw_metadata = metadata
38
+ self.full_data_array = self.create_full_ndarray()
39
+ self.differentiate_table_description()
40
+ self.metadata, self.duplicate_columns = self.reading_data_header(metadata)
41
+ self.data = self.create_parameter_instances()
42
+
43
+ def get_parameter_list(self) -> list[Parameter]:
44
+ """ """
45
+ return list(self.data.values())
46
+
47
+ def create_full_ndarray(self, data_table: list = []) -> np.ndarray:
48
+ """
49
+ Builds a numpy array representing the data table in a cnv file.
50
+
51
+ Parameters
52
+ ----------
53
+ data_table: list :
54
+ The data to work with
55
+ (Default value = [])
56
+
57
+ Returns
58
+ -------
59
+ A numpy array of the same shape as the cnv files data table
60
+
61
+ """
62
+ data_table = self.raw_input_data if len(data_table) == 0 else data_table
63
+ n = 11
64
+ row_list = []
65
+ for line in data_table:
66
+ row_list.append(
67
+ [line[i : i + n].split()[0] for i in range(0, len(line) - n, n)]
68
+ )
69
+ return np.array(row_list, dtype=float)
70
+
71
+ def create_parameter_instances(
72
+ self,
73
+ metadata: dict[str, dict] = {},
74
+ ) -> dict[str, Parameter]:
75
+ """
76
+ Differentiates the individual parameter columns into separate parameter
77
+ instances.
78
+
79
+ Parameters
80
+ ----------
81
+ metadata: dict[str, dict] :
82
+ The structured metadata dictionary
83
+ (Default value = {})
84
+
85
+ Returns
86
+ -------
87
+ A dictionary of parameter instances
88
+
89
+ """
90
+ metadata = self.metadata if len(list(metadata.keys())) == 0 else metadata
91
+ parameter_dict = {}
92
+ list_of_metadata_shortnames = list(metadata.keys())
93
+ if self.full_data_array.shape[1] != len(list_of_metadata_shortnames):
94
+ raise AssertionError(
95
+ f"{self.full_data_array.shape} and metadata:{metadata}"
96
+ )
97
+ for i in range(self.full_data_array.shape[1]):
98
+ column_data = self.full_data_array[:, i]
99
+ key = list_of_metadata_shortnames[i]
100
+ parameter_dict[key] = Parameter(data=column_data, metadata=metadata[key])
101
+ return parameter_dict
102
+
103
+ def differentiate_table_description(self):
104
+ """
105
+ The original method that structures data table metadata.
106
+
107
+ Needs heavy refactoring.
108
+ """
109
+ past_spans = False
110
+ pre = []
111
+ column_names = []
112
+ column_value_spans = []
113
+ post = []
114
+ for line in self.raw_metadata:
115
+ if line.startswith("name"):
116
+ column_names.append(line.split("=")[1].strip())
117
+ elif line.startswith("span"):
118
+ past_spans = True
119
+ column_value_spans.append(line.split("=")[1].strip())
120
+ else:
121
+ if not past_spans:
122
+ pre.append(line)
123
+ else:
124
+ post.append(line)
125
+ assert len(column_names) == len(column_value_spans)
126
+ self.data_table_stats = {
127
+ line.split("=")[0].strip(): line.split("=")[1].strip() for line in pre
128
+ }
129
+ self.data_table_names_and_spans = [
130
+ (name, span) for name, span in zip(column_names, column_value_spans)
131
+ ]
132
+ self.data_table_misc = post
133
+
134
+ def add_parameter(self, parameter: Parameter):
135
+ """
136
+ Adds one parameter instance to the collection.
137
+
138
+ Parameters
139
+ ----------
140
+ parameter: Parameter :
141
+ The new parameter
142
+
143
+ """
144
+ self.data[parameter.name] = parameter
145
+
146
+ def create_parameter(
147
+ self,
148
+ data: np.ndarray | int | float | str,
149
+ metadata: dict = {},
150
+ name: str = "",
151
+ ) -> Parameter:
152
+ """
153
+ Creates a new parameter instance with the given data and metadata.
154
+
155
+ The input data is either a numpy array or a single value. The single
156
+ value will be broadcasted to the shape of the data table. A use-case
157
+ would be the addition of an 'event' or 'cast' column.
158
+
159
+ Parameters
160
+ ----------
161
+ data: np.ndarray | int | float | str :
162
+ Data to use or expand
163
+
164
+ metadata: dict :
165
+ Metadata for the new parameter
166
+ (Default value = {})
167
+ name: str :
168
+ Name to use for missing metadata values
169
+ (Default value = "")
170
+
171
+ Returns
172
+ -------
173
+ The new parameter instance
174
+
175
+ """
176
+ if len(metadata) < 5:
177
+ if len(name) > 0:
178
+ metadata = self.add_default_metadata(name=name, metadata=metadata)
179
+ else:
180
+ raise ValueError("Please specify either a name or sufficient metadata")
181
+ if not isinstance(data, np.ndarray):
182
+ data = np.full(
183
+ fill_value=data,
184
+ shape=self.full_data_array.shape[0],
185
+ dtype=type(data),
186
+ )
187
+ parameter = Parameter(data=data, metadata=metadata)
188
+ self.add_parameter(parameter)
189
+ return parameter
190
+
191
+ def add_default_metadata(
192
+ self,
193
+ name: str,
194
+ metadata: dict = {},
195
+ list_of_keys: list = [
196
+ "shortname",
197
+ "longinfo",
198
+ "name",
199
+ "metainfo",
200
+ "unit",
201
+ ],
202
+ ) -> dict:
203
+ """
204
+ Fills up missing metadata points with a default value.
205
+
206
+ Parameters
207
+ ----------
208
+ name: str :
209
+ The value to use as default
210
+ metadata: dict :
211
+ The present metadata
212
+ (Default value = {})
213
+ list_of_keys: list :
214
+ The expected metadata keys
215
+
216
+ Returns
217
+ -------
218
+ The full metadata dictionary
219
+
220
+ """
221
+ default = {}
222
+ for key in list_of_keys:
223
+ if key not in list(metadata.keys()):
224
+ if key in ["metainfo", "unit"]:
225
+ default[key] = ""
226
+ default[key] = name
227
+ return {**metadata, **default}
228
+
229
+ def update_spans(self):
230
+ """Updates all spans of the parameters."""
231
+ for parameter in self.get_parameter_list():
232
+ parameter.update_span()
233
+
234
+ def get_spans(self) -> list[tuple[int, int]]:
235
+ """Returns all span tuples of the parameters."""
236
+ # update spans first
237
+ self.update_spans()
238
+ return [parameter.span for parameter in self.get_parameter_list()]
239
+
240
+ def get_pandas_dataframe(self) -> pd.DataFrame:
241
+ """Returns a pandas DataFrame of the current parameter data."""
242
+ data = np.array([parameter.data for parameter in self.get_parameter_list()]).T
243
+ columns = [parameter.name for parameter in self.get_parameter_list()]
244
+ assert data.shape[1] == len(columns)
245
+ return pd.DataFrame(data=data, columns=columns, dtype=float)
246
+
247
+ def with_name_type(self, name_type: str = "shortname"):
248
+ """
249
+ Uses the given name_type as column descriptors.
250
+
251
+ Parameters
252
+ ----------
253
+ name_type: str :
254
+ The metadata name to use
255
+ (Default value = "shortname")
256
+
257
+ """
258
+ for parameter in self.get_parameter_list():
259
+ parameter.use_name(name_type)
260
+
261
+ def reading_data_header(
262
+ self, header_info: list = []
263
+ ) -> Tuple[dict[str, dict], list[int]]:
264
+ """Reads the tables header data from the header.
265
+
266
+ Parameters
267
+ ----------
268
+ header_info : list:
269
+ the header values from the file
270
+ header_info: list :
271
+ (Default value = [])
272
+
273
+ Returns
274
+ -------
275
+
276
+
277
+ """
278
+ if len(header_info) == 0:
279
+ header_info = self.raw_metadata
280
+ table_header = {}
281
+ duplicate_columns = []
282
+ for line in header_info:
283
+ if line.startswith("name"):
284
+ header_meta_info = {}
285
+ # get basic shortname and the full, non-differentiated info
286
+ shortname = longinfo = line_info = line.split("=")[1].strip()
287
+ try:
288
+ shortname, longinfo = line_info.split(":")
289
+ except IndexError:
290
+ pass
291
+ finally:
292
+ shortname = shortname.strip()
293
+ if shortname in list(table_header.keys()):
294
+ try:
295
+ duplicate_columns.append(
296
+ int(line.split("=")[0].strip().split()[1])
297
+ )
298
+ except IndexError as error:
299
+ logger.error(
300
+ f"Could not resolve duplicate column: {
301
+ shortname}, {error}"
302
+ )
303
+ else:
304
+ header_meta_info["shortname"] = shortname
305
+ header_meta_info["longinfo"] = longinfo.strip()
306
+ metainfo = self._extract_data_header_meta_info(longinfo.strip())
307
+ header_meta_info = {**header_meta_info, **metainfo}
308
+ table_header[shortname.strip()] = header_meta_info
309
+ return table_header, duplicate_columns
310
+
311
+ def _extract_data_header_meta_info(self, line: str) -> dict:
312
+ """Extracts the individual information bits inside of the header lines
313
+
314
+ Parameters
315
+ ----------
316
+ line : str:
317
+ one header line, trimmed by the 'name =' prefix and the shortname
318
+ line: str :
319
+
320
+
321
+ Returns
322
+ -------
323
+
324
+
325
+ """
326
+ regex_string = r"(?:(?P<name0>.+),\s(?P<metainfo0>.+)\s\[(?P<unit0>.+)\]|(?P<name2>.+)\s\[(?P<unit2>.+)\]|(?P<name3>.+),\s(?P<metainfo2>.[^\s]+)|(?P<name4>.+))"
327
+ regex_check = re.search(regex_string, line, flags=re.IGNORECASE)
328
+ if regex_check:
329
+ regex_info = dict(regex_check.groupdict())
330
+ regex_info = {
331
+ key[:-1]: value
332
+ for key, value in regex_info.items()
333
+ if value is not None
334
+ }
335
+ if len(regex_info) > 2:
336
+ # check for second sensors and adjust their names
337
+ if regex_info["metainfo"][-1] == "2":
338
+ regex_info["name"] = regex_info["name"] + " 2"
339
+ regex_info["metainfo"] = regex_info["metainfo"][:-1]
340
+ if len(regex_info["metainfo"]) == 0:
341
+ regex_info.pop("metainfo")
342
+ if regex_info["name"] == "flag":
343
+ regex_info["metainfo"] = regex_info["name"]
344
+ regex_info["unit"] = regex_info["name"]
345
+ return regex_info
346
+ return {}
347
+
348
+
349
+ class Parameter:
350
+ """A representation of one parameter in a cnv file.
351
+
352
+ Consists of the values of the parameter as well as the metadata.
353
+
354
+ Parameters
355
+ ----------
356
+
357
+ Returns
358
+ -------
359
+
360
+ """
361
+
362
+ def __init__(
363
+ self,
364
+ data: np.ndarray,
365
+ metadata: dict,
366
+ ) -> None:
367
+ self.data = data
368
+ self.metadata = metadata
369
+ self.name = metadata["shortname"]
370
+ self.update_span()
371
+
372
+ def __str__(self) -> str:
373
+ return str(self.metadata["longinfo"])
374
+
375
+ def __repr__(self) -> str:
376
+ return self.__str__()
377
+
378
+ def __eq__(self, other) -> bool:
379
+ return self.data == other.data
380
+
381
+ def get_pandas_series(self) -> pd.Series:
382
+ """Returns a pandas Series of the current parameter data."""
383
+ return pd.Series(data=self.data, name=self.name)
384
+
385
+ def use_name(self, name_type: str = "shortname"):
386
+ """
387
+ Uses the given name as parameter descriptor.
388
+
389
+ Parameters
390
+ ----------
391
+ name_type: str :
392
+ The metadata name to use
393
+ (Default value = "shortname")
394
+
395
+ """
396
+ try:
397
+ self.name = self.metadata[name_type]
398
+ except KeyError:
399
+ return
400
+
401
+ def update_span(self):
402
+ """
403
+ Updates the data span.
404
+
405
+ Uses the first value if dtype is not numeric.
406
+ """
407
+ if self.data.dtype in ["float64", "int"]:
408
+ self.span = (self.data.min(), self.data.max())
409
+ else:
410
+ self.span = (self.data[0], self.data[0])