seabirdfilehandler 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of seabirdfilehandler might be problematic. Click here for more details.
- seabirdfilehandler/__init__.py +4 -2
- seabirdfilehandler/bottlefile.py +181 -0
- seabirdfilehandler/bottlelogfile.py +151 -0
- seabirdfilehandler/cnvfile.py +284 -0
- seabirdfilehandler/datafiles.py +265 -0
- seabirdfilehandler/file_collection.py +19 -18
- seabirdfilehandler/parameter.py +29 -3
- {seabirdfilehandler-0.4.3.dist-info → seabirdfilehandler-0.5.1.dist-info}/METADATA +1 -1
- seabirdfilehandler-0.5.1.dist-info/RECORD +14 -0
- {seabirdfilehandler-0.4.3.dist-info → seabirdfilehandler-0.5.1.dist-info}/WHEEL +1 -1
- seabirdfilehandler/dataframe_meta_accessor.py +0 -184
- seabirdfilehandler/datatablefiles.py +0 -930
- seabirdfilehandler/logging.yaml +0 -23
- seabirdfilehandler/seabirdfiles.py +0 -210
- seabirdfilehandler-0.4.3.dist-info/RECORD +0 -14
- {seabirdfilehandler-0.4.3.dist-info → seabirdfilehandler-0.5.1.dist-info}/LICENSE +0 -0
|
@@ -1,930 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import Union, Tuple
|
|
3
|
-
from datetime import datetime, time, timedelta
|
|
4
|
-
import re
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import numpy as np
|
|
7
|
-
import logging
|
|
8
|
-
import warnings
|
|
9
|
-
from seabirdfilehandler.parameter import Parameters
|
|
10
|
-
from seabirdfilehandler.validation_modules import CnvValidationList
|
|
11
|
-
from seabirdfilehandler.seabirdfiles import SeaBirdFile
|
|
12
|
-
from seabirdfilehandler.dataframe_meta_accessor import (
|
|
13
|
-
SeriesMetaAccessor, # noqa: F401
|
|
14
|
-
DataFrameMetaAccessor, # noqa: F401
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class DataTableFile(SeaBirdFile):
|
|
21
|
-
"""Collection of methods for the SeaBird files that feature some kind of
|
|
22
|
-
data table that is represented in a pandas dataframe.
|
|
23
|
-
|
|
24
|
-
Parameters
|
|
25
|
-
----------
|
|
26
|
-
|
|
27
|
-
Returns
|
|
28
|
-
-------
|
|
29
|
-
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(self, path_to_file):
|
|
33
|
-
super().__init__(path_to_file)
|
|
34
|
-
self.original_df: pd.DataFrame
|
|
35
|
-
self.df: pd.DataFrame
|
|
36
|
-
|
|
37
|
-
def define_output_path(
|
|
38
|
-
self,
|
|
39
|
-
file_path: Path | str | None = None,
|
|
40
|
-
file_name: str | None = None,
|
|
41
|
-
file_type: str = ".csv",
|
|
42
|
-
) -> Path:
|
|
43
|
-
"""Creates a Path object holding the desired output path.
|
|
44
|
-
|
|
45
|
-
Parameters
|
|
46
|
-
----------
|
|
47
|
-
file_path : Path :
|
|
48
|
-
directory the file sits in (Default value = self.file_dir)
|
|
49
|
-
file_name : str :
|
|
50
|
-
the original file name (Default value = self.file_name)
|
|
51
|
-
file_type : str :
|
|
52
|
-
the output file type (Default = '.csv')
|
|
53
|
-
Returns
|
|
54
|
-
-------
|
|
55
|
-
a Path object consisting of the full path of the new file
|
|
56
|
-
|
|
57
|
-
"""
|
|
58
|
-
file_path = self.file_dir if file_path is None else file_path
|
|
59
|
-
file_name = self.file_name if file_name is None else file_name
|
|
60
|
-
if file_type[0] != ".":
|
|
61
|
-
file_type = "." + file_type
|
|
62
|
-
return Path(file_path).joinpath(file_name).with_suffix(file_type)
|
|
63
|
-
|
|
64
|
-
def to_csv(
|
|
65
|
-
self,
|
|
66
|
-
selected_columns: list | None = None,
|
|
67
|
-
with_header: bool = True,
|
|
68
|
-
output_file_path: Path | str | None = None,
|
|
69
|
-
output_file_name: str | None = None,
|
|
70
|
-
):
|
|
71
|
-
"""Writes a csv from the current dataframe. Takes a list of columns to
|
|
72
|
-
use, a boolean for writing the header and the output file parameters.
|
|
73
|
-
|
|
74
|
-
Parameters
|
|
75
|
-
----------
|
|
76
|
-
selected_columns : list :
|
|
77
|
-
a list of columns to include in the csv
|
|
78
|
-
(Default value = self.df.columns)
|
|
79
|
-
with_header : boolean :
|
|
80
|
-
indicating whether the header shall appear in the output
|
|
81
|
-
(Default value = True)
|
|
82
|
-
output_file_path : Path :
|
|
83
|
-
file directory (Default value = None)
|
|
84
|
-
output_file_name : str :
|
|
85
|
-
original file name (Default value = None)
|
|
86
|
-
|
|
87
|
-
Returns
|
|
88
|
-
-------
|
|
89
|
-
|
|
90
|
-
"""
|
|
91
|
-
selected_columns = (
|
|
92
|
-
self.df.columns if selected_columns is None else selected_columns
|
|
93
|
-
)
|
|
94
|
-
df = self.df[selected_columns].reset_index(drop=True)
|
|
95
|
-
new_file_path = self.define_output_path(
|
|
96
|
-
output_file_path, output_file_name
|
|
97
|
-
)
|
|
98
|
-
if with_header:
|
|
99
|
-
with open(new_file_path, "w") as file:
|
|
100
|
-
for line in self.header:
|
|
101
|
-
file.write(line)
|
|
102
|
-
df.to_csv(new_file_path, index=False, mode="a")
|
|
103
|
-
else:
|
|
104
|
-
df.to_csv(new_file_path, index=False, mode="w")
|
|
105
|
-
logger.info(f"Wrote file {self.path_to_file} to {new_file_path}.")
|
|
106
|
-
|
|
107
|
-
def selecting_columns(
|
|
108
|
-
self,
|
|
109
|
-
list_of_columns: list | str,
|
|
110
|
-
df: pd.DataFrame | None = None,
|
|
111
|
-
):
|
|
112
|
-
"""Alters the dataframe to only hold the given columns.
|
|
113
|
-
|
|
114
|
-
Parameters
|
|
115
|
-
----------
|
|
116
|
-
list_of_columns: list or str : a collection of columns
|
|
117
|
-
df : pandas.Dataframe :
|
|
118
|
-
Dataframe (Default value = None)
|
|
119
|
-
|
|
120
|
-
Returns
|
|
121
|
-
-------
|
|
122
|
-
|
|
123
|
-
"""
|
|
124
|
-
df = self.df if df is None else df
|
|
125
|
-
# ensure that the input is a list, so that isin() can do its job
|
|
126
|
-
if isinstance(list_of_columns, str):
|
|
127
|
-
list_of_columns = [list_of_columns]
|
|
128
|
-
if isinstance(df, pd.DataFrame):
|
|
129
|
-
self.df = df[list_of_columns].reset_index(drop=True)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
class BottleFile(DataTableFile):
|
|
133
|
-
"""Class that represents a SeaBird Bottle File. Organizes the files table
|
|
134
|
-
information into a pandas dataframe. This allows the usage of this
|
|
135
|
-
powerful library for statistics, visualization, data manipulation, export,
|
|
136
|
-
etc.
|
|
137
|
-
|
|
138
|
-
Parameters
|
|
139
|
-
----------
|
|
140
|
-
|
|
141
|
-
Returns
|
|
142
|
-
-------
|
|
143
|
-
|
|
144
|
-
"""
|
|
145
|
-
|
|
146
|
-
def __init__(self, path_to_file):
|
|
147
|
-
super().__init__(path_to_file)
|
|
148
|
-
self.original_df = self.create_dataframe()
|
|
149
|
-
self.df = self.original_df
|
|
150
|
-
self.setting_dataframe_dtypes()
|
|
151
|
-
self.adding_timestamp_column()
|
|
152
|
-
|
|
153
|
-
def create_dataframe(self):
|
|
154
|
-
"""Creates a dataframe out of the btl file. Manages the double data
|
|
155
|
-
header correctly.
|
|
156
|
-
|
|
157
|
-
Parameters
|
|
158
|
-
----------
|
|
159
|
-
|
|
160
|
-
Returns
|
|
161
|
-
-------
|
|
162
|
-
|
|
163
|
-
"""
|
|
164
|
-
# TODO: this needs to be broken down into smaller pieces...
|
|
165
|
-
top_names, bottom_names = self.reading_data_header()
|
|
166
|
-
# creating statistics column to store the row type information:
|
|
167
|
-
# 4 rows per bottle, average, standard deviation, max value, min value
|
|
168
|
-
top_names.append("Statistic")
|
|
169
|
-
# TODO: sexier way to construct dataframe than opening the file a
|
|
170
|
-
# second time
|
|
171
|
-
# # df = pd.DataFrame(self.data, index=None, columns=top_names)
|
|
172
|
-
df: pd.DataFrame = pd.read_fwf(
|
|
173
|
-
self.path_to_file,
|
|
174
|
-
index_col=False,
|
|
175
|
-
skiprows=len(self.header) + 2,
|
|
176
|
-
header=None,
|
|
177
|
-
names=top_names,
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
# handling the double row header
|
|
181
|
-
rowtypes = df[df.columns[-1]].unique()
|
|
182
|
-
|
|
183
|
-
# TODO: can this be made a little pretier?
|
|
184
|
-
def separate_double_header_row(df, column, length):
|
|
185
|
-
"""
|
|
186
|
-
|
|
187
|
-
Parameters
|
|
188
|
-
----------
|
|
189
|
-
df :
|
|
190
|
-
column :
|
|
191
|
-
length :
|
|
192
|
-
|
|
193
|
-
Returns
|
|
194
|
-
-------
|
|
195
|
-
|
|
196
|
-
"""
|
|
197
|
-
column_idx = df.columns.get_loc(column)
|
|
198
|
-
old_column = df.iloc[::length, column_idx].reset_index(drop=True)
|
|
199
|
-
new_column = df.iloc[1::length, column_idx].reset_index(drop=True)
|
|
200
|
-
old_column_expanded = pd.Series(
|
|
201
|
-
np.repeat(old_column, length)
|
|
202
|
-
).reset_index(drop=True)
|
|
203
|
-
new_column_expanded = pd.Series(
|
|
204
|
-
np.repeat(new_column, length)
|
|
205
|
-
).reset_index(drop=True)
|
|
206
|
-
df[column] = old_column_expanded
|
|
207
|
-
df.insert(
|
|
208
|
-
column_idx + 1, bottom_names[column_idx], new_column_expanded
|
|
209
|
-
)
|
|
210
|
-
return df
|
|
211
|
-
|
|
212
|
-
df = separate_double_header_row(df, "Date", len(rowtypes))
|
|
213
|
-
df = separate_double_header_row(df, top_names[0], len(rowtypes))
|
|
214
|
-
# remove brackets around statistics values
|
|
215
|
-
df["Statistic"] = df["Statistic"].str.strip("()")
|
|
216
|
-
df = df.rename(mapper={"Btl_ID": "Bottle_ID"}, axis=1)
|
|
217
|
-
return df
|
|
218
|
-
|
|
219
|
-
def adding_timestamp_column(self):
|
|
220
|
-
"""Creates a timestamp column that holds both, Date and Time
|
|
221
|
-
information.
|
|
222
|
-
|
|
223
|
-
Parameters
|
|
224
|
-
----------
|
|
225
|
-
|
|
226
|
-
Returns
|
|
227
|
-
-------
|
|
228
|
-
|
|
229
|
-
"""
|
|
230
|
-
# constructing timestamp column
|
|
231
|
-
timestamp = []
|
|
232
|
-
for datepoint, timepoint in zip(self.df.Date, self.df.Time):
|
|
233
|
-
timestamp.append(
|
|
234
|
-
datetime.combine(datepoint, time.fromisoformat(str(timepoint)))
|
|
235
|
-
)
|
|
236
|
-
self.df.insert(2, "Timestamp", timestamp)
|
|
237
|
-
self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
|
|
238
|
-
|
|
239
|
-
def setting_dataframe_dtypes(self):
|
|
240
|
-
"""Sets the types for the column values in the dataframe."""
|
|
241
|
-
# setting dtypes
|
|
242
|
-
# TODO: extending this to the other columns!
|
|
243
|
-
self.df.Date = pd.to_datetime(self.df.Date)
|
|
244
|
-
self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
|
|
245
|
-
|
|
246
|
-
def selecting_rows(
|
|
247
|
-
self, df=None, statistic_of_interest: Union[list, str] = ["avg"]
|
|
248
|
-
):
|
|
249
|
-
"""Creates a dataframe with the given row identifier, using the
|
|
250
|
-
statistics column. A single string or a list of strings can be
|
|
251
|
-
processed.
|
|
252
|
-
|
|
253
|
-
Parameters
|
|
254
|
-
----------
|
|
255
|
-
df : pandas.Dataframe :
|
|
256
|
-
the files Pandas representation (Default value = self.df)
|
|
257
|
-
statistic_of_interest: list or str :
|
|
258
|
-
collection of values of the 'statistics' column in self.df
|
|
259
|
-
(Default value = ['avg'])
|
|
260
|
-
|
|
261
|
-
Returns
|
|
262
|
-
-------
|
|
263
|
-
|
|
264
|
-
"""
|
|
265
|
-
df = self.df if df is None else df
|
|
266
|
-
# ensure that the input is a list, so that isin() can do its job
|
|
267
|
-
if isinstance(statistic_of_interest, str):
|
|
268
|
-
statistic_of_interest = [statistic_of_interest]
|
|
269
|
-
self.df = df.loc[df["Statistic"].isin(statistic_of_interest)]
|
|
270
|
-
|
|
271
|
-
def reading_data_header(self):
|
|
272
|
-
"""Identifies and separatly collects the rows that specify the data
|
|
273
|
-
tables headers.
|
|
274
|
-
|
|
275
|
-
Parameters
|
|
276
|
-
----------
|
|
277
|
-
|
|
278
|
-
Returns
|
|
279
|
-
-------
|
|
280
|
-
|
|
281
|
-
"""
|
|
282
|
-
n = 11 # fix column width of a seabird btl file
|
|
283
|
-
top_line = self.data[0]
|
|
284
|
-
second_line = self.data[1]
|
|
285
|
-
top_names = [
|
|
286
|
-
top_line[i : i + n].split()[0]
|
|
287
|
-
for i in range(0, len(top_line) - n, n)
|
|
288
|
-
]
|
|
289
|
-
bottom_names = [
|
|
290
|
-
second_line[i : i + n].split()[0] for i in range(0, 2 * n, n)
|
|
291
|
-
]
|
|
292
|
-
return top_names, bottom_names
|
|
293
|
-
|
|
294
|
-
def add_station_and_event_column(self):
|
|
295
|
-
event_list = [self.metadata["Station"] for _ in self.data]
|
|
296
|
-
self.df.insert(0, "Event", pd.Series(event_list))
|
|
297
|
-
|
|
298
|
-
def add_position_columns(self):
|
|
299
|
-
latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
|
|
300
|
-
self.df.insert(1, "Latitude", pd.Series(latitude_list))
|
|
301
|
-
longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
|
|
302
|
-
self.df.insert(2, "Longitude", pd.Series(longitude_list))
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
class CnvFile(DataTableFile):
|
|
306
|
-
"""
|
|
307
|
-
A representation of a cnv-file as used by SeaBird.
|
|
308
|
-
|
|
309
|
-
This class intends to fully extract and organize the different types of
|
|
310
|
-
data and metadata present inside of such a file. Downstream libraries shall
|
|
311
|
-
be able to use this representation for all applications concerning cnv
|
|
312
|
-
files, like data processing, transformation or visualization.
|
|
313
|
-
|
|
314
|
-
To achieve that, the metadata header is organized by the grandparent-class,
|
|
315
|
-
SeaBirdFile, while the data table is extracted by this class. The data
|
|
316
|
-
representation of choice is a pandas Dataframe. Inside this class, there
|
|
317
|
-
are methods to parse cnv data into dataframes, do the reverse of writing a
|
|
318
|
-
dataframe into cnv compliant form and to manipulate the dataframe in
|
|
319
|
-
various ways.
|
|
320
|
-
|
|
321
|
-
Parameters
|
|
322
|
-
----------
|
|
323
|
-
path_to_file: Path | str:
|
|
324
|
-
the path to the file
|
|
325
|
-
full_data_header: bool:
|
|
326
|
-
whether to use the full data column descriptions for the dataframe
|
|
327
|
-
long_header_names: bool:
|
|
328
|
-
whether to use long header names in the dateframe
|
|
329
|
-
absolute_time_calculation: bool:
|
|
330
|
-
whether to use a real timestamp instead of the second count
|
|
331
|
-
event_log_column: bool:
|
|
332
|
-
whether to add a station and device event column from DSHIP
|
|
333
|
-
coordinate_columns: bool:
|
|
334
|
-
whether to add longitude and latitude from the extra metadata header
|
|
335
|
-
|
|
336
|
-
"""
|
|
337
|
-
|
|
338
|
-
def __init__(
|
|
339
|
-
self,
|
|
340
|
-
path_to_file: Path | str,
|
|
341
|
-
create_dataframe: bool = True,
|
|
342
|
-
absolute_time_calculation: bool = False,
|
|
343
|
-
event_log_column: bool = False,
|
|
344
|
-
coordinate_columns: bool = False,
|
|
345
|
-
data_table_info_level: str = "shortname",
|
|
346
|
-
):
|
|
347
|
-
super().__init__(path_to_file)
|
|
348
|
-
self.validation_modules = self.obtaining_validation_modules()
|
|
349
|
-
self.start_time = self.reading_start_time()
|
|
350
|
-
if create_dataframe:
|
|
351
|
-
warnings.warn(
|
|
352
|
-
"The default of constructing a pandas Dataframe will soon be replaced by using the Parameters class that works on numpy arrays.",
|
|
353
|
-
DeprecationWarning,
|
|
354
|
-
stacklevel=2, # Ensures the warning points to the caller's line
|
|
355
|
-
)
|
|
356
|
-
self.data_header_meta_info, self.duplicate_columns = (
|
|
357
|
-
self.reading_data_header(self.data_table_description)
|
|
358
|
-
)
|
|
359
|
-
self.original_df = self.create_dataframe(data_table_info_level)
|
|
360
|
-
self.df = self.original_df
|
|
361
|
-
if absolute_time_calculation:
|
|
362
|
-
self.absolute_time_calculation()
|
|
363
|
-
if event_log_column:
|
|
364
|
-
self.add_station_and_event_column()
|
|
365
|
-
if coordinate_columns:
|
|
366
|
-
self.add_position_columns()
|
|
367
|
-
else:
|
|
368
|
-
self.parameters = Parameters(
|
|
369
|
-
self.data, self.data_table_description
|
|
370
|
-
)
|
|
371
|
-
|
|
372
|
-
def reading_data_header(
|
|
373
|
-
self, header_info: list = []
|
|
374
|
-
) -> Tuple[dict[str, dict], list[int]]:
|
|
375
|
-
"""Reads the tables header data from the header.
|
|
376
|
-
|
|
377
|
-
Parameters
|
|
378
|
-
----------
|
|
379
|
-
header_info: list:
|
|
380
|
-
the header values from the file
|
|
381
|
-
|
|
382
|
-
Returns
|
|
383
|
-
-------
|
|
384
|
-
a list of dictionaries, that organize the table header information
|
|
385
|
-
|
|
386
|
-
"""
|
|
387
|
-
if header_info == []:
|
|
388
|
-
header_info = self.data_table_description
|
|
389
|
-
table_header = {}
|
|
390
|
-
duplicate_columns = []
|
|
391
|
-
for line in header_info:
|
|
392
|
-
if line.startswith("name"):
|
|
393
|
-
header_meta_info = {}
|
|
394
|
-
# get basic shortname and the full, non-differentiated info
|
|
395
|
-
shortname = longinfo = line_info = line.split("=")[1].strip()
|
|
396
|
-
try:
|
|
397
|
-
shortname, longinfo = line_info.split(":")
|
|
398
|
-
except IndexError:
|
|
399
|
-
pass
|
|
400
|
-
finally:
|
|
401
|
-
shortname = shortname.strip()
|
|
402
|
-
if shortname in list(table_header.keys()):
|
|
403
|
-
try:
|
|
404
|
-
duplicate_columns.append(
|
|
405
|
-
int(line.split("=")[0].strip().split()[1])
|
|
406
|
-
)
|
|
407
|
-
except IndexError as error:
|
|
408
|
-
logger.error(
|
|
409
|
-
f"Could not resolve duplicate column: {
|
|
410
|
-
shortname
|
|
411
|
-
}, {error}"
|
|
412
|
-
)
|
|
413
|
-
else:
|
|
414
|
-
header_meta_info["shortname"] = shortname
|
|
415
|
-
header_meta_info["longinfo"] = longinfo.strip()
|
|
416
|
-
metainfo = self._extract_data_header_meta_info(
|
|
417
|
-
longinfo.strip()
|
|
418
|
-
)
|
|
419
|
-
header_meta_info = {**header_meta_info, **metainfo}
|
|
420
|
-
table_header[shortname.strip()] = header_meta_info
|
|
421
|
-
return table_header, duplicate_columns
|
|
422
|
-
|
|
423
|
-
def _extract_data_header_meta_info(self, line: str) -> dict:
|
|
424
|
-
"""Extracts the individual information bits inside of the header lines
|
|
425
|
-
|
|
426
|
-
Parameters
|
|
427
|
-
----------
|
|
428
|
-
line: str:
|
|
429
|
-
one header line, trimmed by the 'name =' prefix and the shortname
|
|
430
|
-
|
|
431
|
-
Returns
|
|
432
|
-
-------
|
|
433
|
-
a dictionary with the information stored
|
|
434
|
-
|
|
435
|
-
"""
|
|
436
|
-
regex_string = r"(?:(?P<name0>.+),\s(?P<metainfo0>.+)\s\[(?P<unit0>.+)\]|(?P<name2>.+)\s\[(?P<unit2>.+)\]|(?P<name3>.+),\s(?P<metainfo2>.[^\s]+)|(?P<name4>.+))"
|
|
437
|
-
regex_check = re.search(regex_string, line, flags=re.IGNORECASE)
|
|
438
|
-
if regex_check:
|
|
439
|
-
regex_info = dict(regex_check.groupdict())
|
|
440
|
-
regex_info = {
|
|
441
|
-
key[:-1]: value
|
|
442
|
-
for key, value in regex_info.items()
|
|
443
|
-
if value is not None
|
|
444
|
-
}
|
|
445
|
-
if len(regex_info) > 2:
|
|
446
|
-
# check for second sensors and adjust their names
|
|
447
|
-
if regex_info["metainfo"][-1] == "2":
|
|
448
|
-
regex_info["name"] = regex_info["name"] + " 2"
|
|
449
|
-
regex_info["metainfo"] = regex_info["metainfo"][:-1]
|
|
450
|
-
if len(regex_info["metainfo"]) == 0:
|
|
451
|
-
regex_info.pop("metainfo")
|
|
452
|
-
if regex_info["name"] == "flag":
|
|
453
|
-
regex_info["metainfo"] = regex_info["name"]
|
|
454
|
-
regex_info["unit"] = regex_info["name"]
|
|
455
|
-
return regex_info
|
|
456
|
-
return {}
|
|
457
|
-
|
|
458
|
-
def create_dataframe(
|
|
459
|
-
self,
|
|
460
|
-
header_info_detail_level: str = "shortname",
|
|
461
|
-
) -> pd.DataFrame:
|
|
462
|
-
"""Creates a pandas dataframe by splitting each dataline every 11
|
|
463
|
-
characters, as SeaBird defines its tables this way.
|
|
464
|
-
|
|
465
|
-
Parameters
|
|
466
|
-
----------
|
|
467
|
-
uns_full_header_names: bool:
|
|
468
|
-
whether to use all header information as dataframe header
|
|
469
|
-
uns_long_header_names: bool:
|
|
470
|
-
whether to use header longnames as dataframe header
|
|
471
|
-
|
|
472
|
-
Returns
|
|
473
|
-
-------
|
|
474
|
-
a pandas.Dataframe that represents the data values inside the cnv file
|
|
475
|
-
|
|
476
|
-
"""
|
|
477
|
-
n = 11
|
|
478
|
-
row_list = []
|
|
479
|
-
for line in self.data:
|
|
480
|
-
row_list.append(
|
|
481
|
-
[
|
|
482
|
-
line[i : i + n].split()[0]
|
|
483
|
-
for i in range(0, len(line) - n, n)
|
|
484
|
-
]
|
|
485
|
-
)
|
|
486
|
-
df = pd.DataFrame(row_list, dtype=float)
|
|
487
|
-
header_names = [
|
|
488
|
-
metainfo[header_info_detail_level]
|
|
489
|
-
for metainfo in list(self.data_header_meta_info.values())
|
|
490
|
-
]
|
|
491
|
-
# remove duplicate columns
|
|
492
|
-
df.drop(labels=self.duplicate_columns, axis=1, inplace=True)
|
|
493
|
-
self.duplicate_columns = []
|
|
494
|
-
try:
|
|
495
|
-
df.columns = header_names
|
|
496
|
-
except ValueError as error:
|
|
497
|
-
logger.error(
|
|
498
|
-
f"Could not set dataframe header for {self.file_name}: {error}"
|
|
499
|
-
)
|
|
500
|
-
logger.error(header_names)
|
|
501
|
-
else:
|
|
502
|
-
df.meta.metadata = self.data_header_meta_info
|
|
503
|
-
# df.meta.propagate_metadata_to_series()
|
|
504
|
-
return df
|
|
505
|
-
|
|
506
|
-
def rename_dataframe_header(
|
|
507
|
-
self,
|
|
508
|
-
df: pd.DataFrame | None = None,
|
|
509
|
-
header_detail_level: str = "shortname",
|
|
510
|
-
) -> list:
|
|
511
|
-
df = self.df if df is None else df
|
|
512
|
-
df.meta.rename(header_detail_level)
|
|
513
|
-
return [column for column in df.columns]
|
|
514
|
-
|
|
515
|
-
def reading_start_time(
|
|
516
|
-
self,
|
|
517
|
-
time_source: str = "System UTC",
|
|
518
|
-
) -> datetime | None:
|
|
519
|
-
"""
|
|
520
|
-
Extracts the Cast start time from the metadata header.
|
|
521
|
-
"""
|
|
522
|
-
for line in self.sbe9_data:
|
|
523
|
-
if line.startswith(time_source):
|
|
524
|
-
start_time = line.split("=")[1]
|
|
525
|
-
start_time = datetime.strptime(
|
|
526
|
-
start_time, " %b %d %Y %H:%M:%S "
|
|
527
|
-
)
|
|
528
|
-
return start_time
|
|
529
|
-
return None
|
|
530
|
-
|
|
531
|
-
def absolute_time_calculation(self) -> bool:
|
|
532
|
-
"""
|
|
533
|
-
Replaces the basic cnv time representation of counting relative to the
|
|
534
|
-
casts start point, by real UTC timestamps.
|
|
535
|
-
This operation will act directly on the dataframe.
|
|
536
|
-
|
|
537
|
-
"""
|
|
538
|
-
time_parameter = None
|
|
539
|
-
for parameter in self.df.columns:
|
|
540
|
-
if parameter.lower().startswith("time"):
|
|
541
|
-
time_parameter = parameter
|
|
542
|
-
if time_parameter and self.start_time:
|
|
543
|
-
self.df.meta.add_column(
|
|
544
|
-
name="datetime",
|
|
545
|
-
data=[
|
|
546
|
-
timedelta(days=float(time)) + self.start_time
|
|
547
|
-
if time_parameter == "timeJ"
|
|
548
|
-
else timedelta(seconds=float(time)) + self.start_time
|
|
549
|
-
for time in self.df[time_parameter]
|
|
550
|
-
],
|
|
551
|
-
)
|
|
552
|
-
return True
|
|
553
|
-
return False
|
|
554
|
-
|
|
555
|
-
def add_start_time(self) -> bool:
|
|
556
|
-
"""
|
|
557
|
-
Adds the Cast start time to the dataframe.
|
|
558
|
-
Necessary for joins on the time.
|
|
559
|
-
"""
|
|
560
|
-
if self.start_time:
|
|
561
|
-
self.df.meta.add_column(
|
|
562
|
-
name="start_time",
|
|
563
|
-
data=pd.Series([self.start_time for _ in self.data]),
|
|
564
|
-
)
|
|
565
|
-
return True
|
|
566
|
-
return False
|
|
567
|
-
|
|
568
|
-
def obtaining_validation_modules(self) -> CnvValidationList:
|
|
569
|
-
"""
|
|
570
|
-
Collects the individual validation modules and their respective
|
|
571
|
-
information, usually present in key-value pairs.
|
|
572
|
-
"""
|
|
573
|
-
validation_modules = self.processing_info
|
|
574
|
-
return CnvValidationList(validation_modules)
|
|
575
|
-
|
|
576
|
-
def df2cnv(
|
|
577
|
-
self,
|
|
578
|
-
header_names: list | None = None,
|
|
579
|
-
header_detail_level: str | None = None,
|
|
580
|
-
) -> list:
|
|
581
|
-
"""
|
|
582
|
-
Parses a pandas dataframe into a list that represents the lines inside
|
|
583
|
-
of a cnv data table.
|
|
584
|
-
|
|
585
|
-
Parameters
|
|
586
|
-
----------
|
|
587
|
-
header_names: list:
|
|
588
|
-
a list of dataframe columns that will be parsed
|
|
589
|
-
|
|
590
|
-
Returns
|
|
591
|
-
-------
|
|
592
|
-
a list of lines in the cnv data table format
|
|
593
|
-
|
|
594
|
-
"""
|
|
595
|
-
if not header_detail_level:
|
|
596
|
-
header_detail_level = self.df.meta.header_detail
|
|
597
|
-
if not header_names:
|
|
598
|
-
header_names = [
|
|
599
|
-
header[header_detail_level]
|
|
600
|
-
for header in list(self.data_header_meta_info.values())
|
|
601
|
-
]
|
|
602
|
-
df = self.df.drop(
|
|
603
|
-
labels=[
|
|
604
|
-
column
|
|
605
|
-
for column in list(self.df.meta.metadata.keys())
|
|
606
|
-
if column not in header_names
|
|
607
|
-
],
|
|
608
|
-
axis=1,
|
|
609
|
-
errors="ignore",
|
|
610
|
-
)
|
|
611
|
-
cnv_out = []
|
|
612
|
-
for _, row in df.iterrows():
|
|
613
|
-
cnv_like_row = "".join(
|
|
614
|
-
(lambda column: f"{str(column):>11}")(value) for value in row
|
|
615
|
-
)
|
|
616
|
-
cnv_out.append(cnv_like_row + "\n")
|
|
617
|
-
return cnv_out
|
|
618
|
-
|
|
619
|
-
def array2cnv(self) -> list:
|
|
620
|
-
result = []
|
|
621
|
-
for row in self.parameters.full_data_array:
|
|
622
|
-
formatted_row = "".join(f"{elem:11}" for elem in row)
|
|
623
|
-
result.append(formatted_row + "\n")
|
|
624
|
-
return result
|
|
625
|
-
|
|
626
|
-
def to_cnv(
|
|
627
|
-
self,
|
|
628
|
-
file_name: Path | str | None = None,
|
|
629
|
-
use_dataframe: bool = True,
|
|
630
|
-
header_list: list | None = None,
|
|
631
|
-
):
|
|
632
|
-
"""
|
|
633
|
-
Writes the values inside of this instance as a new cnv file to disc.
|
|
634
|
-
|
|
635
|
-
Parameters
|
|
636
|
-
----------
|
|
637
|
-
file_name: Path:
|
|
638
|
-
the new file name to use for writing
|
|
639
|
-
use_current_df: bool:
|
|
640
|
-
whether to use the current dataframe as data table
|
|
641
|
-
use_current_validation_header: bool:
|
|
642
|
-
whether to use the current processing module list
|
|
643
|
-
header_list: list:
|
|
644
|
-
the data columns to use for the export
|
|
645
|
-
|
|
646
|
-
"""
|
|
647
|
-
file_name = self.path_to_file if file_name is None else file_name
|
|
648
|
-
# content construction
|
|
649
|
-
if use_dataframe:
|
|
650
|
-
data = self.df2cnv(header_list)
|
|
651
|
-
else:
|
|
652
|
-
data = self.array2cnv()
|
|
653
|
-
self._update_header()
|
|
654
|
-
self.file_data = [*self.header, *data]
|
|
655
|
-
# writing content out
|
|
656
|
-
try:
|
|
657
|
-
with open(file_name, "w", encoding="latin-1") as file:
|
|
658
|
-
for line in self.file_data:
|
|
659
|
-
file.write(line)
|
|
660
|
-
|
|
661
|
-
except IOError as error:
|
|
662
|
-
logger.error(f"Could not write cnv file: {error}")
|
|
663
|
-
|
|
664
|
-
def _update_header(self):
|
|
665
|
-
"""Re-creates the cnv header."""
|
|
666
|
-
self.data_table_description = self._form_data_table_info()
|
|
667
|
-
self.header = [
|
|
668
|
-
*[f"* {data}" for data in self.sbe9_data[:-1]],
|
|
669
|
-
*[f"** {data}" for data in self.metadata_list],
|
|
670
|
-
f"* {self.sbe9_data[-1]}",
|
|
671
|
-
*[f"# {data}" for data in self.data_table_description],
|
|
672
|
-
*[f"# {data}" for data in self.sensor_data],
|
|
673
|
-
*[f"# {data}" for data in self.processing_info],
|
|
674
|
-
"*END*\n",
|
|
675
|
-
]
|
|
676
|
-
|
|
677
|
-
def _form_data_table_info(self) -> list:
|
|
678
|
-
"""Recreates the data table descriptions, like column names and spans
|
|
679
|
-
from the structured dictionaries these values were stored in."""
|
|
680
|
-
new_table_info = []
|
|
681
|
-
for key, value in self.data_table_stats.items():
|
|
682
|
-
new_table_info.append(f"{key} = {value}\n")
|
|
683
|
-
for index, (name, _) in enumerate(self.data_table_names_and_spans):
|
|
684
|
-
new_table_info.append(f"name {index} = {name}\n")
|
|
685
|
-
for index, (_, span) in enumerate(self.data_table_names_and_spans):
|
|
686
|
-
new_table_info.append(f"span {index} = {span}\n")
|
|
687
|
-
for key, value in self.data_table_misc.items():
|
|
688
|
-
new_table_info.append(f"{key} = {value}\n")
|
|
689
|
-
return new_table_info
|
|
690
|
-
|
|
691
|
-
def add_processing_metadata(self, addition: str | list):
|
|
692
|
-
"""
|
|
693
|
-
Adds new processing lines to the list of processing module information
|
|
694
|
-
|
|
695
|
-
Parameters
|
|
696
|
-
----------
|
|
697
|
-
addition: str:
|
|
698
|
-
the new information line
|
|
699
|
-
|
|
700
|
-
"""
|
|
701
|
-
# TODO: use CnvprocessingList here
|
|
702
|
-
if isinstance(addition, str):
|
|
703
|
-
addition = [addition]
|
|
704
|
-
for line in addition:
|
|
705
|
-
self.file_data.append(line)
|
|
706
|
-
# add the new info line *before* the 'file_type = ascii' line
|
|
707
|
-
self.processing_info.insert(-1, line)
|
|
708
|
-
|
|
709
|
-
def add_station_and_event_column(self) -> bool:
|
|
710
|
-
"""
|
|
711
|
-
Adds a column with the DSHIP station and device event numbers to the
|
|
712
|
-
dataframe. These must be present inside the extra metadata header.
|
|
713
|
-
|
|
714
|
-
"""
|
|
715
|
-
try:
|
|
716
|
-
event_list = [self.metadata["Station"] for _ in self.data]
|
|
717
|
-
except KeyError:
|
|
718
|
-
return False
|
|
719
|
-
else:
|
|
720
|
-
self.df.meta.add_column(
|
|
721
|
-
name="Event", data=pd.Series(event_list), location=0
|
|
722
|
-
)
|
|
723
|
-
return True
|
|
724
|
-
|
|
725
|
-
def add_position_columns(self) -> bool:
|
|
726
|
-
"""
|
|
727
|
-
Adds a column with the longitude and latitude to the dataframe.
|
|
728
|
-
These must be present inside the extra metadata header.
|
|
729
|
-
|
|
730
|
-
"""
|
|
731
|
-
if ("latitude" or "longitude") in [
|
|
732
|
-
column["shortname"]
|
|
733
|
-
for column in list(self.df.meta.metadata.values())
|
|
734
|
-
]:
|
|
735
|
-
return True
|
|
736
|
-
try:
|
|
737
|
-
latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
|
|
738
|
-
longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
|
|
739
|
-
except KeyError:
|
|
740
|
-
return False
|
|
741
|
-
else:
|
|
742
|
-
self.df.meta.add_column(
|
|
743
|
-
name="Latitude", data=pd.Series(latitude_list), location=1
|
|
744
|
-
)
|
|
745
|
-
self.df.meta.add_column(
|
|
746
|
-
name="Longitude", data=pd.Series(longitude_list), location=2
|
|
747
|
-
)
|
|
748
|
-
return True
|
|
749
|
-
|
|
750
|
-
def add_cast_number(self, number: int | None = None) -> bool:
|
|
751
|
-
"""
|
|
752
|
-
Adds a column with the cast number to the dataframe.
|
|
753
|
-
|
|
754
|
-
Parameters
|
|
755
|
-
----------
|
|
756
|
-
number: int:
|
|
757
|
-
the cast number of this files cast
|
|
758
|
-
|
|
759
|
-
"""
|
|
760
|
-
if ("Cast" in self.metadata.keys()) and (not number):
|
|
761
|
-
number = int(self.metadata["Cast"])
|
|
762
|
-
try:
|
|
763
|
-
self.df.meta.add_column(
|
|
764
|
-
name="Cast",
|
|
765
|
-
data=pd.Series([number for _ in self.data]),
|
|
766
|
-
location=0,
|
|
767
|
-
)
|
|
768
|
-
except ValueError:
|
|
769
|
-
# Cast is already part of the dataframe, so nothing left to do
|
|
770
|
-
return False
|
|
771
|
-
else:
|
|
772
|
-
return True
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
class BottleLogFile(DataTableFile):
|
|
776
|
-
"""Bottle Log file representation, that extracts the three different data
|
|
777
|
-
types from the file: reset time and the table with bottle IDs and
|
|
778
|
-
corresponding data ranges.
|
|
779
|
-
|
|
780
|
-
Parameters
|
|
781
|
-
----------
|
|
782
|
-
|
|
783
|
-
Returns
|
|
784
|
-
-------
|
|
785
|
-
|
|
786
|
-
"""
|
|
787
|
-
|
|
788
|
-
def __init__(self, path_to_file, create_dataframe=False):
|
|
789
|
-
super().__init__(path_to_file)
|
|
790
|
-
self.reset_time = self.obtaining_reset_time()
|
|
791
|
-
self.origin_cnv = self.raw_file_data[0].strip()
|
|
792
|
-
self.data = self.data_whitespace_removal()
|
|
793
|
-
|
|
794
|
-
if create_dataframe:
|
|
795
|
-
self.original_df = self.create_dataframe()
|
|
796
|
-
self.df = self.original_df
|
|
797
|
-
else:
|
|
798
|
-
self.data_list = self.create_list()
|
|
799
|
-
|
|
800
|
-
def data_whitespace_removal(self) -> list:
|
|
801
|
-
"""Strips the input from whitespace characters, in this case especially
|
|
802
|
-
newline characters.
|
|
803
|
-
|
|
804
|
-
Parameters
|
|
805
|
-
----------
|
|
806
|
-
|
|
807
|
-
Returns
|
|
808
|
-
-------
|
|
809
|
-
the original data stripped off the whitespaces
|
|
810
|
-
|
|
811
|
-
"""
|
|
812
|
-
temp_data = []
|
|
813
|
-
for line in self.raw_file_data[2:]:
|
|
814
|
-
temp_data.append(line.strip())
|
|
815
|
-
return temp_data
|
|
816
|
-
|
|
817
|
-
def obtaining_reset_time(self) -> datetime:
|
|
818
|
-
"""Reading reset time with small input check.
|
|
819
|
-
|
|
820
|
-
Parameters
|
|
821
|
-
----------
|
|
822
|
-
|
|
823
|
-
Returns
|
|
824
|
-
-------
|
|
825
|
-
a datetime.datetime object of the device reset time
|
|
826
|
-
|
|
827
|
-
"""
|
|
828
|
-
|
|
829
|
-
regex_check = re.search(
|
|
830
|
-
r"RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)",
|
|
831
|
-
self.raw_file_data[1],
|
|
832
|
-
)
|
|
833
|
-
if regex_check:
|
|
834
|
-
return datetime.strptime(regex_check.group(1), "%b %d %Y %H:%M:%S")
|
|
835
|
-
else:
|
|
836
|
-
error_message = """BottleLogFile is not formatted as expected:
|
|
837
|
-
Reset time could not be extracted."""
|
|
838
|
-
logger.error(error_message)
|
|
839
|
-
raise IOError(error_message)
|
|
840
|
-
|
|
841
|
-
def create_list(self) -> list:
|
|
842
|
-
"""Creates a list of usable data from the list specified in self.data.
|
|
843
|
-
the list consists of: an array of ID's representing the bottles, the date and time of the data sample
|
|
844
|
-
and the lines of the cnv corresponding to the bottles
|
|
845
|
-
|
|
846
|
-
Parameters
|
|
847
|
-
----------
|
|
848
|
-
|
|
849
|
-
Returns
|
|
850
|
-
-------
|
|
851
|
-
a list representing the bl files table information
|
|
852
|
-
"""
|
|
853
|
-
content_array = []
|
|
854
|
-
for i in range(len(self.data)):
|
|
855
|
-
bottles = [int(x) for x in self.data[i].split(",")[:2]]
|
|
856
|
-
date = self.convert_date(self.data[i].split(",")[2])
|
|
857
|
-
lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
|
|
858
|
-
|
|
859
|
-
content_array.append([bottles, date, lines])
|
|
860
|
-
|
|
861
|
-
return content_array
|
|
862
|
-
|
|
863
|
-
def convert_date(self, date: str):
|
|
864
|
-
"""Converts the Dates of the .bl files to an ISO 8601 standard
|
|
865
|
-
|
|
866
|
-
Parameters
|
|
867
|
-
----------
|
|
868
|
-
|
|
869
|
-
Returns
|
|
870
|
-
-------
|
|
871
|
-
a string with the date in the form of "yymmddThhmmss"
|
|
872
|
-
"""
|
|
873
|
-
date = date.strip()
|
|
874
|
-
month_list = [
|
|
875
|
-
"Jan",
|
|
876
|
-
"Feb",
|
|
877
|
-
"Mar",
|
|
878
|
-
"Apr",
|
|
879
|
-
"May",
|
|
880
|
-
"Jun",
|
|
881
|
-
"Jul",
|
|
882
|
-
"Aug",
|
|
883
|
-
"Sep",
|
|
884
|
-
"Oct",
|
|
885
|
-
"Nov",
|
|
886
|
-
"Dec",
|
|
887
|
-
]
|
|
888
|
-
|
|
889
|
-
month_ind = month_list.index(date.split(" ")[0]) + 1
|
|
890
|
-
if month_ind < 10:
|
|
891
|
-
month = "0" + str(month_ind)
|
|
892
|
-
else:
|
|
893
|
-
month = str(month_ind)
|
|
894
|
-
day = date.split(" ")[1]
|
|
895
|
-
year = (date.split(" ")[2])[2:]
|
|
896
|
-
time = date.split(" ")[3].replace(":", "")
|
|
897
|
-
return year + month + day + "T" + time
|
|
898
|
-
|
|
899
|
-
def create_dataframe(self) -> pd.DataFrame:
|
|
900
|
-
"""Creates a dataframe from the list specified in self.data.
|
|
901
|
-
|
|
902
|
-
Parameters
|
|
903
|
-
----------
|
|
904
|
-
|
|
905
|
-
Returns
|
|
906
|
-
-------
|
|
907
|
-
a pandas.Dataframe representing the bl files table information
|
|
908
|
-
"""
|
|
909
|
-
data_lists = []
|
|
910
|
-
for line in self.data:
|
|
911
|
-
inner_list = line.split(",")
|
|
912
|
-
# dropping first column as its the index
|
|
913
|
-
data_lists.append(inner_list[1:])
|
|
914
|
-
df = pd.DataFrame(data_lists)
|
|
915
|
-
df.columns = ["Bottle ID", "Datetime", "start_range", "end_range"]
|
|
916
|
-
return df
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
class FieldCalibrationFile(DataTableFile):
|
|
920
|
-
def __init__(self, path_to_file):
|
|
921
|
-
super().__init__(path_to_file)
|
|
922
|
-
self.original_df = self.create_dataframe()
|
|
923
|
-
self.df = self.original_df
|
|
924
|
-
|
|
925
|
-
def create_dataframe(self) -> pd.DataFrame:
|
|
926
|
-
try:
|
|
927
|
-
return pd.read_csv(self.path_to_file, skiprows=len(self.header))
|
|
928
|
-
except IOError as error:
|
|
929
|
-
logger.error(f"Could not read field calibration file: {error}.")
|
|
930
|
-
return pd.DataFrame()
|