seabirdfilehandler 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -1,5 +1,7 @@
1
- from .seabirdfiles import *
2
- from .datatablefiles import *
1
+ from .datafiles import *
2
+ from .bottlefile import *
3
+ from .bottlelogfile import *
4
+ from .cnvfile import *
3
5
  from .xmlfiles import *
4
6
  from .validation_modules import *
5
7
  from .file_collection import *
@@ -0,0 +1,181 @@
1
+ from typing import Union
2
+ from datetime import datetime, time
3
+ import pandas as pd
4
+ import numpy as np
5
+ import logging
6
+ from seabirdfilehandler import DataFile
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class BottleFile(DataFile):
12
+ """Class that represents a SeaBird Bottle File. Organizes the files table
13
+ information into a pandas dataframe. This allows the usage of this
14
+ powerful library for statistics, visualization, data manipulation, export,
15
+ etc.
16
+
17
+ Parameters
18
+ ----------
19
+
20
+ Returns
21
+ -------
22
+
23
+ """
24
+
25
+ def __init__(self, path_to_file):
26
+ super().__init__(path_to_file)
27
+ self.original_df = self.create_dataframe()
28
+ self.df = self.original_df
29
+ self.setting_dataframe_dtypes()
30
+ self.adding_timestamp_column()
31
+
32
+ def create_dataframe(self):
33
+ """Creates a dataframe out of the btl file. Manages the double data
34
+ header correctly.
35
+
36
+ Parameters
37
+ ----------
38
+
39
+ Returns
40
+ -------
41
+
42
+ """
43
+ # TODO: this needs to be broken down into smaller pieces...
44
+ top_names, bottom_names = self.reading_data_header()
45
+ # creating statistics column to store the row type information:
46
+ # 4 rows per bottle, average, standard deviation, max value, min value
47
+ top_names.append("Statistic")
48
+ # TODO: sexier way to construct dataframe than opening the file a
49
+ # second time
50
+ # # df = pd.DataFrame(self.data, index=None, columns=top_names)
51
+ df: pd.DataFrame = pd.read_fwf(
52
+ self.path_to_file,
53
+ index_col=False,
54
+ skiprows=len(self.header) + 2,
55
+ header=None,
56
+ names=top_names,
57
+ )
58
+
59
+ # handling the double row header
60
+ rowtypes = df[df.columns[-1]].unique()
61
+
62
+ # TODO: can this be made a little pretier?
63
+ def separate_double_header_row(df, column, length):
64
+ """
65
+
66
+ Parameters
67
+ ----------
68
+ df :
69
+ column :
70
+ length :
71
+
72
+ Returns
73
+ -------
74
+
75
+ """
76
+ column_idx = df.columns.get_loc(column)
77
+ old_column = df.iloc[::length, column_idx].reset_index(drop=True)
78
+ new_column = df.iloc[1::length, column_idx].reset_index(drop=True)
79
+ old_column_expanded = pd.Series(
80
+ np.repeat(old_column, length)
81
+ ).reset_index(drop=True)
82
+ new_column_expanded = pd.Series(
83
+ np.repeat(new_column, length)
84
+ ).reset_index(drop=True)
85
+ df[column] = old_column_expanded
86
+ df.insert(
87
+ column_idx + 1, bottom_names[column_idx], new_column_expanded
88
+ )
89
+ return df
90
+
91
+ df = separate_double_header_row(df, "Date", len(rowtypes))
92
+ df = separate_double_header_row(df, top_names[0], len(rowtypes))
93
+ # remove brackets around statistics values
94
+ df["Statistic"] = df["Statistic"].str.strip("()")
95
+ df = df.rename(mapper={"Btl_ID": "Bottle_ID"}, axis=1)
96
+ return df
97
+
98
+ def adding_timestamp_column(self):
99
+ """Creates a timestamp column that holds both, Date and Time
100
+ information.
101
+
102
+ Parameters
103
+ ----------
104
+
105
+ Returns
106
+ -------
107
+
108
+ """
109
+ # constructing timestamp column
110
+ timestamp = []
111
+ for datepoint, timepoint in zip(self.df.Date, self.df.Time):
112
+ timestamp.append(
113
+ datetime.combine(datepoint, time.fromisoformat(str(timepoint)))
114
+ )
115
+ self.df.insert(2, "Timestamp", timestamp)
116
+ self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
117
+
118
+ def setting_dataframe_dtypes(self):
119
+ """Sets the types for the column values in the dataframe."""
120
+ # setting dtypes
121
+ # TODO: extending this to the other columns!
122
+ self.df.Date = pd.to_datetime(self.df.Date)
123
+ self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
124
+
125
+ def selecting_rows(
126
+ self, df=None, statistic_of_interest: Union[list, str] = ["avg"]
127
+ ):
128
+ """Creates a dataframe with the given row identifier, using the
129
+ statistics column. A single string or a list of strings can be
130
+ processed.
131
+
132
+ Parameters
133
+ ----------
134
+ df : pandas.Dataframe :
135
+ the files Pandas representation (Default value = self.df)
136
+ statistic_of_interest: list or str :
137
+ collection of values of the 'statistics' column in self.df
138
+ (Default value = ['avg'])
139
+
140
+ Returns
141
+ -------
142
+
143
+ """
144
+ df = self.df if df is None else df
145
+ # ensure that the input is a list, so that isin() can do its job
146
+ if isinstance(statistic_of_interest, str):
147
+ statistic_of_interest = [statistic_of_interest]
148
+ self.df = df.loc[df["Statistic"].isin(statistic_of_interest)]
149
+
150
+ def reading_data_header(self):
151
+ """Identifies and separatly collects the rows that specify the data
152
+ tables headers.
153
+
154
+ Parameters
155
+ ----------
156
+
157
+ Returns
158
+ -------
159
+
160
+ """
161
+ n = 11 # fix column width of a seabird btl file
162
+ top_line = self.data[0]
163
+ second_line = self.data[1]
164
+ top_names = [
165
+ top_line[i : i + n].split()[0]
166
+ for i in range(0, len(top_line) - n, n)
167
+ ]
168
+ bottom_names = [
169
+ second_line[i : i + n].split()[0] for i in range(0, 2 * n, n)
170
+ ]
171
+ return top_names, bottom_names
172
+
173
+ def add_station_and_event_column(self):
174
+ event_list = [self.metadata["Station"] for _ in self.data]
175
+ self.df.insert(0, "Event", pd.Series(event_list))
176
+
177
+ def add_position_columns(self):
178
+ latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
179
+ self.df.insert(1, "Latitude", pd.Series(latitude_list))
180
+ longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
181
+ self.df.insert(2, "Longitude", pd.Series(longitude_list))
@@ -0,0 +1,151 @@
1
+ from datetime import datetime
2
+ import re
3
+ import logging
4
+ import pandas as pd
5
+ from seabirdfilehandler import DataFile
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class BottleLogFile(DataFile):
11
+ """Bottle Log file representation, that extracts the three different data
12
+ types from the file: reset time and the table with bottle IDs and
13
+ corresponding data ranges.
14
+
15
+ Parameters
16
+ ----------
17
+
18
+ Returns
19
+ -------
20
+
21
+ """
22
+
23
+ def __init__(self, path_to_file, create_dataframe=False):
24
+ super().__init__(path_to_file)
25
+ self.reset_time = self.obtaining_reset_time()
26
+ self.origin_cnv = self.raw_file_data[0].strip()
27
+ self.data = self.data_whitespace_removal()
28
+
29
+ if create_dataframe:
30
+ self.original_df = self.create_dataframe()
31
+ self.df = self.original_df
32
+ else:
33
+ self.data_list = self.create_list()
34
+
35
+ def data_whitespace_removal(self) -> list:
36
+ """Strips the input from whitespace characters, in this case especially
37
+ newline characters.
38
+
39
+ Parameters
40
+ ----------
41
+
42
+ Returns
43
+ -------
44
+ the original data stripped off the whitespaces
45
+
46
+ """
47
+ temp_data = []
48
+ for line in self.raw_file_data[2:]:
49
+ temp_data.append(line.strip())
50
+ return temp_data
51
+
52
+ def obtaining_reset_time(self) -> datetime:
53
+ """Reading reset time with small input check.
54
+
55
+ Parameters
56
+ ----------
57
+
58
+ Returns
59
+ -------
60
+ a datetime.datetime object of the device reset time
61
+
62
+ """
63
+
64
+ regex_check = re.search(
65
+ r"RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)",
66
+ self.raw_file_data[1],
67
+ )
68
+ if regex_check:
69
+ return datetime.strptime(regex_check.group(1), "%b %d %Y %H:%M:%S")
70
+ else:
71
+ error_message = """BottleLogFile is not formatted as expected:
72
+ Reset time could not be extracted."""
73
+ logger.error(error_message)
74
+ raise IOError(error_message)
75
+
76
+ def create_list(self) -> list:
77
+ """Creates a list of usable data from the list specified in self.data.
78
+ the list consists of: an array of ID's representing the bottles, the date and time of the data sample
79
+ and the lines of the cnv corresponding to the bottles
80
+
81
+ Parameters
82
+ ----------
83
+
84
+ Returns
85
+ -------
86
+ a list representing the bl files table information
87
+ """
88
+ content_array = []
89
+ for i in range(len(self.data)):
90
+ bottles = [int(x) for x in self.data[i].split(",")[:2]]
91
+ date = self.convert_date(self.data[i].split(",")[2])
92
+ lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
93
+
94
+ content_array.append([bottles, date, lines])
95
+
96
+ return content_array
97
+
98
+ def convert_date(self, date: str):
99
+ """Converts the Dates of the .bl files to an ISO 8601 standard
100
+
101
+ Parameters
102
+ ----------
103
+
104
+ Returns
105
+ -------
106
+ a string with the date in the form of "yymmddThhmmss"
107
+ """
108
+ date = date.strip()
109
+ month_list = [
110
+ "Jan",
111
+ "Feb",
112
+ "Mar",
113
+ "Apr",
114
+ "May",
115
+ "Jun",
116
+ "Jul",
117
+ "Aug",
118
+ "Sep",
119
+ "Oct",
120
+ "Nov",
121
+ "Dec",
122
+ ]
123
+
124
+ month_ind = month_list.index(date.split(" ")[0]) + 1
125
+ if month_ind < 10:
126
+ month = "0" + str(month_ind)
127
+ else:
128
+ month = str(month_ind)
129
+ day = date.split(" ")[1]
130
+ year = (date.split(" ")[2])[2:]
131
+ time = date.split(" ")[3].replace(":", "")
132
+ return year + month + day + "T" + time
133
+
134
+ def create_dataframe(self) -> pd.DataFrame:
135
+ """Creates a dataframe from the list specified in self.data.
136
+
137
+ Parameters
138
+ ----------
139
+
140
+ Returns
141
+ -------
142
+ a pandas.Dataframe representing the bl files table information
143
+ """
144
+ data_lists = []
145
+ for line in self.data:
146
+ inner_list = line.split(",")
147
+ # dropping first column as its the index
148
+ data_lists.append(inner_list[1:])
149
+ df = pd.DataFrame(data_lists)
150
+ df.columns = ["Bottle ID", "Datetime", "start_range", "end_range"]
151
+ return df
@@ -0,0 +1,284 @@
1
+ from pathlib import Path
2
+ from datetime import datetime, timedelta
3
+ import pandas as pd
4
+ import numpy as np
5
+ import logging
6
+ from seabirdfilehandler import DataFile
7
+ from seabirdfilehandler.parameter import Parameters
8
+ from seabirdfilehandler.validation_modules import CnvValidationList
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class CnvFile(DataFile):
14
+ """
15
+ A representation of a cnv-file as used by SeaBird.
16
+
17
+ This class intends to fully extract and organize the different types of
18
+ data and metadata present inside of such a file. Downstream libraries shall
19
+ be able to use this representation for all applications concerning cnv
20
+ files, like data processing, transformation or visualization.
21
+
22
+ To achieve that, the metadata header is organized by the grandparent-class,
23
+ SeaBirdFile, while the data table is extracted by this class. The data
24
+ representation of choice is a pandas Dataframe. Inside this class, there
25
+ are methods to parse cnv data into dataframes, do the reverse of writing a
26
+ dataframe into cnv compliant form and to manipulate the dataframe in
27
+ various ways.
28
+
29
+ Parameters
30
+ ----------
31
+ path_to_file: Path | str:
32
+ the path to the file
33
+ full_data_header: bool:
34
+ whether to use the full data column descriptions for the dataframe
35
+ long_header_names: bool:
36
+ whether to use long header names in the dateframe
37
+ absolute_time_calculation: bool:
38
+ whether to use a real timestamp instead of the second count
39
+ event_log_column: bool:
40
+ whether to add a station and device event column from DSHIP
41
+ coordinate_columns: bool:
42
+ whether to add longitude and latitude from the extra metadata header
43
+
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ path_to_file: Path | str,
49
+ only_header: bool = False,
50
+ create_dataframe: bool = False,
51
+ absolute_time_calculation: bool = False,
52
+ event_log_column: bool = False,
53
+ coordinate_columns: bool = False,
54
+ ):
55
+ super().__init__(path_to_file, only_header)
56
+ self.validation_modules = self.obtaining_validation_modules()
57
+ self.start_time = self.reading_start_time()
58
+ self.parameters = Parameters(self.data, self.data_table_description)
59
+ if create_dataframe:
60
+ self.df = self.parameters.get_pandas_dataframe()
61
+ if absolute_time_calculation:
62
+ self.absolute_time_calculation()
63
+ if event_log_column:
64
+ self.add_station_and_event_column()
65
+ if coordinate_columns:
66
+ self.add_position_columns()
67
+
68
+ def reading_start_time(
69
+ self,
70
+ time_source: str = "System UTC",
71
+ ) -> datetime | None:
72
+ """
73
+ Extracts the Cast start time from the metadata header.
74
+ """
75
+ for line in self.sbe9_data:
76
+ if line.startswith(time_source):
77
+ start_time = line.split("=")[1]
78
+ start_time = datetime.strptime(
79
+ start_time, " %b %d %Y %H:%M:%S "
80
+ )
81
+ return start_time
82
+ return None
83
+
84
+ def absolute_time_calculation(self) -> bool:
85
+ """
86
+ Replaces the basic cnv time representation of counting relative to the
87
+ casts start point, by real UTC timestamps.
88
+ This operation will act directly on the dataframe.
89
+
90
+ """
91
+ time_parameter = None
92
+ for parameter in self.df.columns:
93
+ if parameter.lower().startswith("time"):
94
+ time_parameter = parameter
95
+ if time_parameter and self.start_time:
96
+ self.parameters.create_parameter(
97
+ name="datetime",
98
+ data=np.array(
99
+ [
100
+ timedelta(days=float(time)) + self.start_time
101
+ if time_parameter == "timeJ"
102
+ else timedelta(seconds=float(time)) + self.start_time
103
+ for time in self.df[time_parameter]
104
+ ]
105
+ ),
106
+ )
107
+ return True
108
+ return False
109
+
110
+ def add_start_time(self) -> bool:
111
+ """
112
+ Adds the Cast start time to the dataframe.
113
+ Necessary for joins on the time.
114
+ """
115
+ if self.start_time:
116
+ self.parameters.create_parameter(
117
+ name="start_time",
118
+ data=str(self.start_time),
119
+ )
120
+ return True
121
+ return False
122
+
123
+ def obtaining_validation_modules(self) -> CnvValidationList:
124
+ """
125
+ Collects the individual validation modules and their respective
126
+ information, usually present in key-value pairs.
127
+ """
128
+ validation_modules = self.processing_info
129
+ return CnvValidationList(validation_modules)
130
+
131
+ def df2cnv(self, df: pd.DataFrame | None = None) -> list:
132
+ """
133
+ Parses a pandas dataframe into a list that represents the lines inside
134
+ of a cnv data table.
135
+
136
+ Parameters
137
+ ----------
138
+ df: DataFrame to export, default is self.df
139
+
140
+ Returns
141
+ -------
142
+ a list of lines in the cnv data table format
143
+
144
+ """
145
+ df = df if isinstance(df, pd.DataFrame) else self.df
146
+ cnv_out = []
147
+ for _, row in df.iterrows():
148
+ cnv_like_row = "".join(
149
+ (lambda column: f"{str(column):>11}")(value) for value in row
150
+ )
151
+ cnv_out.append(cnv_like_row + "\n")
152
+ return cnv_out
153
+
154
+ def array2cnv(self) -> list:
155
+ result = []
156
+ for row in self.parameters.full_data_array:
157
+ formatted_row = "".join(f"{elem:11}" for elem in row)
158
+ result.append(formatted_row + "\n")
159
+ return result
160
+
161
+ def to_cnv(
162
+ self,
163
+ file_name: Path | str | None = None,
164
+ use_dataframe: bool = True,
165
+ ):
166
+ """
167
+ Writes the values inside of this instance as a new cnv file to disc.
168
+
169
+ Parameters
170
+ ----------
171
+ file_name: Path:
172
+ the new file name to use for writing
173
+ use_current_df: bool:
174
+ whether to use the current dataframe as data table
175
+ use_current_validation_header: bool:
176
+ whether to use the current processing module list
177
+ header_list: list:
178
+ the data columns to use for the export
179
+
180
+ """
181
+ file_name = self.path_to_file if file_name is None else file_name
182
+ # content construction
183
+ if use_dataframe:
184
+ data = self.df2cnv()
185
+ else:
186
+ data = self.array2cnv()
187
+ self._update_header()
188
+ self.file_data = [*self.header, *data]
189
+ # writing content out
190
+ try:
191
+ with open(file_name, "w", encoding="latin-1") as file:
192
+ for line in self.file_data:
193
+ file.write(line)
194
+
195
+ except IOError as error:
196
+ logger.error(f"Could not write cnv file: {error}")
197
+
198
+ def _update_header(self):
199
+ """Re-creates the cnv header."""
200
+ self.data_table_description = self.parameters._form_data_table_info()
201
+ self.header = [
202
+ *[f"* {data}" for data in self.sbe9_data[:-1]],
203
+ *[f"** {data}" for data in self.metadata_list],
204
+ f"* {self.sbe9_data[-1]}",
205
+ *[f"# {data}" for data in self.data_table_description],
206
+ *[f"# {data}" for data in self.sensor_data],
207
+ *[f"# {data}" for data in self.processing_info],
208
+ "*END*\n",
209
+ ]
210
+
211
+ def add_processing_metadata(self, addition: str | list):
212
+ """
213
+ Adds new processing lines to the list of processing module information
214
+
215
+ Parameters
216
+ ----------
217
+ addition: str:
218
+ the new information line
219
+
220
+ """
221
+ # TODO: use CnvprocessingList here
222
+ if isinstance(addition, str):
223
+ addition = [addition]
224
+ for line in addition:
225
+ self.file_data.append(line)
226
+ # add the new info line *before* the 'file_type = ascii' line
227
+ self.processing_info.insert(-1, line)
228
+
229
+ def add_station_and_event_column(self) -> bool:
230
+ """
231
+ Adds a column with the DSHIP station and device event numbers to the
232
+ dataframe. These must be present inside the extra metadata header.
233
+
234
+ """
235
+ if "Station" in self.metadata:
236
+ self.parameters.create_parameter(
237
+ data=self.metadata["Station"],
238
+ name="Event",
239
+ )
240
+ return True
241
+ return False
242
+
243
+ def add_position_columns(self) -> bool:
244
+ """
245
+ Adds a column with the longitude and latitude to the dataframe.
246
+ These must be present inside the extra metadata header.
247
+
248
+ """
249
+ if ("latitude" or "longitude") in [
250
+ k.lower() for k in self.parameters.keys()
251
+ ]:
252
+ return True
253
+ if ("GPS_Lat" and "GPS_Lon") in self.metadata:
254
+ self.parameters.create_parameter(
255
+ data=self.metadata["GPS_Lat"],
256
+ name="Latitude",
257
+ )
258
+ self.parameters.create_parameter(
259
+ data=self.metadata["GPS_Lon"],
260
+ name="Longitude",
261
+ )
262
+ return True
263
+ else:
264
+ return False
265
+
266
+ def add_cast_number(self, number: int | None = None) -> bool:
267
+ """
268
+ Adds a column with the cast number to the dataframe.
269
+
270
+ Parameters
271
+ ----------
272
+ number: int:
273
+ the cast number of this files cast
274
+
275
+ """
276
+ if ("Cast" in self.metadata.keys()) and (not number):
277
+ number = int(self.metadata["Cast"])
278
+ if number:
279
+ self.parameters.create_parameter(
280
+ data=number,
281
+ name="Cast",
282
+ )
283
+ return True
284
+ return False