seabirdfilehandler 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -1,5 +1,7 @@
1
- from .seabirdfiles import *
2
- from .datatablefiles import *
1
+ from .datafiles import *
2
+ from .bottlefile import *
3
+ from .bottlelogfile import *
4
+ from .cnvfile import *
3
5
  from .xmlfiles import *
4
6
  from .validation_modules import *
5
7
  from .file_collection import *
@@ -0,0 +1,185 @@
1
+ from typing import Union
2
+ from datetime import datetime, time
3
+ import pandas as pd
4
+ import numpy as np
5
+ import logging
6
+ from seabirdfilehandler import DataFile
7
+ from seabirdfilehandler.dataframe_meta_accessor import (
8
+ SeriesMetaAccessor, # noqa: F401
9
+ DataFrameMetaAccessor, # noqa: F401
10
+ )
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class BottleFile(DataFile):
16
+ """Class that represents a SeaBird Bottle File. Organizes the files table
17
+ information into a pandas dataframe. This allows the usage of this
18
+ powerful library for statistics, visualization, data manipulation, export,
19
+ etc.
20
+
21
+ Parameters
22
+ ----------
23
+
24
+ Returns
25
+ -------
26
+
27
+ """
28
+
29
+ def __init__(self, path_to_file):
30
+ super().__init__(path_to_file)
31
+ self.original_df = self.create_dataframe()
32
+ self.df = self.original_df
33
+ self.setting_dataframe_dtypes()
34
+ self.adding_timestamp_column()
35
+
36
+ def create_dataframe(self):
37
+ """Creates a dataframe out of the btl file. Manages the double data
38
+ header correctly.
39
+
40
+ Parameters
41
+ ----------
42
+
43
+ Returns
44
+ -------
45
+
46
+ """
47
+ # TODO: this needs to be broken down into smaller pieces...
48
+ top_names, bottom_names = self.reading_data_header()
49
+ # creating statistics column to store the row type information:
50
+ # 4 rows per bottle, average, standard deviation, max value, min value
51
+ top_names.append("Statistic")
52
+ # TODO: sexier way to construct dataframe than opening the file a
53
+ # second time
54
+ # # df = pd.DataFrame(self.data, index=None, columns=top_names)
55
+ df: pd.DataFrame = pd.read_fwf(
56
+ self.path_to_file,
57
+ index_col=False,
58
+ skiprows=len(self.header) + 2,
59
+ header=None,
60
+ names=top_names,
61
+ )
62
+
63
+ # handling the double row header
64
+ rowtypes = df[df.columns[-1]].unique()
65
+
66
+ # TODO: can this be made a little pretier?
67
+ def separate_double_header_row(df, column, length):
68
+ """
69
+
70
+ Parameters
71
+ ----------
72
+ df :
73
+ column :
74
+ length :
75
+
76
+ Returns
77
+ -------
78
+
79
+ """
80
+ column_idx = df.columns.get_loc(column)
81
+ old_column = df.iloc[::length, column_idx].reset_index(drop=True)
82
+ new_column = df.iloc[1::length, column_idx].reset_index(drop=True)
83
+ old_column_expanded = pd.Series(
84
+ np.repeat(old_column, length)
85
+ ).reset_index(drop=True)
86
+ new_column_expanded = pd.Series(
87
+ np.repeat(new_column, length)
88
+ ).reset_index(drop=True)
89
+ df[column] = old_column_expanded
90
+ df.insert(
91
+ column_idx + 1, bottom_names[column_idx], new_column_expanded
92
+ )
93
+ return df
94
+
95
+ df = separate_double_header_row(df, "Date", len(rowtypes))
96
+ df = separate_double_header_row(df, top_names[0], len(rowtypes))
97
+ # remove brackets around statistics values
98
+ df["Statistic"] = df["Statistic"].str.strip("()")
99
+ df = df.rename(mapper={"Btl_ID": "Bottle_ID"}, axis=1)
100
+ return df
101
+
102
+ def adding_timestamp_column(self):
103
+ """Creates a timestamp column that holds both, Date and Time
104
+ information.
105
+
106
+ Parameters
107
+ ----------
108
+
109
+ Returns
110
+ -------
111
+
112
+ """
113
+ # constructing timestamp column
114
+ timestamp = []
115
+ for datepoint, timepoint in zip(self.df.Date, self.df.Time):
116
+ timestamp.append(
117
+ datetime.combine(datepoint, time.fromisoformat(str(timepoint)))
118
+ )
119
+ self.df.insert(2, "Timestamp", timestamp)
120
+ self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
121
+
122
+ def setting_dataframe_dtypes(self):
123
+ """Sets the types for the column values in the dataframe."""
124
+ # setting dtypes
125
+ # TODO: extending this to the other columns!
126
+ self.df.Date = pd.to_datetime(self.df.Date)
127
+ self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
128
+
129
+ def selecting_rows(
130
+ self, df=None, statistic_of_interest: Union[list, str] = ["avg"]
131
+ ):
132
+ """Creates a dataframe with the given row identifier, using the
133
+ statistics column. A single string or a list of strings can be
134
+ processed.
135
+
136
+ Parameters
137
+ ----------
138
+ df : pandas.Dataframe :
139
+ the files Pandas representation (Default value = self.df)
140
+ statistic_of_interest: list or str :
141
+ collection of values of the 'statistics' column in self.df
142
+ (Default value = ['avg'])
143
+
144
+ Returns
145
+ -------
146
+
147
+ """
148
+ df = self.df if df is None else df
149
+ # ensure that the input is a list, so that isin() can do its job
150
+ if isinstance(statistic_of_interest, str):
151
+ statistic_of_interest = [statistic_of_interest]
152
+ self.df = df.loc[df["Statistic"].isin(statistic_of_interest)]
153
+
154
+ def reading_data_header(self):
155
+ """Identifies and separatly collects the rows that specify the data
156
+ tables headers.
157
+
158
+ Parameters
159
+ ----------
160
+
161
+ Returns
162
+ -------
163
+
164
+ """
165
+ n = 11 # fix column width of a seabird btl file
166
+ top_line = self.data[0]
167
+ second_line = self.data[1]
168
+ top_names = [
169
+ top_line[i : i + n].split()[0]
170
+ for i in range(0, len(top_line) - n, n)
171
+ ]
172
+ bottom_names = [
173
+ second_line[i : i + n].split()[0] for i in range(0, 2 * n, n)
174
+ ]
175
+ return top_names, bottom_names
176
+
177
+ def add_station_and_event_column(self):
178
+ event_list = [self.metadata["Station"] for _ in self.data]
179
+ self.df.insert(0, "Event", pd.Series(event_list))
180
+
181
+ def add_position_columns(self):
182
+ latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
183
+ self.df.insert(1, "Latitude", pd.Series(latitude_list))
184
+ longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
185
+ self.df.insert(2, "Longitude", pd.Series(longitude_list))
@@ -0,0 +1,155 @@
1
+ from datetime import datetime
2
+ import re
3
+ import logging
4
+ import pandas as pd
5
+ from seabirdfilehandler import DataFile
6
+ from seabirdfilehandler.dataframe_meta_accessor import (
7
+ SeriesMetaAccessor, # noqa: F401
8
+ DataFrameMetaAccessor, # noqa: F401
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BottleLogFile(DataFile):
15
+ """Bottle Log file representation, that extracts the three different data
16
+ types from the file: reset time and the table with bottle IDs and
17
+ corresponding data ranges.
18
+
19
+ Parameters
20
+ ----------
21
+
22
+ Returns
23
+ -------
24
+
25
+ """
26
+
27
+ def __init__(self, path_to_file, create_dataframe=False):
28
+ super().__init__(path_to_file)
29
+ self.reset_time = self.obtaining_reset_time()
30
+ self.origin_cnv = self.raw_file_data[0].strip()
31
+ self.data = self.data_whitespace_removal()
32
+
33
+ if create_dataframe:
34
+ self.original_df = self.create_dataframe()
35
+ self.df = self.original_df
36
+ else:
37
+ self.data_list = self.create_list()
38
+
39
+ def data_whitespace_removal(self) -> list:
40
+ """Strips the input from whitespace characters, in this case especially
41
+ newline characters.
42
+
43
+ Parameters
44
+ ----------
45
+
46
+ Returns
47
+ -------
48
+ the original data stripped off the whitespaces
49
+
50
+ """
51
+ temp_data = []
52
+ for line in self.raw_file_data[2:]:
53
+ temp_data.append(line.strip())
54
+ return temp_data
55
+
56
+ def obtaining_reset_time(self) -> datetime:
57
+ """Reading reset time with small input check.
58
+
59
+ Parameters
60
+ ----------
61
+
62
+ Returns
63
+ -------
64
+ a datetime.datetime object of the device reset time
65
+
66
+ """
67
+
68
+ regex_check = re.search(
69
+ r"RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)",
70
+ self.raw_file_data[1],
71
+ )
72
+ if regex_check:
73
+ return datetime.strptime(regex_check.group(1), "%b %d %Y %H:%M:%S")
74
+ else:
75
+ error_message = """BottleLogFile is not formatted as expected:
76
+ Reset time could not be extracted."""
77
+ logger.error(error_message)
78
+ raise IOError(error_message)
79
+
80
+ def create_list(self) -> list:
81
+ """Creates a list of usable data from the list specified in self.data.
82
+ the list consists of: an array of ID's representing the bottles, the date and time of the data sample
83
+ and the lines of the cnv corresponding to the bottles
84
+
85
+ Parameters
86
+ ----------
87
+
88
+ Returns
89
+ -------
90
+ a list representing the bl files table information
91
+ """
92
+ content_array = []
93
+ for i in range(len(self.data)):
94
+ bottles = [int(x) for x in self.data[i].split(",")[:2]]
95
+ date = self.convert_date(self.data[i].split(",")[2])
96
+ lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
97
+
98
+ content_array.append([bottles, date, lines])
99
+
100
+ return content_array
101
+
102
+ def convert_date(self, date: str):
103
+ """Converts the Dates of the .bl files to an ISO 8601 standard
104
+
105
+ Parameters
106
+ ----------
107
+
108
+ Returns
109
+ -------
110
+ a string with the date in the form of "yymmddThhmmss"
111
+ """
112
+ date = date.strip()
113
+ month_list = [
114
+ "Jan",
115
+ "Feb",
116
+ "Mar",
117
+ "Apr",
118
+ "May",
119
+ "Jun",
120
+ "Jul",
121
+ "Aug",
122
+ "Sep",
123
+ "Oct",
124
+ "Nov",
125
+ "Dec",
126
+ ]
127
+
128
+ month_ind = month_list.index(date.split(" ")[0]) + 1
129
+ if month_ind < 10:
130
+ month = "0" + str(month_ind)
131
+ else:
132
+ month = str(month_ind)
133
+ day = date.split(" ")[1]
134
+ year = (date.split(" ")[2])[2:]
135
+ time = date.split(" ")[3].replace(":", "")
136
+ return year + month + day + "T" + time
137
+
138
+ def create_dataframe(self) -> pd.DataFrame:
139
+ """Creates a dataframe from the list specified in self.data.
140
+
141
+ Parameters
142
+ ----------
143
+
144
+ Returns
145
+ -------
146
+ a pandas.Dataframe representing the bl files table information
147
+ """
148
+ data_lists = []
149
+ for line in self.data:
150
+ inner_list = line.split(",")
151
+ # dropping first column as its the index
152
+ data_lists.append(inner_list[1:])
153
+ df = pd.DataFrame(data_lists)
154
+ df.columns = ["Bottle ID", "Datetime", "start_range", "end_range"]
155
+ return df
@@ -0,0 +1,283 @@
1
+ from pathlib import Path
2
+ from datetime import datetime, timedelta
3
+ import pandas as pd
4
+ import numpy as np
5
+ import logging
6
+ from seabirdfilehandler import DataFile
7
+ from seabirdfilehandler.parameter import Parameters
8
+ from seabirdfilehandler.validation_modules import CnvValidationList
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class CnvFile(DataFile):
14
+ """
15
+ A representation of a cnv-file as used by SeaBird.
16
+
17
+ This class intends to fully extract and organize the different types of
18
+ data and metadata present inside of such a file. Downstream libraries shall
19
+ be able to use this representation for all applications concerning cnv
20
+ files, like data processing, transformation or visualization.
21
+
22
+ To achieve that, the metadata header is organized by the grandparent-class,
23
+ SeaBirdFile, while the data table is extracted by this class. The data
24
+ representation of choice is a pandas Dataframe. Inside this class, there
25
+ are methods to parse cnv data into dataframes, do the reverse of writing a
26
+ dataframe into cnv compliant form and to manipulate the dataframe in
27
+ various ways.
28
+
29
+ Parameters
30
+ ----------
31
+ path_to_file: Path | str:
32
+ the path to the file
33
+ full_data_header: bool:
34
+ whether to use the full data column descriptions for the dataframe
35
+ long_header_names: bool:
36
+ whether to use long header names in the dateframe
37
+ absolute_time_calculation: bool:
38
+ whether to use a real timestamp instead of the second count
39
+ event_log_column: bool:
40
+ whether to add a station and device event column from DSHIP
41
+ coordinate_columns: bool:
42
+ whether to add longitude and latitude from the extra metadata header
43
+
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ path_to_file: Path | str,
49
+ create_dataframe: bool = False,
50
+ absolute_time_calculation: bool = False,
51
+ event_log_column: bool = False,
52
+ coordinate_columns: bool = False,
53
+ ):
54
+ super().__init__(path_to_file)
55
+ self.validation_modules = self.obtaining_validation_modules()
56
+ self.start_time = self.reading_start_time()
57
+ self.parameters = Parameters(self.data, self.data_table_description)
58
+ if create_dataframe:
59
+ self.df = self.parameters.get_pandas_dataframe()
60
+ if absolute_time_calculation:
61
+ self.absolute_time_calculation()
62
+ if event_log_column:
63
+ self.add_station_and_event_column()
64
+ if coordinate_columns:
65
+ self.add_position_columns()
66
+
67
+ def reading_start_time(
68
+ self,
69
+ time_source: str = "System UTC",
70
+ ) -> datetime | None:
71
+ """
72
+ Extracts the Cast start time from the metadata header.
73
+ """
74
+ for line in self.sbe9_data:
75
+ if line.startswith(time_source):
76
+ start_time = line.split("=")[1]
77
+ start_time = datetime.strptime(
78
+ start_time, " %b %d %Y %H:%M:%S "
79
+ )
80
+ return start_time
81
+ return None
82
+
83
+ def absolute_time_calculation(self) -> bool:
84
+ """
85
+ Replaces the basic cnv time representation of counting relative to the
86
+ casts start point, by real UTC timestamps.
87
+ This operation will act directly on the dataframe.
88
+
89
+ """
90
+ time_parameter = None
91
+ for parameter in self.df.columns:
92
+ if parameter.lower().startswith("time"):
93
+ time_parameter = parameter
94
+ if time_parameter and self.start_time:
95
+ self.parameters.create_parameter(
96
+ name="datetime",
97
+ data=np.array(
98
+ [
99
+ timedelta(days=float(time)) + self.start_time
100
+ if time_parameter == "timeJ"
101
+ else timedelta(seconds=float(time)) + self.start_time
102
+ for time in self.df[time_parameter]
103
+ ]
104
+ ),
105
+ )
106
+ return True
107
+ return False
108
+
109
+ def add_start_time(self) -> bool:
110
+ """
111
+ Adds the Cast start time to the dataframe.
112
+ Necessary for joins on the time.
113
+ """
114
+ if self.start_time:
115
+ self.parameters.create_parameter(
116
+ name="start_time",
117
+ data=str(self.start_time),
118
+ )
119
+ return True
120
+ return False
121
+
122
+ def obtaining_validation_modules(self) -> CnvValidationList:
123
+ """
124
+ Collects the individual validation modules and their respective
125
+ information, usually present in key-value pairs.
126
+ """
127
+ validation_modules = self.processing_info
128
+ return CnvValidationList(validation_modules)
129
+
130
+ def df2cnv(self, df: pd.DataFrame | None = None) -> list:
131
+ """
132
+ Parses a pandas dataframe into a list that represents the lines inside
133
+ of a cnv data table.
134
+
135
+ Parameters
136
+ ----------
137
+ df: DataFrame to export, default is self.df
138
+
139
+ Returns
140
+ -------
141
+ a list of lines in the cnv data table format
142
+
143
+ """
144
+ df = df if isinstance(df, pd.DataFrame) else self.df
145
+ cnv_out = []
146
+ for _, row in df.iterrows():
147
+ cnv_like_row = "".join(
148
+ (lambda column: f"{str(column):>11}")(value) for value in row
149
+ )
150
+ cnv_out.append(cnv_like_row + "\n")
151
+ return cnv_out
152
+
153
+ def array2cnv(self) -> list:
154
+ result = []
155
+ for row in self.parameters.full_data_array:
156
+ formatted_row = "".join(f"{elem:11}" for elem in row)
157
+ result.append(formatted_row + "\n")
158
+ return result
159
+
160
+ def to_cnv(
161
+ self,
162
+ file_name: Path | str | None = None,
163
+ use_dataframe: bool = True,
164
+ ):
165
+ """
166
+ Writes the values inside of this instance as a new cnv file to disc.
167
+
168
+ Parameters
169
+ ----------
170
+ file_name: Path:
171
+ the new file name to use for writing
172
+ use_current_df: bool:
173
+ whether to use the current dataframe as data table
174
+ use_current_validation_header: bool:
175
+ whether to use the current processing module list
176
+ header_list: list:
177
+ the data columns to use for the export
178
+
179
+ """
180
+ file_name = self.path_to_file if file_name is None else file_name
181
+ # content construction
182
+ if use_dataframe:
183
+ data = self.df2cnv()
184
+ else:
185
+ data = self.array2cnv()
186
+ self._update_header()
187
+ self.file_data = [*self.header, *data]
188
+ # writing content out
189
+ try:
190
+ with open(file_name, "w", encoding="latin-1") as file:
191
+ for line in self.file_data:
192
+ file.write(line)
193
+
194
+ except IOError as error:
195
+ logger.error(f"Could not write cnv file: {error}")
196
+
197
+ def _update_header(self):
198
+ """Re-creates the cnv header."""
199
+ self.data_table_description = self.parameters._form_data_table_info()
200
+ self.header = [
201
+ *[f"* {data}" for data in self.sbe9_data[:-1]],
202
+ *[f"** {data}" for data in self.metadata_list],
203
+ f"* {self.sbe9_data[-1]}",
204
+ *[f"# {data}" for data in self.data_table_description],
205
+ *[f"# {data}" for data in self.sensor_data],
206
+ *[f"# {data}" for data in self.processing_info],
207
+ "*END*\n",
208
+ ]
209
+
210
+ def add_processing_metadata(self, addition: str | list):
211
+ """
212
+ Adds new processing lines to the list of processing module information
213
+
214
+ Parameters
215
+ ----------
216
+ addition: str:
217
+ the new information line
218
+
219
+ """
220
+ # TODO: use CnvprocessingList here
221
+ if isinstance(addition, str):
222
+ addition = [addition]
223
+ for line in addition:
224
+ self.file_data.append(line)
225
+ # add the new info line *before* the 'file_type = ascii' line
226
+ self.processing_info.insert(-1, line)
227
+
228
+ def add_station_and_event_column(self) -> bool:
229
+ """
230
+ Adds a column with the DSHIP station and device event numbers to the
231
+ dataframe. These must be present inside the extra metadata header.
232
+
233
+ """
234
+ if "Station" in self.metadata:
235
+ self.parameters.create_parameter(
236
+ data=self.metadata["Station"],
237
+ name="Event",
238
+ )
239
+ return True
240
+ return False
241
+
242
+ def add_position_columns(self) -> bool:
243
+ """
244
+ Adds a column with the longitude and latitude to the dataframe.
245
+ These must be present inside the extra metadata header.
246
+
247
+ """
248
+ if ("latitude" or "longitude") in [
249
+ k.lower() for k in self.parameters.keys()
250
+ ]:
251
+ return True
252
+ if ("GPS_Lat" and "GPS_Lon") in self.metadata:
253
+ self.parameters.create_parameter(
254
+ data=self.metadata["GPS_Lat"],
255
+ name="Latitude",
256
+ )
257
+ self.parameters.create_parameter(
258
+ data=self.metadata["GPS_Lon"],
259
+ name="Longitude",
260
+ )
261
+ return True
262
+ else:
263
+ return False
264
+
265
+ def add_cast_number(self, number: int | None = None) -> bool:
266
+ """
267
+ Adds a column with the cast number to the dataframe.
268
+
269
+ Parameters
270
+ ----------
271
+ number: int:
272
+ the cast number of this files cast
273
+
274
+ """
275
+ if ("Cast" in self.metadata.keys()) and (not number):
276
+ number = int(self.metadata["Cast"])
277
+ if number:
278
+ self.parameters.create_parameter(
279
+ data=number,
280
+ name="Cast",
281
+ )
282
+ return True
283
+ return False