seabirdfilehandler 0.5.3__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of seabirdfilehandler might be problematic. Click here for more details.
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/PKG-INFO +3 -2
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/pyproject.toml +3 -2
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/__init__.py +3 -1
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/bottlefile.py +8 -6
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/cnvfile.py +11 -2
- seabirdfilehandler-0.6.0/src/seabirdfilehandler/file_collection.py +492 -0
- seabirdfilehandler-0.6.0/src/seabirdfilehandler/hexfile.py +71 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/parameter.py +4 -2
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/xmlfiles.py +17 -0
- seabirdfilehandler-0.5.3/src/seabirdfilehandler/file_collection.py +0 -258
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/LICENSE +0 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/README.md +0 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/bottlelogfile.py +0 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/datafiles.py +0 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/geomar_ctd_file_parser.py +0 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/utils.py +0 -0
- {seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/validation_modules.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: seabirdfilehandler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Library of parsers to interact with SeaBird CTD files.
|
|
5
5
|
Keywords: CTD,parser,seabird,data
|
|
6
6
|
Author: Emil Michels
|
|
@@ -16,7 +16,8 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.13
|
|
17
17
|
Requires-Dist: pandas (>=2.2.1)
|
|
18
18
|
Requires-Dist: xmltodict (>=0.13.0)
|
|
19
|
-
Project-URL:
|
|
19
|
+
Project-URL: Documentation, https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler
|
|
20
|
+
Project-URL: Homepage, https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler
|
|
20
21
|
Project-URL: Repository, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
|
|
21
22
|
Description-Content-Type: text/markdown
|
|
22
23
|
|
|
@@ -16,10 +16,11 @@ classifiers = [
|
|
|
16
16
|
"Programming Language :: Python :: 3.12",
|
|
17
17
|
"Programming Language :: Python :: 3.13",
|
|
18
18
|
]
|
|
19
|
-
urls.homepage = "https://
|
|
19
|
+
urls.homepage = "https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler"
|
|
20
20
|
urls.repository = "https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler"
|
|
21
|
+
urls.documentation = "https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler"
|
|
21
22
|
dynamic = []
|
|
22
|
-
version = "0.
|
|
23
|
+
version = "0.6.0"
|
|
23
24
|
|
|
24
25
|
[tool.poetry]
|
|
25
26
|
|
|
@@ -3,6 +3,8 @@ from .bottlefile import *
|
|
|
3
3
|
from .bottlelogfile import *
|
|
4
4
|
from .cnvfile import *
|
|
5
5
|
from .xmlfiles import *
|
|
6
|
+
from .hexfile import *
|
|
6
7
|
from .validation_modules import *
|
|
7
|
-
from .file_collection import *
|
|
8
8
|
from .geomar_ctd_file_parser import *
|
|
9
|
+
from .parameter import *
|
|
10
|
+
from .file_collection import *
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import Union
|
|
2
3
|
from datetime import datetime, time
|
|
3
4
|
import pandas as pd
|
|
@@ -22,12 +23,13 @@ class BottleFile(DataFile):
|
|
|
22
23
|
|
|
23
24
|
"""
|
|
24
25
|
|
|
25
|
-
def __init__(self, path_to_file):
|
|
26
|
-
super().__init__(path_to_file)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
26
|
+
def __init__(self, path_to_file: Path | str, only_header: bool = False):
|
|
27
|
+
super().__init__(path_to_file, only_header)
|
|
28
|
+
if not only_header:
|
|
29
|
+
self.original_df = self.create_dataframe()
|
|
30
|
+
self.df = self.original_df
|
|
31
|
+
self.setting_dataframe_dtypes()
|
|
32
|
+
self.adding_timestamp_column()
|
|
31
33
|
|
|
32
34
|
def create_dataframe(self):
|
|
33
35
|
"""Creates a dataframe out of the btl file. Manages the double data
|
|
@@ -60,9 +60,11 @@ class CnvFile(DataFile):
|
|
|
60
60
|
super().__init__(path_to_file, only_header)
|
|
61
61
|
self.validation_modules = self.obtaining_validation_modules()
|
|
62
62
|
self.start_time = self.reading_start_time()
|
|
63
|
-
self.parameters = Parameters(
|
|
63
|
+
self.parameters = Parameters(
|
|
64
|
+
self.data, self.data_table_description, only_header
|
|
65
|
+
)
|
|
64
66
|
if create_dataframe:
|
|
65
|
-
self.df = self.
|
|
67
|
+
self.df = self.create_dataframe()
|
|
66
68
|
if absolute_time_calculation:
|
|
67
69
|
self.absolute_time_calculation()
|
|
68
70
|
if event_log_column:
|
|
@@ -70,6 +72,13 @@ class CnvFile(DataFile):
|
|
|
70
72
|
if coordinate_columns:
|
|
71
73
|
self.add_position_columns()
|
|
72
74
|
|
|
75
|
+
def create_dataframe(self) -> pd.DataFrame:
|
|
76
|
+
"""
|
|
77
|
+
Plain dataframe creator.
|
|
78
|
+
"""
|
|
79
|
+
self.df = self.parameters.get_pandas_dataframe()
|
|
80
|
+
return self.df
|
|
81
|
+
|
|
73
82
|
def reading_start_time(
|
|
74
83
|
self,
|
|
75
84
|
time_source: str = "System UTC",
|
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
from collections import UserList
|
|
5
|
+
from typing import Callable, Type
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
from seabirdfilehandler import (
|
|
9
|
+
CnvFile,
|
|
10
|
+
BottleFile,
|
|
11
|
+
BottleLogFile,
|
|
12
|
+
DataFile,
|
|
13
|
+
HexFile,
|
|
14
|
+
)
|
|
15
|
+
from seabirdfilehandler.utils import get_unique_sensor_data
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_collection(
|
|
21
|
+
path_to_files: Path | str,
|
|
22
|
+
file_suffix: str = "cnv",
|
|
23
|
+
only_metadata: bool = False,
|
|
24
|
+
pattern: str = "",
|
|
25
|
+
sorting_key: Callable | None = None,
|
|
26
|
+
) -> Type[FileCollection]:
|
|
27
|
+
"""
|
|
28
|
+
Factory to create instances of FileCollection, depending on input type.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
path_to_files : Path | str :
|
|
33
|
+
The path to the directory to search for files.
|
|
34
|
+
file_suffix : str :
|
|
35
|
+
The suffix to search for. (Default value = "cnv")
|
|
36
|
+
only_metadata : bool :
|
|
37
|
+
Whether to read only metadata. (Default value = False)
|
|
38
|
+
pattern: str
|
|
39
|
+
A filter for file selection. (Default value = '')
|
|
40
|
+
sorting_key : Callable | None :
|
|
41
|
+
A callable that returns the filename-part to use to sort the collection. (Default value = None)
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
An instance of FileCollection or one of its children.
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
mapping_suffix_to_type = {
|
|
48
|
+
"cnv": CnvCollection,
|
|
49
|
+
"btl": FileCollection,
|
|
50
|
+
"bl": FileCollection,
|
|
51
|
+
"hex": HexCollection,
|
|
52
|
+
}
|
|
53
|
+
file_suffix = file_suffix.strip(".")
|
|
54
|
+
try:
|
|
55
|
+
collection = mapping_suffix_to_type[file_suffix](
|
|
56
|
+
path_to_files, file_suffix, only_metadata, pattern, sorting_key
|
|
57
|
+
)
|
|
58
|
+
except KeyError:
|
|
59
|
+
raise ValueError(f"Unknown input file type: {file_suffix}, aborting.")
|
|
60
|
+
else:
|
|
61
|
+
return collection
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class FileCollection(UserList):
|
|
65
|
+
"""
|
|
66
|
+
A representation of multiple files of the same kind. These files share
|
|
67
|
+
the same suffix and are otherwise closely connected to each other. A common
|
|
68
|
+
use case would be the collection of CNVs to allow for easier processing or
|
|
69
|
+
integration of field calibration measurements.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
path_to_files : Path | str :
|
|
74
|
+
The path to the directory to search for files.
|
|
75
|
+
file_suffix : str :
|
|
76
|
+
The suffix to search for. (Default value = "cnv")
|
|
77
|
+
only_metadata : bool :
|
|
78
|
+
Whether to read only metadata. (Default value = False)
|
|
79
|
+
pattern: str
|
|
80
|
+
A filter for file selection. (Default value = '')
|
|
81
|
+
sorting_key : Callable | None :
|
|
82
|
+
A callable that returns the filename-part to use to sort the collection. (Default value = None)
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
path_to_files: str | Path,
|
|
88
|
+
file_suffix: str,
|
|
89
|
+
only_metadata: bool = False,
|
|
90
|
+
pattern: str = "",
|
|
91
|
+
sorting_key: Callable | None = None,
|
|
92
|
+
):
|
|
93
|
+
super().__init__()
|
|
94
|
+
self.path_to_files = Path(path_to_files)
|
|
95
|
+
self.file_suffix = file_suffix.strip(".")
|
|
96
|
+
self.pattern = pattern
|
|
97
|
+
self.sorting_key = sorting_key
|
|
98
|
+
self.file_type = self.extract_file_type(self.file_suffix)
|
|
99
|
+
self.individual_file_paths = self.collect_files(
|
|
100
|
+
pattern=pattern,
|
|
101
|
+
sorting_key=sorting_key,
|
|
102
|
+
)
|
|
103
|
+
self.data = self.load_files(only_metadata)
|
|
104
|
+
if not only_metadata:
|
|
105
|
+
self.df_list = self.get_dataframes()
|
|
106
|
+
self.df = self.get_collection_dataframe(self.df_list)
|
|
107
|
+
|
|
108
|
+
def __str__(self):
|
|
109
|
+
return "/n".join(self.data)
|
|
110
|
+
|
|
111
|
+
def extract_file_type(self, suffix: str) -> Type[DataFile]:
|
|
112
|
+
"""
|
|
113
|
+
Determines the file type using the input suffix.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
suffix : str :
|
|
118
|
+
The file suffix.
|
|
119
|
+
Returns
|
|
120
|
+
-------
|
|
121
|
+
An object corresponding to the given suffix.
|
|
122
|
+
"""
|
|
123
|
+
mapping_suffix_to_type = {
|
|
124
|
+
"cnv": CnvFile,
|
|
125
|
+
"btl": BottleFile,
|
|
126
|
+
"bl": BottleLogFile,
|
|
127
|
+
"hex": HexFile,
|
|
128
|
+
}
|
|
129
|
+
file_type = DataFile
|
|
130
|
+
for key, value in mapping_suffix_to_type.items():
|
|
131
|
+
if key == suffix:
|
|
132
|
+
file_type = value
|
|
133
|
+
break
|
|
134
|
+
return file_type
|
|
135
|
+
|
|
136
|
+
def collect_files(
|
|
137
|
+
self,
|
|
138
|
+
pattern: str = "",
|
|
139
|
+
sorting_key: Callable | None = lambda file: int(
|
|
140
|
+
file.stem.split("_")[3]
|
|
141
|
+
),
|
|
142
|
+
) -> list[Path]:
|
|
143
|
+
"""
|
|
144
|
+
Creates a list of target files, recursively from the given directory.
|
|
145
|
+
These can be sorted with the help of the sorting_key parameter, which
|
|
146
|
+
is a Callable that identifies the part of the filename that shall be
|
|
147
|
+
used for sorting.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
pattern: str
|
|
152
|
+
A filter for file selection. Is given to rglob. (Default value = '')
|
|
153
|
+
sorting_key : Callable | None :
|
|
154
|
+
The part of the filename to use in sorting. (Default value = lambda file: int(file.stem.split("_")[3]))
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
A list of all paths found.
|
|
158
|
+
"""
|
|
159
|
+
return sorted(
|
|
160
|
+
self.path_to_files.rglob(f"*{pattern}*{self.file_suffix}"),
|
|
161
|
+
key=sorting_key,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def load_files(self, only_metadata: bool = False) -> list[DataFile]:
|
|
165
|
+
"""
|
|
166
|
+
Creates python instances of each file.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
only_metadata : bool :
|
|
171
|
+
Whether to load only file metadata. (Default value = False)
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
A list of all instances.
|
|
175
|
+
"""
|
|
176
|
+
data = []
|
|
177
|
+
for file in self.individual_file_paths:
|
|
178
|
+
try:
|
|
179
|
+
data.append(
|
|
180
|
+
self.file_type(
|
|
181
|
+
path_to_file=file,
|
|
182
|
+
only_header=only_metadata,
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
except TypeError:
|
|
186
|
+
logger.error(
|
|
187
|
+
f"Could not open file {file} with the type "
|
|
188
|
+
f"{self.file_type}."
|
|
189
|
+
)
|
|
190
|
+
continue
|
|
191
|
+
return data
|
|
192
|
+
|
|
193
|
+
def get_dataframes(
|
|
194
|
+
self,
|
|
195
|
+
event_log: bool = False,
|
|
196
|
+
coordinates: bool = False,
|
|
197
|
+
time_correction: bool = False,
|
|
198
|
+
cast_identifier: bool = False,
|
|
199
|
+
) -> list[pd.DataFrame]:
|
|
200
|
+
"""
|
|
201
|
+
Collects all individual dataframes and allows additional column
|
|
202
|
+
creation.
|
|
203
|
+
|
|
204
|
+
Parameters
|
|
205
|
+
----------
|
|
206
|
+
event_log : bool :
|
|
207
|
+
(Default value = False)
|
|
208
|
+
coordinates : bool :
|
|
209
|
+
(Default value = False)
|
|
210
|
+
time_correction : bool :
|
|
211
|
+
(Default value = False)
|
|
212
|
+
cast_identifier : bool :
|
|
213
|
+
(Default value = False)
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
A list of the individual pandas DataFrames.
|
|
218
|
+
"""
|
|
219
|
+
for index, file in enumerate(self.data):
|
|
220
|
+
if event_log:
|
|
221
|
+
file.add_station_and_event_column()
|
|
222
|
+
if coordinates:
|
|
223
|
+
file.add_position_columns()
|
|
224
|
+
if time_correction:
|
|
225
|
+
file.absolute_time_calculation()
|
|
226
|
+
file.add_start_time()
|
|
227
|
+
if cast_identifier:
|
|
228
|
+
file.add_cast_number(index + 1)
|
|
229
|
+
return [file.df for file in self.data]
|
|
230
|
+
|
|
231
|
+
def get_collection_dataframe(
|
|
232
|
+
self, list_of_dfs: list[pd.DataFrame] | None = None
|
|
233
|
+
) -> pd.DataFrame:
|
|
234
|
+
"""
|
|
235
|
+
Creates one DataFrame from the individual ones, by concatenation.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
list_of_dfs : list[pd.DataFrame] | None :
|
|
240
|
+
A list of the individual DataFrames. (Default value = None)
|
|
241
|
+
Returns
|
|
242
|
+
-------
|
|
243
|
+
A pandas DataFrame representing the whole dataset.
|
|
244
|
+
"""
|
|
245
|
+
if not list_of_dfs:
|
|
246
|
+
list_of_dfs = self.get_dataframes()
|
|
247
|
+
if not list_of_dfs:
|
|
248
|
+
raise ValueError("No dataframes to concatenate.")
|
|
249
|
+
df = pd.concat(list_of_dfs, ignore_index=True)
|
|
250
|
+
self.df = df
|
|
251
|
+
return df
|
|
252
|
+
|
|
253
|
+
def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
254
|
+
"""
|
|
255
|
+
Apply the different dataframe edits to the given dataframe.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
df : pd.DataFrame :
|
|
260
|
+
A DataFrame to edit.
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
The tidied dataframe.
|
|
264
|
+
"""
|
|
265
|
+
df = self.use_bad_flag_for_nan(df)
|
|
266
|
+
df = self.set_dtype_to_float(df)
|
|
267
|
+
return self.select_real_scan_data(df)
|
|
268
|
+
|
|
269
|
+
def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
270
|
+
"""
|
|
271
|
+
Replace all Nan values by the bad flag value, defined inside the files.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
df : pd.DataFrame :
|
|
276
|
+
The dataframe to edit.
|
|
277
|
+
Returns
|
|
278
|
+
-------
|
|
279
|
+
The edited DataFrame.
|
|
280
|
+
"""
|
|
281
|
+
bad_flags = set()
|
|
282
|
+
for file in self.data:
|
|
283
|
+
for line in file.data_table_description:
|
|
284
|
+
if line.startswith("bad_flag"):
|
|
285
|
+
flag = line.split("=")[1].strip()
|
|
286
|
+
bad_flags.add(flag)
|
|
287
|
+
for flag in bad_flags:
|
|
288
|
+
df.replace(to_replace=flag, value=np.nan, inplace=True)
|
|
289
|
+
return df
|
|
290
|
+
|
|
291
|
+
def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
292
|
+
"""
|
|
293
|
+
Use the float-dtype for all DataFrame columns.
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
df : pd.DataFrame :
|
|
298
|
+
The dataframe to edit.
|
|
299
|
+
Returns
|
|
300
|
+
-------
|
|
301
|
+
The edited DataFrame.
|
|
302
|
+
"""
|
|
303
|
+
for parameter in df.columns:
|
|
304
|
+
if parameter in ["datetime"]:
|
|
305
|
+
continue
|
|
306
|
+
try:
|
|
307
|
+
df[parameter] = df[parameter].astype("float")
|
|
308
|
+
finally:
|
|
309
|
+
continue
|
|
310
|
+
return df
|
|
311
|
+
|
|
312
|
+
def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
313
|
+
"""
|
|
314
|
+
Drop data rows have no 'Scan' value, if that column exists.
|
|
315
|
+
|
|
316
|
+
Parameters
|
|
317
|
+
----------
|
|
318
|
+
df : pd.DataFrame :
|
|
319
|
+
The dataframe to edit.
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
The edited DataFrame.
|
|
323
|
+
"""
|
|
324
|
+
try:
|
|
325
|
+
scan_column = [
|
|
326
|
+
c for c in df.columns if c.lower().startswith("scan")
|
|
327
|
+
][0]
|
|
328
|
+
except IndexError:
|
|
329
|
+
return df
|
|
330
|
+
else:
|
|
331
|
+
df = df.loc[df[scan_column].notna()]
|
|
332
|
+
return df
|
|
333
|
+
|
|
334
|
+
def to_csv(self, file_name):
|
|
335
|
+
"""
|
|
336
|
+
Writes a csv file with the given filename.
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
file_name :
|
|
341
|
+
The new csv file name.
|
|
342
|
+
"""
|
|
343
|
+
self.df.to_csv(file_name)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
class CnvCollection(FileCollection):
|
|
347
|
+
"""
|
|
348
|
+
Specific methods to work with collections of .cnv files.
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
def __init__(
|
|
352
|
+
self,
|
|
353
|
+
*args,
|
|
354
|
+
**kwargs,
|
|
355
|
+
):
|
|
356
|
+
super().__init__(*args, **kwargs)
|
|
357
|
+
self.data_meta_info = self.get_data_table_meta_info()
|
|
358
|
+
self.sensor_data = get_unique_sensor_data(
|
|
359
|
+
[file.sensors for file in self.data]
|
|
360
|
+
)
|
|
361
|
+
self.array = self.get_array()
|
|
362
|
+
|
|
363
|
+
def get_dataframes(
|
|
364
|
+
self,
|
|
365
|
+
event_log: bool = False,
|
|
366
|
+
coordinates: bool = False,
|
|
367
|
+
time_correction: bool = False,
|
|
368
|
+
cast_identifier: bool = False,
|
|
369
|
+
) -> list[pd.DataFrame]:
|
|
370
|
+
"""
|
|
371
|
+
Collects all individual dataframes and allows additional column
|
|
372
|
+
creation.
|
|
373
|
+
|
|
374
|
+
Parameters
|
|
375
|
+
----------
|
|
376
|
+
event_log : bool :
|
|
377
|
+
(Default value = False)
|
|
378
|
+
coordinates : bool :
|
|
379
|
+
(Default value = False)
|
|
380
|
+
time_correction : bool :
|
|
381
|
+
(Default value = False)
|
|
382
|
+
cast_identifier : bool :
|
|
383
|
+
(Default value = False)
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
386
|
+
A list of the individual pandas DataFrames.
|
|
387
|
+
"""
|
|
388
|
+
for index, file in enumerate(self.data):
|
|
389
|
+
if event_log:
|
|
390
|
+
file.add_station_and_event_column()
|
|
391
|
+
if coordinates:
|
|
392
|
+
file.add_position_columns()
|
|
393
|
+
if time_correction:
|
|
394
|
+
file.absolute_time_calculation()
|
|
395
|
+
file.add_start_time()
|
|
396
|
+
if cast_identifier:
|
|
397
|
+
file.add_cast_number(index + 1)
|
|
398
|
+
return [file.create_dataframe() for file in self.data]
|
|
399
|
+
|
|
400
|
+
def get_data_table_meta_info(self) -> list[dict]:
|
|
401
|
+
"""
|
|
402
|
+
Ensures the same data description in all input cnv files and returns
|
|
403
|
+
it.
|
|
404
|
+
Acts as an early alarm when working on different kinds of files, which
|
|
405
|
+
cannot be concatenated together.
|
|
406
|
+
|
|
407
|
+
Returns
|
|
408
|
+
-------
|
|
409
|
+
A list of dictionaries that represent the data column information.
|
|
410
|
+
"""
|
|
411
|
+
all_column_descriptions = [
|
|
412
|
+
file.parameters.metadata for file in self.data
|
|
413
|
+
]
|
|
414
|
+
for info in all_column_descriptions:
|
|
415
|
+
if all_column_descriptions[0] != info:
|
|
416
|
+
raise AssertionError(
|
|
417
|
+
"Acting on differently formed data files, aborting"
|
|
418
|
+
)
|
|
419
|
+
return all_column_descriptions[0]
|
|
420
|
+
|
|
421
|
+
def get_array(self) -> np.ndarray:
|
|
422
|
+
"""
|
|
423
|
+
Creates a collection array of all individual file arrays.
|
|
424
|
+
|
|
425
|
+
Returns
|
|
426
|
+
-------
|
|
427
|
+
A numpy array, representing the data of all input files.
|
|
428
|
+
"""
|
|
429
|
+
return np.concatenate(
|
|
430
|
+
[file.parameters.create_full_ndarray() for file in self.data]
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
class HexCollection(FileCollection):
|
|
435
|
+
"""
|
|
436
|
+
Specific methods to work with collections of .hex files.
|
|
437
|
+
|
|
438
|
+
Especially concerned with the detection of corresponding .XMLCON files.
|
|
439
|
+
"""
|
|
440
|
+
|
|
441
|
+
def __init__(
|
|
442
|
+
self,
|
|
443
|
+
*args,
|
|
444
|
+
xmlcon_pattern: str = "",
|
|
445
|
+
path_to_xmlcons: Path | str = "",
|
|
446
|
+
**kwargs,
|
|
447
|
+
):
|
|
448
|
+
# force only_metadata, as the hex data cannot be put into a DataFrame
|
|
449
|
+
kwargs["only_metadata"] = True
|
|
450
|
+
super().__init__(*args, **kwargs)
|
|
451
|
+
if not xmlcon_pattern:
|
|
452
|
+
xmlcon_pattern = self.pattern
|
|
453
|
+
self.xmlcon_pattern = xmlcon_pattern
|
|
454
|
+
self.path_to_xmlcons = (
|
|
455
|
+
Path(path_to_xmlcons)
|
|
456
|
+
if path_to_xmlcons
|
|
457
|
+
else self.path_to_files.parent
|
|
458
|
+
)
|
|
459
|
+
self.xmlcons = self.get_xmlcons()
|
|
460
|
+
|
|
461
|
+
def get_xmlcons(self) -> list[str]:
|
|
462
|
+
"""
|
|
463
|
+
Returns all .xmlcon files found inside the root directory and its
|
|
464
|
+
children, matching a given pattern.
|
|
465
|
+
|
|
466
|
+
Does use the global sorting_key to attempt to also sort the xmlcons the
|
|
467
|
+
same way.
|
|
468
|
+
This is meant to be used in the future for a more specific hex-xmlcon
|
|
469
|
+
matching.
|
|
470
|
+
|
|
471
|
+
Returns
|
|
472
|
+
-------
|
|
473
|
+
A list of the found xmlcon filenames.
|
|
474
|
+
"""
|
|
475
|
+
try:
|
|
476
|
+
xmlcons = [
|
|
477
|
+
Path(xmlcon_path).stem
|
|
478
|
+
for xmlcon_path in sorted(
|
|
479
|
+
self.path_to_xmlcons.rglob(
|
|
480
|
+
f"*{self.xmlcon_pattern}*.XMLCON"
|
|
481
|
+
),
|
|
482
|
+
key=self.sorting_key,
|
|
483
|
+
)
|
|
484
|
+
]
|
|
485
|
+
except (KeyError, IndexError):
|
|
486
|
+
xmlcons = [
|
|
487
|
+
Path(xmlcon_path).stem
|
|
488
|
+
for xmlcon_path in self.path_to_xmlcons.rglob(
|
|
489
|
+
f"*{self.xmlcon_pattern}*.XMLCON"
|
|
490
|
+
)
|
|
491
|
+
]
|
|
492
|
+
return xmlcons
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from seabirdfilehandler import DataFile, XMLCONFile
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class HexFile(DataFile):
|
|
6
|
+
"""
|
|
7
|
+
A representation of a .hex file as used by SeaBird.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
path_to_file: Path | str:
|
|
12
|
+
the path to the file
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path_to_file: Path | str,
|
|
18
|
+
path_to_xmlcon: Path | str = "",
|
|
19
|
+
*args,
|
|
20
|
+
**kwargs,
|
|
21
|
+
):
|
|
22
|
+
# force loading only metadata
|
|
23
|
+
super().__init__(path_to_file, True)
|
|
24
|
+
self.xmlcon = self.get_corresponding_xmlcon(path_to_xmlcon)
|
|
25
|
+
|
|
26
|
+
def get_corresponding_xmlcon(
|
|
27
|
+
self,
|
|
28
|
+
path_to_xmlcon: Path | str = "",
|
|
29
|
+
) -> XMLCONFile | None:
|
|
30
|
+
"""
|
|
31
|
+
Finds the best matching .xmlcon file inside the same directory.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
path_to_xmlcon: Path | str:
|
|
36
|
+
A fixed path to a xmlcon file. Will be checked.
|
|
37
|
+
"""
|
|
38
|
+
# xmlcon path given, test and use it
|
|
39
|
+
if isinstance(path_to_xmlcon, str):
|
|
40
|
+
if path_to_xmlcon:
|
|
41
|
+
return XMLCONFile(path_to_xmlcon)
|
|
42
|
+
else:
|
|
43
|
+
if path_to_xmlcon.exists():
|
|
44
|
+
return XMLCONFile(path_to_xmlcon)
|
|
45
|
+
# no xmlcon path, lets find one in the same dir
|
|
46
|
+
# get all xmlcons in the dir
|
|
47
|
+
xmlcons = [
|
|
48
|
+
xmlcon.stem
|
|
49
|
+
for xmlcon in self.file_dir.glob("*.XMLCON", case_sensitive=False)
|
|
50
|
+
]
|
|
51
|
+
if not xmlcons:
|
|
52
|
+
return None
|
|
53
|
+
# find excact match
|
|
54
|
+
if self.file_name in xmlcons:
|
|
55
|
+
return XMLCONFile(
|
|
56
|
+
self.file_dir.joinpath(self.file_name + ".XMLCON")
|
|
57
|
+
)
|
|
58
|
+
# TODO:
|
|
59
|
+
# otherwise, take the last fitting one in a sorted xmlcon list, or,
|
|
60
|
+
# the one sharing the same prefix
|
|
61
|
+
else:
|
|
62
|
+
same_prefix_xmlcons = [
|
|
63
|
+
xmlcon
|
|
64
|
+
for xmlcon in xmlcons
|
|
65
|
+
if self.file_name.startswith(xmlcon[:5])
|
|
66
|
+
]
|
|
67
|
+
if not same_prefix_xmlcons:
|
|
68
|
+
return None
|
|
69
|
+
return XMLCONFile(
|
|
70
|
+
self.file_dir.joinpath(same_prefix_xmlcons[0] + ".XMLCON")
|
|
71
|
+
)
|
|
@@ -32,15 +32,17 @@ class Parameters(UserDict):
|
|
|
32
32
|
self,
|
|
33
33
|
data: list,
|
|
34
34
|
metadata: list,
|
|
35
|
+
only_header: bool = False,
|
|
35
36
|
):
|
|
36
37
|
self.raw_input_data = data
|
|
37
38
|
self.raw_metadata = metadata
|
|
38
|
-
self.full_data_array = self.create_full_ndarray()
|
|
39
39
|
self.differentiate_table_description()
|
|
40
40
|
self.metadata, self.duplicate_columns = self.reading_data_header(
|
|
41
41
|
metadata
|
|
42
42
|
)
|
|
43
|
-
|
|
43
|
+
if not only_header:
|
|
44
|
+
self.full_data_array = self.create_full_ndarray()
|
|
45
|
+
self.data = self.create_parameter_instances()
|
|
44
46
|
|
|
45
47
|
def get_parameter_names(self) -> list[str]:
|
|
46
48
|
return [parameter["name"] for parameter in self.metadata.values()]
|
|
@@ -33,6 +33,23 @@ class XMLFile(UserDict):
|
|
|
33
33
|
self.xml_tree = ET.fromstring(self.input)
|
|
34
34
|
self.data = xmltodict.parse(self.input)
|
|
35
35
|
|
|
36
|
+
def __eq__(self, other) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Allows comparison of two instances of this class.
|
|
39
|
+
|
|
40
|
+
Uses the parsed xml information to determine equality.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
other: XMLFile
|
|
45
|
+
An instance of this class.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
Whether the given instance and this one are equal.
|
|
50
|
+
"""
|
|
51
|
+
return self.data == other.data
|
|
52
|
+
|
|
36
53
|
def to_xml(self, file_name=None, file_path=None):
|
|
37
54
|
"""
|
|
38
55
|
Writes the dictionary to xml.
|
|
@@ -1,258 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import logging
|
|
3
|
-
from collections import UserList
|
|
4
|
-
from typing import Callable, Type
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import numpy as np
|
|
7
|
-
from seabirdfilehandler import (
|
|
8
|
-
CnvFile,
|
|
9
|
-
BottleFile,
|
|
10
|
-
BottleLogFile,
|
|
11
|
-
)
|
|
12
|
-
from seabirdfilehandler import DataFile
|
|
13
|
-
from seabirdfilehandler.utils import get_unique_sensor_data
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class FileCollection(UserList):
|
|
19
|
-
"""A representation of multiple files of the same kind. These files share
|
|
20
|
-
the same suffix and are otherwise closely connected to each other. A common
|
|
21
|
-
use case would be the collection of CNVs to allow for easier processing or
|
|
22
|
-
integration of field calibration measurements.
|
|
23
|
-
|
|
24
|
-
Parameters
|
|
25
|
-
----------
|
|
26
|
-
|
|
27
|
-
Returns
|
|
28
|
-
-------
|
|
29
|
-
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(
|
|
33
|
-
self,
|
|
34
|
-
path_to_files: str | Path,
|
|
35
|
-
file_suffix: str,
|
|
36
|
-
only_metadata: bool = False,
|
|
37
|
-
sorting_key: Callable | None = None,
|
|
38
|
-
):
|
|
39
|
-
super().__init__()
|
|
40
|
-
self.path_to_files = Path(path_to_files)
|
|
41
|
-
self.file_suffix = file_suffix.strip(".")
|
|
42
|
-
self.file_type: Type[DataFile]
|
|
43
|
-
self.extract_file_type()
|
|
44
|
-
self.individual_file_paths = []
|
|
45
|
-
self.collect_files(sorting_key=sorting_key)
|
|
46
|
-
self.load_files(only_metadata)
|
|
47
|
-
if not only_metadata:
|
|
48
|
-
if self.file_type == DataFile:
|
|
49
|
-
self.df_list = self.get_dataframes()
|
|
50
|
-
self.df = self.get_collection_dataframe(self.df_list)
|
|
51
|
-
if self.file_type == CnvFile:
|
|
52
|
-
self.data_meta_info = self.get_data_table_meta_info()
|
|
53
|
-
self.sensor_data = get_unique_sensor_data(
|
|
54
|
-
[file.sensors for file in self.data]
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
def __str__(self):
|
|
58
|
-
return "/n".join(self.data)
|
|
59
|
-
|
|
60
|
-
def extract_file_type(self):
|
|
61
|
-
""" """
|
|
62
|
-
mapping_suffix_to_type = {
|
|
63
|
-
"cnv": CnvFile,
|
|
64
|
-
"btl": BottleFile,
|
|
65
|
-
"bl": BottleLogFile,
|
|
66
|
-
}
|
|
67
|
-
for key, value in mapping_suffix_to_type.items():
|
|
68
|
-
if key == self.file_suffix:
|
|
69
|
-
self.file_type = value
|
|
70
|
-
break
|
|
71
|
-
else:
|
|
72
|
-
self.file_type = DataFile
|
|
73
|
-
|
|
74
|
-
def collect_files(
|
|
75
|
-
self,
|
|
76
|
-
sorting_key: Callable | None = lambda file: int(
|
|
77
|
-
file.stem.split("_")[3]
|
|
78
|
-
),
|
|
79
|
-
):
|
|
80
|
-
""" """
|
|
81
|
-
self.individual_file_paths = sorted(
|
|
82
|
-
self.path_to_files.rglob(f"*{self.file_suffix}"),
|
|
83
|
-
key=sorting_key,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
def load_files(self, only_metadata: bool = False):
|
|
87
|
-
""" """
|
|
88
|
-
for file in self.individual_file_paths:
|
|
89
|
-
try:
|
|
90
|
-
self.data.append(self.file_type(file))
|
|
91
|
-
except TypeError:
|
|
92
|
-
logger.error(
|
|
93
|
-
f"Could not open file {file} with the type "
|
|
94
|
-
f"{self.file_type}."
|
|
95
|
-
)
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
def get_dataframes(
|
|
99
|
-
self,
|
|
100
|
-
event_log: bool = False,
|
|
101
|
-
coordinates: bool = False,
|
|
102
|
-
time_correction: bool = False,
|
|
103
|
-
cast_identifier: bool = False,
|
|
104
|
-
long_header_names: bool = False,
|
|
105
|
-
full_data_header: bool = True,
|
|
106
|
-
) -> list[pd.DataFrame]:
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
Parameters
|
|
110
|
-
----------
|
|
111
|
-
event_log: bool :
|
|
112
|
-
(Default value = False)
|
|
113
|
-
coordinates: bool :
|
|
114
|
-
(Default value = False)
|
|
115
|
-
time_correction: bool :
|
|
116
|
-
(Default value = False)
|
|
117
|
-
cast_identifier: bool :
|
|
118
|
-
(Default value = False)
|
|
119
|
-
long_header_names: bool :
|
|
120
|
-
(Default value = False)
|
|
121
|
-
full_data_header: bool :
|
|
122
|
-
(Default value = True)
|
|
123
|
-
|
|
124
|
-
Returns
|
|
125
|
-
-------
|
|
126
|
-
|
|
127
|
-
"""
|
|
128
|
-
for index, file in enumerate(self.data):
|
|
129
|
-
if full_data_header:
|
|
130
|
-
file.rename_dataframe_header(header_detail_level="longinfo")
|
|
131
|
-
elif long_header_names:
|
|
132
|
-
file.rename_dataframe_header(header_detail_level="name")
|
|
133
|
-
if event_log:
|
|
134
|
-
file.add_station_and_event_column()
|
|
135
|
-
if coordinates:
|
|
136
|
-
file.add_position_columns()
|
|
137
|
-
if time_correction:
|
|
138
|
-
file.absolute_time_calculation()
|
|
139
|
-
file.add_start_time()
|
|
140
|
-
if cast_identifier:
|
|
141
|
-
file.add_cast_number(index + 1)
|
|
142
|
-
return [file.df for file in self.data]
|
|
143
|
-
|
|
144
|
-
def get_collection_dataframe(
|
|
145
|
-
self, list_of_dfs: list[pd.DataFrame] | None = None
|
|
146
|
-
) -> pd.DataFrame:
|
|
147
|
-
"""
|
|
148
|
-
|
|
149
|
-
Parameters
|
|
150
|
-
----------
|
|
151
|
-
list_of_dfs: list[pd.DataFrame] | None :
|
|
152
|
-
(Default value = None)
|
|
153
|
-
|
|
154
|
-
Returns
|
|
155
|
-
-------
|
|
156
|
-
|
|
157
|
-
"""
|
|
158
|
-
if not list_of_dfs:
|
|
159
|
-
list_of_dfs = self.get_dataframes()
|
|
160
|
-
df = pd.concat(list_of_dfs, ignore_index=True)
|
|
161
|
-
# df.meta.metadata = list_of_dfs[0].meta.metadata
|
|
162
|
-
return df
|
|
163
|
-
|
|
164
|
-
def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
165
|
-
"""
|
|
166
|
-
|
|
167
|
-
Parameters
|
|
168
|
-
----------
|
|
169
|
-
df: pd.DataFrame :
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
Returns
|
|
173
|
-
-------
|
|
174
|
-
|
|
175
|
-
"""
|
|
176
|
-
df = self.use_bad_flag_for_nan(df)
|
|
177
|
-
df = self.set_dtype_to_float(df)
|
|
178
|
-
return self.select_real_scan_data(df)
|
|
179
|
-
|
|
180
|
-
def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
181
|
-
"""
|
|
182
|
-
|
|
183
|
-
Parameters
|
|
184
|
-
----------
|
|
185
|
-
df: pd.DataFrame :
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
Returns
|
|
189
|
-
-------
|
|
190
|
-
|
|
191
|
-
"""
|
|
192
|
-
bad_flags = set()
|
|
193
|
-
for file in self.data:
|
|
194
|
-
for line in file.data_table_description:
|
|
195
|
-
if line.startswith("bad_flag"):
|
|
196
|
-
flag = line.split("=")[1].strip()
|
|
197
|
-
bad_flags.add(flag)
|
|
198
|
-
for flag in bad_flags:
|
|
199
|
-
df.replace(to_replace=flag, value=np.nan, inplace=True)
|
|
200
|
-
return df
|
|
201
|
-
|
|
202
|
-
def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
203
|
-
"""
|
|
204
|
-
|
|
205
|
-
Parameters
|
|
206
|
-
----------
|
|
207
|
-
df: pd.DataFrame :
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
Returns
|
|
211
|
-
-------
|
|
212
|
-
|
|
213
|
-
"""
|
|
214
|
-
for parameter in df.columns:
|
|
215
|
-
if parameter in ["datetime"]:
|
|
216
|
-
continue
|
|
217
|
-
try:
|
|
218
|
-
df[parameter] = df[parameter].astype("float")
|
|
219
|
-
finally:
|
|
220
|
-
continue
|
|
221
|
-
return df
|
|
222
|
-
|
|
223
|
-
def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
224
|
-
"""
|
|
225
|
-
|
|
226
|
-
Parameters
|
|
227
|
-
----------
|
|
228
|
-
df: pd.DataFrame :
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
Returns
|
|
232
|
-
-------
|
|
233
|
-
|
|
234
|
-
"""
|
|
235
|
-
# TODO: fix this hardcoded name
|
|
236
|
-
try:
|
|
237
|
-
df = df.loc[df["Scan Count"].notna()]
|
|
238
|
-
finally:
|
|
239
|
-
pass
|
|
240
|
-
return df
|
|
241
|
-
|
|
242
|
-
def to_csv(self, file_name):
|
|
243
|
-
"""
|
|
244
|
-
|
|
245
|
-
Parameters
|
|
246
|
-
----------
|
|
247
|
-
file_name :
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
Returns
|
|
251
|
-
-------
|
|
252
|
-
|
|
253
|
-
"""
|
|
254
|
-
self.get_collection_dataframe().to_csv(file_name)
|
|
255
|
-
|
|
256
|
-
def get_data_table_meta_info(self) -> list[list[dict]]:
|
|
257
|
-
""" """
|
|
258
|
-
return [file.parameters.metadata for file in self.data]
|
|
File without changes
|
|
File without changes
|
{seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/bottlelogfile.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{seabirdfilehandler-0.5.3 → seabirdfilehandler-0.6.0}/src/seabirdfilehandler/validation_modules.py
RENAMED
|
File without changes
|