seabirdfilehandler 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of seabirdfilehandler might be problematic. Click here for more details.
- seabirdfilehandler/datatablefiles.py +160 -124
- seabirdfilehandler/file_collection.py +140 -46
- seabirdfilehandler/parameter.py +37 -13
- seabirdfilehandler/seabirdfiles.py +16 -6
- seabirdfilehandler/utils.py +53 -0
- seabirdfilehandler/validation_modules.py +18 -15
- seabirdfilehandler/xmlfiles.py +54 -0
- {seabirdfilehandler-0.4.1.dist-info → seabirdfilehandler-0.4.3.dist-info}/METADATA +4 -3
- seabirdfilehandler-0.4.3.dist-info/RECORD +14 -0
- seabirdfilehandler-0.4.1.dist-info/RECORD +0 -13
- {seabirdfilehandler-0.4.1.dist-info → seabirdfilehandler-0.4.3.dist-info}/LICENSE +0 -0
- {seabirdfilehandler-0.4.1.dist-info → seabirdfilehandler-0.4.3.dist-info}/WHEEL +0 -0
|
@@ -9,7 +9,10 @@ import warnings
|
|
|
9
9
|
from seabirdfilehandler.parameter import Parameters
|
|
10
10
|
from seabirdfilehandler.validation_modules import CnvValidationList
|
|
11
11
|
from seabirdfilehandler.seabirdfiles import SeaBirdFile
|
|
12
|
-
from seabirdfilehandler.dataframe_meta_accessor import
|
|
12
|
+
from seabirdfilehandler.dataframe_meta_accessor import (
|
|
13
|
+
SeriesMetaAccessor, # noqa: F401
|
|
14
|
+
DataFrameMetaAccessor, # noqa: F401
|
|
15
|
+
)
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
@@ -54,8 +57,8 @@ class DataTableFile(SeaBirdFile):
|
|
|
54
57
|
"""
|
|
55
58
|
file_path = self.file_dir if file_path is None else file_path
|
|
56
59
|
file_name = self.file_name if file_name is None else file_name
|
|
57
|
-
if file_type[0] !=
|
|
58
|
-
file_type =
|
|
60
|
+
if file_type[0] != ".":
|
|
61
|
+
file_type = "." + file_type
|
|
59
62
|
return Path(file_path).joinpath(file_name).with_suffix(file_type)
|
|
60
63
|
|
|
61
64
|
def to_csv(
|
|
@@ -85,18 +88,21 @@ class DataTableFile(SeaBirdFile):
|
|
|
85
88
|
-------
|
|
86
89
|
|
|
87
90
|
"""
|
|
88
|
-
selected_columns =
|
|
91
|
+
selected_columns = (
|
|
92
|
+
self.df.columns if selected_columns is None else selected_columns
|
|
93
|
+
)
|
|
89
94
|
df = self.df[selected_columns].reset_index(drop=True)
|
|
90
95
|
new_file_path = self.define_output_path(
|
|
91
|
-
output_file_path, output_file_name
|
|
96
|
+
output_file_path, output_file_name
|
|
97
|
+
)
|
|
92
98
|
if with_header:
|
|
93
|
-
with open(new_file_path,
|
|
99
|
+
with open(new_file_path, "w") as file:
|
|
94
100
|
for line in self.header:
|
|
95
101
|
file.write(line)
|
|
96
|
-
df.to_csv(new_file_path, index=False, mode=
|
|
102
|
+
df.to_csv(new_file_path, index=False, mode="a")
|
|
97
103
|
else:
|
|
98
|
-
df.to_csv(new_file_path, index=False, mode=
|
|
99
|
-
logger.info(f
|
|
104
|
+
df.to_csv(new_file_path, index=False, mode="w")
|
|
105
|
+
logger.info(f"Wrote file {self.path_to_file} to {new_file_path}.")
|
|
100
106
|
|
|
101
107
|
def selecting_columns(
|
|
102
108
|
self,
|
|
@@ -159,12 +165,17 @@ class BottleFile(DataTableFile):
|
|
|
159
165
|
top_names, bottom_names = self.reading_data_header()
|
|
160
166
|
# creating statistics column to store the row type information:
|
|
161
167
|
# 4 rows per bottle, average, standard deviation, max value, min value
|
|
162
|
-
top_names.append(
|
|
168
|
+
top_names.append("Statistic")
|
|
163
169
|
# TODO: sexier way to construct dataframe than opening the file a
|
|
164
170
|
# second time
|
|
165
171
|
# # df = pd.DataFrame(self.data, index=None, columns=top_names)
|
|
166
|
-
df: pd.DataFrame = pd.read_fwf(
|
|
167
|
-
self.
|
|
172
|
+
df: pd.DataFrame = pd.read_fwf(
|
|
173
|
+
self.path_to_file,
|
|
174
|
+
index_col=False,
|
|
175
|
+
skiprows=len(self.header) + 2,
|
|
176
|
+
header=None,
|
|
177
|
+
names=top_names,
|
|
178
|
+
)
|
|
168
179
|
|
|
169
180
|
# handling the double row header
|
|
170
181
|
rowtypes = df[df.columns[-1]].unique()
|
|
@@ -184,22 +195,25 @@ class BottleFile(DataTableFile):
|
|
|
184
195
|
|
|
185
196
|
"""
|
|
186
197
|
column_idx = df.columns.get_loc(column)
|
|
187
|
-
old_column = df.iloc[::
|
|
188
|
-
new_column = df.iloc[1::
|
|
198
|
+
old_column = df.iloc[::length, column_idx].reset_index(drop=True)
|
|
199
|
+
new_column = df.iloc[1::length, column_idx].reset_index(drop=True)
|
|
189
200
|
old_column_expanded = pd.Series(
|
|
190
|
-
np.repeat(old_column, length)
|
|
201
|
+
np.repeat(old_column, length)
|
|
202
|
+
).reset_index(drop=True)
|
|
191
203
|
new_column_expanded = pd.Series(
|
|
192
|
-
np.repeat(new_column, length)
|
|
204
|
+
np.repeat(new_column, length)
|
|
205
|
+
).reset_index(drop=True)
|
|
193
206
|
df[column] = old_column_expanded
|
|
194
|
-
df.insert(
|
|
195
|
-
|
|
207
|
+
df.insert(
|
|
208
|
+
column_idx + 1, bottom_names[column_idx], new_column_expanded
|
|
209
|
+
)
|
|
196
210
|
return df
|
|
197
211
|
|
|
198
|
-
df = separate_double_header_row(df,
|
|
212
|
+
df = separate_double_header_row(df, "Date", len(rowtypes))
|
|
199
213
|
df = separate_double_header_row(df, top_names[0], len(rowtypes))
|
|
200
214
|
# remove brackets around statistics values
|
|
201
|
-
df["Statistic"] = df["Statistic"].str.strip(
|
|
202
|
-
df = df.rename(mapper={
|
|
215
|
+
df["Statistic"] = df["Statistic"].str.strip("()")
|
|
216
|
+
df = df.rename(mapper={"Btl_ID": "Bottle_ID"}, axis=1)
|
|
203
217
|
return df
|
|
204
218
|
|
|
205
219
|
def adding_timestamp_column(self):
|
|
@@ -216,9 +230,10 @@ class BottleFile(DataTableFile):
|
|
|
216
230
|
# constructing timestamp column
|
|
217
231
|
timestamp = []
|
|
218
232
|
for datepoint, timepoint in zip(self.df.Date, self.df.Time):
|
|
219
|
-
timestamp.append(
|
|
220
|
-
datepoint, time.fromisoformat(str(timepoint)))
|
|
221
|
-
|
|
233
|
+
timestamp.append(
|
|
234
|
+
datetime.combine(datepoint, time.fromisoformat(str(timepoint)))
|
|
235
|
+
)
|
|
236
|
+
self.df.insert(2, "Timestamp", timestamp)
|
|
222
237
|
self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
|
|
223
238
|
|
|
224
239
|
def setting_dataframe_dtypes(self):
|
|
@@ -228,9 +243,9 @@ class BottleFile(DataTableFile):
|
|
|
228
243
|
self.df.Date = pd.to_datetime(self.df.Date)
|
|
229
244
|
self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
|
|
230
245
|
|
|
231
|
-
def selecting_rows(
|
|
232
|
-
|
|
233
|
-
|
|
246
|
+
def selecting_rows(
|
|
247
|
+
self, df=None, statistic_of_interest: Union[list, str] = ["avg"]
|
|
248
|
+
):
|
|
234
249
|
"""Creates a dataframe with the given row identifier, using the
|
|
235
250
|
statistics column. A single string or a list of strings can be
|
|
236
251
|
processed.
|
|
@@ -251,7 +266,7 @@ class BottleFile(DataTableFile):
|
|
|
251
266
|
# ensure that the input is a list, so that isin() can do its job
|
|
252
267
|
if isinstance(statistic_of_interest, str):
|
|
253
268
|
statistic_of_interest = [statistic_of_interest]
|
|
254
|
-
self.df = df.loc[df[
|
|
269
|
+
self.df = df.loc[df["Statistic"].isin(statistic_of_interest)]
|
|
255
270
|
|
|
256
271
|
def reading_data_header(self):
|
|
257
272
|
"""Identifies and separatly collects the rows that specify the data
|
|
@@ -267,21 +282,24 @@ class BottleFile(DataTableFile):
|
|
|
267
282
|
n = 11 # fix column width of a seabird btl file
|
|
268
283
|
top_line = self.data[0]
|
|
269
284
|
second_line = self.data[1]
|
|
270
|
-
top_names = [
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
285
|
+
top_names = [
|
|
286
|
+
top_line[i : i + n].split()[0]
|
|
287
|
+
for i in range(0, len(top_line) - n, n)
|
|
288
|
+
]
|
|
289
|
+
bottom_names = [
|
|
290
|
+
second_line[i : i + n].split()[0] for i in range(0, 2 * n, n)
|
|
291
|
+
]
|
|
274
292
|
return top_names, bottom_names
|
|
275
293
|
|
|
276
294
|
def add_station_and_event_column(self):
|
|
277
|
-
event_list = [self.metadata[
|
|
278
|
-
self.df.insert(0,
|
|
295
|
+
event_list = [self.metadata["Station"] for _ in self.data]
|
|
296
|
+
self.df.insert(0, "Event", pd.Series(event_list))
|
|
279
297
|
|
|
280
298
|
def add_position_columns(self):
|
|
281
|
-
latitude_list = [self.metadata[
|
|
282
|
-
self.df.insert(1,
|
|
283
|
-
longitude_list = [self.metadata[
|
|
284
|
-
self.df.insert(2,
|
|
299
|
+
latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
|
|
300
|
+
self.df.insert(1, "Latitude", pd.Series(latitude_list))
|
|
301
|
+
longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
|
|
302
|
+
self.df.insert(2, "Longitude", pd.Series(longitude_list))
|
|
285
303
|
|
|
286
304
|
|
|
287
305
|
class CnvFile(DataTableFile):
|
|
@@ -331,12 +349,13 @@ class CnvFile(DataTableFile):
|
|
|
331
349
|
self.start_time = self.reading_start_time()
|
|
332
350
|
if create_dataframe:
|
|
333
351
|
warnings.warn(
|
|
334
|
-
|
|
352
|
+
"The default of constructing a pandas Dataframe will soon be replaced by using the Parameters class that works on numpy arrays.",
|
|
335
353
|
DeprecationWarning,
|
|
336
|
-
stacklevel=2 # Ensures the warning points to the caller's line
|
|
354
|
+
stacklevel=2, # Ensures the warning points to the caller's line
|
|
355
|
+
)
|
|
356
|
+
self.data_header_meta_info, self.duplicate_columns = (
|
|
357
|
+
self.reading_data_header(self.data_table_description)
|
|
337
358
|
)
|
|
338
|
-
self.data_header_meta_info, self.duplicate_columns = self.reading_data_header(
|
|
339
|
-
self.data_table_description)
|
|
340
359
|
self.original_df = self.create_dataframe(data_table_info_level)
|
|
341
360
|
self.df = self.original_df
|
|
342
361
|
if absolute_time_calculation:
|
|
@@ -347,9 +366,12 @@ class CnvFile(DataTableFile):
|
|
|
347
366
|
self.add_position_columns()
|
|
348
367
|
else:
|
|
349
368
|
self.parameters = Parameters(
|
|
350
|
-
self.data, self.data_table_description
|
|
369
|
+
self.data, self.data_table_description
|
|
370
|
+
)
|
|
351
371
|
|
|
352
|
-
def reading_data_header(
|
|
372
|
+
def reading_data_header(
|
|
373
|
+
self, header_info: list = []
|
|
374
|
+
) -> Tuple[dict[str, dict], list[int]]:
|
|
353
375
|
"""Reads the tables header data from the header.
|
|
354
376
|
|
|
355
377
|
Parameters
|
|
@@ -362,7 +384,7 @@ class CnvFile(DataTableFile):
|
|
|
362
384
|
a list of dictionaries, that organize the table header information
|
|
363
385
|
|
|
364
386
|
"""
|
|
365
|
-
if header_info
|
|
387
|
+
if header_info == []:
|
|
366
388
|
header_info = self.data_table_description
|
|
367
389
|
table_header = {}
|
|
368
390
|
duplicate_columns = []
|
|
@@ -380,16 +402,20 @@ class CnvFile(DataTableFile):
|
|
|
380
402
|
if shortname in list(table_header.keys()):
|
|
381
403
|
try:
|
|
382
404
|
duplicate_columns.append(
|
|
383
|
-
int(line.split("=")[0].strip().split()[1])
|
|
384
|
-
|
|
405
|
+
int(line.split("=")[0].strip().split()[1])
|
|
406
|
+
)
|
|
407
|
+
except IndexError as error:
|
|
385
408
|
logger.error(
|
|
386
409
|
f"Could not resolve duplicate column: {
|
|
387
|
-
shortname
|
|
410
|
+
shortname
|
|
411
|
+
}, {error}"
|
|
412
|
+
)
|
|
388
413
|
else:
|
|
389
414
|
header_meta_info["shortname"] = shortname
|
|
390
415
|
header_meta_info["longinfo"] = longinfo.strip()
|
|
391
416
|
metainfo = self._extract_data_header_meta_info(
|
|
392
|
-
longinfo.strip()
|
|
417
|
+
longinfo.strip()
|
|
418
|
+
)
|
|
393
419
|
header_meta_info = {**header_meta_info, **metainfo}
|
|
394
420
|
table_header[shortname.strip()] = header_meta_info
|
|
395
421
|
return table_header, duplicate_columns
|
|
@@ -408,22 +434,24 @@ class CnvFile(DataTableFile):
|
|
|
408
434
|
|
|
409
435
|
"""
|
|
410
436
|
regex_string = r"(?:(?P<name0>.+),\s(?P<metainfo0>.+)\s\[(?P<unit0>.+)\]|(?P<name2>.+)\s\[(?P<unit2>.+)\]|(?P<name3>.+),\s(?P<metainfo2>.[^\s]+)|(?P<name4>.+))"
|
|
411
|
-
regex_check = re.search(
|
|
412
|
-
regex_string, line, flags=re.IGNORECASE)
|
|
437
|
+
regex_check = re.search(regex_string, line, flags=re.IGNORECASE)
|
|
413
438
|
if regex_check:
|
|
414
439
|
regex_info = dict(regex_check.groupdict())
|
|
415
|
-
regex_info = {
|
|
416
|
-
|
|
440
|
+
regex_info = {
|
|
441
|
+
key[:-1]: value
|
|
442
|
+
for key, value in regex_info.items()
|
|
443
|
+
if value is not None
|
|
444
|
+
}
|
|
417
445
|
if len(regex_info) > 2:
|
|
418
446
|
# check for second sensors and adjust their names
|
|
419
|
-
if regex_info["metainfo"][-1] ==
|
|
447
|
+
if regex_info["metainfo"][-1] == "2":
|
|
420
448
|
regex_info["name"] = regex_info["name"] + " 2"
|
|
421
|
-
regex_info["metainfo"] = regex_info[
|
|
449
|
+
regex_info["metainfo"] = regex_info["metainfo"][:-1]
|
|
422
450
|
if len(regex_info["metainfo"]) == 0:
|
|
423
451
|
regex_info.pop("metainfo")
|
|
424
|
-
if regex_info[
|
|
425
|
-
regex_info["metainfo"] = regex_info[
|
|
426
|
-
regex_info["unit"] = regex_info[
|
|
452
|
+
if regex_info["name"] == "flag":
|
|
453
|
+
regex_info["metainfo"] = regex_info["name"]
|
|
454
|
+
regex_info["unit"] = regex_info["name"]
|
|
427
455
|
return regex_info
|
|
428
456
|
return {}
|
|
429
457
|
|
|
@@ -450,7 +478,10 @@ class CnvFile(DataTableFile):
|
|
|
450
478
|
row_list = []
|
|
451
479
|
for line in self.data:
|
|
452
480
|
row_list.append(
|
|
453
|
-
[
|
|
481
|
+
[
|
|
482
|
+
line[i : i + n].split()[0]
|
|
483
|
+
for i in range(0, len(line) - n, n)
|
|
484
|
+
]
|
|
454
485
|
)
|
|
455
486
|
df = pd.DataFrame(row_list, dtype=float)
|
|
456
487
|
header_names = [
|
|
@@ -463,8 +494,9 @@ class CnvFile(DataTableFile):
|
|
|
463
494
|
try:
|
|
464
495
|
df.columns = header_names
|
|
465
496
|
except ValueError as error:
|
|
466
|
-
logger.error(
|
|
467
|
-
|
|
497
|
+
logger.error(
|
|
498
|
+
f"Could not set dataframe header for {self.file_name}: {error}"
|
|
499
|
+
)
|
|
468
500
|
logger.error(header_names)
|
|
469
501
|
else:
|
|
470
502
|
df.meta.metadata = self.data_header_meta_info
|
|
@@ -472,26 +504,27 @@ class CnvFile(DataTableFile):
|
|
|
472
504
|
return df
|
|
473
505
|
|
|
474
506
|
def rename_dataframe_header(
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
507
|
+
self,
|
|
508
|
+
df: pd.DataFrame | None = None,
|
|
509
|
+
header_detail_level: str = "shortname",
|
|
478
510
|
) -> list:
|
|
479
511
|
df = self.df if df is None else df
|
|
480
512
|
df.meta.rename(header_detail_level)
|
|
481
513
|
return [column for column in df.columns]
|
|
482
514
|
|
|
483
515
|
def reading_start_time(
|
|
484
|
-
|
|
485
|
-
|
|
516
|
+
self,
|
|
517
|
+
time_source: str = "System UTC",
|
|
486
518
|
) -> datetime | None:
|
|
487
519
|
"""
|
|
488
520
|
Extracts the Cast start time from the metadata header.
|
|
489
521
|
"""
|
|
490
522
|
for line in self.sbe9_data:
|
|
491
523
|
if line.startswith(time_source):
|
|
492
|
-
start_time = line.split(
|
|
524
|
+
start_time = line.split("=")[1]
|
|
493
525
|
start_time = datetime.strptime(
|
|
494
|
-
start_time,
|
|
526
|
+
start_time, " %b %d %Y %H:%M:%S "
|
|
527
|
+
)
|
|
495
528
|
return start_time
|
|
496
529
|
return None
|
|
497
530
|
|
|
@@ -504,11 +537,11 @@ class CnvFile(DataTableFile):
|
|
|
504
537
|
"""
|
|
505
538
|
time_parameter = None
|
|
506
539
|
for parameter in self.df.columns:
|
|
507
|
-
if parameter.lower().startswith(
|
|
540
|
+
if parameter.lower().startswith("time"):
|
|
508
541
|
time_parameter = parameter
|
|
509
542
|
if time_parameter and self.start_time:
|
|
510
543
|
self.df.meta.add_column(
|
|
511
|
-
name=
|
|
544
|
+
name="datetime",
|
|
512
545
|
data=[
|
|
513
546
|
timedelta(days=float(time)) + self.start_time
|
|
514
547
|
if time_parameter == "timeJ"
|
|
@@ -526,7 +559,7 @@ class CnvFile(DataTableFile):
|
|
|
526
559
|
"""
|
|
527
560
|
if self.start_time:
|
|
528
561
|
self.df.meta.add_column(
|
|
529
|
-
name=
|
|
562
|
+
name="start_time",
|
|
530
563
|
data=pd.Series([self.start_time for _ in self.data]),
|
|
531
564
|
)
|
|
532
565
|
return True
|
|
@@ -541,9 +574,9 @@ class CnvFile(DataTableFile):
|
|
|
541
574
|
return CnvValidationList(validation_modules)
|
|
542
575
|
|
|
543
576
|
def df2cnv(
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
577
|
+
self,
|
|
578
|
+
header_names: list | None = None,
|
|
579
|
+
header_detail_level: str | None = None,
|
|
547
580
|
) -> list:
|
|
548
581
|
"""
|
|
549
582
|
Parses a pandas dataframe into a list that represents the lines inside
|
|
@@ -577,8 +610,9 @@ class CnvFile(DataTableFile):
|
|
|
577
610
|
)
|
|
578
611
|
cnv_out = []
|
|
579
612
|
for _, row in df.iterrows():
|
|
580
|
-
cnv_like_row =
|
|
581
|
-
(lambda column: f"{str(column):>11}")(value) for value in row
|
|
613
|
+
cnv_like_row = "".join(
|
|
614
|
+
(lambda column: f"{str(column):>11}")(value) for value in row
|
|
615
|
+
)
|
|
582
616
|
cnv_out.append(cnv_like_row + "\n")
|
|
583
617
|
return cnv_out
|
|
584
618
|
|
|
@@ -620,25 +654,24 @@ class CnvFile(DataTableFile):
|
|
|
620
654
|
self.file_data = [*self.header, *data]
|
|
621
655
|
# writing content out
|
|
622
656
|
try:
|
|
623
|
-
with open(file_name,
|
|
657
|
+
with open(file_name, "w", encoding="latin-1") as file:
|
|
624
658
|
for line in self.file_data:
|
|
625
659
|
file.write(line)
|
|
626
|
-
logger.info(f'Wrote cnv {self.path_to_file} to {file_name}.')
|
|
627
660
|
|
|
628
661
|
except IOError as error:
|
|
629
|
-
logger.error(f
|
|
662
|
+
logger.error(f"Could not write cnv file: {error}")
|
|
630
663
|
|
|
631
664
|
def _update_header(self):
|
|
632
665
|
"""Re-creates the cnv header."""
|
|
633
666
|
self.data_table_description = self._form_data_table_info()
|
|
634
667
|
self.header = [
|
|
635
|
-
*[f
|
|
636
|
-
*[f
|
|
637
|
-
f
|
|
638
|
-
*[f
|
|
639
|
-
*[f
|
|
640
|
-
*[f
|
|
641
|
-
|
|
668
|
+
*[f"* {data}" for data in self.sbe9_data[:-1]],
|
|
669
|
+
*[f"** {data}" for data in self.metadata_list],
|
|
670
|
+
f"* {self.sbe9_data[-1]}",
|
|
671
|
+
*[f"# {data}" for data in self.data_table_description],
|
|
672
|
+
*[f"# {data}" for data in self.sensor_data],
|
|
673
|
+
*[f"# {data}" for data in self.processing_info],
|
|
674
|
+
"*END*\n",
|
|
642
675
|
]
|
|
643
676
|
|
|
644
677
|
def _form_data_table_info(self) -> list:
|
|
@@ -680,14 +713,12 @@ class CnvFile(DataTableFile):
|
|
|
680
713
|
|
|
681
714
|
"""
|
|
682
715
|
try:
|
|
683
|
-
event_list = [self.metadata[
|
|
716
|
+
event_list = [self.metadata["Station"] for _ in self.data]
|
|
684
717
|
except KeyError:
|
|
685
718
|
return False
|
|
686
719
|
else:
|
|
687
720
|
self.df.meta.add_column(
|
|
688
|
-
name=
|
|
689
|
-
data=pd.Series(event_list),
|
|
690
|
-
location=0
|
|
721
|
+
name="Event", data=pd.Series(event_list), location=0
|
|
691
722
|
)
|
|
692
723
|
return True
|
|
693
724
|
|
|
@@ -697,26 +728,22 @@ class CnvFile(DataTableFile):
|
|
|
697
728
|
These must be present inside the extra metadata header.
|
|
698
729
|
|
|
699
730
|
"""
|
|
700
|
-
if (
|
|
701
|
-
column[
|
|
731
|
+
if ("latitude" or "longitude") in [
|
|
732
|
+
column["shortname"]
|
|
702
733
|
for column in list(self.df.meta.metadata.values())
|
|
703
734
|
]:
|
|
704
735
|
return True
|
|
705
736
|
try:
|
|
706
|
-
latitude_list = [self.metadata[
|
|
707
|
-
longitude_list = [self.metadata[
|
|
737
|
+
latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
|
|
738
|
+
longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
|
|
708
739
|
except KeyError:
|
|
709
740
|
return False
|
|
710
741
|
else:
|
|
711
742
|
self.df.meta.add_column(
|
|
712
|
-
name=
|
|
713
|
-
data=pd.Series(latitude_list),
|
|
714
|
-
location=1
|
|
743
|
+
name="Latitude", data=pd.Series(latitude_list), location=1
|
|
715
744
|
)
|
|
716
745
|
self.df.meta.add_column(
|
|
717
|
-
name=
|
|
718
|
-
data=pd.Series(longitude_list),
|
|
719
|
-
location=2
|
|
746
|
+
name="Longitude", data=pd.Series(longitude_list), location=2
|
|
720
747
|
)
|
|
721
748
|
return True
|
|
722
749
|
|
|
@@ -730,13 +757,13 @@ class CnvFile(DataTableFile):
|
|
|
730
757
|
the cast number of this files cast
|
|
731
758
|
|
|
732
759
|
"""
|
|
733
|
-
if (
|
|
734
|
-
number = int(self.metadata[
|
|
760
|
+
if ("Cast" in self.metadata.keys()) and (not number):
|
|
761
|
+
number = int(self.metadata["Cast"])
|
|
735
762
|
try:
|
|
736
763
|
self.df.meta.add_column(
|
|
737
|
-
name=
|
|
764
|
+
name="Cast",
|
|
738
765
|
data=pd.Series([number for _ in self.data]),
|
|
739
|
-
location=0
|
|
766
|
+
location=0,
|
|
740
767
|
)
|
|
741
768
|
except ValueError:
|
|
742
769
|
# Cast is already part of the dataframe, so nothing left to do
|
|
@@ -758,7 +785,7 @@ class BottleLogFile(DataTableFile):
|
|
|
758
785
|
|
|
759
786
|
"""
|
|
760
787
|
|
|
761
|
-
def __init__(self, path_to_file, create_dataframe
|
|
788
|
+
def __init__(self, path_to_file, create_dataframe=False):
|
|
762
789
|
super().__init__(path_to_file)
|
|
763
790
|
self.reset_time = self.obtaining_reset_time()
|
|
764
791
|
self.origin_cnv = self.raw_file_data[0].strip()
|
|
@@ -769,8 +796,6 @@ class BottleLogFile(DataTableFile):
|
|
|
769
796
|
self.df = self.original_df
|
|
770
797
|
else:
|
|
771
798
|
self.data_list = self.create_list()
|
|
772
|
-
|
|
773
|
-
|
|
774
799
|
|
|
775
800
|
def data_whitespace_removal(self) -> list:
|
|
776
801
|
"""Strips the input from whitespace characters, in this case especially
|
|
@@ -801,13 +826,15 @@ class BottleLogFile(DataTableFile):
|
|
|
801
826
|
|
|
802
827
|
"""
|
|
803
828
|
|
|
804
|
-
regex_check = re.search(
|
|
805
|
-
|
|
829
|
+
regex_check = re.search(
|
|
830
|
+
r"RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)",
|
|
831
|
+
self.raw_file_data[1],
|
|
832
|
+
)
|
|
806
833
|
if regex_check:
|
|
807
|
-
return datetime.strptime(regex_check.group(1),
|
|
834
|
+
return datetime.strptime(regex_check.group(1), "%b %d %Y %H:%M:%S")
|
|
808
835
|
else:
|
|
809
|
-
error_message =
|
|
810
|
-
Reset time could not be extracted.
|
|
836
|
+
error_message = """BottleLogFile is not formatted as expected:
|
|
837
|
+
Reset time could not be extracted."""
|
|
811
838
|
logger.error(error_message)
|
|
812
839
|
raise IOError(error_message)
|
|
813
840
|
|
|
@@ -828,13 +855,12 @@ class BottleLogFile(DataTableFile):
|
|
|
828
855
|
bottles = [int(x) for x in self.data[i].split(",")[:2]]
|
|
829
856
|
date = self.convert_date(self.data[i].split(",")[2])
|
|
830
857
|
lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
|
|
831
|
-
|
|
858
|
+
|
|
832
859
|
content_array.append([bottles, date, lines])
|
|
833
|
-
|
|
860
|
+
|
|
834
861
|
return content_array
|
|
835
|
-
|
|
836
862
|
|
|
837
|
-
def convert_date(self, date
|
|
863
|
+
def convert_date(self, date: str):
|
|
838
864
|
"""Converts the Dates of the .bl files to an ISO 8601 standard
|
|
839
865
|
|
|
840
866
|
Parameters
|
|
@@ -845,8 +871,21 @@ class BottleLogFile(DataTableFile):
|
|
|
845
871
|
a string with the date in the form of "yymmddThhmmss"
|
|
846
872
|
"""
|
|
847
873
|
date = date.strip()
|
|
848
|
-
month_list = [
|
|
849
|
-
|
|
874
|
+
month_list = [
|
|
875
|
+
"Jan",
|
|
876
|
+
"Feb",
|
|
877
|
+
"Mar",
|
|
878
|
+
"Apr",
|
|
879
|
+
"May",
|
|
880
|
+
"Jun",
|
|
881
|
+
"Jul",
|
|
882
|
+
"Aug",
|
|
883
|
+
"Sep",
|
|
884
|
+
"Oct",
|
|
885
|
+
"Nov",
|
|
886
|
+
"Dec",
|
|
887
|
+
]
|
|
888
|
+
|
|
850
889
|
month_ind = month_list.index(date.split(" ")[0]) + 1
|
|
851
890
|
if month_ind < 10:
|
|
852
891
|
month = "0" + str(month_ind)
|
|
@@ -856,9 +895,7 @@ class BottleLogFile(DataTableFile):
|
|
|
856
895
|
year = (date.split(" ")[2])[2:]
|
|
857
896
|
time = date.split(" ")[3].replace(":", "")
|
|
858
897
|
return year + month + day + "T" + time
|
|
859
|
-
|
|
860
898
|
|
|
861
|
-
|
|
862
899
|
def create_dataframe(self) -> pd.DataFrame:
|
|
863
900
|
"""Creates a dataframe from the list specified in self.data.
|
|
864
901
|
|
|
@@ -871,16 +908,15 @@ class BottleLogFile(DataTableFile):
|
|
|
871
908
|
"""
|
|
872
909
|
data_lists = []
|
|
873
910
|
for line in self.data:
|
|
874
|
-
inner_list = line.split(
|
|
911
|
+
inner_list = line.split(",")
|
|
875
912
|
# dropping first column as its the index
|
|
876
913
|
data_lists.append(inner_list[1:])
|
|
877
914
|
df = pd.DataFrame(data_lists)
|
|
878
|
-
df.columns = [
|
|
915
|
+
df.columns = ["Bottle ID", "Datetime", "start_range", "end_range"]
|
|
879
916
|
return df
|
|
880
917
|
|
|
881
918
|
|
|
882
919
|
class FieldCalibrationFile(DataTableFile):
|
|
883
|
-
|
|
884
920
|
def __init__(self, path_to_file):
|
|
885
921
|
super().__init__(path_to_file)
|
|
886
922
|
self.original_df = self.create_dataframe()
|
|
@@ -890,5 +926,5 @@ class FieldCalibrationFile(DataTableFile):
|
|
|
890
926
|
try:
|
|
891
927
|
return pd.read_csv(self.path_to_file, skiprows=len(self.header))
|
|
892
928
|
except IOError as error:
|
|
893
|
-
logger.error(f
|
|
929
|
+
logger.error(f"Could not read field calibration file: {error}.")
|
|
894
930
|
return pd.DataFrame()
|
|
@@ -4,8 +4,14 @@ from collections import UserList
|
|
|
4
4
|
from typing import Type
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import numpy as np
|
|
7
|
-
from seabirdfilehandler import
|
|
7
|
+
from seabirdfilehandler import (
|
|
8
|
+
SeaBirdFile,
|
|
9
|
+
CnvFile,
|
|
10
|
+
BottleFile,
|
|
11
|
+
BottleLogFile,
|
|
12
|
+
)
|
|
8
13
|
from seabirdfilehandler.datatablefiles import DataTableFile
|
|
14
|
+
from seabirdfilehandler.utils import get_unique_sensor_data
|
|
9
15
|
|
|
10
16
|
logger = logging.getLogger(__name__)
|
|
11
17
|
|
|
@@ -14,17 +20,26 @@ class FileCollection(UserList):
|
|
|
14
20
|
"""A representation of multiple files of the same kind. These files share
|
|
15
21
|
the same suffix and are otherwise closely connected to each other. A common
|
|
16
22
|
use case would be the collection of CNVs to allow for easier processing or
|
|
17
|
-
integration of field calibration measurements.
|
|
23
|
+
integration of field calibration measurements.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
|
|
31
|
+
"""
|
|
18
32
|
|
|
19
33
|
def __init__(
|
|
20
34
|
self,
|
|
21
35
|
path_to_files: str | Path,
|
|
22
36
|
file_suffix: str,
|
|
23
|
-
pattern: str | None = None
|
|
37
|
+
pattern: str | None = None,
|
|
38
|
+
only_metadata: bool = False,
|
|
24
39
|
):
|
|
25
40
|
super().__init__()
|
|
26
41
|
self.path_to_files = Path(path_to_files)
|
|
27
|
-
self.file_suffix = file_suffix.strip(
|
|
42
|
+
self.file_suffix = file_suffix.strip(".")
|
|
28
43
|
self.file_type: Type[SeaBirdFile]
|
|
29
44
|
self.extract_file_type()
|
|
30
45
|
self.individual_file_paths = []
|
|
@@ -33,21 +48,27 @@ class FileCollection(UserList):
|
|
|
33
48
|
# TODO: implement pattern handling
|
|
34
49
|
self.pattern = pattern
|
|
35
50
|
else:
|
|
36
|
-
self.load_files()
|
|
37
|
-
if
|
|
38
|
-
self.
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
self.
|
|
42
|
-
|
|
51
|
+
self.load_files(only_metadata)
|
|
52
|
+
if not only_metadata:
|
|
53
|
+
if self.file_type == DataTableFile:
|
|
54
|
+
self.df_list = self.get_dataframes()
|
|
55
|
+
self.df = self.get_collection_dataframe(self.df_list)
|
|
56
|
+
if self.file_type == CnvFile:
|
|
57
|
+
self.data_meta_info = self.get_data_table_meta_info()
|
|
58
|
+
self.sensor_data = get_unique_sensor_data(
|
|
59
|
+
[file.sensors for file in self.data]
|
|
60
|
+
)
|
|
43
61
|
|
|
44
62
|
def __str__(self):
|
|
45
|
-
return
|
|
63
|
+
return "/n".join(self.data)
|
|
46
64
|
|
|
47
65
|
def extract_file_type(self):
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
66
|
+
""" """
|
|
67
|
+
mapping_suffix_to_type = {
|
|
68
|
+
"cnv": CnvFile,
|
|
69
|
+
"btl": BottleFile,
|
|
70
|
+
"bl": BottleLogFile,
|
|
71
|
+
}
|
|
51
72
|
for key, value in mapping_suffix_to_type.items():
|
|
52
73
|
if key == self.file_suffix:
|
|
53
74
|
self.file_type = value
|
|
@@ -56,17 +77,21 @@ class FileCollection(UserList):
|
|
|
56
77
|
self.file_type = SeaBirdFile
|
|
57
78
|
|
|
58
79
|
def collect_files(self):
|
|
59
|
-
|
|
80
|
+
""" """
|
|
81
|
+
for path in self.path_to_files.rglob(f"*{self.file_suffix}"):
|
|
60
82
|
self.individual_file_paths.append(path)
|
|
61
83
|
self.individual_file_paths.sort()
|
|
62
84
|
|
|
63
|
-
def load_files(self):
|
|
85
|
+
def load_files(self, only_metadata: bool = False):
|
|
86
|
+
""" """
|
|
64
87
|
for file in self.individual_file_paths:
|
|
65
88
|
try:
|
|
66
89
|
self.data.append(self.file_type(file))
|
|
67
90
|
except TypeError:
|
|
68
|
-
logger.error(
|
|
69
|
-
|
|
91
|
+
logger.error(
|
|
92
|
+
f"Could not open file {file} with the type "
|
|
93
|
+
f"{self.file_type}."
|
|
94
|
+
)
|
|
70
95
|
continue
|
|
71
96
|
|
|
72
97
|
def get_dataframes(
|
|
@@ -78,6 +103,27 @@ class FileCollection(UserList):
|
|
|
78
103
|
long_header_names: bool = False,
|
|
79
104
|
full_data_header: bool = True,
|
|
80
105
|
) -> list[pd.DataFrame]:
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
event_log: bool :
|
|
111
|
+
(Default value = False)
|
|
112
|
+
coordinates: bool :
|
|
113
|
+
(Default value = False)
|
|
114
|
+
time_correction: bool :
|
|
115
|
+
(Default value = False)
|
|
116
|
+
cast_identifier: bool :
|
|
117
|
+
(Default value = False)
|
|
118
|
+
long_header_names: bool :
|
|
119
|
+
(Default value = False)
|
|
120
|
+
full_data_header: bool :
|
|
121
|
+
(Default value = True)
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
|
|
126
|
+
"""
|
|
81
127
|
for index, file in enumerate(self.data):
|
|
82
128
|
if full_data_header:
|
|
83
129
|
file.rename_dataframe_header(header_detail_level="longinfo")
|
|
@@ -91,73 +137,121 @@ class FileCollection(UserList):
|
|
|
91
137
|
file.absolute_time_calculation()
|
|
92
138
|
file.add_start_time()
|
|
93
139
|
if cast_identifier:
|
|
94
|
-
file.add_cast_number(index+1)
|
|
140
|
+
file.add_cast_number(index + 1)
|
|
95
141
|
return [file.df for file in self.data]
|
|
96
142
|
|
|
97
143
|
def get_collection_dataframe(
|
|
98
|
-
self,
|
|
99
|
-
list_of_dfs: list[pd.DataFrame] | None = None
|
|
144
|
+
self, list_of_dfs: list[pd.DataFrame] | None = None
|
|
100
145
|
) -> pd.DataFrame:
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
list_of_dfs: list[pd.DataFrame] | None :
|
|
151
|
+
(Default value = None)
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
|
|
156
|
+
"""
|
|
101
157
|
if not list_of_dfs:
|
|
102
158
|
list_of_dfs = self.get_dataframes()
|
|
103
159
|
df = pd.concat(list_of_dfs, ignore_index=True)
|
|
104
|
-
df.meta.metadata = list_of_dfs[0].meta.metadata
|
|
160
|
+
# df.meta.metadata = list_of_dfs[0].meta.metadata
|
|
105
161
|
return df
|
|
106
162
|
|
|
107
163
|
def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
df: pd.DataFrame :
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
|
|
174
|
+
"""
|
|
108
175
|
df = self.use_bad_flag_for_nan(df)
|
|
109
176
|
df = self.set_dtype_to_float(df)
|
|
110
177
|
return self.select_real_scan_data(df)
|
|
111
178
|
|
|
112
179
|
def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
df: pd.DataFrame :
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
|
|
190
|
+
"""
|
|
113
191
|
bad_flags = set()
|
|
114
192
|
for file in self.data:
|
|
115
193
|
for line in file.data_table_description:
|
|
116
|
-
if line.startswith(
|
|
117
|
-
flag = line.split(
|
|
194
|
+
if line.startswith("bad_flag"):
|
|
195
|
+
flag = line.split("=")[1].strip()
|
|
118
196
|
bad_flags.add(flag)
|
|
119
197
|
for flag in bad_flags:
|
|
120
198
|
df.replace(to_replace=flag, value=np.nan, inplace=True)
|
|
121
199
|
return df
|
|
122
200
|
|
|
123
201
|
def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
Parameters
|
|
205
|
+
----------
|
|
206
|
+
df: pd.DataFrame :
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
|
|
212
|
+
"""
|
|
124
213
|
for parameter in df.columns:
|
|
125
|
-
if parameter in [
|
|
214
|
+
if parameter in ["datetime"]:
|
|
126
215
|
continue
|
|
127
216
|
try:
|
|
128
|
-
df[parameter] = df[parameter].astype(
|
|
217
|
+
df[parameter] = df[parameter].astype("float")
|
|
129
218
|
finally:
|
|
130
219
|
continue
|
|
131
220
|
return df
|
|
132
221
|
|
|
133
222
|
def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
df: pd.DataFrame :
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
|
|
233
|
+
"""
|
|
134
234
|
# TODO: fix this hardcoded name
|
|
135
235
|
try:
|
|
136
|
-
df = df.loc[df[
|
|
236
|
+
df = df.loc[df["Scan Count"].notna()]
|
|
137
237
|
finally:
|
|
138
238
|
pass
|
|
139
239
|
return df
|
|
140
240
|
|
|
141
241
|
def to_csv(self, file_name):
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
Parameters
|
|
245
|
+
----------
|
|
246
|
+
file_name :
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
Returns
|
|
250
|
+
-------
|
|
251
|
+
|
|
252
|
+
"""
|
|
142
253
|
self.get_collection_dataframe().to_csv(file_name)
|
|
143
254
|
|
|
144
255
|
def get_data_table_meta_info(self) -> list[list[dict]]:
|
|
256
|
+
""" """
|
|
145
257
|
return [file.data_header_meta_info for file in self.data]
|
|
146
|
-
|
|
147
|
-
def get_sensor_data(self) -> list[tuple[list[dict]]]:
|
|
148
|
-
unique = []
|
|
149
|
-
last_unique = None
|
|
150
|
-
for file in [file for file in self.data]:
|
|
151
|
-
cast_sensors = file.sensors
|
|
152
|
-
if last_unique is None:
|
|
153
|
-
unique.append((file.file_name, cast_sensors))
|
|
154
|
-
else:
|
|
155
|
-
differing_dicts = [
|
|
156
|
-
current_dict
|
|
157
|
-
for last_dict, current_dict in zip(last_unique, cast_sensors)
|
|
158
|
-
if current_dict != last_dict
|
|
159
|
-
]
|
|
160
|
-
if differing_dicts:
|
|
161
|
-
unique.append((file.file_name, differing_dicts))
|
|
162
|
-
last_unique = cast_sensors
|
|
163
|
-
return unique
|
seabirdfilehandler/parameter.py
CHANGED
|
@@ -37,7 +37,9 @@ class Parameters(UserDict):
|
|
|
37
37
|
self.raw_metadata = metadata
|
|
38
38
|
self.full_data_array = self.create_full_ndarray()
|
|
39
39
|
self.differentiate_table_description()
|
|
40
|
-
self.metadata, self.duplicate_columns = self.reading_data_header(
|
|
40
|
+
self.metadata, self.duplicate_columns = self.reading_data_header(
|
|
41
|
+
metadata
|
|
42
|
+
)
|
|
41
43
|
self.data = self.create_parameter_instances()
|
|
42
44
|
|
|
43
45
|
def get_parameter_list(self) -> list[Parameter]:
|
|
@@ -59,12 +61,17 @@ class Parameters(UserDict):
|
|
|
59
61
|
A numpy array of the same shape as the cnv files data table
|
|
60
62
|
|
|
61
63
|
"""
|
|
62
|
-
data_table =
|
|
64
|
+
data_table = (
|
|
65
|
+
self.raw_input_data if len(data_table) == 0 else data_table
|
|
66
|
+
)
|
|
63
67
|
n = 11
|
|
64
68
|
row_list = []
|
|
65
69
|
for line in data_table:
|
|
66
70
|
row_list.append(
|
|
67
|
-
[
|
|
71
|
+
[
|
|
72
|
+
line[i : i + n].split()[0]
|
|
73
|
+
for i in range(0, len(line) - n, n)
|
|
74
|
+
]
|
|
68
75
|
)
|
|
69
76
|
return np.array(row_list, dtype=float)
|
|
70
77
|
|
|
@@ -87,17 +94,23 @@ class Parameters(UserDict):
|
|
|
87
94
|
A dictionary of parameter instances
|
|
88
95
|
|
|
89
96
|
"""
|
|
90
|
-
metadata =
|
|
97
|
+
metadata = (
|
|
98
|
+
self.metadata if len(list(metadata.keys())) == 0 else metadata
|
|
99
|
+
)
|
|
91
100
|
parameter_dict = {}
|
|
92
101
|
list_of_metadata_shortnames = list(metadata.keys())
|
|
93
102
|
if self.full_data_array.shape[1] != len(list_of_metadata_shortnames):
|
|
94
103
|
raise AssertionError(
|
|
95
|
-
f"
|
|
104
|
+
f"Array column width {
|
|
105
|
+
self.full_data_array.shape[1]
|
|
106
|
+
} does not fit metadata length {len(metadata)}"
|
|
96
107
|
)
|
|
97
108
|
for i in range(self.full_data_array.shape[1]):
|
|
98
109
|
column_data = self.full_data_array[:, i]
|
|
99
110
|
key = list_of_metadata_shortnames[i]
|
|
100
|
-
parameter_dict[key] = Parameter(
|
|
111
|
+
parameter_dict[key] = Parameter(
|
|
112
|
+
data=column_data, metadata=metadata[key]
|
|
113
|
+
)
|
|
101
114
|
return parameter_dict
|
|
102
115
|
|
|
103
116
|
def differentiate_table_description(self):
|
|
@@ -124,10 +137,12 @@ class Parameters(UserDict):
|
|
|
124
137
|
post.append(line)
|
|
125
138
|
assert len(column_names) == len(column_value_spans)
|
|
126
139
|
self.data_table_stats = {
|
|
127
|
-
line.split("=")[0].strip(): line.split("=")[1].strip()
|
|
140
|
+
line.split("=")[0].strip(): line.split("=")[1].strip()
|
|
141
|
+
for line in pre
|
|
128
142
|
}
|
|
129
143
|
self.data_table_names_and_spans = [
|
|
130
|
-
(name, span)
|
|
144
|
+
(name, span)
|
|
145
|
+
for name, span in zip(column_names, column_value_spans)
|
|
131
146
|
]
|
|
132
147
|
self.data_table_misc = post
|
|
133
148
|
|
|
@@ -175,9 +190,13 @@ class Parameters(UserDict):
|
|
|
175
190
|
"""
|
|
176
191
|
if len(metadata) < 5:
|
|
177
192
|
if len(name) > 0:
|
|
178
|
-
metadata = self.add_default_metadata(
|
|
193
|
+
metadata = self.add_default_metadata(
|
|
194
|
+
name=name, metadata=metadata
|
|
195
|
+
)
|
|
179
196
|
else:
|
|
180
|
-
raise ValueError(
|
|
197
|
+
raise ValueError(
|
|
198
|
+
"Please specify either a name or sufficient metadata"
|
|
199
|
+
)
|
|
181
200
|
if not isinstance(data, np.ndarray):
|
|
182
201
|
data = np.full(
|
|
183
202
|
fill_value=data,
|
|
@@ -239,7 +258,9 @@ class Parameters(UserDict):
|
|
|
239
258
|
|
|
240
259
|
def get_pandas_dataframe(self) -> pd.DataFrame:
|
|
241
260
|
"""Returns a pandas DataFrame of the current parameter data."""
|
|
242
|
-
data = np.array(
|
|
261
|
+
data = np.array(
|
|
262
|
+
[parameter.data for parameter in self.get_parameter_list()]
|
|
263
|
+
).T
|
|
243
264
|
columns = [parameter.name for parameter in self.get_parameter_list()]
|
|
244
265
|
assert data.shape[1] == len(columns)
|
|
245
266
|
return pd.DataFrame(data=data, columns=columns, dtype=float)
|
|
@@ -298,12 +319,15 @@ class Parameters(UserDict):
|
|
|
298
319
|
except IndexError as error:
|
|
299
320
|
logger.error(
|
|
300
321
|
f"Could not resolve duplicate column: {
|
|
301
|
-
shortname
|
|
322
|
+
shortname
|
|
323
|
+
}, {error}"
|
|
302
324
|
)
|
|
303
325
|
else:
|
|
304
326
|
header_meta_info["shortname"] = shortname
|
|
305
327
|
header_meta_info["longinfo"] = longinfo.strip()
|
|
306
|
-
metainfo = self._extract_data_header_meta_info(
|
|
328
|
+
metainfo = self._extract_data_header_meta_info(
|
|
329
|
+
longinfo.strip()
|
|
330
|
+
)
|
|
307
331
|
header_meta_info = {**header_meta_info, **metainfo}
|
|
308
332
|
table_header[shortname.strip()] = header_meta_info
|
|
309
333
|
return table_header, duplicate_columns
|
|
@@ -30,7 +30,7 @@ class SeaBirdFile:
|
|
|
30
30
|
):
|
|
31
31
|
self.path_to_file = Path(path_to_file)
|
|
32
32
|
self.file_name = self.path_to_file.stem
|
|
33
|
-
self.file_dir = self.path_to_file.
|
|
33
|
+
self.file_dir = self.path_to_file.parent
|
|
34
34
|
self.timestamp = datetime.now(timezone.utc)
|
|
35
35
|
self.raw_file_data = [] # the text file input
|
|
36
36
|
self.header = [] # the full file header
|
|
@@ -53,7 +53,9 @@ class SeaBirdFile:
|
|
|
53
53
|
break
|
|
54
54
|
self.extract_file_information(only_header)
|
|
55
55
|
if len(self.sensor_data) > 0:
|
|
56
|
-
self.sensors = self.sensor_xml_to_flattened_dict(
|
|
56
|
+
self.sensors = self.sensor_xml_to_flattened_dict(
|
|
57
|
+
"".join(self.sensor_data)
|
|
58
|
+
)
|
|
57
59
|
|
|
58
60
|
def __str__(self) -> str:
|
|
59
61
|
return "/n".join(self.file_data)
|
|
@@ -114,6 +116,7 @@ class SeaBirdFile:
|
|
|
114
116
|
post = []
|
|
115
117
|
for line in self.data_table_description:
|
|
116
118
|
if line.startswith("name"):
|
|
119
|
+
# TODO: cuts off lines containing multiple '=' symbols
|
|
117
120
|
column_names.append(line.split("=")[1].strip())
|
|
118
121
|
elif line.startswith("span"):
|
|
119
122
|
past_spans = True
|
|
@@ -125,14 +128,21 @@ class SeaBirdFile:
|
|
|
125
128
|
post.append(line)
|
|
126
129
|
assert len(column_names) == len(column_value_spans)
|
|
127
130
|
self.data_table_stats = {
|
|
128
|
-
line.split("=")[0].strip(): line.split("=")[1].strip()
|
|
131
|
+
line.split("=")[0].strip(): line.split("=")[1].strip()
|
|
132
|
+
for line in pre
|
|
129
133
|
}
|
|
130
134
|
self.data_table_names_and_spans = [
|
|
131
|
-
(name, span)
|
|
135
|
+
(name, span)
|
|
136
|
+
for name, span in zip(column_names, column_value_spans)
|
|
132
137
|
]
|
|
133
|
-
self.data_table_misc = {
|
|
138
|
+
self.data_table_misc = {
|
|
139
|
+
line.split("=")[0].strip(): line.split("=")[1].strip()
|
|
140
|
+
for line in post
|
|
141
|
+
}
|
|
134
142
|
|
|
135
|
-
def sensor_xml_to_flattened_dict(
|
|
143
|
+
def sensor_xml_to_flattened_dict(
|
|
144
|
+
self, sensor_data: str
|
|
145
|
+
) -> list[dict] | dict:
|
|
136
146
|
"""Reads the pure xml sensor input and creates a multilevel dictionary,
|
|
137
147
|
dropping the first two dictionaries, as they are single entry only
|
|
138
148
|
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
logger = logging.getLogger(__name__)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_unique_sensor_data(
|
|
7
|
+
sensor_data: list[list[dict]],
|
|
8
|
+
) -> list[tuple[list[dict]]]:
|
|
9
|
+
"""
|
|
10
|
+
Returns all the unique sensors and their configuration used in the given
|
|
11
|
+
collection of sensor data. These will typically be parsed from xml inside
|
|
12
|
+
.cnv or .xmlcon files.
|
|
13
|
+
If for example, the first oxygen sensor has been replaced after the 8 cast,
|
|
14
|
+
then we will see that in the output structure by a seconde tuple, with the
|
|
15
|
+
number 8 and the individual sensor information for that new oxygen sensor.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
sensor_data:
|
|
20
|
+
The structure of xml-parsed dicts inside two organizing lists.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
The input structure stripped down to unique sensor data and appended by
|
|
25
|
+
the index, at which this new sensor appeared the first time.
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
unique = []
|
|
29
|
+
last_unique = None
|
|
30
|
+
for index, individual_sensor_data in enumerate(
|
|
31
|
+
[file for file in sensor_data]
|
|
32
|
+
):
|
|
33
|
+
if last_unique is None:
|
|
34
|
+
unique.append((index, individual_sensor_data))
|
|
35
|
+
else:
|
|
36
|
+
differing_dicts = [
|
|
37
|
+
current_dict
|
|
38
|
+
for last_dict, current_dict in zip(
|
|
39
|
+
last_unique, individual_sensor_data
|
|
40
|
+
)
|
|
41
|
+
if current_dict != last_dict
|
|
42
|
+
]
|
|
43
|
+
if differing_dicts:
|
|
44
|
+
unique.append((index, differing_dicts))
|
|
45
|
+
last_unique = individual_sensor_data
|
|
46
|
+
return unique
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class UnexpectedFileFormat(Exception):
|
|
50
|
+
def __init__(self, file_type: str, error: str) -> None:
|
|
51
|
+
message = f"{file_type} is not formatted as expected: {error}"
|
|
52
|
+
logger.error(message)
|
|
53
|
+
super().__init__(message)
|
|
@@ -27,8 +27,10 @@ class CnvValidationList(UserDict):
|
|
|
27
27
|
""" """
|
|
28
28
|
module_list = []
|
|
29
29
|
for line in self.cnv_header_val_modules:
|
|
30
|
-
module = line.split(
|
|
31
|
-
if (
|
|
30
|
+
module = line.split("_")[0]
|
|
31
|
+
if (module not in module_list) and (
|
|
32
|
+
line.split()[0] != "file_type"
|
|
33
|
+
):
|
|
32
34
|
module_list.append(module)
|
|
33
35
|
return module_list
|
|
34
36
|
|
|
@@ -50,41 +52,42 @@ class CnvValidationList(UserDict):
|
|
|
50
52
|
action_dict_present = False
|
|
51
53
|
# extract lines corresponding to the module
|
|
52
54
|
for line in self.cnv_header_val_modules:
|
|
53
|
-
if module == line.split(
|
|
55
|
+
if module == line.split("_")[0]:
|
|
54
56
|
# removing the module names from the lines
|
|
55
57
|
shifting_index = len(module) + 1
|
|
56
58
|
line_content = line[shifting_index:]
|
|
57
59
|
# handle the case of the validation methods keyword being
|
|
58
60
|
# 'action', which corresponds to an entire dict of values
|
|
59
|
-
if line_content[:6] ==
|
|
61
|
+
if line_content[:6] == "action":
|
|
60
62
|
action_dict_present = True
|
|
61
63
|
inner_action_dict = self.module_dict_feeder(
|
|
62
|
-
line_content[6:], inner_action_dict
|
|
64
|
+
line_content[6:], inner_action_dict
|
|
65
|
+
)
|
|
63
66
|
else:
|
|
64
67
|
# handle the cases where after some date value, another value
|
|
65
68
|
# is printed inside of [] brackets
|
|
66
|
-
double_value_list = line_content.split(
|
|
69
|
+
double_value_list = line_content.split("[")
|
|
67
70
|
if len(double_value_list) > 1:
|
|
68
71
|
out_dict = self.module_dict_feeder(
|
|
69
|
-
double_value_list[1][shifting_index:-2], out_dict
|
|
72
|
+
double_value_list[1][shifting_index:-2], out_dict
|
|
73
|
+
)
|
|
70
74
|
line_content = double_value_list[0]
|
|
71
|
-
if line_content[:11] ==
|
|
75
|
+
if line_content[:11] == "surface_bin":
|
|
72
76
|
surface_bin_dict = {}
|
|
73
|
-
for line in line_content.split(
|
|
77
|
+
for line in line_content.split(","):
|
|
74
78
|
self.module_dict_feeder(line, surface_bin_dict)
|
|
75
|
-
out_dict[
|
|
79
|
+
out_dict["surface_bin"] = surface_bin_dict
|
|
76
80
|
continue
|
|
77
81
|
# usual behavior, for 99% cases:
|
|
78
82
|
# assigning key and value to the module dict
|
|
79
83
|
out_dict = self.module_dict_feeder(line_content, out_dict)
|
|
80
84
|
if action_dict_present:
|
|
81
|
-
out_dict[
|
|
85
|
+
out_dict["action"] = inner_action_dict
|
|
82
86
|
return out_dict
|
|
83
87
|
|
|
84
|
-
def module_dict_feeder(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
split_value: str = '='):
|
|
88
|
+
def module_dict_feeder(
|
|
89
|
+
self, line: str, dictionary: dict, split_value: str = "="
|
|
90
|
+
):
|
|
88
91
|
"""
|
|
89
92
|
|
|
90
93
|
Parameters
|
seabirdfilehandler/xmlfiles.py
CHANGED
|
@@ -4,6 +4,8 @@ import xml.etree.ElementTree as ET
|
|
|
4
4
|
import json
|
|
5
5
|
import xmltodict
|
|
6
6
|
|
|
7
|
+
from seabirdfilehandler.utils import UnexpectedFileFormat
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class XMLFile(UserDict):
|
|
9
11
|
"""
|
|
@@ -78,6 +80,58 @@ class XMLCONFile(XMLFile):
|
|
|
78
80
|
|
|
79
81
|
def __init__(self, path_to_file):
|
|
80
82
|
super().__init__(path_to_file)
|
|
83
|
+
self.sensor_info = self.get_sensor_info()
|
|
84
|
+
|
|
85
|
+
def get_sensor_info(self) -> list[dict]:
|
|
86
|
+
"""
|
|
87
|
+
Creates a multilevel dictionary, dropping the first four dictionaries,
|
|
88
|
+
to retrieve pure sensor information.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
A list of all the individual sensor information, stored in dictionaries
|
|
93
|
+
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
sensors = self.data["SBE_InstrumentConfiguration"]["Instrument"][
|
|
97
|
+
"SensorArray"
|
|
98
|
+
]["Sensor"]
|
|
99
|
+
except KeyError as error:
|
|
100
|
+
raise UnexpectedFileFormat("XMLCON", error)
|
|
101
|
+
else:
|
|
102
|
+
# create a tidied version of the xml-parsed sensor dict
|
|
103
|
+
sensor_names = []
|
|
104
|
+
tidied_sensor_list = []
|
|
105
|
+
for entry in sensors:
|
|
106
|
+
sensor_key = list(entry.keys())[-1]
|
|
107
|
+
if not sensor_key.endswith(("Sensor", "Meter")):
|
|
108
|
+
continue
|
|
109
|
+
sensor_name = sensor_key.removesuffix("Sensor")
|
|
110
|
+
# the wetlab sensors feature a suffix _Sensor
|
|
111
|
+
sensor_name = sensor_name.removesuffix("_")
|
|
112
|
+
# assuming, that the first sensor in the xmlcon is also on the
|
|
113
|
+
# first sensor strand, the second occurence of the name is
|
|
114
|
+
# suffixed with '2'
|
|
115
|
+
if sensor_name in sensor_names:
|
|
116
|
+
sensor_name += "2"
|
|
117
|
+
sensor_names.append(sensor_name)
|
|
118
|
+
# move the calibration info one dictionary level up
|
|
119
|
+
calibration_info = entry[sensor_key]
|
|
120
|
+
# build the new dictionary
|
|
121
|
+
try:
|
|
122
|
+
new_dict = {
|
|
123
|
+
"Channel": str(int(entry["@index"]) + 1),
|
|
124
|
+
"SensorName": sensor_name,
|
|
125
|
+
**calibration_info,
|
|
126
|
+
}
|
|
127
|
+
except TypeError:
|
|
128
|
+
new_dict = {
|
|
129
|
+
"Channel": entry["@Channel"],
|
|
130
|
+
"SensorName": sensor_name,
|
|
131
|
+
"Info": calibration_info,
|
|
132
|
+
}
|
|
133
|
+
tidied_sensor_list.append(new_dict)
|
|
134
|
+
return tidied_sensor_list
|
|
81
135
|
|
|
82
136
|
|
|
83
137
|
class PsaFile(XMLFile):
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: seabirdfilehandler
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Library of parsers to interact with SeaBird CTD files.
|
|
5
5
|
Keywords: CTD,parser,seabird,data
|
|
6
6
|
Author: Emil Michels
|
|
7
7
|
Author-email: <emil.michels@io-warnemuende.de>
|
|
8
|
-
Requires-Python: >=3.12
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
9
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
@@ -13,7 +13,8 @@ Classifier: Intended Audience :: Science/Research
|
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Oceanography
|
|
14
14
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
-
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Dist: pandas (>=2.2.1)
|
|
17
18
|
Requires-Dist: xmltodict (>=0.13.0)
|
|
18
19
|
Project-URL: Homepage, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
|
|
19
20
|
Project-URL: Repository, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
seabirdfilehandler/__init__.py,sha256=8Vk2TURWXv2_NG_U_fR0fIVDFymkFkBipvmlS5ucB3M,147
|
|
2
|
+
seabirdfilehandler/dataframe_meta_accessor.py,sha256=x4mSEN49us6Ezzjdt41fl5Ry8IJR09ORrZ1roOIJbyc,6439
|
|
3
|
+
seabirdfilehandler/datatablefiles.py,sha256=yzTAzsWcdrHYaa2QaR6OpdEs-cu2py8V5o79s2Uz7MM,31646
|
|
4
|
+
seabirdfilehandler/file_collection.py,sha256=nWyi5FToCV9-e_zcaLhRb4oOt8KAmyHC-SBGLJO9KQ4,6909
|
|
5
|
+
seabirdfilehandler/logging.yaml,sha256=mXxbhJPio3OGaukTpc3rLGA8Ywq1DNqp0Vn5YCbH6jY,459
|
|
6
|
+
seabirdfilehandler/parameter.py,sha256=UyKb_HGQ57pETdhSfR5FbJ60aOj8_d3_Tgw_akth0TY,13283
|
|
7
|
+
seabirdfilehandler/seabirdfiles.py,sha256=BKLyk5gUMkt1CG4ljDXlCqcr5zej0-9PjPS0sX2E4n8,7449
|
|
8
|
+
seabirdfilehandler/utils.py,sha256=5KXdB8Hdv65dv5tPyXxNMct1mCEOyA3S8XP54AFAnx0,1745
|
|
9
|
+
seabirdfilehandler/validation_modules.py,sha256=eZ6x0giftUtlxnRMOnK_vCkgccdwUXPrDjajFa-E6n0,4698
|
|
10
|
+
seabirdfilehandler/xmlfiles.py,sha256=L_puQf8eg0ojv85AyEMID4jnwkOlV_fgZP3W5yeSUBY,4668
|
|
11
|
+
seabirdfilehandler-0.4.3.dist-info/LICENSE,sha256=Ifd1VPmYv32oJd2QVh3wIQP9X05vYJlcY6kONz360ws,34603
|
|
12
|
+
seabirdfilehandler-0.4.3.dist-info/METADATA,sha256=ODlAzixojiAr_KtwyXVmYlWDn0UJTzFlp_RBAWzLxug,1289
|
|
13
|
+
seabirdfilehandler-0.4.3.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
14
|
+
seabirdfilehandler-0.4.3.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
seabirdfilehandler/__init__.py,sha256=8Vk2TURWXv2_NG_U_fR0fIVDFymkFkBipvmlS5ucB3M,147
|
|
2
|
-
seabirdfilehandler/dataframe_meta_accessor.py,sha256=x4mSEN49us6Ezzjdt41fl5Ry8IJR09ORrZ1roOIJbyc,6439
|
|
3
|
-
seabirdfilehandler/datatablefiles.py,sha256=yphs3cFs636-dQ0iPNjonm9-8gNRHvtkBSgVtw-APm4,31307
|
|
4
|
-
seabirdfilehandler/file_collection.py,sha256=andpNZwoVd8gFo4umOBzPn2gR3OQnh6d4_ou0r6zEpM,5889
|
|
5
|
-
seabirdfilehandler/logging.yaml,sha256=mXxbhJPio3OGaukTpc3rLGA8Ywq1DNqp0Vn5YCbH6jY,459
|
|
6
|
-
seabirdfilehandler/parameter.py,sha256=hd7dG0aAhI21GUvW2NioGpeDGV7moRsm_R3mQeAmdmg,12835
|
|
7
|
-
seabirdfilehandler/seabirdfiles.py,sha256=EH23sa4U_pytl3XPcumHU6g-NOzuLSX2KtDIhG7wFsI,7280
|
|
8
|
-
seabirdfilehandler/validation_modules.py,sha256=nwiwfkDLltE3S_aQDlqZrMeiNlGVJP9XNNw0abxgjfc,4691
|
|
9
|
-
seabirdfilehandler/xmlfiles.py,sha256=Es0pUCoB0af-meCH-75h-s-r0mEfrthl-8QDjqNBWPk,2441
|
|
10
|
-
seabirdfilehandler-0.4.1.dist-info/LICENSE,sha256=Ifd1VPmYv32oJd2QVh3wIQP9X05vYJlcY6kONz360ws,34603
|
|
11
|
-
seabirdfilehandler-0.4.1.dist-info/METADATA,sha256=RBi3HJEgQJh68-f_ElkPxfp2N-pekeR-yLsQsO_SYGU,1249
|
|
12
|
-
seabirdfilehandler-0.4.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
13
|
-
seabirdfilehandler-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|