seabirdfilehandler 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -0,0 +1,886 @@
1
+ from pathlib import Path
2
+ from typing import Union, Tuple
3
+ from datetime import datetime, time, timedelta
4
+ import re
5
+ import pandas as pd
6
+ import numpy as np
7
+ import logging
8
+ import warnings
9
+ from seabirdfilehandler.parameter import Parameters
10
+ from seabirdfilehandler.validation_modules import CnvValidationList
11
+ from seabirdfilehandler.seabirdfiles import SeaBirdFile
12
+ from seabirdfilehandler.dataframe_meta_accessor import SeriesMetaAccessor, DataFrameMetaAccessor
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DataTableFile(SeaBirdFile):
18
+ """Collection of methods for the SeaBird files that feature some kind of
19
+ data table that is represented in a pandas dataframe.
20
+
21
+ Parameters
22
+ ----------
23
+
24
+ Returns
25
+ -------
26
+
27
+ """
28
+
29
+ def __init__(self, path_to_file):
30
+ super().__init__(path_to_file)
31
+ self.original_df: pd.DataFrame
32
+ self.df: pd.DataFrame
33
+
34
+ def define_output_path(
35
+ self,
36
+ file_path: Path | str | None = None,
37
+ file_name: str | None = None,
38
+ file_type: str = ".csv",
39
+ ) -> Path:
40
+ """Creates a Path object holding the desired output path.
41
+
42
+ Parameters
43
+ ----------
44
+ file_path : Path :
45
+ directory the file sits in (Default value = self.file_dir)
46
+ file_name : str :
47
+ the original file name (Default value = self.file_name)
48
+ file_type : str :
49
+ the output file type (Default = '.csv')
50
+ Returns
51
+ -------
52
+ a Path object consisting of the full path of the new file
53
+
54
+ """
55
+ file_path = self.file_dir if file_path is None else file_path
56
+ file_name = self.file_name if file_name is None else file_name
57
+ if file_type[0] != '.':
58
+ file_type = '.' + file_type
59
+ return Path(file_path).joinpath(file_name).with_suffix(file_type)
60
+
61
+ def to_csv(
62
+ self,
63
+ selected_columns: list | None = None,
64
+ with_header: bool = True,
65
+ output_file_path: Path | str | None = None,
66
+ output_file_name: str | None = None,
67
+ ):
68
+ """Writes a csv from the current dataframe. Takes a list of columns to
69
+ use, a boolean for writing the header and the output file parameters.
70
+
71
+ Parameters
72
+ ----------
73
+ selected_columns : list :
74
+ a list of columns to include in the csv
75
+ (Default value = self.df.columns)
76
+ with_header : boolean :
77
+ indicating whether the header shall appear in the output
78
+ (Default value = True)
79
+ output_file_path : Path :
80
+ file directory (Default value = None)
81
+ output_file_name : str :
82
+ original file name (Default value = None)
83
+
84
+ Returns
85
+ -------
86
+
87
+ """
88
+ selected_columns = self.df.columns if selected_columns is None else selected_columns
89
+ df = self.df[selected_columns].reset_index(drop=True)
90
+ new_file_path = self.define_output_path(
91
+ output_file_path, output_file_name)
92
+ if with_header:
93
+ with open(new_file_path, 'w') as file:
94
+ for line in self.header:
95
+ file.write(line)
96
+ df.to_csv(new_file_path, index=False, mode='a')
97
+ else:
98
+ df.to_csv(new_file_path, index=False, mode='w')
99
+ logger.info(f'Wrote file {self.path_to_file} to {new_file_path}.')
100
+
101
+ def selecting_columns(
102
+ self,
103
+ list_of_columns: list | str,
104
+ df: pd.DataFrame | None = None,
105
+ ):
106
+ """Alters the dataframe to only hold the given columns.
107
+
108
+ Parameters
109
+ ----------
110
+ list_of_columns: list or str : a collection of columns
111
+ df : pandas.Dataframe :
112
+ Dataframe (Default value = None)
113
+
114
+ Returns
115
+ -------
116
+
117
+ """
118
+ df = self.df if df is None else df
119
+ # ensure that the input is a list, so that isin() can do its job
120
+ if isinstance(list_of_columns, str):
121
+ list_of_columns = [list_of_columns]
122
+ if isinstance(df, pd.DataFrame):
123
+ self.df = df[list_of_columns].reset_index(drop=True)
124
+
125
+
126
+ class BottleFile(DataTableFile):
127
+ """Class that represents a SeaBird Bottle File. Organizes the files table
128
+ information into a pandas dataframe. This allows the usage of this
129
+ powerful library for statistics, visualization, data manipulation, export,
130
+ etc.
131
+
132
+ Parameters
133
+ ----------
134
+
135
+ Returns
136
+ -------
137
+
138
+ """
139
+
140
+ def __init__(self, path_to_file):
141
+ super().__init__(path_to_file)
142
+ self.original_df = self.create_dataframe()
143
+ self.df = self.original_df
144
+ self.setting_dataframe_dtypes()
145
+ self.adding_timestamp_column()
146
+
147
+ def create_dataframe(self):
148
+ """Creates a dataframe out of the btl file. Manages the double data
149
+ header correctly.
150
+
151
+ Parameters
152
+ ----------
153
+
154
+ Returns
155
+ -------
156
+
157
+ """
158
+ # TODO: this needs to be broken down into smaller pieces...
159
+ top_names, bottom_names = self.reading_data_header()
160
+ # creating statistics column to store the row type information:
161
+ # 4 rows per bottle, average, standard deviation, max value, min value
162
+ top_names.append('Statistic')
163
+ # TODO: sexier way to construct dataframe than opening the file a
164
+ # second time
165
+ # # df = pd.DataFrame(self.data, index=None, columns=top_names)
166
+ df: pd.DataFrame = pd.read_fwf(self.path_to_file, index_col=False, skiprows=len(
167
+ self.header)+2, header=None, names=top_names)
168
+
169
+ # handling the double row header
170
+ rowtypes = df[df.columns[-1]].unique()
171
+
172
+ # TODO: can this be made a little pretier?
173
+ def separate_double_header_row(df, column, length):
174
+ """
175
+
176
+ Parameters
177
+ ----------
178
+ df :
179
+ column :
180
+ length :
181
+
182
+ Returns
183
+ -------
184
+
185
+ """
186
+ column_idx = df.columns.get_loc(column)
187
+ old_column = df.iloc[:: length, column_idx].reset_index(drop=True)
188
+ new_column = df.iloc[1:: length, column_idx].reset_index(drop=True)
189
+ old_column_expanded = pd.Series(
190
+ np.repeat(old_column, length)).reset_index(drop=True)
191
+ new_column_expanded = pd.Series(
192
+ np.repeat(new_column, length)).reset_index(drop=True)
193
+ df[column] = old_column_expanded
194
+ df.insert(column_idx + 1,
195
+ bottom_names[column_idx], new_column_expanded)
196
+ return df
197
+
198
+ df = separate_double_header_row(df, 'Date', len(rowtypes))
199
+ df = separate_double_header_row(df, top_names[0], len(rowtypes))
200
+ # remove brackets around statistics values
201
+ df["Statistic"] = df["Statistic"].str.strip('()')
202
+ df = df.rename(mapper={'Btl_ID': 'Bottle_ID'}, axis=1)
203
+ return df
204
+
205
+ def adding_timestamp_column(self):
206
+ """Creates a timestamp column that holds both, Date and Time
207
+ information.
208
+
209
+ Parameters
210
+ ----------
211
+
212
+ Returns
213
+ -------
214
+
215
+ """
216
+ # constructing timestamp column
217
+ timestamp = []
218
+ for datepoint, timepoint in zip(self.df.Date, self.df.Time):
219
+ timestamp.append(datetime.combine(
220
+ datepoint, time.fromisoformat(str(timepoint))))
221
+ self.df.insert(2, 'Timestamp', timestamp)
222
+ self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
223
+
224
+ def setting_dataframe_dtypes(self):
225
+ """Sets the types for the column values in the dataframe."""
226
+ # setting dtypes
227
+ # TODO: extending this to the other columns!
228
+ self.df.Date = pd.to_datetime(self.df.Date)
229
+ self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
230
+
231
+ def selecting_rows(self,
232
+ df=None,
233
+ statistic_of_interest: Union[list, str] = ['avg']):
234
+ """Creates a dataframe with the given row identifier, using the
235
+ statistics column. A single string or a list of strings can be
236
+ processed.
237
+
238
+ Parameters
239
+ ----------
240
+ df : pandas.Dataframe :
241
+ the files Pandas representation (Default value = self.df)
242
+ statistic_of_interest: list or str :
243
+ collection of values of the 'statistics' column in self.df
244
+ (Default value = ['avg'])
245
+
246
+ Returns
247
+ -------
248
+
249
+ """
250
+ df = self.df if df is None else df
251
+ # ensure that the input is a list, so that isin() can do its job
252
+ if isinstance(statistic_of_interest, str):
253
+ statistic_of_interest = [statistic_of_interest]
254
+ self.df = df.loc[df['Statistic'].isin(statistic_of_interest)]
255
+
256
+ def reading_data_header(self):
257
+ """Identifies and separatly collects the rows that specify the data
258
+ tables headers.
259
+
260
+ Parameters
261
+ ----------
262
+
263
+ Returns
264
+ -------
265
+
266
+ """
267
+ n = 11 # fix column width of a seabird btl file
268
+ top_line = self.data[0]
269
+ second_line = self.data[1]
270
+ top_names = [top_line[i:i+n].split()[0]
271
+ for i in range(0, len(top_line)-n, n)]
272
+ bottom_names = [second_line[i:i+n].split()[0]
273
+ for i in range(0, 2*n, n)]
274
+ return top_names, bottom_names
275
+
276
+ def add_station_and_event_column(self):
277
+ event_list = [self.metadata['Station'] for _ in self.data]
278
+ self.df.insert(0, 'Event', pd.Series(event_list))
279
+
280
+ def add_position_columns(self):
281
+ latitude_list = [self.metadata['GPS_Lat'] for _ in self.data]
282
+ self.df.insert(1, 'Latitude', pd.Series(latitude_list))
283
+ longitude_list = [self.metadata['GPS_Lon'] for _ in self.data]
284
+ self.df.insert(2, 'Longitude', pd.Series(longitude_list))
285
+
286
+
287
+ class CnvFile(DataTableFile):
288
+ """
289
+ A representation of a cnv-file as used by SeaBird.
290
+
291
+ This class intends to fully extract and organize the different types of
292
+ data and metadata present inside of such a file. Downstream libraries shall
293
+ be able to use this representation for all applications concerning cnv
294
+ files, like data processing, transformation or visualization.
295
+
296
+ To achieve that, the metadata header is organized by the grandparent-class,
297
+ SeaBirdFile, while the data table is extracted by this class. The data
298
+ representation of choice is a pandas Dataframe. Inside this class, there
299
+ are methods to parse cnv data into dataframes, do the reverse of writing a
300
+ dataframe into cnv compliant form and to manipulate the dataframe in
301
+ various ways.
302
+
303
+ Parameters
304
+ ----------
305
+ path_to_file: Path | str:
306
+ the path to the file
307
+ full_data_header: bool:
308
+ whether to use the full data column descriptions for the dataframe
309
+ long_header_names: bool:
310
+ whether to use long header names in the dateframe
311
+ absolute_time_calculation: bool:
312
+ whether to use a real timestamp instead of the second count
313
+ event_log_column: bool:
314
+ whether to add a station and device event column from DSHIP
315
+ coordinate_columns: bool:
316
+ whether to add longitude and latitude from the extra metadata header
317
+
318
+ """
319
+
320
+ def __init__(
321
+ self,
322
+ path_to_file: Path | str,
323
+ create_dataframe: bool = True,
324
+ absolute_time_calculation: bool = False,
325
+ event_log_column: bool = False,
326
+ coordinate_columns: bool = False,
327
+ data_table_info_level: str = "shortname",
328
+ ):
329
+ super().__init__(path_to_file)
330
+ self.validation_modules = self.obtaining_validation_modules()
331
+ self.start_time = self.reading_start_time()
332
+ if create_dataframe:
333
+ warnings.warn(
334
+ 'The default of constructing a pandas Dataframe will soon be replaced by using the Parameters class that works on numpy arrays.',
335
+ DeprecationWarning,
336
+ stacklevel=2 # Ensures the warning points to the caller's line
337
+ )
338
+ self.data_header_meta_info, self.duplicate_columns = self.reading_data_header(
339
+ self.data_table_description)
340
+ self.original_df = self.create_dataframe(data_table_info_level)
341
+ self.df = self.original_df
342
+ if absolute_time_calculation:
343
+ self.absolute_time_calculation()
344
+ if event_log_column:
345
+ self.add_station_and_event_column()
346
+ if coordinate_columns:
347
+ self.add_position_columns()
348
+ else:
349
+ self.parameters = Parameters(
350
+ self.data, self.data_table_description)
351
+
352
+ def reading_data_header(self, header_info: list = []) -> Tuple[dict[str, dict], list[int]]:
353
+ """Reads the tables header data from the header.
354
+
355
+ Parameters
356
+ ----------
357
+ header_info: list:
358
+ the header values from the file
359
+
360
+ Returns
361
+ -------
362
+ a list of dictionaries, that organize the table header information
363
+
364
+ """
365
+ if header_info is []:
366
+ header_info = self.data_table_description
367
+ table_header = {}
368
+ duplicate_columns = []
369
+ for line in header_info:
370
+ if line.startswith("name"):
371
+ header_meta_info = {}
372
+ # get basic shortname and the full, non-differentiated info
373
+ shortname = longinfo = line_info = line.split("=")[1].strip()
374
+ try:
375
+ shortname, longinfo = line_info.split(":")
376
+ except IndexError:
377
+ pass
378
+ finally:
379
+ shortname = shortname.strip()
380
+ if shortname in list(table_header.keys()):
381
+ try:
382
+ duplicate_columns.append(
383
+ int(line.split("=")[0].strip().split()[1]))
384
+ except (IndexError) as error:
385
+ logger.error(
386
+ f"Could not resolve duplicate column: {
387
+ shortname}, {error}")
388
+ else:
389
+ header_meta_info["shortname"] = shortname
390
+ header_meta_info["longinfo"] = longinfo.strip()
391
+ metainfo = self._extract_data_header_meta_info(
392
+ longinfo.strip())
393
+ header_meta_info = {**header_meta_info, **metainfo}
394
+ table_header[shortname.strip()] = header_meta_info
395
+ return table_header, duplicate_columns
396
+
397
+ def _extract_data_header_meta_info(self, line: str) -> dict:
398
+ """Extracts the individual information bits inside of the header lines
399
+
400
+ Parameters
401
+ ----------
402
+ line: str:
403
+ one header line, trimmed by the 'name =' prefix and the shortname
404
+
405
+ Returns
406
+ -------
407
+ a dictionary with the information stored
408
+
409
+ """
410
+ regex_string = r"(?:(?P<name0>.+),\s(?P<metainfo0>.+)\s\[(?P<unit0>.+)\]|(?P<name2>.+)\s\[(?P<unit2>.+)\]|(?P<name3>.+),\s(?P<metainfo2>.[^\s]+)|(?P<name4>.+))"
411
+ regex_check = re.search(
412
+ regex_string, line, flags=re.IGNORECASE)
413
+ if regex_check:
414
+ regex_info = dict(regex_check.groupdict())
415
+ regex_info = {key[:-1]: value for key,
416
+ value in regex_info.items() if value is not None}
417
+ if len(regex_info) > 2:
418
+ # check for second sensors and adjust their names
419
+ if regex_info["metainfo"][-1] == '2':
420
+ regex_info["name"] = regex_info["name"] + " 2"
421
+ regex_info["metainfo"] = regex_info['metainfo'][:-1]
422
+ if len(regex_info["metainfo"]) == 0:
423
+ regex_info.pop("metainfo")
424
+ if regex_info['name'] == 'flag':
425
+ regex_info["metainfo"] = regex_info['name']
426
+ regex_info["unit"] = regex_info['name']
427
+ return regex_info
428
+ return {}
429
+
430
+ def create_dataframe(
431
+ self,
432
+ header_info_detail_level: str = "shortname",
433
+ ) -> pd.DataFrame:
434
+ """Creates a pandas dataframe by splitting each dataline every 11
435
+ characters, as SeaBird defines its tables this way.
436
+
437
+ Parameters
438
+ ----------
439
+ uns_full_header_names: bool:
440
+ whether to use all header information as dataframe header
441
+ uns_long_header_names: bool:
442
+ whether to use header longnames as dataframe header
443
+
444
+ Returns
445
+ -------
446
+ a pandas.Dataframe that represents the data values inside the cnv file
447
+
448
+ """
449
+ n = 11
450
+ row_list = []
451
+ for line in self.data:
452
+ row_list.append([line[i:i+n].split()[0]
453
+ for i in range(0, len(line)-n, n)])
454
+ # TODO: force float dtype here and handle that in rest of code
455
+ df = pd.DataFrame(row_list)
456
+ header_names = [
457
+ metainfo[header_info_detail_level]
458
+ for metainfo in list(self.data_header_meta_info.values())
459
+ ]
460
+ # remove duplicate columns
461
+ df.drop(labels=self.duplicate_columns, axis=1, inplace=True)
462
+ self.duplicate_columns = []
463
+ try:
464
+ df.columns = header_names
465
+ except ValueError as error:
466
+ logger.error(f"Could not set dataframe header for {
467
+ self.file_name}: {error}")
468
+ logger.error(header_names)
469
+ else:
470
+ df.meta.metadata = self.data_header_meta_info
471
+ # df.meta.propagate_metadata_to_series()
472
+ return df
473
+
474
+ def rename_dataframe_header(
475
+ self,
476
+ df: pd.DataFrame | None = None,
477
+ header_detail_level: str = "shortname",
478
+ ) -> list:
479
+ df = self.df if df is None else df
480
+ df.meta.rename(header_detail_level)
481
+ return [column for column in df.columns]
482
+
483
+ def reading_start_time(
484
+ self,
485
+ time_source: str = "System UTC",
486
+ ) -> datetime | None:
487
+ """
488
+ Extracts the Cast start time from the metadata header.
489
+ """
490
+ for line in self.sbe9_data:
491
+ if line.startswith(time_source):
492
+ start_time = line.split('=')[1]
493
+ start_time = datetime.strptime(
494
+ start_time, ' %b %d %Y %H:%M:%S ')
495
+ return start_time
496
+ return None
497
+
498
+ def absolute_time_calculation(self) -> bool:
499
+ """
500
+ Replaces the basic cnv time representation of counting relative to the
501
+ casts start point, by real UTC timestamps.
502
+ This operation will act directly on the dataframe.
503
+
504
+ """
505
+ time_parameter = None
506
+ for parameter in self.df.columns:
507
+ if parameter.lower().startswith('time'):
508
+ time_parameter = parameter
509
+ if time_parameter and self.start_time:
510
+ self.df.meta.add_column(
511
+ name='datetime',
512
+ data=[
513
+ timedelta(days=float(time)) + self.start_time
514
+ if time_parameter == "timeJ"
515
+ else timedelta(seconds=float(time)) + self.start_time
516
+ for time in self.df[time_parameter]
517
+ ],
518
+ )
519
+ return True
520
+ return False
521
+
522
+ def add_start_time(self) -> bool:
523
+ """
524
+ Adds the Cast start time to the dataframe.
525
+ Necessary for joins on the time.
526
+ """
527
+ if self.start_time:
528
+ self.df.meta.add_column(
529
+ name='start_time',
530
+ data=pd.Series([self.start_time for _ in self.data]),
531
+ )
532
+ return True
533
+ return False
534
+
535
+ def obtaining_validation_modules(self) -> CnvValidationList:
536
+ """
537
+ Collects the individual validation modules and their respective
538
+ information, usually present in key-value pairs.
539
+ """
540
+ validation_modules = self.processing_info
541
+ return CnvValidationList(validation_modules)
542
+
543
+ def df2cnv(
544
+ self,
545
+ header_names: list | None = None,
546
+ header_detail_level: str | None = None,
547
+ ) -> list:
548
+ """
549
+ Parses a pandas dataframe into a list that represents the lines inside
550
+ of a cnv data table.
551
+
552
+ Parameters
553
+ ----------
554
+ header_names: list:
555
+ a list of dataframe columns that will be parsed
556
+
557
+ Returns
558
+ -------
559
+ a list of lines in the cnv data table format
560
+
561
+ """
562
+ if not header_detail_level:
563
+ header_detail_level = self.df.meta.header_detail
564
+ if not header_names:
565
+ header_names = [
566
+ header[header_detail_level]
567
+ for header in list(self.data_header_meta_info.values())
568
+ ]
569
+ df = self.df.drop(
570
+ labels=[
571
+ column
572
+ for column in list(self.df.meta.metadata.keys())
573
+ if column not in header_names
574
+ ],
575
+ axis=1,
576
+ errors="ignore",
577
+ )
578
+ cnv_out = []
579
+ for _, row in df.iterrows():
580
+ cnv_like_row = ''.join(
581
+ (lambda column: f"{str(column):>11}")(value) for value in row)
582
+ cnv_out.append(cnv_like_row + "\n")
583
+ return cnv_out
584
+
585
+ def to_cnv(
586
+ self,
587
+ file_name: Path | str | None = None,
588
+ use_current_df: bool = True,
589
+ use_current_processing_header: bool = False,
590
+ header_list: list | None = None,
591
+ ):
592
+ """
593
+ Writes the values inside of this instance as a new cnv file to disc.
594
+
595
+ Parameters
596
+ ----------
597
+ file_name: Path:
598
+ the new file name to use for writing
599
+ use_current_df: bool:
600
+ whether to use the current dataframe as data table
601
+ use_current_validation_header: bool:
602
+ whether to use the current processing module list
603
+ header_list: list:
604
+ the data columns to use for the export
605
+
606
+ """
607
+ file_name = self.path_to_file if file_name is None else file_name
608
+ # content construction
609
+ if use_current_df:
610
+ self.data = self.df2cnv(header_list)
611
+ if use_current_processing_header:
612
+ self._update_header()
613
+ self.file_data = [*self.header, *self.data]
614
+ # writing content out
615
+ try:
616
+ with open(file_name, 'w', encoding='latin-1') as file:
617
+ for line in self.file_data:
618
+ file.write(line)
619
+ logger.info(f'Wrote cnv {self.path_to_file} to {file_name}.')
620
+
621
+ except IOError as error:
622
+ logger.error(f'Could not write cnv file: {error}')
623
+
624
+ def _update_header(self):
625
+ """Re-creates the cnv header."""
626
+ self.data_table_description = self._form_data_table_info()
627
+ self.header = [
628
+ *[f'* {data}' for data in self.sbe9_data[:-1]],
629
+ *[f'** {data}' for data in self.metadata_list],
630
+ f'* {self.sbe9_data[-1]}',
631
+ *[f'# {data}' for data in self.data_table_description],
632
+ *[f'# {data}' for data in self.sensor_data],
633
+ *[f'# {data}' for data in self.processing_info],
634
+ '*END*\n'
635
+ ]
636
+
637
+ def _form_data_table_info(self) -> list:
638
+ """Recreates the data table descriptions, like column names and spans
639
+ from the structured dictionaries these values were stored in."""
640
+ new_table_info = []
641
+ for key, value in self.data_table_stats.items():
642
+ new_table_info.append(f"{key} = {value}\n")
643
+ for index, (name, _) in enumerate(self.data_table_names_and_spans):
644
+ new_table_info.append(f"name {index} = {name}\n")
645
+ for index, (_, span) in enumerate(self.data_table_names_and_spans):
646
+ new_table_info.append(f"span {index} = {span}\n")
647
+ new_table_info = [*new_table_info, *self.data_table_misc]
648
+ return new_table_info
649
+
650
+ def add_processing_metadata(self, addition: str | list):
651
+ """
652
+ Adds new processing lines to the list of processing module information
653
+
654
+ Parameters
655
+ ----------
656
+ addition: str:
657
+ the new information line
658
+
659
+ """
660
+ # TODO: use CnvprocessingList here
661
+ if isinstance(addition, str):
662
+ addition = [addition]
663
+ for line in addition:
664
+ self.file_data.append(line)
665
+ # add the new info line *before* the 'file_type = ascii' line
666
+ self.processing_info.insert(-1, line)
667
+
668
+ def add_station_and_event_column(self) -> bool:
669
+ """
670
+ Adds a column with the DSHIP station and device event numbers to the
671
+ dataframe. These must be present inside the extra metadata header.
672
+
673
+ """
674
+ try:
675
+ event_list = [self.metadata['Station'] for _ in self.data]
676
+ except KeyError:
677
+ return False
678
+ else:
679
+ self.df.meta.add_column(
680
+ name='Event',
681
+ data=pd.Series(event_list),
682
+ location=0
683
+ )
684
+ return True
685
+
686
+ def add_position_columns(self) -> bool:
687
+ """
688
+ Adds a column with the longitude and latitude to the dataframe.
689
+ These must be present inside the extra metadata header.
690
+
691
+ """
692
+ if ('latitude' or 'longitude') in [
693
+ column['shortname']
694
+ for column in list(self.df.meta.metadata.values())
695
+ ]:
696
+ return True
697
+ try:
698
+ latitude_list = [self.metadata['GPS_Lat'] for _ in self.data]
699
+ longitude_list = [self.metadata['GPS_Lon'] for _ in self.data]
700
+ except KeyError:
701
+ return False
702
+ else:
703
+ self.df.meta.add_column(
704
+ name='Latitude',
705
+ data=pd.Series(latitude_list),
706
+ location=1
707
+ )
708
+ self.df.meta.add_column(
709
+ name='Longitude',
710
+ data=pd.Series(longitude_list),
711
+ location=2
712
+ )
713
+ return True
714
+
715
+ def add_cast_number(self, number: int | None = None) -> bool:
716
+ """
717
+ Adds a column with the cast number to the dataframe.
718
+
719
+ Parameters
720
+ ----------
721
+ number: int:
722
+ the cast number of this files cast
723
+
724
+ """
725
+ if ('Cast' in self.metadata.keys()) and (not number):
726
+ number = int(self.metadata['Cast'])
727
+ try:
728
+ self.df.meta.add_column(
729
+ name='Cast',
730
+ data=pd.Series([number for _ in self.data]),
731
+ location=0
732
+ )
733
+ except ValueError:
734
+ # Cast is already part of the dataframe, so nothing left to do
735
+ return False
736
+ else:
737
+ return True
738
+
739
+
740
+ class BottleLogFile(DataTableFile):
741
+ """Bottle Log file representation, that extracts the three different data
742
+ types from the file: reset time and the table with bottle IDs and
743
+ corresponding data ranges.
744
+
745
+ Parameters
746
+ ----------
747
+
748
+ Returns
749
+ -------
750
+
751
+ """
752
+
753
+ def __init__(self, path_to_file, create_dataframe = False):
754
+ super().__init__(path_to_file)
755
+ self.reset_time = self.obtaining_reset_time()
756
+ self.origin_cnv = self.raw_file_data[0].strip()
757
+ self.data = self.data_whitespace_removal()
758
+
759
+ if create_dataframe:
760
+ self.original_df = self.create_dataframe()
761
+ self.df = self.original_df
762
+ else:
763
+ self.data_list = self.create_list()
764
+
765
+
766
+
767
+ def data_whitespace_removal(self) -> list:
768
+ """Strips the input from whitespace characters, in this case especially
769
+ newline characters.
770
+
771
+ Parameters
772
+ ----------
773
+
774
+ Returns
775
+ -------
776
+ the original data stripped off the whitespaces
777
+
778
+ """
779
+ temp_data = []
780
+ for line in self.raw_file_data[2:]:
781
+ temp_data.append(line.strip())
782
+ return temp_data
783
+
784
+ def obtaining_reset_time(self) -> datetime:
785
+ """Reading reset time with small input check.
786
+
787
+ Parameters
788
+ ----------
789
+
790
+ Returns
791
+ -------
792
+ a datetime.datetime object of the device reset time
793
+
794
+ """
795
+
796
+ regex_check = re.search(r'RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)',
797
+ self.raw_file_data[1])
798
+ if regex_check:
799
+ return datetime.strptime(regex_check.group(1), '%b %d %Y %H:%M:%S')
800
+ else:
801
+ error_message = '''BottleLogFile is not formatted as expected:
802
+ Reset time could not be extracted.'''
803
+ logger.error(error_message)
804
+ raise IOError(error_message)
805
+
806
+ def create_list(self) -> list:
807
+ """Creates a list of usable data from the list specified in self.data.
808
+ the list consists of: an array of ID's representing the bottles, the date and time of the data sample
809
+ and the lines of the cnv corresponding to the bottles
810
+
811
+ Parameters
812
+ ----------
813
+
814
+ Returns
815
+ -------
816
+ a list representing the bl files table information
817
+ """
818
+ content_array = []
819
+ for i in range(len(self.data)):
820
+ bottles = [int(x) for x in self.data[i].split(",")[:2]]
821
+ date = self.convert_date(self.data[i].split(",")[2])
822
+ lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
823
+
824
+ content_array.append([bottles, date, lines])
825
+
826
+ return content_array
827
+
828
+
829
+ def convert_date(self, date : str):
830
+ """Converts the Dates of the .bl files to an ISO 8601 standard
831
+
832
+ Parameters
833
+ ----------
834
+
835
+ Returns
836
+ -------
837
+ a string with the date in the form of "yymmddThhmmss"
838
+ """
839
+ date = date.strip()
840
+ month_list = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
841
+
842
+ month_ind = month_list.index(date.split(" ")[0]) + 1
843
+ if month_ind < 10:
844
+ month = "0" + str(month_ind)
845
+ else:
846
+ month = str(month_ind)
847
+ day = date.split(" ")[1]
848
+ year = (date.split(" ")[2])[2:]
849
+ time = date.split(" ")[3].replace(":", "")
850
+ return year + month + day + "T" + time
851
+
852
+
853
+
854
+ def create_dataframe(self) -> pd.DataFrame:
855
+ """Creates a dataframe from the list specified in self.data.
856
+
857
+ Parameters
858
+ ----------
859
+
860
+ Returns
861
+ -------
862
+ a pandas.Dataframe representing the bl files table information
863
+ """
864
+ data_lists = []
865
+ for line in self.data:
866
+ inner_list = line.split(',')
867
+ # dropping first column as its the index
868
+ data_lists.append(inner_list[1:])
869
+ df = pd.DataFrame(data_lists)
870
+ df.columns = ['Bottle ID', 'Datetime', 'start_range', 'end_range']
871
+ return df
872
+
873
+
874
+ class FieldCalibrationFile(DataTableFile):
875
+
876
+ def __init__(self, path_to_file):
877
+ super().__init__(path_to_file)
878
+ self.original_df = self.create_dataframe()
879
+ self.df = self.original_df
880
+
881
+ def create_dataframe(self) -> pd.DataFrame:
882
+ try:
883
+ return pd.read_csv(self.path_to_file, skiprows=len(self.header))
884
+ except IOError as error:
885
+ logger.error(f'Could not read field calibration file: {error}.')
886
+ return pd.DataFrame()