seabirdfilehandler 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -1,930 +0,0 @@
1
- from pathlib import Path
2
- from typing import Union, Tuple
3
- from datetime import datetime, time, timedelta
4
- import re
5
- import pandas as pd
6
- import numpy as np
7
- import logging
8
- import warnings
9
- from seabirdfilehandler.parameter import Parameters
10
- from seabirdfilehandler.validation_modules import CnvValidationList
11
- from seabirdfilehandler.seabirdfiles import SeaBirdFile
12
- from seabirdfilehandler.dataframe_meta_accessor import (
13
- SeriesMetaAccessor,
14
- DataFrameMetaAccessor,
15
- )
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- class DataTableFile(SeaBirdFile):
21
- """Collection of methods for the SeaBird files that feature some kind of
22
- data table that is represented in a pandas dataframe.
23
-
24
- Parameters
25
- ----------
26
-
27
- Returns
28
- -------
29
-
30
- """
31
-
32
- def __init__(self, path_to_file):
33
- super().__init__(path_to_file)
34
- self.original_df: pd.DataFrame
35
- self.df: pd.DataFrame
36
-
37
- def define_output_path(
38
- self,
39
- file_path: Path | str | None = None,
40
- file_name: str | None = None,
41
- file_type: str = ".csv",
42
- ) -> Path:
43
- """Creates a Path object holding the desired output path.
44
-
45
- Parameters
46
- ----------
47
- file_path : Path :
48
- directory the file sits in (Default value = self.file_dir)
49
- file_name : str :
50
- the original file name (Default value = self.file_name)
51
- file_type : str :
52
- the output file type (Default = '.csv')
53
- Returns
54
- -------
55
- a Path object consisting of the full path of the new file
56
-
57
- """
58
- file_path = self.file_dir if file_path is None else file_path
59
- file_name = self.file_name if file_name is None else file_name
60
- if file_type[0] != ".":
61
- file_type = "." + file_type
62
- return Path(file_path).joinpath(file_name).with_suffix(file_type)
63
-
64
- def to_csv(
65
- self,
66
- selected_columns: list | None = None,
67
- with_header: bool = True,
68
- output_file_path: Path | str | None = None,
69
- output_file_name: str | None = None,
70
- ):
71
- """Writes a csv from the current dataframe. Takes a list of columns to
72
- use, a boolean for writing the header and the output file parameters.
73
-
74
- Parameters
75
- ----------
76
- selected_columns : list :
77
- a list of columns to include in the csv
78
- (Default value = self.df.columns)
79
- with_header : boolean :
80
- indicating whether the header shall appear in the output
81
- (Default value = True)
82
- output_file_path : Path :
83
- file directory (Default value = None)
84
- output_file_name : str :
85
- original file name (Default value = None)
86
-
87
- Returns
88
- -------
89
-
90
- """
91
- selected_columns = (
92
- self.df.columns if selected_columns is None else selected_columns
93
- )
94
- df = self.df[selected_columns].reset_index(drop=True)
95
- new_file_path = self.define_output_path(
96
- output_file_path, output_file_name
97
- )
98
- if with_header:
99
- with open(new_file_path, "w") as file:
100
- for line in self.header:
101
- file.write(line)
102
- df.to_csv(new_file_path, index=False, mode="a")
103
- else:
104
- df.to_csv(new_file_path, index=False, mode="w")
105
- logger.info(f"Wrote file {self.path_to_file} to {new_file_path}.")
106
-
107
- def selecting_columns(
108
- self,
109
- list_of_columns: list | str,
110
- df: pd.DataFrame | None = None,
111
- ):
112
- """Alters the dataframe to only hold the given columns.
113
-
114
- Parameters
115
- ----------
116
- list_of_columns: list or str : a collection of columns
117
- df : pandas.Dataframe :
118
- Dataframe (Default value = None)
119
-
120
- Returns
121
- -------
122
-
123
- """
124
- df = self.df if df is None else df
125
- # ensure that the input is a list, so that isin() can do its job
126
- if isinstance(list_of_columns, str):
127
- list_of_columns = [list_of_columns]
128
- if isinstance(df, pd.DataFrame):
129
- self.df = df[list_of_columns].reset_index(drop=True)
130
-
131
-
132
- class BottleFile(DataTableFile):
133
- """Class that represents a SeaBird Bottle File. Organizes the files table
134
- information into a pandas dataframe. This allows the usage of this
135
- powerful library for statistics, visualization, data manipulation, export,
136
- etc.
137
-
138
- Parameters
139
- ----------
140
-
141
- Returns
142
- -------
143
-
144
- """
145
-
146
- def __init__(self, path_to_file):
147
- super().__init__(path_to_file)
148
- self.original_df = self.create_dataframe()
149
- self.df = self.original_df
150
- self.setting_dataframe_dtypes()
151
- self.adding_timestamp_column()
152
-
153
- def create_dataframe(self):
154
- """Creates a dataframe out of the btl file. Manages the double data
155
- header correctly.
156
-
157
- Parameters
158
- ----------
159
-
160
- Returns
161
- -------
162
-
163
- """
164
- # TODO: this needs to be broken down into smaller pieces...
165
- top_names, bottom_names = self.reading_data_header()
166
- # creating statistics column to store the row type information:
167
- # 4 rows per bottle, average, standard deviation, max value, min value
168
- top_names.append("Statistic")
169
- # TODO: sexier way to construct dataframe than opening the file a
170
- # second time
171
- # # df = pd.DataFrame(self.data, index=None, columns=top_names)
172
- df: pd.DataFrame = pd.read_fwf(
173
- self.path_to_file,
174
- index_col=False,
175
- skiprows=len(self.header) + 2,
176
- header=None,
177
- names=top_names,
178
- )
179
-
180
- # handling the double row header
181
- rowtypes = df[df.columns[-1]].unique()
182
-
183
- # TODO: can this be made a little pretier?
184
- def separate_double_header_row(df, column, length):
185
- """
186
-
187
- Parameters
188
- ----------
189
- df :
190
- column :
191
- length :
192
-
193
- Returns
194
- -------
195
-
196
- """
197
- column_idx = df.columns.get_loc(column)
198
- old_column = df.iloc[::length, column_idx].reset_index(drop=True)
199
- new_column = df.iloc[1::length, column_idx].reset_index(drop=True)
200
- old_column_expanded = pd.Series(
201
- np.repeat(old_column, length)
202
- ).reset_index(drop=True)
203
- new_column_expanded = pd.Series(
204
- np.repeat(new_column, length)
205
- ).reset_index(drop=True)
206
- df[column] = old_column_expanded
207
- df.insert(
208
- column_idx + 1, bottom_names[column_idx], new_column_expanded
209
- )
210
- return df
211
-
212
- df = separate_double_header_row(df, "Date", len(rowtypes))
213
- df = separate_double_header_row(df, top_names[0], len(rowtypes))
214
- # remove brackets around statistics values
215
- df["Statistic"] = df["Statistic"].str.strip("()")
216
- df = df.rename(mapper={"Btl_ID": "Bottle_ID"}, axis=1)
217
- return df
218
-
219
- def adding_timestamp_column(self):
220
- """Creates a timestamp column that holds both, Date and Time
221
- information.
222
-
223
- Parameters
224
- ----------
225
-
226
- Returns
227
- -------
228
-
229
- """
230
- # constructing timestamp column
231
- timestamp = []
232
- for datepoint, timepoint in zip(self.df.Date, self.df.Time):
233
- timestamp.append(
234
- datetime.combine(datepoint, time.fromisoformat(str(timepoint)))
235
- )
236
- self.df.insert(2, "Timestamp", timestamp)
237
- self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
238
-
239
- def setting_dataframe_dtypes(self):
240
- """Sets the types for the column values in the dataframe."""
241
- # setting dtypes
242
- # TODO: extending this to the other columns!
243
- self.df.Date = pd.to_datetime(self.df.Date)
244
- self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
245
-
246
- def selecting_rows(
247
- self, df=None, statistic_of_interest: Union[list, str] = ["avg"]
248
- ):
249
- """Creates a dataframe with the given row identifier, using the
250
- statistics column. A single string or a list of strings can be
251
- processed.
252
-
253
- Parameters
254
- ----------
255
- df : pandas.Dataframe :
256
- the files Pandas representation (Default value = self.df)
257
- statistic_of_interest: list or str :
258
- collection of values of the 'statistics' column in self.df
259
- (Default value = ['avg'])
260
-
261
- Returns
262
- -------
263
-
264
- """
265
- df = self.df if df is None else df
266
- # ensure that the input is a list, so that isin() can do its job
267
- if isinstance(statistic_of_interest, str):
268
- statistic_of_interest = [statistic_of_interest]
269
- self.df = df.loc[df["Statistic"].isin(statistic_of_interest)]
270
-
271
- def reading_data_header(self):
272
- """Identifies and separatly collects the rows that specify the data
273
- tables headers.
274
-
275
- Parameters
276
- ----------
277
-
278
- Returns
279
- -------
280
-
281
- """
282
- n = 11 # fix column width of a seabird btl file
283
- top_line = self.data[0]
284
- second_line = self.data[1]
285
- top_names = [
286
- top_line[i : i + n].split()[0]
287
- for i in range(0, len(top_line) - n, n)
288
- ]
289
- bottom_names = [
290
- second_line[i : i + n].split()[0] for i in range(0, 2 * n, n)
291
- ]
292
- return top_names, bottom_names
293
-
294
- def add_station_and_event_column(self):
295
- event_list = [self.metadata["Station"] for _ in self.data]
296
- self.df.insert(0, "Event", pd.Series(event_list))
297
-
298
- def add_position_columns(self):
299
- latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
300
- self.df.insert(1, "Latitude", pd.Series(latitude_list))
301
- longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
302
- self.df.insert(2, "Longitude", pd.Series(longitude_list))
303
-
304
-
305
- class CnvFile(DataTableFile):
306
- """
307
- A representation of a cnv-file as used by SeaBird.
308
-
309
- This class intends to fully extract and organize the different types of
310
- data and metadata present inside of such a file. Downstream libraries shall
311
- be able to use this representation for all applications concerning cnv
312
- files, like data processing, transformation or visualization.
313
-
314
- To achieve that, the metadata header is organized by the grandparent-class,
315
- SeaBirdFile, while the data table is extracted by this class. The data
316
- representation of choice is a pandas Dataframe. Inside this class, there
317
- are methods to parse cnv data into dataframes, do the reverse of writing a
318
- dataframe into cnv compliant form and to manipulate the dataframe in
319
- various ways.
320
-
321
- Parameters
322
- ----------
323
- path_to_file: Path | str:
324
- the path to the file
325
- full_data_header: bool:
326
- whether to use the full data column descriptions for the dataframe
327
- long_header_names: bool:
328
- whether to use long header names in the dateframe
329
- absolute_time_calculation: bool:
330
- whether to use a real timestamp instead of the second count
331
- event_log_column: bool:
332
- whether to add a station and device event column from DSHIP
333
- coordinate_columns: bool:
334
- whether to add longitude and latitude from the extra metadata header
335
-
336
- """
337
-
338
- def __init__(
339
- self,
340
- path_to_file: Path | str,
341
- create_dataframe: bool = True,
342
- absolute_time_calculation: bool = False,
343
- event_log_column: bool = False,
344
- coordinate_columns: bool = False,
345
- data_table_info_level: str = "shortname",
346
- ):
347
- super().__init__(path_to_file)
348
- self.validation_modules = self.obtaining_validation_modules()
349
- self.start_time = self.reading_start_time()
350
- if create_dataframe:
351
- warnings.warn(
352
- "The default of constructing a pandas Dataframe will soon be replaced by using the Parameters class that works on numpy arrays.",
353
- DeprecationWarning,
354
- stacklevel=2, # Ensures the warning points to the caller's line
355
- )
356
- self.data_header_meta_info, self.duplicate_columns = (
357
- self.reading_data_header(self.data_table_description)
358
- )
359
- self.original_df = self.create_dataframe(data_table_info_level)
360
- self.df = self.original_df
361
- if absolute_time_calculation:
362
- self.absolute_time_calculation()
363
- if event_log_column:
364
- self.add_station_and_event_column()
365
- if coordinate_columns:
366
- self.add_position_columns()
367
- else:
368
- self.parameters = Parameters(
369
- self.data, self.data_table_description
370
- )
371
-
372
- def reading_data_header(
373
- self, header_info: list = []
374
- ) -> Tuple[dict[str, dict], list[int]]:
375
- """Reads the tables header data from the header.
376
-
377
- Parameters
378
- ----------
379
- header_info: list:
380
- the header values from the file
381
-
382
- Returns
383
- -------
384
- a list of dictionaries, that organize the table header information
385
-
386
- """
387
- if header_info is []:
388
- header_info = self.data_table_description
389
- table_header = {}
390
- duplicate_columns = []
391
- for line in header_info:
392
- if line.startswith("name"):
393
- header_meta_info = {}
394
- # get basic shortname and the full, non-differentiated info
395
- shortname = longinfo = line_info = line.split("=")[1].strip()
396
- try:
397
- shortname, longinfo = line_info.split(":")
398
- except IndexError:
399
- pass
400
- finally:
401
- shortname = shortname.strip()
402
- if shortname in list(table_header.keys()):
403
- try:
404
- duplicate_columns.append(
405
- int(line.split("=")[0].strip().split()[1])
406
- )
407
- except IndexError as error:
408
- logger.error(
409
- f"Could not resolve duplicate column: {
410
- shortname
411
- }, {error}"
412
- )
413
- else:
414
- header_meta_info["shortname"] = shortname
415
- header_meta_info["longinfo"] = longinfo.strip()
416
- metainfo = self._extract_data_header_meta_info(
417
- longinfo.strip()
418
- )
419
- header_meta_info = {**header_meta_info, **metainfo}
420
- table_header[shortname.strip()] = header_meta_info
421
- return table_header, duplicate_columns
422
-
423
- def _extract_data_header_meta_info(self, line: str) -> dict:
424
- """Extracts the individual information bits inside of the header lines
425
-
426
- Parameters
427
- ----------
428
- line: str:
429
- one header line, trimmed by the 'name =' prefix and the shortname
430
-
431
- Returns
432
- -------
433
- a dictionary with the information stored
434
-
435
- """
436
- regex_string = r"(?:(?P<name0>.+),\s(?P<metainfo0>.+)\s\[(?P<unit0>.+)\]|(?P<name2>.+)\s\[(?P<unit2>.+)\]|(?P<name3>.+),\s(?P<metainfo2>.[^\s]+)|(?P<name4>.+))"
437
- regex_check = re.search(regex_string, line, flags=re.IGNORECASE)
438
- if regex_check:
439
- regex_info = dict(regex_check.groupdict())
440
- regex_info = {
441
- key[:-1]: value
442
- for key, value in regex_info.items()
443
- if value is not None
444
- }
445
- if len(regex_info) > 2:
446
- # check for second sensors and adjust their names
447
- if regex_info["metainfo"][-1] == "2":
448
- regex_info["name"] = regex_info["name"] + " 2"
449
- regex_info["metainfo"] = regex_info["metainfo"][:-1]
450
- if len(regex_info["metainfo"]) == 0:
451
- regex_info.pop("metainfo")
452
- if regex_info["name"] == "flag":
453
- regex_info["metainfo"] = regex_info["name"]
454
- regex_info["unit"] = regex_info["name"]
455
- return regex_info
456
- return {}
457
-
458
- def create_dataframe(
459
- self,
460
- header_info_detail_level: str = "shortname",
461
- ) -> pd.DataFrame:
462
- """Creates a pandas dataframe by splitting each dataline every 11
463
- characters, as SeaBird defines its tables this way.
464
-
465
- Parameters
466
- ----------
467
- uns_full_header_names: bool:
468
- whether to use all header information as dataframe header
469
- uns_long_header_names: bool:
470
- whether to use header longnames as dataframe header
471
-
472
- Returns
473
- -------
474
- a pandas.Dataframe that represents the data values inside the cnv file
475
-
476
- """
477
- n = 11
478
- row_list = []
479
- for line in self.data:
480
- row_list.append(
481
- [
482
- line[i : i + n].split()[0]
483
- for i in range(0, len(line) - n, n)
484
- ]
485
- )
486
- df = pd.DataFrame(row_list, dtype=float)
487
- header_names = [
488
- metainfo[header_info_detail_level]
489
- for metainfo in list(self.data_header_meta_info.values())
490
- ]
491
- # remove duplicate columns
492
- df.drop(labels=self.duplicate_columns, axis=1, inplace=True)
493
- self.duplicate_columns = []
494
- try:
495
- df.columns = header_names
496
- except ValueError as error:
497
- logger.error(
498
- f"Could not set dataframe header for {self.file_name}: {error}"
499
- )
500
- logger.error(header_names)
501
- else:
502
- df.meta.metadata = self.data_header_meta_info
503
- # df.meta.propagate_metadata_to_series()
504
- return df
505
-
506
- def rename_dataframe_header(
507
- self,
508
- df: pd.DataFrame | None = None,
509
- header_detail_level: str = "shortname",
510
- ) -> list:
511
- df = self.df if df is None else df
512
- df.meta.rename(header_detail_level)
513
- return [column for column in df.columns]
514
-
515
- def reading_start_time(
516
- self,
517
- time_source: str = "System UTC",
518
- ) -> datetime | None:
519
- """
520
- Extracts the Cast start time from the metadata header.
521
- """
522
- for line in self.sbe9_data:
523
- if line.startswith(time_source):
524
- start_time = line.split("=")[1]
525
- start_time = datetime.strptime(
526
- start_time, " %b %d %Y %H:%M:%S "
527
- )
528
- return start_time
529
- return None
530
-
531
- def absolute_time_calculation(self) -> bool:
532
- """
533
- Replaces the basic cnv time representation of counting relative to the
534
- casts start point, by real UTC timestamps.
535
- This operation will act directly on the dataframe.
536
-
537
- """
538
- time_parameter = None
539
- for parameter in self.df.columns:
540
- if parameter.lower().startswith("time"):
541
- time_parameter = parameter
542
- if time_parameter and self.start_time:
543
- self.df.meta.add_column(
544
- name="datetime",
545
- data=[
546
- timedelta(days=float(time)) + self.start_time
547
- if time_parameter == "timeJ"
548
- else timedelta(seconds=float(time)) + self.start_time
549
- for time in self.df[time_parameter]
550
- ],
551
- )
552
- return True
553
- return False
554
-
555
- def add_start_time(self) -> bool:
556
- """
557
- Adds the Cast start time to the dataframe.
558
- Necessary for joins on the time.
559
- """
560
- if self.start_time:
561
- self.df.meta.add_column(
562
- name="start_time",
563
- data=pd.Series([self.start_time for _ in self.data]),
564
- )
565
- return True
566
- return False
567
-
568
- def obtaining_validation_modules(self) -> CnvValidationList:
569
- """
570
- Collects the individual validation modules and their respective
571
- information, usually present in key-value pairs.
572
- """
573
- validation_modules = self.processing_info
574
- return CnvValidationList(validation_modules)
575
-
576
- def df2cnv(
577
- self,
578
- header_names: list | None = None,
579
- header_detail_level: str | None = None,
580
- ) -> list:
581
- """
582
- Parses a pandas dataframe into a list that represents the lines inside
583
- of a cnv data table.
584
-
585
- Parameters
586
- ----------
587
- header_names: list:
588
- a list of dataframe columns that will be parsed
589
-
590
- Returns
591
- -------
592
- a list of lines in the cnv data table format
593
-
594
- """
595
- if not header_detail_level:
596
- header_detail_level = self.df.meta.header_detail
597
- if not header_names:
598
- header_names = [
599
- header[header_detail_level]
600
- for header in list(self.data_header_meta_info.values())
601
- ]
602
- df = self.df.drop(
603
- labels=[
604
- column
605
- for column in list(self.df.meta.metadata.keys())
606
- if column not in header_names
607
- ],
608
- axis=1,
609
- errors="ignore",
610
- )
611
- cnv_out = []
612
- for _, row in df.iterrows():
613
- cnv_like_row = "".join(
614
- (lambda column: f"{str(column):>11}")(value) for value in row
615
- )
616
- cnv_out.append(cnv_like_row + "\n")
617
- return cnv_out
618
-
619
- def array2cnv(self) -> list:
620
- result = []
621
- for row in self.parameters.full_data_array:
622
- formatted_row = "".join(f"{elem:11}" for elem in row)
623
- result.append(formatted_row + "\n")
624
- return result
625
-
626
- def to_cnv(
627
- self,
628
- file_name: Path | str | None = None,
629
- use_dataframe: bool = True,
630
- header_list: list | None = None,
631
- ):
632
- """
633
- Writes the values inside of this instance as a new cnv file to disc.
634
-
635
- Parameters
636
- ----------
637
- file_name: Path:
638
- the new file name to use for writing
639
- use_current_df: bool:
640
- whether to use the current dataframe as data table
641
- use_current_validation_header: bool:
642
- whether to use the current processing module list
643
- header_list: list:
644
- the data columns to use for the export
645
-
646
- """
647
- file_name = self.path_to_file if file_name is None else file_name
648
- # content construction
649
- if use_dataframe:
650
- data = self.df2cnv(header_list)
651
- else:
652
- data = self.array2cnv()
653
- self._update_header()
654
- self.file_data = [*self.header, *data]
655
- # writing content out
656
- try:
657
- with open(file_name, "w", encoding="latin-1") as file:
658
- for line in self.file_data:
659
- file.write(line)
660
-
661
- except IOError as error:
662
- logger.error(f"Could not write cnv file: {error}")
663
-
664
- def _update_header(self):
665
- """Re-creates the cnv header."""
666
- self.data_table_description = self._form_data_table_info()
667
- self.header = [
668
- *[f"* {data}" for data in self.sbe9_data[:-1]],
669
- *[f"** {data}" for data in self.metadata_list],
670
- f"* {self.sbe9_data[-1]}",
671
- *[f"# {data}" for data in self.data_table_description],
672
- *[f"# {data}" for data in self.sensor_data],
673
- *[f"# {data}" for data in self.processing_info],
674
- "*END*\n",
675
- ]
676
-
677
- def _form_data_table_info(self) -> list:
678
- """Recreates the data table descriptions, like column names and spans
679
- from the structured dictionaries these values were stored in."""
680
- new_table_info = []
681
- for key, value in self.data_table_stats.items():
682
- new_table_info.append(f"{key} = {value}\n")
683
- for index, (name, _) in enumerate(self.data_table_names_and_spans):
684
- new_table_info.append(f"name {index} = {name}\n")
685
- for index, (_, span) in enumerate(self.data_table_names_and_spans):
686
- new_table_info.append(f"span {index} = {span}\n")
687
- for key, value in self.data_table_misc.items():
688
- new_table_info.append(f"{key} = {value}\n")
689
- return new_table_info
690
-
691
- def add_processing_metadata(self, addition: str | list):
692
- """
693
- Adds new processing lines to the list of processing module information
694
-
695
- Parameters
696
- ----------
697
- addition: str:
698
- the new information line
699
-
700
- """
701
- # TODO: use CnvprocessingList here
702
- if isinstance(addition, str):
703
- addition = [addition]
704
- for line in addition:
705
- self.file_data.append(line)
706
- # add the new info line *before* the 'file_type = ascii' line
707
- self.processing_info.insert(-1, line)
708
-
709
- def add_station_and_event_column(self) -> bool:
710
- """
711
- Adds a column with the DSHIP station and device event numbers to the
712
- dataframe. These must be present inside the extra metadata header.
713
-
714
- """
715
- try:
716
- event_list = [self.metadata["Station"] for _ in self.data]
717
- except KeyError:
718
- return False
719
- else:
720
- self.df.meta.add_column(
721
- name="Event", data=pd.Series(event_list), location=0
722
- )
723
- return True
724
-
725
- def add_position_columns(self) -> bool:
726
- """
727
- Adds a column with the longitude and latitude to the dataframe.
728
- These must be present inside the extra metadata header.
729
-
730
- """
731
- if ("latitude" or "longitude") in [
732
- column["shortname"]
733
- for column in list(self.df.meta.metadata.values())
734
- ]:
735
- return True
736
- try:
737
- latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
738
- longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
739
- except KeyError:
740
- return False
741
- else:
742
- self.df.meta.add_column(
743
- name="Latitude", data=pd.Series(latitude_list), location=1
744
- )
745
- self.df.meta.add_column(
746
- name="Longitude", data=pd.Series(longitude_list), location=2
747
- )
748
- return True
749
-
750
- def add_cast_number(self, number: int | None = None) -> bool:
751
- """
752
- Adds a column with the cast number to the dataframe.
753
-
754
- Parameters
755
- ----------
756
- number: int:
757
- the cast number of this files cast
758
-
759
- """
760
- if ("Cast" in self.metadata.keys()) and (not number):
761
- number = int(self.metadata["Cast"])
762
- try:
763
- self.df.meta.add_column(
764
- name="Cast",
765
- data=pd.Series([number for _ in self.data]),
766
- location=0,
767
- )
768
- except ValueError:
769
- # Cast is already part of the dataframe, so nothing left to do
770
- return False
771
- else:
772
- return True
773
-
774
-
775
- class BottleLogFile(DataTableFile):
776
- """Bottle Log file representation, that extracts the three different data
777
- types from the file: reset time and the table with bottle IDs and
778
- corresponding data ranges.
779
-
780
- Parameters
781
- ----------
782
-
783
- Returns
784
- -------
785
-
786
- """
787
-
788
- def __init__(self, path_to_file, create_dataframe=False):
789
- super().__init__(path_to_file)
790
- self.reset_time = self.obtaining_reset_time()
791
- self.origin_cnv = self.raw_file_data[0].strip()
792
- self.data = self.data_whitespace_removal()
793
-
794
- if create_dataframe:
795
- self.original_df = self.create_dataframe()
796
- self.df = self.original_df
797
- else:
798
- self.data_list = self.create_list()
799
-
800
- def data_whitespace_removal(self) -> list:
801
- """Strips the input from whitespace characters, in this case especially
802
- newline characters.
803
-
804
- Parameters
805
- ----------
806
-
807
- Returns
808
- -------
809
- the original data stripped off the whitespaces
810
-
811
- """
812
- temp_data = []
813
- for line in self.raw_file_data[2:]:
814
- temp_data.append(line.strip())
815
- return temp_data
816
-
817
- def obtaining_reset_time(self) -> datetime:
818
- """Reading reset time with small input check.
819
-
820
- Parameters
821
- ----------
822
-
823
- Returns
824
- -------
825
- a datetime.datetime object of the device reset time
826
-
827
- """
828
-
829
- regex_check = re.search(
830
- r"RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)",
831
- self.raw_file_data[1],
832
- )
833
- if regex_check:
834
- return datetime.strptime(regex_check.group(1), "%b %d %Y %H:%M:%S")
835
- else:
836
- error_message = """BottleLogFile is not formatted as expected:
837
- Reset time could not be extracted."""
838
- logger.error(error_message)
839
- raise IOError(error_message)
840
-
841
- def create_list(self) -> list:
842
- """Creates a list of usable data from the list specified in self.data.
843
- the list consists of: an array of ID's representing the bottles, the date and time of the data sample
844
- and the lines of the cnv corresponding to the bottles
845
-
846
- Parameters
847
- ----------
848
-
849
- Returns
850
- -------
851
- a list representing the bl files table information
852
- """
853
- content_array = []
854
- for i in range(len(self.data)):
855
- bottles = [int(x) for x in self.data[i].split(",")[:2]]
856
- date = self.convert_date(self.data[i].split(",")[2])
857
- lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
858
-
859
- content_array.append([bottles, date, lines])
860
-
861
- return content_array
862
-
863
- def convert_date(self, date: str):
864
- """Converts the Dates of the .bl files to an ISO 8601 standard
865
-
866
- Parameters
867
- ----------
868
-
869
- Returns
870
- -------
871
- a string with the date in the form of "yymmddThhmmss"
872
- """
873
- date = date.strip()
874
- month_list = [
875
- "Jan",
876
- "Feb",
877
- "Mar",
878
- "Apr",
879
- "May",
880
- "Jun",
881
- "Jul",
882
- "Aug",
883
- "Sep",
884
- "Oct",
885
- "Nov",
886
- "Dec",
887
- ]
888
-
889
- month_ind = month_list.index(date.split(" ")[0]) + 1
890
- if month_ind < 10:
891
- month = "0" + str(month_ind)
892
- else:
893
- month = str(month_ind)
894
- day = date.split(" ")[1]
895
- year = (date.split(" ")[2])[2:]
896
- time = date.split(" ")[3].replace(":", "")
897
- return year + month + day + "T" + time
898
-
899
- def create_dataframe(self) -> pd.DataFrame:
900
- """Creates a dataframe from the list specified in self.data.
901
-
902
- Parameters
903
- ----------
904
-
905
- Returns
906
- -------
907
- a pandas.Dataframe representing the bl files table information
908
- """
909
- data_lists = []
910
- for line in self.data:
911
- inner_list = line.split(",")
912
- # dropping first column as its the index
913
- data_lists.append(inner_list[1:])
914
- df = pd.DataFrame(data_lists)
915
- df.columns = ["Bottle ID", "Datetime", "start_range", "end_range"]
916
- return df
917
-
918
-
919
- class FieldCalibrationFile(DataTableFile):
920
- def __init__(self, path_to_file):
921
- super().__init__(path_to_file)
922
- self.original_df = self.create_dataframe()
923
- self.df = self.original_df
924
-
925
- def create_dataframe(self) -> pd.DataFrame:
926
- try:
927
- return pd.read_csv(self.path_to_file, skiprows=len(self.header))
928
- except IOError as error:
929
- logger.error(f"Could not read field calibration file: {error}.")
930
- return pd.DataFrame()