seabirdfilehandler 0.5.2__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.3
2
+ Name: seabirdfilehandler
3
+ Version: 0.5.4
4
+ Summary: Library of parsers to interact with SeaBird CTD files.
5
+ Keywords: CTD,parser,seabird,data
6
+ Author: Emil Michels
7
+ Author-email: <emil.michels@io-warnemuende.de>
8
+ Requires-Python: >=3.12
9
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Oceanography
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Requires-Dist: pandas (>=2.2.1)
18
+ Requires-Dist: xmltodict (>=0.13.0)
19
+ Project-URL: Documentation, https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler
20
+ Project-URL: Homepage, https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler
21
+ Project-URL: Repository, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Intro
25
+
26
+ This is a library for handling the different SeaBird file types. Each file is
27
+ meant to be represented by one object that stores all of its information in a
28
+ structured way. Through the grouping of different data types, more complex
29
+ calculations, visualisations and output forms are possible inside of those
30
+ objects.
31
+
32
+ By being able to parse edited data and metadata back to the original file
33
+ format, this package can be used to process data using custom ideas, while
34
+ staying compatible to the original SeaBird software packages. This way, one can
35
+ create new workflows that interchangeably use old and new processing modules.
36
+ One implementation of this idea is the [ctd-processing python package](https://ctd-software.pages.io-warnemuende.de/processing/), also developed at the IOW.
37
+
38
+ The structured metadata does provide the possibility to leverage the vast
39
+ amounts of information stored inside the extensive metadata header. Sensor data
40
+ and processing information are readily available in intuitive dictionaries.
41
+
42
+ ## Development roadmap
43
+
44
+ ### misc improvements
45
+
46
+ - refactor processing module handling
47
+ - extend individual parameter information
48
+ - handle duplicate input columns
49
+
50
+ ### visualisation
51
+
52
+ - write an intuitive visualisation module
53
+
@@ -0,0 +1,29 @@
1
+ # Intro
2
+
3
+ This is a library for handling the different SeaBird file types. Each file is
4
+ meant to be represented by one object that stores all of its information in a
5
+ structured way. Through the grouping of different data types, more complex
6
+ calculations, visualisations and output forms are possible inside of those
7
+ objects.
8
+
9
+ By being able to parse edited data and metadata back to the original file
10
+ format, this package can be used to process data using custom ideas, while
11
+ staying compatible to the original SeaBird software packages. This way, one can
12
+ create new workflows that interchangeably use old and new processing modules.
13
+ One implementation of this idea is the [ctd-processing python package](https://ctd-software.pages.io-warnemuende.de/processing/), also developed at the IOW.
14
+
15
+ The structured metadata does provide the possibility to leverage the vast
16
+ amounts of information stored inside the extensive metadata header. Sensor data
17
+ and processing information are readily available in intuitive dictionaries.
18
+
19
+ ## Development roadmap
20
+
21
+ ### misc improvements
22
+
23
+ - refactor processing module handling
24
+ - extend individual parameter information
25
+ - handle duplicate input columns
26
+
27
+ ### visualisation
28
+
29
+ - write an intuitive visualisation module
@@ -16,10 +16,11 @@ classifiers = [
16
16
  "Programming Language :: Python :: 3.12",
17
17
  "Programming Language :: Python :: 3.13",
18
18
  ]
19
- urls.homepage = "https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler"
19
+ urls.homepage = "https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler"
20
20
  urls.repository = "https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler"
21
+ urls.documentation = "https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler"
21
22
  dynamic = []
22
- version = "0.5.2"
23
+ version = "0.5.4"
23
24
 
24
25
  [tool.poetry]
25
26
 
@@ -43,6 +44,7 @@ pyment = ">=0.3.3"
43
44
  pylint = ">=3.0.2"
44
45
  pre-commit = ">=3.6.2"
45
46
  tomlkit = ">=0.13.2"
47
+ myst-parser = "^4.0.1"
46
48
 
47
49
  [tool.pytest.ini_options]
48
50
  pythonpath = [".", "src", "src/seabirdfilehandler"]
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import Union
2
3
  from datetime import datetime, time
3
4
  import pandas as pd
@@ -22,12 +23,13 @@ class BottleFile(DataFile):
22
23
 
23
24
  """
24
25
 
25
- def __init__(self, path_to_file):
26
- super().__init__(path_to_file)
27
- self.original_df = self.create_dataframe()
28
- self.df = self.original_df
29
- self.setting_dataframe_dtypes()
30
- self.adding_timestamp_column()
26
+ def __init__(self, path_to_file: Path | str, only_header: bool = False):
27
+ super().__init__(path_to_file, only_header)
28
+ if not only_header:
29
+ self.original_df = self.create_dataframe()
30
+ self.df = self.original_df
31
+ self.setting_dataframe_dtypes()
32
+ self.adding_timestamp_column()
31
33
 
32
34
  def create_dataframe(self):
33
35
  """Creates a dataframe out of the btl file. Manages the double data
@@ -19,21 +19,26 @@ class CnvFile(DataFile):
19
19
  be able to use this representation for all applications concerning cnv
20
20
  files, like data processing, transformation or visualization.
21
21
 
22
- To achieve that, the metadata header is organized by the grandparent-class,
23
- SeaBirdFile, while the data table is extracted by this class. The data
24
- representation of choice is a pandas Dataframe. Inside this class, there
25
- are methods to parse cnv data into dataframes, do the reverse of writing a
26
- dataframe into cnv compliant form and to manipulate the dataframe in
27
- various ways.
22
+ To achieve that, the metadata header is organized by the parent-class,
23
+ DataFile, while the data table is extracted by this class. The data
24
+ representation can be a numpy array or pandas dataframe. The handling of
25
+ the data is mostly done inside parameters, a representation of the
26
+ individual measurement parameter data and metadata.
27
+
28
+ This class is also able to parse the edited data and metadata back to the
29
+ original .cnv file format, allowing for custom data processing using this
30
+ representation, while still being able to use Sea-Birds original software
31
+ on that output. It also allows to stay comparable with other parsers or
32
+ methods in general.
28
33
 
29
34
  Parameters
30
35
  ----------
31
36
  path_to_file: Path | str:
32
37
  the path to the file
33
- full_data_header: bool:
34
- whether to use the full data column descriptions for the dataframe
35
- long_header_names: bool:
36
- whether to use long header names in the dateframe
38
+ only_header: bool :
39
+ Whether to stop reading the file after the metadata header.
40
+ create_dataframe: bool :
41
+ Whether to create a pandas DataFrame from the data table.
37
42
  absolute_time_calculation: bool:
38
43
  whether to use a real timestamp instead of the second count
39
44
  event_log_column: bool:
@@ -55,9 +60,11 @@ class CnvFile(DataFile):
55
60
  super().__init__(path_to_file, only_header)
56
61
  self.validation_modules = self.obtaining_validation_modules()
57
62
  self.start_time = self.reading_start_time()
58
- self.parameters = Parameters(self.data, self.data_table_description)
63
+ self.parameters = Parameters(
64
+ self.data, self.data_table_description, only_header
65
+ )
59
66
  if create_dataframe:
60
- self.df = self.parameters.get_pandas_dataframe()
67
+ self.df = self.create_dataframe()
61
68
  if absolute_time_calculation:
62
69
  self.absolute_time_calculation()
63
70
  if event_log_column:
@@ -65,6 +72,13 @@ class CnvFile(DataFile):
65
72
  if coordinate_columns:
66
73
  self.add_position_columns()
67
74
 
75
+ def create_dataframe(self) -> pd.DataFrame:
76
+ """
77
+ Plain dataframe creator.
78
+ """
79
+ self.df = self.parameters.get_pandas_dataframe()
80
+ return self.df
81
+
68
82
  def reading_start_time(
69
83
  self,
70
84
  time_source: str = "System UTC",
@@ -18,15 +18,21 @@ logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  class DataFile:
21
- """Collection of methods for the SeaBird files that feature some kind of
22
- data table that is represented in a pandas dataframe.
21
+ """
22
+ The base class for all Sea-Bird data files, which are .cnv, .btl, and .bl .
23
+ One instance of this class, or its children, represents one data text file.
24
+ The different information bits of such a file are structured into individual
25
+ lists or dictionaries. The data table will be loaded as numpy array and
26
+ can be converted to a pandas DataFrame. Datatype-specific behavior is
27
+ implemented in the subclasses.
28
+
23
29
 
24
30
  Parameters
25
31
  ----------
26
-
27
- Returns
28
- -------
29
-
32
+ path_to_file: Path | str :
33
+ The file to the data file.
34
+ only_header: bool :
35
+ Whether to stop reading the file after the metadata header.
30
36
  """
31
37
 
32
38
  def __init__(
@@ -66,16 +72,10 @@ class DataFile:
66
72
  return self.file_data == other.file_data
67
73
 
68
74
  def read_file(self):
69
- """Reads and structures all the different information present in the
75
+ """
76
+ Reads and structures all the different information present in the
70
77
  file. Lists and Dictionaries are the data structures of choice. Uses
71
78
  basic prefix checking to distinguish different header information.
72
-
73
- Parameters
74
- ----------
75
-
76
- Returns
77
- -------
78
-
79
79
  """
80
80
  past_sensors = False
81
81
  with self.path_to_file.open("r", encoding="latin-1") as file:
@@ -109,14 +109,18 @@ class DataFile:
109
109
  def sensor_xml_to_flattened_dict(
110
110
  self, sensor_data: str
111
111
  ) -> list[dict] | dict:
112
- """Reads the pure xml sensor input and creates a multilevel dictionary,
112
+ """
113
+ Reads the pure xml sensor input and creates a multilevel dictionary,
113
114
  dropping the first two dictionaries, as they are single entry only
114
115
 
115
116
  Parameters
116
117
  ----------
118
+ sensor_data: str:
119
+ The raw xml sensor data.
117
120
 
118
121
  Returns
119
122
  -------
123
+ A list of sensor information, which is a structured dict.
120
124
 
121
125
  """
122
126
  full_sensor_dict = xmltodict.parse(sensor_data, process_comments=True)
@@ -153,8 +157,9 @@ class DataFile:
153
157
  return tidied_sensor_list
154
158
 
155
159
  def structure_metadata(self, metadata_list: list) -> dict:
156
- """Creates a dictionary to store the metadata that is added by using
157
- werums dship API.
160
+ """
161
+ Creates a dictionary to store custom metadata, of which Sea-Bird allows
162
+ 12 lines in each file.
158
163
 
159
164
  Parameters
160
165
  ----------
@@ -181,7 +186,8 @@ class DataFile:
181
186
  file_name: str | None = None,
182
187
  file_type: str = ".csv",
183
188
  ) -> Path:
184
- """Creates a Path object holding the desired output path.
189
+ """
190
+ Creates a Path object holding the desired output path.
185
191
 
186
192
  Parameters
187
193
  ----------
@@ -209,14 +215,13 @@ class DataFile:
209
215
  output_file_path: Path | str | None = None,
210
216
  output_file_name: str | None = None,
211
217
  ):
212
- """Writes a csv from the current dataframe. Takes a list of columns to
213
- use, a boolean for writing the header and the output file parameters.
218
+ """
219
+ Writes a csv from the given data.
214
220
 
215
221
  Parameters
216
222
  ----------
217
- selected_columns : list :
218
- a list of columns to include in the csv
219
- (Default value = self.df.columns)
223
+ data: pd.DataFrame | np.ndarray :
224
+ The source data to use.
220
225
  with_header : boolean :
221
226
  indicating whether the header shall appear in the output
222
227
  (Default value = True)
@@ -246,7 +251,8 @@ class DataFile:
246
251
  list_of_columns: list | str,
247
252
  df: pd.DataFrame,
248
253
  ):
249
- """Alters the dataframe to only hold the given columns.
254
+ """
255
+ Alters the dataframe to only hold the given columns.
250
256
 
251
257
  Parameters
252
258
  ----------
@@ -0,0 +1,411 @@
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ import logging
4
+ from collections import UserList
5
+ from typing import Callable, Type
6
+ import pandas as pd
7
+ import numpy as np
8
+ from seabirdfilehandler import (
9
+ CnvFile,
10
+ BottleFile,
11
+ BottleLogFile,
12
+ )
13
+ from seabirdfilehandler import DataFile
14
+ from seabirdfilehandler.utils import get_unique_sensor_data
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def get_collection(
20
+ path_to_files: Path | str,
21
+ file_suffix: str = "cnv",
22
+ only_metadata: bool = False,
23
+ sorting_key: Callable | None = None,
24
+ ) -> Type[FileCollection]:
25
+ """
26
+ Factory to create instances of FileCollection, depending on input type.
27
+
28
+ Parameters
29
+ ----------
30
+ path_to_files : Path | str :
31
+ The path to the directory to search for files.
32
+ file_suffix : str :
33
+ The suffix to search for. (Default value = "cnv")
34
+ only_metadata : bool :
35
+ Whether to read only metadata. (Default value = False)
36
+ sorting_key : Callable | None :
37
+ A callable that returns the filename-part to use to sort the collection. (Default value = None)
38
+ Returns
39
+ -------
40
+ An instance of FileCollection or one of its children.
41
+
42
+ """
43
+ mapping_suffix_to_type = {
44
+ "cnv": CnvCollection,
45
+ "btl": FileCollection,
46
+ "bl": FileCollection,
47
+ }
48
+ file_suffix = file_suffix.strip(".")
49
+ try:
50
+ collection = mapping_suffix_to_type[file_suffix](
51
+ path_to_files, file_suffix, only_metadata, sorting_key
52
+ )
53
+ except ValueError:
54
+ raise ValueError(f"Unknown input file type: {file_suffix}, aborting.")
55
+ else:
56
+ return collection
57
+
58
+
59
+ class FileCollection(UserList):
60
+ """
61
+ A representation of multiple files of the same kind. These files share
62
+ the same suffix and are otherwise closely connected to each other. A common
63
+ use case would be the collection of CNVs to allow for easier processing or
64
+ integration of field calibration measurements.
65
+
66
+ Parameters
67
+ ----------
68
+ path_to_files : Path | str :
69
+ The path to the directory to search for files.
70
+ file_suffix : str :
71
+ The suffix to search for. (Default value = "cnv")
72
+ only_metadata : bool :
73
+ Whether to read only metadata. (Default value = False)
74
+ sorting_key : Callable | None :
75
+ A callable that returns the filename-part to use to sort the collection. (Default value = None)
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ path_to_files: str | Path,
81
+ file_suffix: str,
82
+ only_metadata: bool = False,
83
+ sorting_key: Callable | None = None,
84
+ ):
85
+ super().__init__()
86
+ self.path_to_files = Path(path_to_files)
87
+ self.file_suffix = file_suffix.strip(".")
88
+ self.file_type = self.extract_file_type(self.file_suffix)
89
+ self.individual_file_paths = self.collect_files(
90
+ sorting_key=sorting_key
91
+ )
92
+ self.data = self.load_files(only_metadata)
93
+ if not only_metadata:
94
+ self.df_list = self.get_dataframes()
95
+ self.df = self.get_collection_dataframe(self.df_list)
96
+
97
+ def __str__(self):
98
+ return "/n".join(self.data)
99
+
100
+ def extract_file_type(self, suffix: str) -> Type[DataFile]:
101
+ """
102
+ Determines the file type using the input suffix.
103
+
104
+ Parameters
105
+ ----------
106
+ suffix : str :
107
+ The file suffix.
108
+ Returns
109
+ -------
110
+ An object corresponding to the given suffix.
111
+ """
112
+ mapping_suffix_to_type = {
113
+ "cnv": CnvFile,
114
+ "btl": BottleFile,
115
+ "bl": BottleLogFile,
116
+ }
117
+ file_type = DataFile
118
+ for key, value in mapping_suffix_to_type.items():
119
+ if key == suffix:
120
+ file_type = value
121
+ break
122
+ return file_type
123
+
124
+ def collect_files(
125
+ self,
126
+ sorting_key: Callable | None = lambda file: int(
127
+ file.stem.split("_")[3]
128
+ ),
129
+ ) -> list[Path]:
130
+ """
131
+ Creates a list of target files, recursively from the given directory.
132
+ These can be sorted with the help of the sorting_key parameter, which
133
+ is a Callable that identifies the part of the filename that shall be
134
+ used for sorting.
135
+
136
+ Parameters
137
+ ----------
138
+ sorting_key : Callable | None :
139
+ The part of the filename to use in sorting. (Default value = lambda file: int(file.stem.split("_")[3]))
140
+ Returns
141
+ -------
142
+ A list of all paths found.
143
+ """
144
+ return sorted(
145
+ self.path_to_files.rglob(f"*{self.file_suffix}"),
146
+ key=sorting_key,
147
+ )
148
+
149
+ def load_files(self, only_metadata: bool = False) -> list[DataFile]:
150
+ """
151
+ Creates python instances of each file.
152
+
153
+ Parameters
154
+ ----------
155
+ only_metadata : bool :
156
+ Whether to load only file metadata. (Default value = False)
157
+ Returns
158
+ -------
159
+ A list of all instances.
160
+ """
161
+ data = []
162
+ for file in self.individual_file_paths:
163
+ try:
164
+ data.append(self.file_type(file, only_metadata))
165
+ except TypeError:
166
+ logger.error(
167
+ f"Could not open file {file} with the type "
168
+ f"{self.file_type}."
169
+ )
170
+ continue
171
+ return data
172
+
173
+ def get_dataframes(
174
+ self,
175
+ event_log: bool = False,
176
+ coordinates: bool = False,
177
+ time_correction: bool = False,
178
+ cast_identifier: bool = False,
179
+ ) -> list[pd.DataFrame]:
180
+ """
181
+ Collects all individual dataframes and allows additional column
182
+ creation.
183
+
184
+ Parameters
185
+ ----------
186
+ event_log : bool :
187
+ (Default value = False)
188
+ coordinates : bool :
189
+ (Default value = False)
190
+ time_correction : bool :
191
+ (Default value = False)
192
+ cast_identifier : bool :
193
+ (Default value = False)
194
+
195
+ Returns
196
+ -------
197
+ A list of the individual pandas DataFrames.
198
+ """
199
+ for index, file in enumerate(self.data):
200
+ if event_log:
201
+ file.add_station_and_event_column()
202
+ if coordinates:
203
+ file.add_position_columns()
204
+ if time_correction:
205
+ file.absolute_time_calculation()
206
+ file.add_start_time()
207
+ if cast_identifier:
208
+ file.add_cast_number(index + 1)
209
+ return [file.df for file in self.data]
210
+
211
+ def get_collection_dataframe(
212
+ self, list_of_dfs: list[pd.DataFrame] | None = None
213
+ ) -> pd.DataFrame:
214
+ """
215
+ Creates one DataFrame from the individual ones, by concatenation.
216
+
217
+ Parameters
218
+ ----------
219
+ list_of_dfs : list[pd.DataFrame] | None :
220
+ A list of the individual DataFrames. (Default value = None)
221
+ Returns
222
+ -------
223
+ A pandas DataFrame representing the whole dataset.
224
+ """
225
+ if not list_of_dfs:
226
+ list_of_dfs = self.get_dataframes()
227
+ if not list_of_dfs:
228
+ raise ValueError("No dataframes to concatenate.")
229
+ df = pd.concat(list_of_dfs, ignore_index=True)
230
+ self.df = df
231
+ return df
232
+
233
+ def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
234
+ """
235
+ Apply the different dataframe edits to the given dataframe.
236
+
237
+ Parameters
238
+ ----------
239
+ df : pd.DataFrame :
240
+ A DataFrame to edit.
241
+ Returns
242
+ -------
243
+ The tidied dataframe.
244
+ """
245
+ df = self.use_bad_flag_for_nan(df)
246
+ df = self.set_dtype_to_float(df)
247
+ return self.select_real_scan_data(df)
248
+
249
+ def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
250
+ """
251
+ Replace all Nan values by the bad flag value, defined inside the files.
252
+
253
+ Parameters
254
+ ----------
255
+ df : pd.DataFrame :
256
+ The dataframe to edit.
257
+ Returns
258
+ -------
259
+ The edited DataFrame.
260
+ """
261
+ bad_flags = set()
262
+ for file in self.data:
263
+ for line in file.data_table_description:
264
+ if line.startswith("bad_flag"):
265
+ flag = line.split("=")[1].strip()
266
+ bad_flags.add(flag)
267
+ for flag in bad_flags:
268
+ df.replace(to_replace=flag, value=np.nan, inplace=True)
269
+ return df
270
+
271
+ def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
272
+ """
273
+ Use the float-dtype for all DataFrame columns.
274
+
275
+ Parameters
276
+ ----------
277
+ df : pd.DataFrame :
278
+ The dataframe to edit.
279
+ Returns
280
+ -------
281
+ The edited DataFrame.
282
+ """
283
+ for parameter in df.columns:
284
+ if parameter in ["datetime"]:
285
+ continue
286
+ try:
287
+ df[parameter] = df[parameter].astype("float")
288
+ finally:
289
+ continue
290
+ return df
291
+
292
+ def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
293
+ """
294
+ Drop data rows have no 'Scan' value, if that column exists.
295
+
296
+ Parameters
297
+ ----------
298
+ df : pd.DataFrame :
299
+ The dataframe to edit.
300
+ Returns
301
+ -------
302
+ The edited DataFrame.
303
+ """
304
+ try:
305
+ scan_column = [
306
+ c for c in df.columns if c.lower().startswith("scan")
307
+ ][0]
308
+ except IndexError:
309
+ return df
310
+ else:
311
+ df = df.loc[df[scan_column].notna()]
312
+ return df
313
+
314
+ def to_csv(self, file_name):
315
+ """
316
+ Writes a csv file with the given filename.
317
+
318
+ Parameters
319
+ ----------
320
+ file_name :
321
+ The new csv file name.
322
+ """
323
+ self.df.to_csv(file_name)
324
+
325
+
326
+ class CnvCollection(FileCollection):
327
+ """
328
+ Specific methods to work with collections of .cnv files.
329
+ """
330
+
331
+ def __init__(
332
+ self,
333
+ *args,
334
+ **kwargs,
335
+ ):
336
+ super().__init__(*args, **kwargs)
337
+ self.data_meta_info = self.get_data_table_meta_info()
338
+ self.sensor_data = get_unique_sensor_data(
339
+ [file.sensors for file in self.data]
340
+ )
341
+ self.array = self.get_array()
342
+
343
+ def get_dataframes(
344
+ self,
345
+ event_log: bool = False,
346
+ coordinates: bool = False,
347
+ time_correction: bool = False,
348
+ cast_identifier: bool = False,
349
+ ) -> list[pd.DataFrame]:
350
+ """
351
+ Collects all individual dataframes and allows additional column
352
+ creation.
353
+
354
+ Parameters
355
+ ----------
356
+ event_log : bool :
357
+ (Default value = False)
358
+ coordinates : bool :
359
+ (Default value = False)
360
+ time_correction : bool :
361
+ (Default value = False)
362
+ cast_identifier : bool :
363
+ (Default value = False)
364
+ Returns
365
+ -------
366
+ A list of the individual pandas DataFrames.
367
+ """
368
+ for index, file in enumerate(self.data):
369
+ if event_log:
370
+ file.add_station_and_event_column()
371
+ if coordinates:
372
+ file.add_position_columns()
373
+ if time_correction:
374
+ file.absolute_time_calculation()
375
+ file.add_start_time()
376
+ if cast_identifier:
377
+ file.add_cast_number(index + 1)
378
+ return [file.create_dataframe() for file in self.data]
379
+
380
+ def get_data_table_meta_info(self) -> list[dict]:
381
+ """
382
+ Ensures the same data description in all input cnv files and returns
383
+ it.
384
+ Acts as an early alarm when working on different kinds of files, which
385
+ cannot be concatenated together.
386
+
387
+ Returns
388
+ -------
389
+ A list of dictionaries that represent the data column information.
390
+ """
391
+ all_column_descriptions = [
392
+ file.parameters.metadata for file in self.data
393
+ ]
394
+ for info in all_column_descriptions:
395
+ if all_column_descriptions[0] != info:
396
+ raise AssertionError(
397
+ "Acting on differently formed data files, aborting"
398
+ )
399
+ return all_column_descriptions[0]
400
+
401
+ def get_array(self) -> np.ndarray:
402
+ """
403
+ Creates a collection array of all individual file arrays.
404
+
405
+ Returns
406
+ -------
407
+ A numpy array, representing the data of all input files.
408
+ """
409
+ return np.concatenate(
410
+ [file.parameters.create_full_ndarray() for file in self.data]
411
+ )
@@ -18,10 +18,10 @@ class Parameters(UserDict):
18
18
 
19
19
  Parameters
20
20
  ----------
21
- data: list:
22
- The raw data as extraced by SeaBirdFile
23
- metadata: list,
24
- The raw metadata as extraced by SeaBirdFile
21
+ data: list
22
+ The raw data as extraced by DataFile
23
+ metadata: list
24
+ The raw metadata as extraced by DataFile
25
25
 
26
26
  Returns
27
27
  -------
@@ -32,15 +32,20 @@ class Parameters(UserDict):
32
32
  self,
33
33
  data: list,
34
34
  metadata: list,
35
+ only_header: bool = False,
35
36
  ):
36
37
  self.raw_input_data = data
37
38
  self.raw_metadata = metadata
38
- self.full_data_array = self.create_full_ndarray()
39
39
  self.differentiate_table_description()
40
40
  self.metadata, self.duplicate_columns = self.reading_data_header(
41
41
  metadata
42
42
  )
43
- self.data = self.create_parameter_instances()
43
+ if not only_header:
44
+ self.full_data_array = self.create_full_ndarray()
45
+ self.data = self.create_parameter_instances()
46
+
47
+ def get_parameter_names(self) -> list[str]:
48
+ return [parameter["name"] for parameter in self.metadata.values()]
44
49
 
45
50
  def get_parameter_list(self) -> list[Parameter]:
46
51
  """ """
@@ -1,28 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: seabirdfilehandler
3
- Version: 0.5.2
4
- Summary: Library of parsers to interact with SeaBird CTD files.
5
- Keywords: CTD,parser,seabird,data
6
- Author: Emil Michels
7
- Author-email: <emil.michels@io-warnemuende.de>
8
- Requires-Python: >=3.12
9
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
- Classifier: Development Status :: 3 - Alpha
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Intended Audience :: Science/Research
13
- Classifier: Topic :: Scientific/Engineering :: Oceanography
14
- Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Requires-Dist: pandas (>=2.2.1)
18
- Requires-Dist: xmltodict (>=0.13.0)
19
- Project-URL: Homepage, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
20
- Project-URL: Repository, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
21
- Description-Content-Type: text/markdown
22
-
23
- This is a library for handling the different SeaBird file types. Each file is
24
- meant to be represented by one object that stores all of its information in a
25
- structured way. Through the grouping of different data types, more complex
26
- calculations, visualisations and output forms will be possible inside of those
27
- objects.
28
-
@@ -1,5 +0,0 @@
1
- This is a library for handling the different SeaBird file types. Each file is
2
- meant to be represented by one object that stores all of its information in a
3
- structured way. Through the grouping of different data types, more complex
4
- calculations, visualisations and output forms will be possible inside of those
5
- objects.
@@ -1,258 +0,0 @@
1
- from pathlib import Path
2
- import logging
3
- from collections import UserList
4
- from typing import Callable, Type
5
- import pandas as pd
6
- import numpy as np
7
- from seabirdfilehandler import (
8
- CnvFile,
9
- BottleFile,
10
- BottleLogFile,
11
- )
12
- from seabirdfilehandler import DataFile
13
- from seabirdfilehandler.utils import get_unique_sensor_data
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class FileCollection(UserList):
19
- """A representation of multiple files of the same kind. These files share
20
- the same suffix and are otherwise closely connected to each other. A common
21
- use case would be the collection of CNVs to allow for easier processing or
22
- integration of field calibration measurements.
23
-
24
- Parameters
25
- ----------
26
-
27
- Returns
28
- -------
29
-
30
- """
31
-
32
- def __init__(
33
- self,
34
- path_to_files: str | Path,
35
- file_suffix: str,
36
- only_metadata: bool = False,
37
- sorting_key: Callable | None = None,
38
- ):
39
- super().__init__()
40
- self.path_to_files = Path(path_to_files)
41
- self.file_suffix = file_suffix.strip(".")
42
- self.file_type: Type[DataFile]
43
- self.extract_file_type()
44
- self.individual_file_paths = []
45
- self.collect_files(sorting_key=sorting_key)
46
- self.load_files(only_metadata)
47
- if not only_metadata:
48
- if self.file_type == DataFile:
49
- self.df_list = self.get_dataframes()
50
- self.df = self.get_collection_dataframe(self.df_list)
51
- if self.file_type == CnvFile:
52
- self.data_meta_info = self.get_data_table_meta_info()
53
- self.sensor_data = get_unique_sensor_data(
54
- [file.sensors for file in self.data]
55
- )
56
-
57
- def __str__(self):
58
- return "/n".join(self.data)
59
-
60
- def extract_file_type(self):
61
- """ """
62
- mapping_suffix_to_type = {
63
- "cnv": CnvFile,
64
- "btl": BottleFile,
65
- "bl": BottleLogFile,
66
- }
67
- for key, value in mapping_suffix_to_type.items():
68
- if key == self.file_suffix:
69
- self.file_type = value
70
- break
71
- else:
72
- self.file_type = DataFile
73
-
74
- def collect_files(
75
- self,
76
- sorting_key: Callable | None = lambda file: int(
77
- file.stem.split("_")[3]
78
- ),
79
- ):
80
- """ """
81
- self.individual_file_paths = sorted(
82
- self.path_to_files.rglob(f"*{self.file_suffix}"),
83
- key=sorting_key,
84
- )
85
-
86
- def load_files(self, only_metadata: bool = False):
87
- """ """
88
- for file in self.individual_file_paths:
89
- try:
90
- self.data.append(self.file_type(file))
91
- except TypeError:
92
- logger.error(
93
- f"Could not open file {file} with the type "
94
- f"{self.file_type}."
95
- )
96
- continue
97
-
98
- def get_dataframes(
99
- self,
100
- event_log: bool = False,
101
- coordinates: bool = False,
102
- time_correction: bool = False,
103
- cast_identifier: bool = False,
104
- long_header_names: bool = False,
105
- full_data_header: bool = True,
106
- ) -> list[pd.DataFrame]:
107
- """
108
-
109
- Parameters
110
- ----------
111
- event_log: bool :
112
- (Default value = False)
113
- coordinates: bool :
114
- (Default value = False)
115
- time_correction: bool :
116
- (Default value = False)
117
- cast_identifier: bool :
118
- (Default value = False)
119
- long_header_names: bool :
120
- (Default value = False)
121
- full_data_header: bool :
122
- (Default value = True)
123
-
124
- Returns
125
- -------
126
-
127
- """
128
- for index, file in enumerate(self.data):
129
- if full_data_header:
130
- file.rename_dataframe_header(header_detail_level="longinfo")
131
- elif long_header_names:
132
- file.rename_dataframe_header(header_detail_level="name")
133
- if event_log:
134
- file.add_station_and_event_column()
135
- if coordinates:
136
- file.add_position_columns()
137
- if time_correction:
138
- file.absolute_time_calculation()
139
- file.add_start_time()
140
- if cast_identifier:
141
- file.add_cast_number(index + 1)
142
- return [file.df for file in self.data]
143
-
144
- def get_collection_dataframe(
145
- self, list_of_dfs: list[pd.DataFrame] | None = None
146
- ) -> pd.DataFrame:
147
- """
148
-
149
- Parameters
150
- ----------
151
- list_of_dfs: list[pd.DataFrame] | None :
152
- (Default value = None)
153
-
154
- Returns
155
- -------
156
-
157
- """
158
- if not list_of_dfs:
159
- list_of_dfs = self.get_dataframes()
160
- df = pd.concat(list_of_dfs, ignore_index=True)
161
- # df.meta.metadata = list_of_dfs[0].meta.metadata
162
- return df
163
-
164
- def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
165
- """
166
-
167
- Parameters
168
- ----------
169
- df: pd.DataFrame :
170
-
171
-
172
- Returns
173
- -------
174
-
175
- """
176
- df = self.use_bad_flag_for_nan(df)
177
- df = self.set_dtype_to_float(df)
178
- return self.select_real_scan_data(df)
179
-
180
- def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
181
- """
182
-
183
- Parameters
184
- ----------
185
- df: pd.DataFrame :
186
-
187
-
188
- Returns
189
- -------
190
-
191
- """
192
- bad_flags = set()
193
- for file in self.data:
194
- for line in file.data_table_description:
195
- if line.startswith("bad_flag"):
196
- flag = line.split("=")[1].strip()
197
- bad_flags.add(flag)
198
- for flag in bad_flags:
199
- df.replace(to_replace=flag, value=np.nan, inplace=True)
200
- return df
201
-
202
- def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
203
- """
204
-
205
- Parameters
206
- ----------
207
- df: pd.DataFrame :
208
-
209
-
210
- Returns
211
- -------
212
-
213
- """
214
- for parameter in df.columns:
215
- if parameter in ["datetime"]:
216
- continue
217
- try:
218
- df[parameter] = df[parameter].astype("float")
219
- finally:
220
- continue
221
- return df
222
-
223
- def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
224
- """
225
-
226
- Parameters
227
- ----------
228
- df: pd.DataFrame :
229
-
230
-
231
- Returns
232
- -------
233
-
234
- """
235
- # TODO: fix this hardcoded name
236
- try:
237
- df = df.loc[df["Scan Count"].notna()]
238
- finally:
239
- pass
240
- return df
241
-
242
- def to_csv(self, file_name):
243
- """
244
-
245
- Parameters
246
- ----------
247
- file_name :
248
-
249
-
250
- Returns
251
- -------
252
-
253
- """
254
- self.get_collection_dataframe().to_csv(file_name)
255
-
256
- def get_data_table_meta_info(self) -> list[list[dict]]:
257
- """ """
258
- return [file.parameters.metadata for file in self.data]