seabirdfilehandler 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import Union
2
3
  from datetime import datetime, time
3
4
  import pandas as pd
@@ -22,12 +23,13 @@ class BottleFile(DataFile):
22
23
 
23
24
  """
24
25
 
25
- def __init__(self, path_to_file):
26
- super().__init__(path_to_file)
27
- self.original_df = self.create_dataframe()
28
- self.df = self.original_df
29
- self.setting_dataframe_dtypes()
30
- self.adding_timestamp_column()
26
+ def __init__(self, path_to_file: Path | str, only_header: bool = False):
27
+ super().__init__(path_to_file, only_header)
28
+ if not only_header:
29
+ self.original_df = self.create_dataframe()
30
+ self.df = self.original_df
31
+ self.setting_dataframe_dtypes()
32
+ self.adding_timestamp_column()
31
33
 
32
34
  def create_dataframe(self):
33
35
  """Creates a dataframe out of the btl file. Manages the double data
@@ -60,9 +60,11 @@ class CnvFile(DataFile):
60
60
  super().__init__(path_to_file, only_header)
61
61
  self.validation_modules = self.obtaining_validation_modules()
62
62
  self.start_time = self.reading_start_time()
63
- self.parameters = Parameters(self.data, self.data_table_description)
63
+ self.parameters = Parameters(
64
+ self.data, self.data_table_description, only_header
65
+ )
64
66
  if create_dataframe:
65
- self.df = self.parameters.get_pandas_dataframe()
67
+ self.df = self.create_dataframe()
66
68
  if absolute_time_calculation:
67
69
  self.absolute_time_calculation()
68
70
  if event_log_column:
@@ -70,6 +72,13 @@ class CnvFile(DataFile):
70
72
  if coordinate_columns:
71
73
  self.add_position_columns()
72
74
 
75
+ def create_dataframe(self) -> pd.DataFrame:
76
+ """
77
+ Plain dataframe creator.
78
+ """
79
+ self.df = self.parameters.get_pandas_dataframe()
80
+ return self.df
81
+
73
82
  def reading_start_time(
74
83
  self,
75
84
  time_source: str = "System UTC",
@@ -1,3 +1,4 @@
1
+ from __future__ import annotations
1
2
  from pathlib import Path
2
3
  import logging
3
4
  from collections import UserList
@@ -15,18 +16,63 @@ from seabirdfilehandler.utils import get_unique_sensor_data
15
16
  logger = logging.getLogger(__name__)
16
17
 
17
18
 
19
+ def get_collection(
20
+ path_to_files: Path | str,
21
+ file_suffix: str = "cnv",
22
+ only_metadata: bool = False,
23
+ sorting_key: Callable | None = None,
24
+ ) -> Type[FileCollection]:
25
+ """
26
+ Factory to create instances of FileCollection, depending on input type.
27
+
28
+ Parameters
29
+ ----------
30
+ path_to_files : Path | str :
31
+ The path to the directory to search for files.
32
+ file_suffix : str :
33
+ The suffix to search for. (Default value = "cnv")
34
+ only_metadata : bool :
35
+ Whether to read only metadata. (Default value = False)
36
+ sorting_key : Callable | None :
37
+ A callable that returns the filename-part to use to sort the collection. (Default value = None)
38
+ Returns
39
+ -------
40
+ An instance of FileCollection or one of its children.
41
+
42
+ """
43
+ mapping_suffix_to_type = {
44
+ "cnv": CnvCollection,
45
+ "btl": FileCollection,
46
+ "bl": FileCollection,
47
+ }
48
+ file_suffix = file_suffix.strip(".")
49
+ try:
50
+ collection = mapping_suffix_to_type[file_suffix](
51
+ path_to_files, file_suffix, only_metadata, sorting_key
52
+ )
53
+ except ValueError:
54
+ raise ValueError(f"Unknown input file type: {file_suffix}, aborting.")
55
+ else:
56
+ return collection
57
+
58
+
18
59
  class FileCollection(UserList):
19
- """A representation of multiple files of the same kind. These files share
60
+ """
61
+ A representation of multiple files of the same kind. These files share
20
62
  the same suffix and are otherwise closely connected to each other. A common
21
63
  use case would be the collection of CNVs to allow for easier processing or
22
64
  integration of field calibration measurements.
23
65
 
24
66
  Parameters
25
67
  ----------
26
-
27
- Returns
28
- -------
29
-
68
+ path_to_files : Path | str :
69
+ The path to the directory to search for files.
70
+ file_suffix : str :
71
+ The suffix to search for. (Default value = "cnv")
72
+ only_metadata : bool :
73
+ Whether to read only metadata. (Default value = False)
74
+ sorting_key : Callable | None :
75
+ A callable that returns the filename-part to use to sort the collection. (Default value = None)
30
76
  """
31
77
 
32
78
  def __init__(
@@ -39,61 +85,90 @@ class FileCollection(UserList):
39
85
  super().__init__()
40
86
  self.path_to_files = Path(path_to_files)
41
87
  self.file_suffix = file_suffix.strip(".")
42
- self.file_type: Type[DataFile]
43
- self.extract_file_type()
44
- self.individual_file_paths = []
45
- self.collect_files(sorting_key=sorting_key)
46
- self.load_files(only_metadata)
88
+ self.file_type = self.extract_file_type(self.file_suffix)
89
+ self.individual_file_paths = self.collect_files(
90
+ sorting_key=sorting_key
91
+ )
92
+ self.data = self.load_files(only_metadata)
47
93
  if not only_metadata:
48
- if self.file_type == DataFile:
49
- self.df_list = self.get_dataframes()
50
- self.df = self.get_collection_dataframe(self.df_list)
51
- if self.file_type == CnvFile:
52
- self.data_meta_info = self.get_data_table_meta_info()
53
- self.sensor_data = get_unique_sensor_data(
54
- [file.sensors for file in self.data]
55
- )
94
+ self.df_list = self.get_dataframes()
95
+ self.df = self.get_collection_dataframe(self.df_list)
56
96
 
57
97
  def __str__(self):
58
98
  return "/n".join(self.data)
59
99
 
60
- def extract_file_type(self):
61
- """ """
100
+ def extract_file_type(self, suffix: str) -> Type[DataFile]:
101
+ """
102
+ Determines the file type using the input suffix.
103
+
104
+ Parameters
105
+ ----------
106
+ suffix : str :
107
+ The file suffix.
108
+ Returns
109
+ -------
110
+ An object corresponding to the given suffix.
111
+ """
62
112
  mapping_suffix_to_type = {
63
113
  "cnv": CnvFile,
64
114
  "btl": BottleFile,
65
115
  "bl": BottleLogFile,
66
116
  }
117
+ file_type = DataFile
67
118
  for key, value in mapping_suffix_to_type.items():
68
- if key == self.file_suffix:
69
- self.file_type = value
119
+ if key == suffix:
120
+ file_type = value
70
121
  break
71
- else:
72
- self.file_type = DataFile
122
+ return file_type
73
123
 
74
124
  def collect_files(
75
125
  self,
76
126
  sorting_key: Callable | None = lambda file: int(
77
127
  file.stem.split("_")[3]
78
128
  ),
79
- ):
80
- """ """
81
- self.individual_file_paths = sorted(
129
+ ) -> list[Path]:
130
+ """
131
+ Creates a list of target files, recursively from the given directory.
132
+ These can be sorted with the help of the sorting_key parameter, which
133
+ is a Callable that identifies the part of the filename that shall be
134
+ used for sorting.
135
+
136
+ Parameters
137
+ ----------
138
+ sorting_key : Callable | None :
139
+ The part of the filename to use in sorting. (Default value = lambda file: int(file.stem.split("_")[3]))
140
+ Returns
141
+ -------
142
+ A list of all paths found.
143
+ """
144
+ return sorted(
82
145
  self.path_to_files.rglob(f"*{self.file_suffix}"),
83
146
  key=sorting_key,
84
147
  )
85
148
 
86
- def load_files(self, only_metadata: bool = False):
87
- """ """
149
+ def load_files(self, only_metadata: bool = False) -> list[DataFile]:
150
+ """
151
+ Creates python instances of each file.
152
+
153
+ Parameters
154
+ ----------
155
+ only_metadata : bool :
156
+ Whether to load only file metadata. (Default value = False)
157
+ Returns
158
+ -------
159
+ A list of all instances.
160
+ """
161
+ data = []
88
162
  for file in self.individual_file_paths:
89
163
  try:
90
- self.data.append(self.file_type(file))
164
+ data.append(self.file_type(file, only_metadata))
91
165
  except TypeError:
92
166
  logger.error(
93
167
  f"Could not open file {file} with the type "
94
168
  f"{self.file_type}."
95
169
  )
96
170
  continue
171
+ return data
97
172
 
98
173
  def get_dataframes(
99
174
  self,
@@ -101,35 +176,27 @@ class FileCollection(UserList):
101
176
  coordinates: bool = False,
102
177
  time_correction: bool = False,
103
178
  cast_identifier: bool = False,
104
- long_header_names: bool = False,
105
- full_data_header: bool = True,
106
179
  ) -> list[pd.DataFrame]:
107
180
  """
181
+ Collects all individual dataframes and allows additional column
182
+ creation.
108
183
 
109
184
  Parameters
110
185
  ----------
111
- event_log: bool :
112
- (Default value = False)
113
- coordinates: bool :
114
- (Default value = False)
115
- time_correction: bool :
116
- (Default value = False)
117
- cast_identifier: bool :
118
- (Default value = False)
119
- long_header_names: bool :
120
- (Default value = False)
121
- full_data_header: bool :
122
- (Default value = True)
186
+ event_log : bool :
187
+ (Default value = False)
188
+ coordinates : bool :
189
+ (Default value = False)
190
+ time_correction : bool :
191
+ (Default value = False)
192
+ cast_identifier : bool :
193
+ (Default value = False)
123
194
 
124
195
  Returns
125
196
  -------
126
-
197
+ A list of the individual pandas DataFrames.
127
198
  """
128
199
  for index, file in enumerate(self.data):
129
- if full_data_header:
130
- file.rename_dataframe_header(header_detail_level="longinfo")
131
- elif long_header_names:
132
- file.rename_dataframe_header(header_detail_level="name")
133
200
  if event_log:
134
201
  file.add_station_and_event_column()
135
202
  if coordinates:
@@ -145,33 +212,35 @@ class FileCollection(UserList):
145
212
  self, list_of_dfs: list[pd.DataFrame] | None = None
146
213
  ) -> pd.DataFrame:
147
214
  """
215
+ Creates one DataFrame from the individual ones, by concatenation.
148
216
 
149
217
  Parameters
150
218
  ----------
151
- list_of_dfs: list[pd.DataFrame] | None :
152
- (Default value = None)
153
-
219
+ list_of_dfs : list[pd.DataFrame] | None :
220
+ A list of the individual DataFrames. (Default value = None)
154
221
  Returns
155
222
  -------
156
-
223
+ A pandas DataFrame representing the whole dataset.
157
224
  """
158
225
  if not list_of_dfs:
159
226
  list_of_dfs = self.get_dataframes()
227
+ if not list_of_dfs:
228
+ raise ValueError("No dataframes to concatenate.")
160
229
  df = pd.concat(list_of_dfs, ignore_index=True)
161
- # df.meta.metadata = list_of_dfs[0].meta.metadata
230
+ self.df = df
162
231
  return df
163
232
 
164
233
  def tidy_collection_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
165
234
  """
235
+ Apply the different dataframe edits to the given dataframe.
166
236
 
167
237
  Parameters
168
238
  ----------
169
- df: pd.DataFrame :
170
-
171
-
239
+ df : pd.DataFrame :
240
+ A DataFrame to edit.
172
241
  Returns
173
242
  -------
174
-
243
+ The tidied dataframe.
175
244
  """
176
245
  df = self.use_bad_flag_for_nan(df)
177
246
  df = self.set_dtype_to_float(df)
@@ -179,15 +248,15 @@ class FileCollection(UserList):
179
248
 
180
249
  def use_bad_flag_for_nan(self, df: pd.DataFrame) -> pd.DataFrame:
181
250
  """
251
+ Replace all Nan values by the bad flag value, defined inside the files.
182
252
 
183
253
  Parameters
184
254
  ----------
185
- df: pd.DataFrame :
186
-
187
-
255
+ df : pd.DataFrame :
256
+ The dataframe to edit.
188
257
  Returns
189
258
  -------
190
-
259
+ The edited DataFrame.
191
260
  """
192
261
  bad_flags = set()
193
262
  for file in self.data:
@@ -201,15 +270,15 @@ class FileCollection(UserList):
201
270
 
202
271
  def set_dtype_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
203
272
  """
273
+ Use the float-dtype for all DataFrame columns.
204
274
 
205
275
  Parameters
206
276
  ----------
207
- df: pd.DataFrame :
208
-
209
-
277
+ df : pd.DataFrame :
278
+ The dataframe to edit.
210
279
  Returns
211
280
  -------
212
-
281
+ The edited DataFrame.
213
282
  """
214
283
  for parameter in df.columns:
215
284
  if parameter in ["datetime"]:
@@ -222,37 +291,121 @@ class FileCollection(UserList):
222
291
 
223
292
  def select_real_scan_data(self, df: pd.DataFrame) -> pd.DataFrame:
224
293
  """
294
+ Drop data rows have no 'Scan' value, if that column exists.
225
295
 
226
296
  Parameters
227
297
  ----------
228
- df: pd.DataFrame :
229
-
230
-
298
+ df : pd.DataFrame :
299
+ The dataframe to edit.
231
300
  Returns
232
301
  -------
233
-
302
+ The edited DataFrame.
234
303
  """
235
- # TODO: fix this hardcoded name
236
304
  try:
237
- df = df.loc[df["Scan Count"].notna()]
238
- finally:
239
- pass
305
+ scan_column = [
306
+ c for c in df.columns if c.lower().startswith("scan")
307
+ ][0]
308
+ except IndexError:
309
+ return df
310
+ else:
311
+ df = df.loc[df[scan_column].notna()]
240
312
  return df
241
313
 
242
314
  def to_csv(self, file_name):
243
315
  """
316
+ Writes a csv file with the given filename.
244
317
 
245
318
  Parameters
246
319
  ----------
247
320
  file_name :
321
+ The new csv file name.
322
+ """
323
+ self.df.to_csv(file_name)
324
+
325
+
326
+ class CnvCollection(FileCollection):
327
+ """
328
+ Specific methods to work with collections of .cnv files.
329
+ """
330
+
331
+ def __init__(
332
+ self,
333
+ *args,
334
+ **kwargs,
335
+ ):
336
+ super().__init__(*args, **kwargs)
337
+ self.data_meta_info = self.get_data_table_meta_info()
338
+ self.sensor_data = get_unique_sensor_data(
339
+ [file.sensors for file in self.data]
340
+ )
341
+ self.array = self.get_array()
342
+
343
+ def get_dataframes(
344
+ self,
345
+ event_log: bool = False,
346
+ coordinates: bool = False,
347
+ time_correction: bool = False,
348
+ cast_identifier: bool = False,
349
+ ) -> list[pd.DataFrame]:
350
+ """
351
+ Collects all individual dataframes and allows additional column
352
+ creation.
353
+
354
+ Parameters
355
+ ----------
356
+ event_log : bool :
357
+ (Default value = False)
358
+ coordinates : bool :
359
+ (Default value = False)
360
+ time_correction : bool :
361
+ (Default value = False)
362
+ cast_identifier : bool :
363
+ (Default value = False)
364
+ Returns
365
+ -------
366
+ A list of the individual pandas DataFrames.
367
+ """
368
+ for index, file in enumerate(self.data):
369
+ if event_log:
370
+ file.add_station_and_event_column()
371
+ if coordinates:
372
+ file.add_position_columns()
373
+ if time_correction:
374
+ file.absolute_time_calculation()
375
+ file.add_start_time()
376
+ if cast_identifier:
377
+ file.add_cast_number(index + 1)
378
+ return [file.create_dataframe() for file in self.data]
248
379
 
380
+ def get_data_table_meta_info(self) -> list[dict]:
381
+ """
382
+ Ensures the same data description in all input cnv files and returns
383
+ it.
384
+ Acts as an early alarm when working on different kinds of files, which
385
+ cannot be concatenated together.
249
386
 
250
387
  Returns
251
388
  -------
389
+ A list of dictionaries that represent the data column information.
390
+ """
391
+ all_column_descriptions = [
392
+ file.parameters.metadata for file in self.data
393
+ ]
394
+ for info in all_column_descriptions:
395
+ if all_column_descriptions[0] != info:
396
+ raise AssertionError(
397
+ "Acting on differently formed data files, aborting"
398
+ )
399
+ return all_column_descriptions[0]
252
400
 
401
+ def get_array(self) -> np.ndarray:
253
402
  """
254
- self.get_collection_dataframe().to_csv(file_name)
403
+ Creates a collection array of all individual file arrays.
255
404
 
256
- def get_data_table_meta_info(self) -> list[list[dict]]:
257
- """ """
258
- return [file.parameters.metadata for file in self.data]
405
+ Returns
406
+ -------
407
+ A numpy array, representing the data of all input files.
408
+ """
409
+ return np.concatenate(
410
+ [file.parameters.create_full_ndarray() for file in self.data]
411
+ )
@@ -32,15 +32,17 @@ class Parameters(UserDict):
32
32
  self,
33
33
  data: list,
34
34
  metadata: list,
35
+ only_header: bool = False,
35
36
  ):
36
37
  self.raw_input_data = data
37
38
  self.raw_metadata = metadata
38
- self.full_data_array = self.create_full_ndarray()
39
39
  self.differentiate_table_description()
40
40
  self.metadata, self.duplicate_columns = self.reading_data_header(
41
41
  metadata
42
42
  )
43
- self.data = self.create_parameter_instances()
43
+ if not only_header:
44
+ self.full_data_array = self.create_full_ndarray()
45
+ self.data = self.create_parameter_instances()
44
46
 
45
47
  def get_parameter_names(self) -> list[str]:
46
48
  return [parameter["name"] for parameter in self.metadata.values()]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: seabirdfilehandler
3
- Version: 0.5.3
3
+ Version: 0.5.4
4
4
  Summary: Library of parsers to interact with SeaBird CTD files.
5
5
  Keywords: CTD,parser,seabird,data
6
6
  Author: Emil Michels
@@ -16,7 +16,8 @@ Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Programming Language :: Python :: 3.13
17
17
  Requires-Dist: pandas (>=2.2.1)
18
18
  Requires-Dist: xmltodict (>=0.13.0)
19
- Project-URL: Homepage, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
19
+ Project-URL: Documentation, https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler
20
+ Project-URL: Homepage, https://ctd-software.pages.io-warnemuende.de/seabirdfilehandler
20
21
  Project-URL: Repository, https://git.io-warnemuende.de/CTD-Software/SeabirdFileHandler
21
22
  Description-Content-Type: text/markdown
22
23
 
@@ -1,15 +1,15 @@
1
1
  seabirdfilehandler/__init__.py,sha256=rS1NfxKVWgOntk5NL-ndZyDt3LHW_tKr_F3iW_QbYvg,230
2
- seabirdfilehandler/bottlefile.py,sha256=nnfoDczPMG_ge40dT2rHNhifR7-NRgnZNFrfPM_9OSQ,5925
2
+ seabirdfilehandler/bottlefile.py,sha256=qCh506J3MWZXM11243aw_oJRocVB0ZIipXQLEgkD5M0,6046
3
3
  seabirdfilehandler/bottlelogfile.py,sha256=MtMmEebdAktO3mk6KbmJC7dfx9sRLbV5qqDQt2qtpJE,4310
4
- seabirdfilehandler/cnvfile.py,sha256=0fDovJ6WJsxkbtonbuKIPDY2E5Ex1k-1WD5nkA2WRyg,9990
4
+ seabirdfilehandler/cnvfile.py,sha256=j7IR3EgCrGD3riKOzFSKFNW6lkuzZYGYkxI_czcD8XU,10196
5
5
  seabirdfilehandler/datafiles.py,sha256=9r0Mh3zPYJJ3CoybgOBH4Dsq43kLDnca9m8s_V0cYU8,9378
6
- seabirdfilehandler/file_collection.py,sha256=b5iJaP4F34Vq7-FiJOlPvfS4IePGWsYx20XwWbZQw1A,6882
6
+ seabirdfilehandler/file_collection.py,sha256=IXbNTpplF-BQmjDSPh6Cj_f5-mS5C-biBLDRnqaFhOo,12531
7
7
  seabirdfilehandler/geomar_ctd_file_parser.py,sha256=4eCnkE0mvPKC8Dic8sXP4xpfwnk3K2MQcGFBf6loT8k,2655
8
- seabirdfilehandler/parameter.py,sha256=wBBSe85-Eqc3YmntuZmwvny1l2exXATri-oak8xEKX0,14490
8
+ seabirdfilehandler/parameter.py,sha256=CjC8T5wfbThryqVNjTcAolug9gi_BnInfZzkEfmKm5E,14561
9
9
  seabirdfilehandler/utils.py,sha256=5KXdB8Hdv65dv5tPyXxNMct1mCEOyA3S8XP54AFAnx0,1745
10
10
  seabirdfilehandler/validation_modules.py,sha256=eZ6x0giftUtlxnRMOnK_vCkgccdwUXPrDjajFa-E6n0,4698
11
11
  seabirdfilehandler/xmlfiles.py,sha256=L_puQf8eg0ojv85AyEMID4jnwkOlV_fgZP3W5yeSUBY,4668
12
- seabirdfilehandler-0.5.3.dist-info/LICENSE,sha256=Ifd1VPmYv32oJd2QVh3wIQP9X05vYJlcY6kONz360ws,34603
13
- seabirdfilehandler-0.5.3.dist-info/METADATA,sha256=7LNvhfKbt5_sjuLNhqO8t1h2iKKZyS7sJZhT0nh2c4w,2213
14
- seabirdfilehandler-0.5.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
15
- seabirdfilehandler-0.5.3.dist-info/RECORD,,
12
+ seabirdfilehandler-0.5.4.dist-info/LICENSE,sha256=Ifd1VPmYv32oJd2QVh3wIQP9X05vYJlcY6kONz360ws,34603
13
+ seabirdfilehandler-0.5.4.dist-info/METADATA,sha256=yt3BkhGRUOlsQ8tLPjziKPmobuYTDNUybjAARMJrcZw,2307
14
+ seabirdfilehandler-0.5.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
15
+ seabirdfilehandler-0.5.4.dist-info/RECORD,,