seabirdfilehandler 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of seabirdfilehandler might be problematic. Click here for more details.

@@ -0,0 +1,265 @@
1
+ from pathlib import Path
2
+ import xmltodict
3
+ import pandas as pd
4
+ import numpy as np
5
+ import logging
6
+
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s - %(name)s - [%(levelname)s] - %(message)s",
10
+ datefmt="%Y-%m-%d %H:%M:%S",
11
+ handlers=[
12
+ logging.FileHandler("filehandler.log"),
13
+ logging.StreamHandler(),
14
+ ],
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class DataFile:
21
+ """Collection of methods for the SeaBird files that feature some kind of
22
+ data table that is represented in a pandas dataframe.
23
+
24
+ Parameters
25
+ ----------
26
+
27
+ Returns
28
+ -------
29
+
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ path_to_file: Path | str,
35
+ only_header: bool = False,
36
+ ):
37
+ self.path_to_file = Path(path_to_file)
38
+ self.file_name = self.path_to_file.stem
39
+ self.file_dir = self.path_to_file.parent
40
+ self.only_header = only_header
41
+ self.raw_file_data = [] # the text file input
42
+ self.header = [] # the full file header
43
+ self.sbe9_data = [] # device specific information
44
+ self.metadata = {} # non-SeaBird metadata
45
+ self.metadata_list = [] # unstructured metadata for easier export
46
+ self.data_table_description = [] # the column names and other info
47
+ self.sensor_data = []
48
+ self.sensors = {} # xml-parsed sensor data
49
+ self.processing_info = [] # everything after the sensor data
50
+ self.data = [] # the data table
51
+ self.file_data = self.raw_file_data # variable file information
52
+ self.read_file()
53
+ self.metadata = self.structure_metadata(self.metadata_list)
54
+ if len(self.sensor_data) > 0:
55
+ self.sensors = self.sensor_xml_to_flattened_dict(
56
+ "".join(self.sensor_data)
57
+ )
58
+
59
+ def __str__(self) -> str:
60
+ return "/n".join(self.file_data)
61
+
62
+ def __repr__(self) -> str:
63
+ return str(self.path_to_file.absolute())
64
+
65
+ def __eq__(self, other) -> bool:
66
+ return self.file_data == other.file_data
67
+
68
+ def read_file(self):
69
+ """Reads and structures all the different information present in the
70
+ file. Lists and Dictionaries are the data structures of choice. Uses
71
+ basic prefix checking to distinguish different header information.
72
+
73
+ Parameters
74
+ ----------
75
+
76
+ Returns
77
+ -------
78
+
79
+ """
80
+ past_sensors = False
81
+ with self.path_to_file.open("r", encoding="latin-1") as file:
82
+ for line in file:
83
+ self.raw_file_data.append(line)
84
+ line_prefix = line[:2]
85
+ if line_prefix == "* ":
86
+ self.header.append(line)
87
+ self.sbe9_data.append(line[2:])
88
+ elif line_prefix == "**":
89
+ self.header.append(line)
90
+ self.metadata_list.append(line[3:])
91
+ elif line_prefix == "# ":
92
+ self.header.append(line)
93
+ if line[2:].strip()[0] == "<":
94
+ self.sensor_data.append(line[2:])
95
+ past_sensors = True
96
+ else:
97
+ if past_sensors:
98
+ self.processing_info.append(line[2:])
99
+ else:
100
+ self.data_table_description.append(line[2:])
101
+ else:
102
+ if line.startswith("*END*"):
103
+ self.header.append(line)
104
+ if self.only_header:
105
+ break
106
+ else:
107
+ self.data.append(line)
108
+
109
+ def sensor_xml_to_flattened_dict(
110
+ self, sensor_data: str
111
+ ) -> list[dict] | dict:
112
+ """Reads the pure xml sensor input and creates a multilevel dictionary,
113
+ dropping the first two dictionaries, as they are single entry only
114
+
115
+ Parameters
116
+ ----------
117
+
118
+ Returns
119
+ -------
120
+
121
+ """
122
+ full_sensor_dict = xmltodict.parse(sensor_data, process_comments=True)
123
+ try:
124
+ sensors = full_sensor_dict["Sensors"]["sensor"]
125
+ except KeyError as error:
126
+ logger.error(f"XML is not formatted as expected: {error}")
127
+ return full_sensor_dict
128
+ else:
129
+ # create a tidied version of the xml-parsed sensor dict
130
+ tidied_sensor_list = []
131
+ for entry in sensors:
132
+ # use comment value as type descriptor
133
+ comment = entry["#comment"]
134
+ split_comment = comment.split(",")
135
+ new_entry = split_comment[1].strip()
136
+ if split_comment[-1] == " 2":
137
+ new_entry += " 2"
138
+ # remove second-level dict
139
+ calibration_info = list(entry.values())[-1]
140
+ try:
141
+ new_dict = {
142
+ "Channel": entry["@Channel"],
143
+ "SensorName": new_entry,
144
+ **calibration_info,
145
+ }
146
+ except TypeError:
147
+ new_dict = {
148
+ "Channel": entry["@Channel"],
149
+ "SensorName": new_entry,
150
+ "Info": calibration_info,
151
+ }
152
+ tidied_sensor_list.append(new_dict)
153
+ return tidied_sensor_list
154
+
155
+ def structure_metadata(self, metadata_list: list) -> dict:
156
+ """Creates a dictionary to store the metadata that is added by using
157
+ werums dship API.
158
+
159
+ Parameters
160
+ ----------
161
+ metadata_list: list :
162
+ a list of the individual lines of metadata found in the file
163
+
164
+ Returns
165
+ -------
166
+ a dictionary of the lines of metadata divided into key-value pairs
167
+ """
168
+ out_dict = {}
169
+ for line in metadata_list:
170
+ try:
171
+ (key, val) = line.split("=")
172
+ except ValueError:
173
+ out_dict["text"] = line
174
+ else:
175
+ out_dict[key.strip()] = val.strip()
176
+ return out_dict
177
+
178
+ def define_output_path(
179
+ self,
180
+ file_path: Path | str | None = None,
181
+ file_name: str | None = None,
182
+ file_type: str = ".csv",
183
+ ) -> Path:
184
+ """Creates a Path object holding the desired output path.
185
+
186
+ Parameters
187
+ ----------
188
+ file_path : Path :
189
+ directory the file sits in (Default value = self.file_dir)
190
+ file_name : str :
191
+ the original file name (Default value = self.file_name)
192
+ file_type : str :
193
+ the output file type (Default = '.csv')
194
+ Returns
195
+ -------
196
+ a Path object consisting of the full path of the new file
197
+
198
+ """
199
+ file_path = self.file_dir if file_path is None else file_path
200
+ file_name = self.file_name if file_name is None else file_name
201
+ if file_type[0] != ".":
202
+ file_type = "." + file_type
203
+ return Path(file_path).joinpath(file_name).with_suffix(file_type)
204
+
205
+ def to_csv(
206
+ self,
207
+ data: pd.DataFrame | np.ndarray,
208
+ with_header: bool = True,
209
+ output_file_path: Path | str | None = None,
210
+ output_file_name: str | None = None,
211
+ ):
212
+ """Writes a csv from the current dataframe. Takes a list of columns to
213
+ use, a boolean for writing the header and the output file parameters.
214
+
215
+ Parameters
216
+ ----------
217
+ selected_columns : list :
218
+ a list of columns to include in the csv
219
+ (Default value = self.df.columns)
220
+ with_header : boolean :
221
+ indicating whether the header shall appear in the output
222
+ (Default value = True)
223
+ output_file_path : Path :
224
+ file directory (Default value = None)
225
+ output_file_name : str :
226
+ original file name (Default value = None)
227
+
228
+ Returns
229
+ -------
230
+
231
+ """
232
+ new_file_path = self.define_output_path(
233
+ output_file_path, output_file_name
234
+ )
235
+ if with_header:
236
+ with open(new_file_path, "w") as file:
237
+ for line in self.header:
238
+ file.write(line)
239
+ if isinstance(data, pd.DataFrame):
240
+ data.to_csv(new_file_path, index=False, mode="a")
241
+ else:
242
+ np.savetxt(new_file_path, data, delimiter=",")
243
+
244
+ def selecting_columns(
245
+ self,
246
+ list_of_columns: list | str,
247
+ df: pd.DataFrame,
248
+ ):
249
+ """Alters the dataframe to only hold the given columns.
250
+
251
+ Parameters
252
+ ----------
253
+ list_of_columns: list or str : a collection of columns
254
+ df : pandas.Dataframe :
255
+ Dataframe (Default value = None)
256
+
257
+ Returns
258
+ -------
259
+
260
+ """
261
+ # ensure that the input is a list, so that isin() can do its job
262
+ if isinstance(list_of_columns, str):
263
+ list_of_columns = [list_of_columns]
264
+ if isinstance(df, pd.DataFrame):
265
+ self.df = df[list_of_columns].reset_index(drop=True)
@@ -1,16 +1,15 @@
1
1
  from pathlib import Path
2
2
  import logging
3
3
  from collections import UserList
4
- from typing import Type
4
+ from typing import Callable, Type
5
5
  import pandas as pd
6
6
  import numpy as np
7
7
  from seabirdfilehandler import (
8
- SeaBirdFile,
9
8
  CnvFile,
10
9
  BottleFile,
11
10
  BottleLogFile,
12
11
  )
13
- from seabirdfilehandler.datatablefiles import DataTableFile
12
+ from seabirdfilehandler import DataFile
14
13
  from seabirdfilehandler.utils import get_unique_sensor_data
15
14
 
16
15
  logger = logging.getLogger(__name__)
@@ -34,23 +33,19 @@ class FileCollection(UserList):
34
33
  self,
35
34
  path_to_files: str | Path,
36
35
  file_suffix: str,
37
- pattern: str | None = None,
38
36
  only_metadata: bool = False,
37
+ sorting_key: Callable | None = None,
39
38
  ):
40
39
  super().__init__()
41
40
  self.path_to_files = Path(path_to_files)
42
41
  self.file_suffix = file_suffix.strip(".")
43
- self.file_type: Type[SeaBirdFile]
42
+ self.file_type: Type[DataFile]
44
43
  self.extract_file_type()
45
44
  self.individual_file_paths = []
46
- self.collect_files()
47
- if pattern:
48
- # TODO: implement pattern handling
49
- self.pattern = pattern
50
- else:
51
- self.load_files(only_metadata)
45
+ self.collect_files(sorting_key=sorting_key)
46
+ self.load_files(only_metadata)
52
47
  if not only_metadata:
53
- if self.file_type == DataTableFile:
48
+ if self.file_type == DataFile:
54
49
  self.df_list = self.get_dataframes()
55
50
  self.df = self.get_collection_dataframe(self.df_list)
56
51
  if self.file_type == CnvFile:
@@ -74,13 +69,19 @@ class FileCollection(UserList):
74
69
  self.file_type = value
75
70
  break
76
71
  else:
77
- self.file_type = SeaBirdFile
72
+ self.file_type = DataFile
78
73
 
79
- def collect_files(self):
74
+ def collect_files(
75
+ self,
76
+ sorting_key: Callable | None = lambda file: int(
77
+ file.stem.split("_")[3]
78
+ ),
79
+ ):
80
80
  """ """
81
- for path in self.path_to_files.rglob(f"*{self.file_suffix}"):
82
- self.individual_file_paths.append(path)
83
- self.individual_file_paths.sort()
81
+ self.individual_file_paths = sorted(
82
+ self.path_to_files.rglob(f"*{self.file_suffix}"),
83
+ key=sorting_key,
84
+ )
84
85
 
85
86
  def load_files(self, only_metadata: bool = False):
86
87
  """ """
@@ -254,4 +255,4 @@ class FileCollection(UserList):
254
255
 
255
256
  def get_data_table_meta_info(self) -> list[list[dict]]:
256
257
  """ """
257
- return [file.data_header_meta_info for file in self.data]
258
+ return [file.parameters.metadata for file in self.data]
@@ -113,6 +113,20 @@ class Parameters(UserDict):
113
113
  )
114
114
  return parameter_dict
115
115
 
116
+ def _form_data_table_info(self) -> list:
117
+ """Recreates the data table descriptions, like column names and spans
118
+ from the structured dictionaries these values were stored in."""
119
+ new_table_info = []
120
+ for key, value in self.data_table_stats.items():
121
+ new_table_info.append(f"{key} = {value}\n")
122
+ for index, (name, _) in enumerate(self.data_table_names_and_spans):
123
+ new_table_info.append(f"name {index} = {name}\n")
124
+ for index, (_, span) in enumerate(self.data_table_names_and_spans):
125
+ new_table_info.append(f"span {index} = {span}\n")
126
+ for key, value in self.data_table_misc.items():
127
+ new_table_info.append(f"{key} = {value}\n")
128
+ return new_table_info
129
+
116
130
  def differentiate_table_description(self):
117
131
  """
118
132
  The original method that structures data table metadata.
@@ -144,7 +158,10 @@ class Parameters(UserDict):
144
158
  (name, span)
145
159
  for name, span in zip(column_names, column_value_spans)
146
160
  ]
147
- self.data_table_misc = post
161
+ self.data_table_misc = {
162
+ line.split("=")[0].strip(): line.split("=")[1].strip()
163
+ for line in post
164
+ }
148
165
 
149
166
  def add_parameter(self, parameter: Parameter):
150
167
  """
@@ -201,7 +218,6 @@ class Parameters(UserDict):
201
218
  data = np.full(
202
219
  fill_value=data,
203
220
  shape=self.full_data_array.shape[0],
204
- dtype=type(data),
205
221
  )
206
222
  parameter = Parameter(data=data, metadata=metadata)
207
223
  self.add_parameter(parameter)
@@ -263,7 +279,17 @@ class Parameters(UserDict):
263
279
  ).T
264
280
  columns = [parameter.name for parameter in self.get_parameter_list()]
265
281
  assert data.shape[1] == len(columns)
266
- return pd.DataFrame(data=data, columns=columns, dtype=float)
282
+ df = pd.DataFrame(data=data, columns=columns)
283
+ for column in df.columns:
284
+ if column.lower() not in [
285
+ "latitude",
286
+ "longitude",
287
+ "event",
288
+ "cast",
289
+ "flag",
290
+ ]:
291
+ df[column].astype("float64")
292
+ return df
267
293
 
268
294
  def with_name_type(self, name_type: str = "shortname"):
269
295
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: seabirdfilehandler
3
- Version: 0.4.3
3
+ Version: 0.5.1
4
4
  Summary: Library of parsers to interact with SeaBird CTD files.
5
5
  Keywords: CTD,parser,seabird,data
6
6
  Author: Emil Michels
@@ -0,0 +1,14 @@
1
+ seabirdfilehandler/__init__.py,sha256=5JTzYE3oRdrxkC9_etAnFQ1cy10PHtpmesdR6n5PoPQ,192
2
+ seabirdfilehandler/bottlefile.py,sha256=nnfoDczPMG_ge40dT2rHNhifR7-NRgnZNFrfPM_9OSQ,5925
3
+ seabirdfilehandler/bottlelogfile.py,sha256=MtMmEebdAktO3mk6KbmJC7dfx9sRLbV5qqDQt2qtpJE,4310
4
+ seabirdfilehandler/cnvfile.py,sha256=LXpJcC3ukiD-2b5vy4aKESCbIvwV12TwQy1G6Y25_GE,9709
5
+ seabirdfilehandler/datafiles.py,sha256=lqENvdGSwRKT6PyNFN2etaWKMA-4OONG0x-up1W5ezo,8991
6
+ seabirdfilehandler/file_collection.py,sha256=b5iJaP4F34Vq7-FiJOlPvfS4IePGWsYx20XwWbZQw1A,6882
7
+ seabirdfilehandler/parameter.py,sha256=UuwFzege94sqPt0kOjEqtMGGol4hjuFjj2_EH7o0pzA,14374
8
+ seabirdfilehandler/utils.py,sha256=5KXdB8Hdv65dv5tPyXxNMct1mCEOyA3S8XP54AFAnx0,1745
9
+ seabirdfilehandler/validation_modules.py,sha256=eZ6x0giftUtlxnRMOnK_vCkgccdwUXPrDjajFa-E6n0,4698
10
+ seabirdfilehandler/xmlfiles.py,sha256=L_puQf8eg0ojv85AyEMID4jnwkOlV_fgZP3W5yeSUBY,4668
11
+ seabirdfilehandler-0.5.1.dist-info/LICENSE,sha256=Ifd1VPmYv32oJd2QVh3wIQP9X05vYJlcY6kONz360ws,34603
12
+ seabirdfilehandler-0.5.1.dist-info/METADATA,sha256=2VrJmgeRr-Par2zU5A--xDS5r_7VsKzi-HLi8SlPUX4,1289
13
+ seabirdfilehandler-0.5.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
14
+ seabirdfilehandler-0.5.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,184 +0,0 @@
1
- import pandas as pd
2
- import logging
3
- from pandas.api.extensions import register_series_accessor
4
- from pandas.api.extensions import register_dataframe_accessor
5
- import warnings
6
-
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- class MetadataHandler:
12
- """
13
- The base class for the pandas series and dataframe accessors.
14
- Offers a very basic metadata handling, by using a dictionary as metadata
15
- store. The accessors then allow to access this metadata store and
16
- corresponding methods by calling 'df.meta' or 'series.meta', respectively.
17
- Mainly targeted for usage with dataframes featuring data from CNV files,
18
- it for example allows the attachement of parameter metadata found in the
19
- CNV header to individual dataframe columns.
20
-
21
- This approach was chosen over others, like directly subclassing the pandas
22
- dataframe or series class, or a seperate metadata storage, due to its
23
- simplicity and ability to keep using the full powerfull pandas library
24
- without the need to implement each and every transformation. Of course,
25
- the 'attrs' attribute does offer a similar metadata storage. But at the
26
- time of writing this, it is still in a very experimental condition and does
27
- not propagate reliably.
28
- """
29
-
30
- def __init__(self, pandas_obj):
31
- self._obj = pandas_obj
32
- if not hasattr(self._obj, "_metadata_store"):
33
- with warnings.catch_warnings():
34
- warnings.simplefilter("ignore")
35
- self._obj._metadata_store = {}
36
-
37
- @property
38
- def metadata(self):
39
- return self._obj._metadata_store
40
-
41
- @metadata.setter
42
- def metadata(self, value):
43
- self._obj._metadata_store = value
44
-
45
- def get(self, key, default=None):
46
- return self._obj._metadata_store.get(key, default)
47
-
48
- def set(self, key, value):
49
- self._obj._metadata_store[key] = value
50
-
51
- def clear(self):
52
- self._obj._metadata_store.clear()
53
-
54
-
55
- @register_series_accessor("meta")
56
- class SeriesMetaAccessor(MetadataHandler):
57
- """
58
- Series implementation of the Metadata Accessor.
59
- Does not offer anything more than the base class at the moment.
60
- """
61
-
62
- def __init__(self, pandas_obj):
63
- super().__init__(pandas_obj)
64
-
65
-
66
- @register_dataframe_accessor("meta")
67
- class DataFrameMetaAccessor(MetadataHandler):
68
- """
69
- DataFrame implementation of the Metadata Accessor.
70
- Introduces another attribute, '_header_level_detail', that stores the
71
- currently displayed metadata as column names. Additionally offers methods
72
- to sync metadata between the dataframe and its series, and the handling of
73
- common operations, like renaming or the addition of new columns.
74
- """
75
-
76
- def __init__(self, pandas_obj):
77
- super().__init__(pandas_obj)
78
- if not hasattr(self._obj, "_header_level_detail"):
79
- self._obj._header_level_detail = "shortname"
80
- # Initialize DataFrame metadata
81
- self.aggregate_series_metadata()
82
-
83
- @property
84
- def header_detail(self):
85
- return self._obj._header_level_detail
86
-
87
- @header_detail.setter
88
- def header_detail(self, value):
89
- self._obj._header_level_detail = value
90
-
91
- @property
92
- def metadata(self):
93
- return self._obj._metadata_store
94
-
95
- @metadata.setter
96
- def metadata(self, value):
97
- meta_dict = {
98
- shortname: self.add_default_metadata(shortname, metainfo)
99
- for shortname, metainfo in value.items()
100
- }
101
- self._obj._metadata_store = meta_dict
102
- self.propagate_metadata_to_series()
103
-
104
- def aggregate_series_metadata(self):
105
- """Aggregate metadata from Series within the DataFrame."""
106
- for column in self._obj.columns:
107
- if isinstance(self._obj[column], pd.Series) and hasattr(
108
- self._obj[column], "meta"
109
- ):
110
- self.metadata[column] = self._obj[column].meta.metadata
111
-
112
- def propagate_metadata_to_series(self):
113
- """Propagate DataFrame-level metadata back to Series."""
114
- for column in self._obj.columns:
115
- if isinstance(self._obj[column], pd.Series) and hasattr(
116
- self._obj[column], "meta"
117
- ):
118
- for key, value in self.metadata.items():
119
- if key == column:
120
- try:
121
- self._obj[column].meta.metadata = value
122
- except TypeError:
123
- logger.error(f"{column}: {value}")
124
-
125
- def update_metadata_on_rename(self, rename_dict):
126
- """Update metadata when columns are renamed."""
127
- new_metadata = {}
128
- for old_name, new_name in rename_dict.items():
129
- for key, value in self.metadata.items():
130
- if key == old_name:
131
- new_metadata[new_name] = value
132
- self.metadata = new_metadata
133
- self.propagate_metadata_to_series()
134
-
135
- def rename(self, rename_key):
136
- """Rename the column names by using a metadata point."""
137
- rename_dict = {
138
- column: (
139
- self._obj[column].meta.get(rename_key)
140
- if rename_key in list(self._obj[column].meta.metadata.keys())
141
- else column
142
- )
143
- for column in self._obj.columns
144
- }
145
- self._obj.rename(columns=rename_dict, inplace=True)
146
- self.header_detail = rename_key
147
- self.update_metadata_on_rename(rename_dict)
148
-
149
- def add_column(
150
- self,
151
- name: str,
152
- data: pd.Series | list,
153
- location: int | None = None,
154
- metadata: dict = {},
155
- ):
156
- """Add a column and use or generate metadata for it."""
157
- location = len(self._obj.columns) if location is None else location
158
- self._obj.insert(
159
- loc=location,
160
- column=name,
161
- value=data,
162
- allow_duplicates=False,
163
- )
164
- self.metadata[name] = self.add_default_metadata(name, metadata)
165
- self.propagate_metadata_to_series()
166
-
167
- def add_default_metadata(
168
- self,
169
- name: str,
170
- metadata: dict = {},
171
- list_of_keys: list = [
172
- "shortname",
173
- "longinfo",
174
- "name",
175
- "metainfo",
176
- "unit",
177
- ],
178
- ) -> dict:
179
- """Fill up missing metadata points with a default value."""
180
- default = {}
181
- for key in list_of_keys:
182
- if key not in list(metadata.keys()):
183
- default[key] = name
184
- return {**metadata, **default}