PyPI - seabirdfilehandler - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

seabirdfilehandler 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of seabirdfilehandler might be problematic. Click here for more details.

Files changed (16) hide show

seabirdfilehandler/__init__.py +4 -2
seabirdfilehandler/bottlefile.py +185 -0
seabirdfilehandler/bottlelogfile.py +155 -0
seabirdfilehandler/cnvfile.py +283 -0
seabirdfilehandler/datafiles.py +259 -0
seabirdfilehandler/file_collection.py +28 -40
seabirdfilehandler/parameter.py +29 -3
seabirdfilehandler/utils.py +53 -0
seabirdfilehandler/xmlfiles.py +54 -0
{seabirdfilehandler-0.4.2.dist-info → seabirdfilehandler-0.5.0.dist-info}/METADATA +1 -1
seabirdfilehandler-0.5.0.dist-info/RECORD +16 -0
{seabirdfilehandler-0.4.2.dist-info → seabirdfilehandler-0.5.0.dist-info}/WHEEL +1 -1
seabirdfilehandler/datatablefiles.py +0 -930
seabirdfilehandler/seabirdfiles.py +0 -210
seabirdfilehandler-0.4.2.dist-info/RECORD +0 -13
{seabirdfilehandler-0.4.2.dist-info → seabirdfilehandler-0.5.0.dist-info}/LICENSE +0 -0

seabirdfilehandler/__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
-from .seabirdfiles import *
-from .datatablefiles import *
+from .datafiles import *
+from .bottlefile import *
+from .bottlelogfile import *
+from .cnvfile import *
 from .xmlfiles import *
 from .validation_modules import *
 from .file_collection import *

seabirdfilehandler/bottlefile.py ADDED Viewed

@@ -0,0 +1,185 @@
+from typing import Union
+from datetime import datetime, time
+import pandas as pd
+import numpy as np
+import logging
+from seabirdfilehandler import DataFile
+from seabirdfilehandler.dataframe_meta_accessor import (
+    SeriesMetaAccessor,  # noqa: F401
+    DataFrameMetaAccessor,  # noqa: F401
+)
+logger = logging.getLogger(__name__)
+class BottleFile(DataFile):
+    """Class that represents a SeaBird Bottle File. Organizes the files table
+    information into a pandas dataframe. This allows the usage of this
+    powerful library for statistics, visualization, data manipulation, export,
+    etc.
+    Parameters
+    ----------
+    Returns
+    -------
+    """
+    def __init__(self, path_to_file):
+        super().__init__(path_to_file)
+        self.original_df = self.create_dataframe()
+        self.df = self.original_df
+        self.setting_dataframe_dtypes()
+        self.adding_timestamp_column()
+    def create_dataframe(self):
+        """Creates a dataframe out of the btl file. Manages the double data
+        header correctly.
+        Parameters
+        ----------
+        Returns
+        -------
+        """
+        # TODO: this needs to be broken down into smaller pieces...
+        top_names, bottom_names = self.reading_data_header()
+        # creating statistics column to store the row type information:
+        # 4 rows per bottle, average, standard deviation, max value, min value
+        top_names.append("Statistic")
+        # TODO: sexier way to construct dataframe than opening the file a
+        # second time
+        # # df = pd.DataFrame(self.data, index=None, columns=top_names)
+        df: pd.DataFrame = pd.read_fwf(
+            self.path_to_file,
+            index_col=False,
+            skiprows=len(self.header) + 2,
+            header=None,
+            names=top_names,
+        )
+        # handling the double row header
+        rowtypes = df[df.columns[-1]].unique()
+        # TODO: can this be made a little pretier?
+        def separate_double_header_row(df, column, length):
+            """
+            Parameters
+            ----------
+            df :
+            column :
+            length :
+            Returns
+            -------
+            """
+            column_idx = df.columns.get_loc(column)
+            old_column = df.iloc[::length, column_idx].reset_index(drop=True)
+            new_column = df.iloc[1::length, column_idx].reset_index(drop=True)
+            old_column_expanded = pd.Series(
+                np.repeat(old_column, length)
+            ).reset_index(drop=True)
+            new_column_expanded = pd.Series(
+                np.repeat(new_column, length)
+            ).reset_index(drop=True)
+            df[column] = old_column_expanded
+            df.insert(
+                column_idx + 1, bottom_names[column_idx], new_column_expanded
+            )
+            return df
+        df = separate_double_header_row(df, "Date", len(rowtypes))
+        df = separate_double_header_row(df, top_names[0], len(rowtypes))
+        # remove brackets around statistics values
+        df["Statistic"] = df["Statistic"].str.strip("()")
+        df = df.rename(mapper={"Btl_ID": "Bottle_ID"}, axis=1)
+        return df
+    def adding_timestamp_column(self):
+        """Creates a timestamp column that holds both, Date and Time
+        information.
+        Parameters
+        ----------
+        Returns
+        -------
+        """
+        # constructing timestamp column
+        timestamp = []
+        for datepoint, timepoint in zip(self.df.Date, self.df.Time):
+            timestamp.append(
+                datetime.combine(datepoint, time.fromisoformat(str(timepoint)))
+            )
+        self.df.insert(2, "Timestamp", timestamp)
+        self.df.Timestamp = pd.to_datetime(self.df.Timestamp)
+    def setting_dataframe_dtypes(self):
+        """Sets the types for the column values in the dataframe."""
+        # setting dtypes
+        # TODO: extending this to the other columns!
+        self.df.Date = pd.to_datetime(self.df.Date)
+        self.df.Bottle_ID = self.df.Bottle_ID.astype(int)
+    def selecting_rows(
+        self, df=None, statistic_of_interest: Union[list, str] = ["avg"]
+    ):
+        """Creates a dataframe with the given row identifier, using the
+        statistics column. A single string or a list of strings can be
+        processed.
+        Parameters
+        ----------
+        df : pandas.Dataframe :
+            the files Pandas representation (Default value = self.df)
+        statistic_of_interest: list or str :
+            collection of values of the 'statistics' column in self.df
+             (Default value = ['avg'])
+        Returns
+        -------
+        """
+        df = self.df if df is None else df
+        # ensure that the input is a list, so that isin() can do its job
+        if isinstance(statistic_of_interest, str):
+            statistic_of_interest = [statistic_of_interest]
+        self.df = df.loc[df["Statistic"].isin(statistic_of_interest)]
+    def reading_data_header(self):
+        """Identifies and separatly collects the rows that specify the data
+        tables headers.
+        Parameters
+        ----------
+        Returns
+        -------
+        """
+        n = 11  # fix column width of a seabird btl file
+        top_line = self.data[0]
+        second_line = self.data[1]
+        top_names = [
+            top_line[i : i + n].split()[0]
+            for i in range(0, len(top_line) - n, n)
+        ]
+        bottom_names = [
+            second_line[i : i + n].split()[0] for i in range(0, 2 * n, n)
+        ]
+        return top_names, bottom_names
+    def add_station_and_event_column(self):
+        event_list = [self.metadata["Station"] for _ in self.data]
+        self.df.insert(0, "Event", pd.Series(event_list))
+    def add_position_columns(self):
+        latitude_list = [self.metadata["GPS_Lat"] for _ in self.data]
+        self.df.insert(1, "Latitude", pd.Series(latitude_list))
+        longitude_list = [self.metadata["GPS_Lon"] for _ in self.data]
+        self.df.insert(2, "Longitude", pd.Series(longitude_list))

seabirdfilehandler/bottlelogfile.py ADDED Viewed

@@ -0,0 +1,155 @@
+from datetime import datetime
+import re
+import logging
+import pandas as pd
+from seabirdfilehandler import DataFile
+from seabirdfilehandler.dataframe_meta_accessor import (
+    SeriesMetaAccessor,  # noqa: F401
+    DataFrameMetaAccessor,  # noqa: F401
+)
+logger = logging.getLogger(__name__)
+class BottleLogFile(DataFile):
+    """Bottle Log file representation, that extracts the three different data
+    types from the file: reset time and the table with bottle IDs and
+    corresponding data ranges.
+    Parameters
+    ----------
+    Returns
+    -------
+    """
+    def __init__(self, path_to_file, create_dataframe=False):
+        super().__init__(path_to_file)
+        self.reset_time = self.obtaining_reset_time()
+        self.origin_cnv = self.raw_file_data[0].strip()
+        self.data = self.data_whitespace_removal()
+        if create_dataframe:
+            self.original_df = self.create_dataframe()
+            self.df = self.original_df
+        else:
+            self.data_list = self.create_list()
+    def data_whitespace_removal(self) -> list:
+        """Strips the input from whitespace characters, in this case especially
+        newline characters.
+        Parameters
+        ----------
+        Returns
+        -------
+        the original data stripped off the whitespaces
+        """
+        temp_data = []
+        for line in self.raw_file_data[2:]:
+            temp_data.append(line.strip())
+        return temp_data
+    def obtaining_reset_time(self) -> datetime:
+        """Reading reset time with small input check.
+        Parameters
+        ----------
+        Returns
+        -------
+        a datetime.datetime object of the device reset time
+        """
+        regex_check = re.search(
+            r"RESET\s(\w{3}\s\d+\s\d{4}\s\d\d:\d\d:\d\d)",
+            self.raw_file_data[1],
+        )
+        if regex_check:
+            return datetime.strptime(regex_check.group(1), "%b %d %Y %H:%M:%S")
+        else:
+            error_message = """BottleLogFile is not formatted as expected:
+                Reset time could not be extracted."""
+            logger.error(error_message)
+            raise IOError(error_message)
+    def create_list(self) -> list:
+        """Creates a list of usable data from the list specified in self.data.
+        the list consists of: an array of ID's representing the bottles, the date and time of the data sample
+        and the lines of the cnv corresponding to the bottles
+        Parameters
+        ----------
+        Returns
+        -------
+        a list representing the bl files table information
+        """
+        content_array = []
+        for i in range(len(self.data)):
+            bottles = [int(x) for x in self.data[i].split(",")[:2]]
+            date = self.convert_date(self.data[i].split(",")[2])
+            lines = tuple([int(x) for x in self.data[i].split(",")[3:]])
+            content_array.append([bottles, date, lines])
+        return content_array
+    def convert_date(self, date: str):
+        """Converts the Dates of the .bl files to an ISO 8601 standard
+        Parameters
+        ----------
+        Returns
+        -------
+        a string with the date in the form of "yymmddThhmmss"
+        """
+        date = date.strip()
+        month_list = [
+            "Jan",
+            "Feb",
+            "Mar",
+            "Apr",
+            "May",
+            "Jun",
+            "Jul",
+            "Aug",
+            "Sep",
+            "Oct",
+            "Nov",
+            "Dec",
+        ]
+        month_ind = month_list.index(date.split(" ")[0]) + 1
+        if month_ind < 10:
+            month = "0" + str(month_ind)
+        else:
+            month = str(month_ind)
+        day = date.split(" ")[1]
+        year = (date.split(" ")[2])[2:]
+        time = date.split(" ")[3].replace(":", "")
+        return year + month + day + "T" + time
+    def create_dataframe(self) -> pd.DataFrame:
+        """Creates a dataframe from the list specified in self.data.
+        Parameters
+        ----------
+        Returns
+        -------
+        a pandas.Dataframe representing the bl files table information
+        """
+        data_lists = []
+        for line in self.data:
+            inner_list = line.split(",")
+            # dropping first column as its the index
+            data_lists.append(inner_list[1:])
+        df = pd.DataFrame(data_lists)
+        df.columns = ["Bottle ID", "Datetime", "start_range", "end_range"]
+        return df

seabirdfilehandler/cnvfile.py ADDED Viewed

@@ -0,0 +1,283 @@
+from pathlib import Path
+from datetime import datetime, timedelta
+import pandas as pd
+import numpy as np
+import logging
+from seabirdfilehandler import DataFile
+from seabirdfilehandler.parameter import Parameters
+from seabirdfilehandler.validation_modules import CnvValidationList
+logger = logging.getLogger(__name__)
+class CnvFile(DataFile):
+    """
+    A representation of a cnv-file as used by SeaBird.
+    This class intends to fully extract and organize the different types of
+    data and metadata present inside of such a file. Downstream libraries shall
+    be able to use this representation for all applications concerning cnv
+    files, like data processing, transformation or visualization.
+    To achieve that, the metadata header is organized by the grandparent-class,
+    SeaBirdFile, while the data table is extracted by this class. The data
+    representation of choice is a pandas Dataframe. Inside this class, there
+    are methods to parse cnv data into dataframes, do the reverse of writing a
+    dataframe into cnv compliant form and to manipulate the dataframe in
+    various ways.
+    Parameters
+    ----------
+    path_to_file: Path | str:
+        the path to the file
+    full_data_header: bool:
+        whether to use the full data column descriptions for the dataframe
+    long_header_names: bool:
+        whether to use long header names in the dateframe
+    absolute_time_calculation: bool:
+        whether to use a real timestamp instead of the second count
+    event_log_column: bool:
+        whether to add a station and device event column from DSHIP
+    coordinate_columns: bool:
+        whether to add longitude and latitude from the extra metadata header
+    """
+    def __init__(
+        self,
+        path_to_file: Path | str,
+        create_dataframe: bool = False,
+        absolute_time_calculation: bool = False,
+        event_log_column: bool = False,
+        coordinate_columns: bool = False,
+    ):
+        super().__init__(path_to_file)
+        self.validation_modules = self.obtaining_validation_modules()
+        self.start_time = self.reading_start_time()
+        self.parameters = Parameters(self.data, self.data_table_description)
+        if create_dataframe:
+            self.df = self.parameters.get_pandas_dataframe()
+            if absolute_time_calculation:
+                self.absolute_time_calculation()
+            if event_log_column:
+                self.add_station_and_event_column()
+            if coordinate_columns:
+                self.add_position_columns()
+    def reading_start_time(
+        self,
+        time_source: str = "System UTC",
+    ) -> datetime | None:
+        """
+        Extracts the Cast start time from the metadata header.
+        """
+        for line in self.sbe9_data:
+            if line.startswith(time_source):
+                start_time = line.split("=")[1]
+                start_time = datetime.strptime(
+                    start_time, " %b %d %Y %H:%M:%S "
+                )
+                return start_time
+        return None
+    def absolute_time_calculation(self) -> bool:
+        """
+        Replaces the basic cnv time representation of counting relative to the
+        casts start point, by real UTC timestamps.
+        This operation will act directly on the dataframe.
+        """
+        time_parameter = None
+        for parameter in self.df.columns:
+            if parameter.lower().startswith("time"):
+                time_parameter = parameter
+        if time_parameter and self.start_time:
+            self.parameters.create_parameter(
+                name="datetime",
+                data=np.array(
+                    [
+                        timedelta(days=float(time)) + self.start_time
+                        if time_parameter == "timeJ"
+                        else timedelta(seconds=float(time)) + self.start_time
+                        for time in self.df[time_parameter]
+                    ]
+                ),
+            )
+            return True
+        return False
+    def add_start_time(self) -> bool:
+        """
+        Adds the Cast start time to the dataframe.
+        Necessary for joins on the time.
+        """
+        if self.start_time:
+            self.parameters.create_parameter(
+                name="start_time",
+                data=str(self.start_time),
+            )
+            return True
+        return False
+    def obtaining_validation_modules(self) -> CnvValidationList:
+        """
+        Collects the individual validation modules and their respective
+        information, usually present in key-value pairs.
+        """
+        validation_modules = self.processing_info
+        return CnvValidationList(validation_modules)
+    def df2cnv(self, df: pd.DataFrame | None = None) -> list:
+        """
+        Parses a pandas dataframe into a list that represents the lines inside
+        of a cnv data table.
+        Parameters
+        ----------
+        df: DataFrame to export, default is self.df
+        Returns
+        -------
+        a list of lines in the cnv data table format
+        """
+        df = df if isinstance(df, pd.DataFrame) else self.df
+        cnv_out = []
+        for _, row in df.iterrows():
+            cnv_like_row = "".join(
+                (lambda column: f"{str(column):>11}")(value) for value in row
+            )
+            cnv_out.append(cnv_like_row + "\n")
+        return cnv_out
+    def array2cnv(self) -> list:
+        result = []
+        for row in self.parameters.full_data_array:
+            formatted_row = "".join(f"{elem:11}" for elem in row)
+            result.append(formatted_row + "\n")
+        return result
+    def to_cnv(
+        self,
+        file_name: Path | str | None = None,
+        use_dataframe: bool = True,
+    ):
+        """
+        Writes the values inside of this instance as a new cnv file to disc.
+        Parameters
+        ----------
+        file_name: Path:
+            the new file name to use for writing
+        use_current_df: bool:
+            whether to use the current dataframe as data table
+        use_current_validation_header: bool:
+            whether to use the current processing module list
+        header_list: list:
+            the data columns to use for the export
+        """
+        file_name = self.path_to_file if file_name is None else file_name
+        # content construction
+        if use_dataframe:
+            data = self.df2cnv()
+        else:
+            data = self.array2cnv()
+        self._update_header()
+        self.file_data = [*self.header, *data]
+        # writing content out
+        try:
+            with open(file_name, "w", encoding="latin-1") as file:
+                for line in self.file_data:
+                    file.write(line)
+        except IOError as error:
+            logger.error(f"Could not write cnv file: {error}")
+    def _update_header(self):
+        """Re-creates the cnv header."""
+        self.data_table_description = self.parameters._form_data_table_info()
+        self.header = [
+            *[f"* {data}" for data in self.sbe9_data[:-1]],
+            *[f"** {data}" for data in self.metadata_list],
+            f"* {self.sbe9_data[-1]}",
+            *[f"# {data}" for data in self.data_table_description],
+            *[f"# {data}" for data in self.sensor_data],
+            *[f"# {data}" for data in self.processing_info],
+            "*END*\n",
+        ]
+    def add_processing_metadata(self, addition: str | list):
+        """
+        Adds new processing lines to the list of processing module information
+        Parameters
+        ----------
+        addition: str:
+            the new information line
+        """
+        # TODO: use CnvprocessingList here
+        if isinstance(addition, str):
+            addition = [addition]
+        for line in addition:
+            self.file_data.append(line)
+            # add the new info line *before* the 'file_type = ascii' line
+            self.processing_info.insert(-1, line)
+    def add_station_and_event_column(self) -> bool:
+        """
+        Adds a column with the DSHIP station and device event numbers to the
+        dataframe. These must be present inside the extra metadata header.
+        """
+        if "Station" in self.metadata:
+            self.parameters.create_parameter(
+                data=self.metadata["Station"],
+                name="Event",
+            )
+            return True
+        return False
+    def add_position_columns(self) -> bool:
+        """
+        Adds a column with the longitude and latitude to the dataframe.
+        These must be present inside the extra metadata header.
+        """
+        if ("latitude" or "longitude") in [
+            k.lower() for k in self.parameters.keys()
+        ]:
+            return True
+        if ("GPS_Lat" and "GPS_Lon") in self.metadata:
+            self.parameters.create_parameter(
+                data=self.metadata["GPS_Lat"],
+                name="Latitude",
+            )
+            self.parameters.create_parameter(
+                data=self.metadata["GPS_Lon"],
+                name="Longitude",
+            )
+            return True
+        else:
+            return False
+    def add_cast_number(self, number: int | None = None) -> bool:
+        """
+        Adds a column with the cast number to the dataframe.
+        Parameters
+        ----------
+        number: int:
+            the cast number of this files cast
+        """
+        if ("Cast" in self.metadata.keys()) and (not number):
+            number = int(self.metadata["Cast"])
+        if number:
+            self.parameters.create_parameter(
+                data=number,
+                name="Cast",
+            )
+            return True
+        return False

seabirdfilehandler 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

seabirdfilehandler 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl