PyPI - gensor - Versions diffs - 0.1.6__tar.gz → 0.2.1__tar.gz - Mend

gensor 0.1.6tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{gensor-0.1.6 → gensor-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gensor
-Version: 0.1.6
+Version: 0.2.1
 Summary: Library for handling groundwater sensor data.
 Home-page: https://github.com/zawadzkim/gensor
 Author: Mateusz Zawadzki

gensor-0.2.1/gensor/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+import logging
+from .core.dataset import Dataset
+from .core.timeseries import Timeseries
+from .io.read import read_from_csv, read_from_sql
+from .log import set_log_level
+from .processing.compensation import compensate
+__all__ = [
+    # basic data types
+    "Dataset",
+    "Timeseries",
+    "compensate",
+    # getters
+    "read_from_csv",
+    "read_from_sql",
+    "set_log_level",
+]
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+if not logger.hasHandlers():
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(levelname)s: %(message)s")
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)

{gensor-0.1.6 → gensor-0.2.1}/gensor/core/base.py RENAMED Viewed

@@ -6,6 +6,8 @@ import pandas as pd
 import pandera as pa
 import pydantic as pyd
 from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
 from sqlalchemy import Table
 from sqlalchemy.dialects.sqlite import insert as sqlite_insert
@@ -47,14 +49,14 @@ class BaseTimeseries(pyd.BaseModel):
         arbitrary_types_allowed=True, validate_assignment=True
     )
-    ts: pd.Series = pyd.Field(repr=False)
+    ts: pd.Series = pyd.Field(repr=False, exclude=True)
     variable: Literal[
         "temperature", "pressure", "conductivity", "flux", "head", "depth"
     ]
     unit: Literal["degc", "cmh2o", "ms/cm", "m/s", "m asl", "m"]
     location: str | None = None
-    outliers: pd.Series | None = pyd.Field(default=None, repr=False)
-    transformation: Any = pyd.Field(default=None, repr=False)
+    outliers: pd.Series | None = pyd.Field(default=None, repr=False, exclude=True)
+    transformation: Any = pyd.Field(default=None, repr=False, exclude=True)
     @pyd.computed_field()  # type: ignore[prop-decorator]
     @property
@@ -66,6 +68,11 @@ class BaseTimeseries(pyd.BaseModel):
     def end(self) -> pd.Timestamp | Any:
         return self.ts.index.max()
+    @pyd.field_serializer("start", "end")
+    def serialize_timestamps(self, value: pd.Timestamp | None) -> str | None:
+        """Serialize `pd.Timestamp` to ISO format."""
+        return value.strftime("%Y%m%d%H%M%S") if value is not None else None
     def __eq__(self, other: object) -> bool:
         """Check equality based on location, sensor, variable, unit and sensor_alt."""
         if not isinstance(other, BaseTimeseries):
@@ -85,6 +92,9 @@ class BaseTimeseries(pyd.BaseModel):
         if attr == "loc":
             return TimeseriesIndexer(self, self.ts.loc)
+        if attr == "iloc":
+            return TimeseriesIndexer(self, self.ts.iloc)
         error_message = f"'{self.__class__.__name__}' object has no attribute '{attr}'"
         if hasattr(self.ts, attr):
@@ -97,6 +107,7 @@ class BaseTimeseries(pyd.BaseModel):
                     # If the result is a Series, return a new Timeseries; otherwise, return the result
                     if isinstance(result, pd.Series):
                         return self.model_copy(update={"ts": result}, deep=True)
                     return result
                 return wrapper
@@ -256,19 +267,29 @@ class BaseTimeseries(pyd.BaseModel):
         `to_sql` method. Additionally, metadata about the timeseries is stored in the
         'timeseries_metadata' table.
-        Args:
+        Parameters:
             db (DatabaseConnection): The database connection object.
         Returns:
             str: A message indicating the number of rows inserted into the database.
         """
-        # Format the start timestamp as 'YYYYMMDDHHMMSS'
+        def separate_metadata() -> tuple:
+            _core_metadata_fields = {"location", "variable", "unit", "start", "end"}
+            core_metadata = self.model_dump(include=_core_metadata_fields)
+            core_metadata.update({
+                "cls": f"{self.__module__}.{self.__class__.__name__}"
+            })
+            extra_metadata = self.model_dump(exclude=_core_metadata_fields)
+            return core_metadata, extra_metadata
         timestamp_start_fmt = self.start.strftime("%Y%m%d%H%M%S")
+        timestamp_end_fmt = self.end.strftime("%Y%m%d%H%M%S")
-        # Construct the schema name using the location, sensor, variable, unit, and timestamp
-        schema_name = (
-            f"{self.location}_{self.variable}_{self.unit}_{timestamp_start_fmt}".lower()
-        )
+        schema_name = f"{self.location}_{self.variable}_{self.unit}".lower()
         # Ensure the index is a pandas DatetimeIndex
         if isinstance(self.ts.index, pd.DatetimeIndex):
@@ -281,66 +302,71 @@ class BaseTimeseries(pyd.BaseModel):
             message = "The index is not a DatetimeIndex and cannot be converted to UTC."
             raise TypeError(message)
-        # Prepare the timeseries data as records for insertion
         series_as_records = list(
             zip(utc_index.strftime("%Y-%m-%dT%H:%M:%S%z"), self.ts, strict=False)
         )
+        core_metadata, extra_metadata = separate_metadata()
+        metadata_entry = {
+            **core_metadata,
+            "extra": extra_metadata,
+            "table_name": schema_name,
+        }
         with db as con:
-            # Create the timeseries table if it doesn't exist
             schema = db.create_table(schema_name, self.variable)
-            # Ensure that the timeseries_metadata table exists
             metadata_schema = db.metadata.tables["__timeseries_metadata__"]
             if isinstance(schema, Table):
-                # Insert the timeseries data
                 stmt = sqlite_insert(schema).values(series_as_records)
                 stmt = stmt.on_conflict_do_nothing(index_elements=["timestamp"])
                 con.execute(stmt)
-                con.commit()
-                metadata_stmt = sqlite_insert(metadata_schema).values(
-                    table_name=schema_name,
-                    location=self.location,
-                    variable=self.variable,
-                    unit=self.unit,
-                    timestamp_start=timestamp_start_fmt,
-                    timestamp_end=self.end.strftime("%Y%m%d%H%M%S"),
-                )
+                metadata_stmt = sqlite_insert(metadata_schema).values(metadata_entry)
                 metadata_stmt = metadata_stmt.on_conflict_do_update(
                     index_elements=["table_name"],
                     set_={
-                        "timestamp_start": timestamp_start_fmt,
-                        "timestamp_end": self.end.strftime("%Y%m%d%H%M%S"),
+                        "start": timestamp_start_fmt,
+                        "end": timestamp_end_fmt,
                     },
                 )
                 con.execute(metadata_stmt)
-                con.commit()
+            # Commit all changes at once
+            con.commit()
         return f"{schema_name} table and metadata updated."
     def plot(
-        self: T, include_outliers: bool = False, ax: Any = None, **plot_kwargs: Any
-    ) -> tuple:
+        self: T,
+        include_outliers: bool = False,
+        ax: Axes | None = None,
+        plot_kwargs: dict[str, Any] | None = None,
+        legend_kwargs: dict[str, Any] | None = None,
+    ) -> tuple[Figure, Axes]:
         """Plots the timeseries data.
-        Args:
+        Parameters:
             include_outliers (bool): Whether to include outliers in the plot.
             ax (matplotlib.axes.Axes, optional): Matplotlib axes object to plot on.
                 If None, a new figure and axes are created.
-            **plot_kwargs: Additional keyword arguments passed to plt.plot.
+            plot_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.plot() method to customize the plot.
+            legend_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.legend() to customize the legend.
         Returns:
             (fig, ax): Matplotlib figure and axes to allow further customization.
         """
-        # Create new figure and axes if not provided
+        plot_kwargs = plot_kwargs or {}
+        legend_kwargs = legend_kwargs or {}
         if ax is None:
             fig, ax = plt.subplots(figsize=(10, 5))
         else:
-            fig = ax.get_figure()
+            # mypy complained that the get_figure() can return None, but there is no
+            # situation here in which this could be the case.
+            fig = ax.get_figure()  # type: ignore [assignment]
         ax.plot(
             self.ts.index,
@@ -353,11 +379,13 @@ class BaseTimeseries(pyd.BaseModel):
             ax.scatter(
                 self.outliers.index, self.outliers, color="red", label="Outliers"
             )
-        plt.xticks(rotation=45)
+        for label in ax.get_xticklabels():
+            label.set_rotation(45)
         ax.set_xlabel("Time")
         ax.set_ylabel(f"{self.variable} ({self.unit})")
         ax.set_title(f"{self.variable.capitalize()} at {self.location}")
-        ax.legend()
+        ax.legend(**legend_kwargs)
         return fig, ax

gensor-0.2.1/gensor/core/dataset.py ADDED Viewed

@@ -0,0 +1,203 @@
+from __future__ import annotations
+from collections import defaultdict
+from typing import Any, Generic
+import pydantic as pyd
+from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from gensor.core.base import BaseTimeseries, T
+from gensor.db import DatabaseConnection
+from gensor.exceptions import IndexOutOfRangeError
+class Dataset(pyd.BaseModel, Generic[T]):
+    """Store and operate on a collection of Timeseries.
+    Attributes:
+        timeseries (list[Timeseries]): A list of Timeseries objects.
+    """
+    timeseries: list[T | None] = pyd.Field(default_factory=list)
+    def __iter__(self) -> Any:
+        """Allows to iterate directly over the dataset."""
+        return iter(self.timeseries)
+    def __len__(self) -> int:
+        """Gives the number of timeseries in the Dataset."""
+        return len(self.timeseries)
+    def __repr__(self) -> str:
+        return f"Dataset({len(self)})"
+    def __getitem__(self, index: int) -> T | None:
+        """Retrieve a Timeseries object by its index in the dataset.
+        Parameters:
+            index (int): The index of the Timeseries to retrieve.
+        Returns:
+            Timeseries: The Timeseries object at the specified index.
+        Raises:
+            IndexError: If the index is out of range.
+        """
+        try:
+            return self.timeseries[index]
+        except IndexError:
+            raise IndexOutOfRangeError(index, len(self)) from None
+    def get_locations(self) -> list:
+        """List all unique locations in the dataset."""
+        return [ts.location for ts in self.timeseries if ts is not None]
+    def add(self, other: T | list[T] | Dataset) -> Dataset:
+        """Appends new Timeseries to the Dataset.
+        If an equal Timeseries already exists, merge the new data into the existing
+        Timeseries, dropping duplicate timestamps.
+        Parameters:
+            other (Timeseries): The Timeseries object to add.
+        """
+        # I need to check for BaseTimeseries instance in the add() method, but also
+        # type hint VarType T.
+        if isinstance(other, list | Dataset):
+            for ts in other:
+                if isinstance(ts, BaseTimeseries):
+                    self._add_single_timeseries(ts)  # type: ignore[arg-type]
+        elif isinstance(other, BaseTimeseries):
+            self._add_single_timeseries(other)
+        return self
+    def _add_single_timeseries(self, ts: T) -> None:
+        """Adds a single Timeseries to the Dataset or merges if an equal one exists."""
+        for i, existing_ts in enumerate(self.timeseries):
+            if existing_ts == ts:
+                self.timeseries[i] = existing_ts.concatenate(ts)
+                return
+        self.timeseries.append(ts)
+        return
+    def filter(
+        self,
+        location: str | list | None = None,
+        variable: str | list | None = None,
+        unit: str | list | None = None,
+        **kwargs: dict[str, str | list],
+    ) -> T | Dataset:
+        """Return a Timeseries or a new Dataset filtered by station, sensor,
+        and/or variable.
+        Parameters:
+            location (Optional[str]): The location name.
+            variable (Optional[str]): The variable being measured.
+            unit (Optional[str]): Unit of the measurement.
+            **kwargs (dict): Attributes of subclassed timeseries used for filtering
+                (e.g., sensor, method).
+        Returns:
+            Timeseries | Dataset: A single Timeseries if exactly one match is found,
+                                   or a new Dataset if multiple matches are found.
+        """
+        def matches(ts: T, attr: str, value: dict[str, str | list]) -> bool | None:
+            """Check if the Timeseries object has the attribute and if it matches the value."""
+            if not hasattr(ts, attr):
+                message = f"'{ts.__class__.__name__}' object has no attribute '{attr}'"
+                raise AttributeError(message)
+            return getattr(ts, attr) in value
+        if isinstance(location, str):
+            location = [location]
+        if isinstance(variable, str):
+            variable = [variable]
+        if isinstance(unit, str):
+            unit = [unit]
+        for key, value in kwargs.items():
+            if isinstance(value, str):
+                kwargs[key] = [value]
+        matching_timeseries = [
+            ts
+            for ts in self.timeseries
+            if ts is not None
+            and (location is None or ts.location in location)
+            and (variable is None or ts.variable in variable)
+            and (unit is None or ts.unit in unit)
+            and all(matches(ts, attr, value) for attr, value in kwargs.items())
+        ]
+        if not matching_timeseries:
+            return Dataset()
+        if len(matching_timeseries) == 1:
+            return matching_timeseries[0]
+        return self.model_copy(update={"timeseries": matching_timeseries})
+    def to_sql(self, db: DatabaseConnection) -> None:
+        """Save the entire timeseries to a SQLite database.
+        Parameters:
+            db (DatabaseConnection): SQLite database connection object.
+        """
+        for ts in self.timeseries:
+            if ts:
+                ts.to_sql(db)
+        return
+    def plot(
+        self,
+        include_outliers: bool = False,
+        plot_kwargs: dict[str, Any] | None = None,
+        legend_kwargs: dict[str, Any] | None = None,
+    ) -> tuple[Figure, Axes]:
+        """Plots the timeseries data, grouping by variable type.
+        Parameters:
+            include_outliers (bool): Whether to include outliers in the plot.
+            plot_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.plot() method to customize the plot.
+            legend_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.legend() to customize the legend.
+        Returns:
+            (fig, ax): Matplotlib figure and axes to allow further customization.
+        """
+        grouped_ts = defaultdict(list)
+        for ts in self.timeseries:
+            if ts:
+                grouped_ts[ts.variable].append(ts)
+        num_variables = len(grouped_ts)
+        fig, axes = plt.subplots(
+            num_variables, 1, figsize=(10, 5 * num_variables), sharex=True
+        )
+        if num_variables == 1:
+            axes = [axes]
+        for ax, (variable, ts_list) in zip(axes, grouped_ts.items(), strict=False):
+            for ts in ts_list:
+                ts.plot(
+                    include_outliers=include_outliers,
+                    ax=ax,
+                    plot_kwargs=plot_kwargs,
+                    legend_kwargs=legend_kwargs,
+                )
+            ax.set_title(f"Timeseries for {variable.capitalize()}")
+            ax.set_xlabel("Time")
+        fig.tight_layout()
+        return fig, axes

{gensor-0.1.6 → gensor-0.2.1}/gensor/core/indexer.py RENAMED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 from typing import Any
+import numpy as np
 import pandas as pd
@@ -23,5 +24,9 @@ class TimeseriesIndexer:
         if isinstance(result, pd.Series):
             return self.parent.model_copy(update={"ts": result}, deep=True)
+        if isinstance(result, (int | float | str | pd.Timestamp | np.float64)):
+            return result
         message = f"Expected pd.Series, but got {type(result)} instead."
         raise TypeError(message)

gensor-0.2.1/gensor/core/timeseries.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+from typing import Any
+import pandas as pd
+import pandera as pa
+import pydantic as pyd
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from gensor.core.base import BaseTimeseries
+ts_schema = pa.SeriesSchema(
+    float,
+    index=pa.Index(pd.DatetimeTZDtype(tz="UTC"), coerce=False),
+    coerce=True,
+)
+class Timeseries(BaseTimeseries):
+    """Timeseries of groundwater sensor data.
+    Attributes:
+        ts (pd.Series): The timeseries data.
+        variable (Literal['temperature', 'pressure', 'conductivity', 'flux']):
+            The type of the measurement.
+        unit (Literal['degC', 'mmH2O', 'mS/cm', 'm/s']): The unit of
+            the measurement.
+        sensor (str): The serial number of the sensor.
+        sensor_alt (float): Altitude of the sensor (ncessary to compute groundwater levels).
+    """
+    model_config = pyd.ConfigDict(
+        arbitrary_types_allowed=True, validate_assignment=True
+    )
+    sensor: str | None = None
+    sensor_alt: float | None = None
+    def __eq__(self, other: object) -> bool:
+        """Check equality based on location, sensor, variable, unit and sensor_alt."""
+        if not isinstance(other, Timeseries):
+            return NotImplemented
+        if not super().__eq__(other):
+            return False
+        return self.sensor == other.sensor and self.sensor_alt == other.sensor_alt
+    def plot(
+        self,
+        include_outliers: bool = False,
+        ax: Axes | None = None,
+        plot_kwargs: dict[str, Any] | None = None,
+        legend_kwargs: dict[str, Any] | None = None,
+    ) -> tuple[Figure, Axes]:
+        """Plots the timeseries data.
+        Parameters:
+            include_outliers (bool): Whether to include outliers in the plot.
+            ax (matplotlib.axes.Axes, optional): Matplotlib axes object to plot on.
+                If None, a new figure and axes are created.
+            plot_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.plot() method to customize the plot.
+            legend_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.legend() to customize the legend.
+        Returns:
+            (fig, ax): Matplotlib figure and axes to allow further customization.
+        """
+        fig, ax = super().plot(
+            include_outliers=include_outliers,
+            ax=ax,
+            plot_kwargs=plot_kwargs,
+            legend_kwargs=legend_kwargs,
+        )
+        ax.set_title(f"{self.variable.capitalize()} at {self.location} ({self.sensor})")
+        return fig, ax

{gensor-0.1.6 → gensor-0.2.1}/gensor/db/connection.py RENAMED Viewed

@@ -9,6 +9,7 @@ from typing import Any
 import pydantic as pyd
 from sqlalchemy import (
+    JSON,
     Column,
     Connection,
     Engine,
@@ -100,13 +101,12 @@ class DatabaseConnection(pyd.BaseModel):
             Column("id", Integer, primary_key=True),
             Column("table_name", String, unique=True),
             Column("location", String),
-            Column("sensor", String),
             Column("variable", String),
             Column("unit", String),
-            Column("logger_alt", Float, nullable=True),
-            Column("location_alt", Float, nullable=True),
-            Column("timestamp_start", String, nullable=True),
-            Column("timestamp_end", String, nullable=True),
+            Column("start", String, nullable=True),
+            Column("end", String, nullable=True),
+            Column("extra", JSON, nullable=True),
+            Column("cls", String, nullable=False),
         )
         if self.engine:

{gensor-0.1.6 → gensor-0.2.1}/gensor/io/read.py RENAMED Viewed

@@ -3,6 +3,8 @@
 TODO: Fix up the read_from_sql() function to actually work properly.
 """
+import logging
+from importlib import import_module
 from pathlib import Path
 from typing import Any, Literal
@@ -12,13 +14,14 @@ from sqlalchemy import select
 from ..core.dataset import Dataset
 from ..core.timeseries import Timeseries
 from ..db.connection import DatabaseConnection
-from ..exceptions import NoFilesToLoad
 from ..parse import parse_plain, parse_vanessen_csv
+logger = logging.getLogger(__name__)
 def read_from_csv(
     path: Path, file_format: Literal["vanessen", "plain"] = "vanessen", **kwargs: Any
-) -> Dataset:
+) -> Dataset | Timeseries:
     """Loads the data from csv files with given file_format and returns a list of Timeseries objects.
     Parameters:
@@ -44,7 +47,8 @@ def read_from_csv(
     if path.is_dir() and not any(
         file.is_file() and file.suffix.lower() == ".csv" for file in path.iterdir()
     ):
-        raise NoFilesToLoad()
+        logger.info("No CSV files found. Operation skipped.")
+        return Dataset()
     files = (
         [
@@ -58,24 +62,33 @@ def read_from_csv(
         else []
     )
+    if not files:
+        logger.info("No CSV files found. Operation skipped.")
+        return Dataset()
     parser = parsers[file_format]
-    ds = Dataset()
+    ds: Dataset = Dataset()
     for f in files:
-        print(f"Loading file: {f}")
+        logger.info(f"Loading file: {f}")
         ts_in_file = parser(f, **kwargs)
         ds.add(ts_in_file)
-    return ds
+    # If there is only one Timeseries in Dataset (as in the condition), ds[0] will always
+    # be a Timeseries; so the line below does not introduce potential None in the return
+    return ds[0] if len(ds) == 1 else ds  # type: ignore[return-value]
 def read_from_sql(
     db: DatabaseConnection,
     load_all: bool,
     location: str | None = None,
-    sensor: str | None = None,
     variable: str | None = None,
     unit: str | None = None,
     timestamp_start: pd.Timestamp | None = None,
+    timestamp_stop: pd.Timestamp | None = None,
+    **kwargs: dict,
 ) -> Timeseries | Dataset:
     """Returns the timeseries or a dataset from a SQL database.
@@ -83,7 +96,6 @@ def read_from_sql(
         db (DatabaseConnection): The database connection object.
         load_all (bool): Whether to load all timeseries from the database.
         location (str): The station name.
-        sensor (str): The sensor name.
         variable (str): The measurement type.
         unit (str): The unit of the measurement.
@@ -95,7 +107,7 @@ def read_from_sql(
         TypeError: If the retrieved data is not a DataFrame or is of incorrect type.
     """
-    def _read_from_sql(schema_name: str) -> Timeseries:
+    def _read_from_sql(schema_name: str) -> Any:
         with db as con:
             schema = db.metadata.tables[schema_name]
             metadata_table = db.metadata.tables["__timeseries_metadata__"]
@@ -122,21 +134,23 @@ def read_from_sql(
             message = f"No metadata found for table {schema_name}"
             raise ValueError(message)
-        location = metadata_result[2]
-        sensor = metadata_result[3]
-        variable = metadata_result[4]
-        unit = metadata_result[5]
-        sensor_alt = metadata_result[6]
-        # location_alt = metadata_result[7]
-        ts_object = Timeseries(
-            ts=ts,
-            variable=variable,
-            location=location,
-            sensor=sensor,
-            unit=unit,
-            sensor_alt=sensor_alt,
-        )
+        # Core metadata extraction
+        core_metadata = {
+            "location": metadata_result[2],
+            "variable": metadata_result[3],
+            "unit": metadata_result[4],
+        }
+        extra_metadata = metadata_result[7] or {}
+        cls = metadata_result[8]
+        metadata = {**core_metadata, **extra_metadata}
+        module_name, class_name = cls.rsplit(".", 1)
+        module = import_module(module_name)
+        TimeseriesClass = getattr(module, class_name)
+        ts_object = TimeseriesClass(ts=ts, **metadata)
         return ts_object
@@ -151,12 +165,12 @@ def read_from_sql(
         else:
             return Dataset()
     else:
-        if isinstance(timestamp_start, pd.Timestamp):
-            timestamp_start_fmt = timestamp_start.strftime("%Y%m%d%H%M%S")
         schema_name = (
-            f"{location}_{sensor}_{variable}_{unit}_{timestamp_start_fmt}".lower()
+            f"{location}_{variable}_{unit}".lower()
         )
-        return _read_from_sql(schema_name)
+        # This will always returm Timeseries or Dataset.
+        return _read_from_sql(schema_name)  # type: ignore[no-any-return]
 # fmt: on

gensor-0.2.1/gensor/log.py ADDED Viewed

@@ -0,0 +1,7 @@
+import logging
+def set_log_level(level: str) -> None:
+    """Set the logging level for the package."""
+    logger = logging.getLogger("gensor")
+    logger.setLevel(level.upper())

{gensor-0.1.6 → gensor-0.2.1}/gensor/parse/utils.py RENAMED Viewed

@@ -10,6 +10,18 @@ from pandas import DataFrame, read_csv, to_datetime
 def get_data(
     text: str, data_start: str, data_end: str, column_names: list
 ) -> DataFrame:
+    """Search for data in the file.
+    Parameters:
+        text (str): string obtained from the CSV file.
+        data_start (str): string at the first row of the data.
+        data_end (str): string at the last row of the data.
+        column_names (list): list of expected column names.
+    Returns:
+        pd.DataFrame
+    """
     data_io = StringIO(text[text.index(data_start) : text.index(data_end)])
     df = read_csv(
@@ -20,7 +32,15 @@ def get_data(
 def get_metadata(text: str, patterns: dict) -> dict:
-    """Search for metadata in the file header with given regex patterns."""
+    """Search for metadata in the file header with given regex patterns.
+    Parameters:
+        text (str): string obtained from the CSV file.
+        patterns (dict): regex patterns matching the location and sensor information.
+    Returns:
+        dict: metadata of the timeseries.
+    """
     metadata = {}
     for k, v in patterns.items():
@@ -36,7 +56,7 @@ def get_metadata(text: str, patterns: dict) -> dict:
 def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
     """Detect the encoding of a file using chardet.
-    Args:
+    Parameters:
         path (Path): The path to the file.
         num_bytes (int): Number of bytes to read for encoding detection (default is 1024).
@@ -52,7 +72,7 @@ def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
 def handle_timestamps(df: DataFrame, tz_string: str) -> DataFrame:
     """Converts timestamps in the dataframe to the specified timezone (e.g., 'UTC+1').
-    Args:
+    Parameters:
         df (pd.DataFrame): The dataframe with timestamps.
         tz_string (str): A timezone string like 'UTC+1' or 'UTC-5'.

{gensor-0.1.6 → gensor-0.2.1}/gensor/parse/vanessen.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Logic parsing CSV files from van Essen Instruments Divers."""
+import logging
 from pathlib import Path
 from typing import Any
@@ -7,6 +8,8 @@ from ..config import VARIABLE_TYPES_AND_UNITS
 from ..core.timeseries import Timeseries
 from .utils import detect_encoding, get_data, get_metadata, handle_timestamps
+logger = logging.getLogger(__name__)
 def parse_vanessen_csv(path: Path, **kwargs: Any) -> list[Timeseries]:
     """Parses a van Essen csv file and returns a list of Timeseries objects. At this point it
@@ -51,7 +54,7 @@ def parse_vanessen_csv(path: Path, **kwargs: Any) -> list[Timeseries]:
         metadata = get_metadata(text, patterns)
         if not metadata:
-            print(f"Skipping file {path} due to missing metadata.")
+            logger.info(f"Skipping file {path} due to missing metadata.")
             return []
         data_start = "Date/time"

{gensor-0.1.6 → gensor-0.2.1}/gensor/processing/transform.py RENAMED Viewed

@@ -93,7 +93,7 @@ class Transformation:
         Returns:
             pandas.Series: The Box-Cox transformed time series data.
         """
-        lmbda = kwargs.get("lmbda", None)
+        lmbda = kwargs.get("lmbda")
         if (self.data <= 0).any():
             message = (

{gensor-0.1.6 → gensor-0.2.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gensor"
-version = "0.1.6"
+version = "0.2.1"
 description = "Library for handling groundwater sensor data."
 authors = ["Mateusz Zawadzki <zawadzkimat@outlook.com>"]
 repository = "https://github.com/zawadzkim/gensor"

gensor-0.1.6/gensor/__init__.py DELETED Viewed

@@ -1,20 +0,0 @@
-from .analysis.outliers import OutlierDetection
-from .core.dataset import Dataset
-from .core.timeseries import Timeseries
-from .io.read import read_from_csv, read_from_sql
-from .processing.compensation import Compensator, compensate
-from .processing.transform import Transformation
-__all__ = [
-    # basic data types
-    "Dataset",
-    "Timeseries",
-    # data transformation
-    "OutlierDetection",
-    "Transformation",
-    "Compensator",
-    "compensate",
-    # getters
-    "read_from_csv",
-    "read_from_sql",
-]

gensor-0.1.6/gensor/core/dataset.py DELETED Viewed

@@ -1,174 +0,0 @@
-from __future__ import annotations
-from collections import defaultdict
-from typing import Any, Self
-import pydantic as pyd
-from matplotlib import pyplot as plt
-from gensor.core.timeseries import Timeseries
-from gensor.db import DatabaseConnection
-from gensor.exceptions import IndexOutOfRangeError, TimeseriesNotFound
-class Dataset(pyd.BaseModel):
-    """Class to store a collection of timeseries.
-    The Dataset class is used to store a collection of Timeseries objects. It
-    is meant to be created when the van Essen CSV file is parsed.
-    Attributes:
-        timeseries (list[Timeseries]): A list of Timeseries objects.
-    Methods:
-        __iter__: Returns timeseries when iterated over.
-        __len__: Gives the number of timeseries in the Dataset.
-        get_stations: List all unique locations in the dataset.
-        add: Appends a new series to the Dataset or merges series if
-            an equal one exists.
-        align: Aligns the timeseries to a common time axis.
-        plot: Plots the timeseries data.
-    """
-    timeseries: list[Timeseries | None] = pyd.Field(default_factory=list)
-    def __iter__(self) -> Any:
-        """Allows to iterate directly over the dataset."""
-        return iter(self.timeseries)
-    def __len__(self) -> int:
-        """Gives the number of timeseries in the Dataset."""
-        return len(self.timeseries)
-    def __repr__(self) -> str:
-        return f"Dataset({len(self)})"
-    def __getitem__(self, index: int) -> Timeseries | None:
-        """Retrieve a Timeseries object by its index in the dataset.
-        Parameters:
-            index (int): The index of the Timeseries to retrieve.
-        Returns:
-            Timeseries: The Timeseries object at the specified index.
-        Raises:
-            IndexError: If the index is out of range.
-        """
-        try:
-            return self.timeseries[index]
-        except IndexError:
-            raise IndexOutOfRangeError(index, len(self)) from None
-    def get_stations(self) -> list:
-        """List all unique locations in the dataset."""
-        return [ts.location for ts in self.timeseries if ts is not None]
-    def add(self, other: Timeseries | list[Timeseries] | Self) -> None:
-        """Appends a new series to the Dataset or merges series if an equal
-        one exists.
-        If a Timeseries with the same location, sensor, and variable already
-        exists, merge the new data into the existing Timeseries, dropping
-        duplicate timestamps.
-        Parameters:
-            other (Timeseries): The Timeseries object to add.
-        """
-        if isinstance(other, list):
-            for ts in other:
-                if isinstance(ts, Timeseries):
-                    self._add_single_timeseries(ts)
-        elif isinstance(other, Dataset):
-            for ts in other.timeseries:  # type: ignore[assignment]
-                if isinstance(ts, Timeseries):
-                    self._add_single_timeseries(ts)
-        elif isinstance(other, Timeseries):
-            self._add_single_timeseries(other)
-        return
-    def _add_single_timeseries(self, ts: Timeseries) -> None:
-        """Adds a single Timeseries to the Dataset or merges if an equal one exists."""
-        for i, existing_ts in enumerate(self.timeseries):
-            if existing_ts == ts:
-                self.timeseries[i] = existing_ts.concatenate(ts)
-                return
-        self.timeseries.append(ts)
-        return
-    def filter(
-        self,
-        stations: str | list | None = None,
-        sensors: str | list | None = None,
-        variables: str | list | None = None,
-    ) -> Timeseries | Dataset:
-        """Return a Timeseries or a new Dataset filtered by station, sensor,
-        and/or variable.
-        Parameters:
-            stations (Optional[str]): The location of the station.
-            sensors (Optional[str]): The sensor identifier.
-            variables (Optional[str]): The variable being measured.
-        Returns:
-            Timeseries or Dataset: A single Timeseries if exactly one match is found,
-                                   or a new Dataset if multiple matches are found.
-        """
-        if isinstance(stations, str):
-            stations = [stations]
-        if isinstance(sensors, str):
-            sensors = [sensors]
-        if isinstance(variables, str):
-            variables = [variables]
-        matching_timeseries = [
-            ts
-            for ts in self.timeseries
-            if ts is not None
-            if (stations is None or ts.location in stations)
-            and (sensors is None or ts.sensor in sensors)
-            and (variables is None or ts.variable in variables)
-        ]
-        if not matching_timeseries:
-            raise TimeseriesNotFound()
-        if len(matching_timeseries) == 1:
-            return matching_timeseries[0]
-        return self.model_copy(update={"timeseries": matching_timeseries})
-    def to_sql(self, db: DatabaseConnection) -> None:
-        for ts in self.timeseries:
-            if ts:
-                ts.to_sql(db)
-        return
-    def plot(self, include_outliers: bool = False) -> None:
-        """Plots the timeseries data, grouping by variable type.
-        Args:
-            include_outliers (bool): Whether to include outliers in the plot.
-        """
-        # Group timeseries by variable
-        grouped_ts = defaultdict(list)
-        for ts in self.timeseries:
-            if ts:
-                grouped_ts[ts.variable].append(ts)
-        # Create a plot for each group of timeseries with the same variable
-        for variable, ts_list in grouped_ts.items():
-            fig, ax = plt.subplots(figsize=(10, 5))
-            for ts in ts_list:
-                ts.plot(include_outliers=include_outliers, ax=ax)
-            ax.set_title(f"Timeseries for {variable.capitalize()}")
-            plt.show()
-        return

gensor-0.1.6/gensor/core/timeseries.py DELETED Viewed

@@ -1,169 +0,0 @@
-from __future__ import annotations
-from typing import Any
-import pandas as pd
-import pandera as pa
-import pydantic as pyd
-from matplotlib import pyplot as plt
-from sqlalchemy import Table
-from sqlalchemy.dialects.sqlite import insert as sqlite_insert
-from gensor.core.base import BaseTimeseries
-from gensor.db import DatabaseConnection
-ts_schema = pa.SeriesSchema(
-    float,
-    index=pa.Index(pd.DatetimeTZDtype(tz="UTC"), coerce=False),
-    coerce=True,
-)
-class Timeseries(BaseTimeseries):
-    """Timeseries for groundwater sensor data
-    Attributes:
-        ts (pd.Series): The timeseries data.
-        variable (Literal['temperature', 'pressure', 'conductivity', 'flux']):
-            The type of the measurement.
-        unit (Literal['degC', 'mmH2O', 'mS/cm', 'm/s']): The unit of
-            the measurement.
-        sensor (SensorInfo): The serial number of the sensor.
-    Methods:
-        validate_ts: if the pd.Series is not exactly what is required, coerce.
-    """
-    model_config = pyd.ConfigDict(
-        arbitrary_types_allowed=True, validate_assignment=True
-    )
-    sensor: str | None = None
-    sensor_alt: float | None = None
-    def __eq__(self, other: object) -> bool:
-        """Check equality based on location, sensor, variable, unit and sensor_alt."""
-        if not isinstance(other, Timeseries):
-            return NotImplemented
-        return (
-            self.variable == other.variable
-            and self.unit == other.unit
-            and self.location == other.location
-            and self.sensor == other.sensor
-            and self.sensor_alt == other.sensor_alt
-        )
-    def to_sql(self, db: DatabaseConnection) -> str:
-        """Converts the timeseries to a list of dictionaries and uploads it to the database.
-        The Timeseries data is uploaded to the SQL database by using the pandas
-        `to_sql` method. Additionally, metadata about the timeseries is stored in the
-        'timeseries_metadata' table.
-        Args:
-            db (DatabaseConnection): The database connection object.
-        Returns:
-            str: A message indicating the number of rows inserted into the database.
-        """
-        # Format the start timestamp as 'YYYYMMDDHHMMSS'
-        timestamp_start_fmt = self.start.strftime("%Y%m%d%H%M%S")
-        # Construct the schema name using the location, sensor, variable, unit, and timestamp
-        schema_name = f"{self.location}_{self.sensor}_{self.variable}_{self.unit}_{timestamp_start_fmt}".lower()
-        # Ensure the index is a pandas DatetimeIndex
-        if isinstance(self.ts.index, pd.DatetimeIndex):
-            utc_index = (
-                self.ts.index.tz_convert("UTC")
-                if self.ts.index.tz is not None
-                else self.ts.index
-            )
-        else:
-            message = "The index is not a DatetimeIndex and cannot be converted to UTC."
-            raise TypeError(message)
-        # Prepare the timeseries data as records for insertion
-        series_as_records = list(
-            zip(utc_index.strftime("%Y-%m-%dT%H:%M:%S%z"), self.ts, strict=False)
-        )
-        with db as con:
-            # Create the timeseries table if it doesn't exist
-            schema = db.create_table(schema_name, self.variable)
-            # Ensure that the timeseries_metadata table exists
-            metadata_schema = db.metadata.tables["__timeseries_metadata__"]
-            if isinstance(schema, Table):
-                # Insert the timeseries data
-                stmt = sqlite_insert(schema).values(series_as_records)
-                stmt = stmt.on_conflict_do_nothing(index_elements=["timestamp"])
-                con.execute(stmt)
-                con.commit()
-                metadata_stmt = sqlite_insert(metadata_schema).values(
-                    table_name=schema_name,
-                    location=self.location,
-                    sensor=self.sensor,
-                    variable=self.variable,
-                    unit=self.unit,
-                    logger_alt=self.sensor_alt,
-                    location_alt=self.sensor_alt,
-                    timestamp_start=timestamp_start_fmt,
-                    timestamp_end=self.end.strftime("%Y%m%d%H%M%S"),
-                )
-                metadata_stmt = metadata_stmt.on_conflict_do_update(
-                    index_elements=["table_name"],
-                    set_={
-                        "timestamp_start": timestamp_start_fmt,
-                        "timestamp_end": self.end.strftime("%Y%m%d%H%M%S"),
-                    },
-                )
-                con.execute(metadata_stmt)
-                con.commit()
-        return f"{schema_name} table and metadata updated."
-    def plot(
-        self, include_outliers: bool = False, ax: Any = None, **plot_kwargs: Any
-    ) -> tuple:
-        """Plots the timeseries data.
-        Args:
-            include_outliers (bool): Whether to include outliers in the plot.
-            ax (matplotlib.axes.Axes, optional): Matplotlib axes object to plot on.
-                If None, a new figure and axes are created.
-            **plot_kwargs: Additional keyword arguments passed to plt.plot.
-        Returns:
-            (fig, ax): Matplotlib figure and axes to allow further customization.
-        """
-        # Create new figure and axes if not provided
-        if ax is None:
-            fig, ax = plt.subplots(figsize=(10, 5))
-        else:
-            fig = ax.get_figure()
-        ax.plot(
-            self.ts.index,
-            self.ts,
-            label=f"{self.location} ({self.sensor})",
-            **plot_kwargs,
-        )
-        if include_outliers and self.outliers is not None:
-            ax.scatter(
-                self.outliers.index, self.outliers, color="red", label="Outliers"
-            )
-        plt.xticks(rotation=45)
-        ax.set_xlabel("Time")
-        ax.set_ylabel(f"{self.variable} ({self.unit})")
-        ax.set_title(f"{self.variable.capitalize()} at {self.location}")
-        ax.legend()
-        return fig, ax