PyPI - avoca - Versions diffs - 0.12.0__tar.gz → 0.14.0__tar.gz - Mend

avoca 0.12.0tar.gz → 0.14.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{avoca-0.12.0 → avoca-0.14.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: avoca
-Version: 0.12.0
+Version: 0.14.0
 Summary: @voc@: Quality assessement of measurement data
 Project-URL: Homepage, https://gitlab.com/empa503/atmospheric-measurements/avoca
 Project-URL: Bug Tracker, https://gitlab.com/empa503/atmospheric-measurements/avoca/-/issues

{avoca-0.12.0 → avoca-0.14.0}/avoca/bindings/ebas.py RENAMED Viewed

@@ -171,7 +171,6 @@ def set_dataframe(
             )
         )
     this_nan_flags = nan_flags.copy()
     if data_level in concs_data_levels and invalidate_conc_calib:
@@ -188,7 +187,9 @@ def set_dataframe(
             )
             for flag in flag_col
         ]
-        nan_flag = np.logical_or.reduce([flag_col & flag.value for flag in this_nan_flags])
+        nan_flag = np.logical_or.reduce(
+            [flag_col & flag.value for flag in this_nan_flags]
+        )
         for var in vars_to_export[data_level]:
             ebas_name = compounds[sub]
@@ -199,6 +200,16 @@ def set_dataframe(
                 for val, isnan in zip(serie_to_export, nan_flag)
             ]
+            if var == "conc_calib":
+                # Invalidate calibration concentration for non-calibration samples
+                this_flags = [
+                    flags_ebas
+                    + ([] if (QA_Flag.CALIBRATION.value & flag_avoca) else [980])
+                    for flags_ebas, flag_avoca in zip(flags, flag_col)
+                ]
+            else:
+                this_flags = flags
             metadata = DataObject()
             metadata.comp_name = (
                 f"{ebas_name}_{ebas_compname_of_var[var]}"
@@ -214,7 +225,9 @@ def set_dataframe(
             metadata.matrix = "air"
             # add the variable
             nas.variables.append(
-                DataObject(values_=values, flags=flags, flagcol=True, metadata=metadata)
+                DataObject(
+                    values_=values, flags=this_flags, flagcol=True, metadata=metadata
+                )
             )
             if var == "conc_calib":

{avoca-0.12.0 → avoca-0.14.0}/avoca/bindings/ebas_flags.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# https://projects.nilu.no/ccc/flags/flags.html for more info on what ebas uses
+# https://ebas-submit.nilu.no/templates/comments/fl_flag for more info on what ebas uses
 from avoca.flags import QA_Flag, nan_flags
 flags_to_ebas: dict[QA_Flag, int] = {

{avoca-0.12.0 → avoca-0.14.0}/avoca/plots.py RENAMED Viewed

@@ -69,6 +69,8 @@ def plot_yearly_plotly(
     df: pd.DataFrame,
     compound: str,
     df_new: pd.DataFrame | None = None,
+    opacity: float = 0.5,
+    size: int = 6,
 ) -> "plotly.graph_objs._figure.Figure":
     """Plot yearly data using plotly."""
     import plotly.express as px
@@ -97,7 +99,28 @@ def plot_yearly_plotly(
     df_to_plot = df_to_plot.pivot_table(
         index=df_to_plot.index, columns="year", values="conc"
     )
-    fig = px.scatter(df_to_plot)
+    fig = go.Figure()
+    hover_template = "Timestamp: %{text}<br>Conc: %{y:.2f} ppt"
+    kwargs = {
+        "mode": "markers",
+        "opacity": opacity,
+        "marker": dict(size=size),
+        "hovertemplate": hover_template,
+    }
+    for year in df_to_plot.columns:
+        fig.add_trace(
+            go.Scatter(
+                x=df_to_plot.index,
+                y=df_to_plot[year],
+                name=str(year),
+                zorder=-year,
+                text=dt[dt.dt.year == year].dt.strftime("%y%m%d.%H%M"),
+                **kwargs,
+            )
+        )
     x_values = pd.date_range(start="2024-01-01", end="2024-12-31", freq="MS")
     dt_new = df_new[dt_column]
@@ -105,8 +128,9 @@ def plot_yearly_plotly(
         go.Scatter(
             x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
             y=df_new[(compound, "conc")],
-            mode="markers",
             name="New Data",
+            text=dt_new.dt.strftime("%y%m%d.%H%M"),
+            **kwargs,
         )
     )
     fig.update_layout(

{avoca-0.12.0 → avoca-0.14.0}/avoca/qa_class/abstract.py RENAMED Viewed

@@ -49,6 +49,7 @@ class AbstractQA_Assigner(ABC):
     flag: QA_Flag
     runtypes: list[str] | None
     required_packages: list[PythonPackageRequirement] | None = None
+    require_datetime_index: bool = False
     # Options that can be set by the user
     name: str
@@ -142,6 +143,14 @@ class AbstractQA_Assigner(ABC):
                 f"Please check the data and the settings for {self.name}"
             )
+        if self.require_datetime_index:
+            if not isinstance(df.index, pd.DatetimeIndex):
+                raise ValueError(
+                    f"Assigner {self} requires a DatetimeIndex but the dataframe"
+                    " does not have one. \n "
+                    f"Please check the data and the settings for {self.name}"
+                )
     @abstractmethod
     def fit(self, df: pd.DataFrame):
         """Fit the QA assigner on some data.

avoca-0.14.0/avoca/qa_class/rolling.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Quality assurance based on statistical methods."""
+from __future__ import annotations
+from datetime import timedelta
+from typing import TYPE_CHECKING
+import numpy as np
+import pandas as pd
+from avoca.qa_class.zscore import ExtremeValues
+if TYPE_CHECKING:
+    from avoca.utils.torch_models import MultipleRegressionModel
+class RollingWindow(ExtremeValues):
+    """Detect in rolling windows.
+    The method is based on outliers in a rolling window using the median and standard deviation.
+    The training is done directly on the fitted data.
+    :param variable: The variable to check for extreme values.
+    :param threshold: The threshold for the z-score. To flag values.
+    :param use_log_normal: If True, the log of the values will be used to calculate the z-score.
+        This can be useful if the values are log-normal distributed.
+    :param only_greater: If True, only values greater than the threshold will be flagged.
+        The values lower than the negative threshold will not be flagged.
+        By default, this is True if use_log_normal is True, and False otherwise.
+    """
+    require_datetime_index = True
+    rolling_window: timedelta
+    def __init__(
+        self,
+        *args,
+        rolling_window: timedelta = timedelta(days=7),
+        threshold: float = 1.5,
+        **kwargs,
+    ):
+        super().__init__(*args, threshold=threshold, **kwargs)
+        self.rolling_window = rolling_window
+    def fit(self, df: pd.DataFrame):
+        self.check_columns_or_raise(df, columns=self._stats_columns)
+        self.df_train = df[self._stats_columns]
+    def assign(self, df: pd.DataFrame) -> dict[str, pd.Index]:
+        df = df[self._stats_columns]
+        df = self._clean_data(df)
+        if self.use_log_normal:
+            # Replace <=0 with NaN
+            df = df.where(df > 0, np.nan)
+            df = df.map(lambda x: np.log(x))
+        rolling = df.rolling(window=self.rolling_window)
+        means = rolling.median()
+        stds = rolling.std()
+        self.rolling_median = means
+        self.rolling_std = stds
+        thresholds = means + stds * self.threshold
+        df_fail = df > thresholds
+        if not self.only_greater:
+            df_fail = df_fail | (df < (means - stds * self.threshold))
+        out_dict = {}
+        for compound in self.compounds:
+            col = (compound, self.variable)
+            this_c_fail = df_fail[col]
+            out_dict[compound] = this_c_fail.loc[this_c_fail].index
+        return out_dict
+    def plot(self):
+        import matplotlib.pyplot as plt
+        fig, axes = plt.subplots(
+            len(self.compounds), 1, figsize=(6, 3 * len(self.compounds)), sharex=True
+        )
+        x = self.dt if hasattr(self, "dt") else self.df_train.index
+        x = pd.Series(x, index=self.df_train.index)
+        outliers = self.assign(self.df_train)
+        for i, compound in enumerate(self.compounds):
+            ax = axes[i]
+            col = (compound, self.variable)
+            ax.scatter(
+                x,
+                self.df_train[col],
+                s=1,
+                label="darkblue",
+            )
+            median = self.rolling_median[col]
+            std = self.rolling_std[col]
+            top, bottom = median + std * self.threshold, median - std * self.threshold
+            ax.fill_between(
+                x,
+                top,
+                bottom,
+                color="lightgray",
+                label="Rolling threshold",
+                alpha=0.5,
+            )
+            outlier_indices = outliers[compound]
+            ax.scatter(
+                x.loc[outlier_indices],
+                self.df_train.loc[outlier_indices, col],
+                s=10,
+                marker="x",
+                color="red",
+                label="Extreme values",
+            )
+            ax.set_title(
+                f"{compound} +- {self.threshold} std",
+                # Under teh top line
+                y=0.8,
+            )
+            ax.tick_params(axis="x", rotation=25)
+        return fig, axes

{avoca-0.12.0 → avoca-0.14.0}/avoca/testing/df.py RENAMED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 import pandas as pd
 empty_index = pd.Index([], dtype="int64")
+empty_index_dt = pd.DatetimeIndex([])
 simple_df = pd.DataFrame(
     np.ones((2, 4)),

avoca-0.14.0/avoca/testing/utils.py ADDED Viewed

@@ -0,0 +1,9 @@
+import pandas as pd
+def make_dt_index(df: pd.DataFrame | pd.Index) -> pd.DataFrame | pd.Index:
+    """Create a datetime index for the dataframe."""
+    index = pd.date_range(start="2023-01-01", periods=len(df), freq="h")
+    if isinstance(df, pd.Index):
+        return index
+    return df.set_index(index)

{avoca-0.12.0 → avoca-0.14.0}/examples/data_qa.ipynb RENAMED Viewed

@@ -137,6 +137,7 @@
    "source": [
     "from avoca.qa_class.concs import ExtremeConcentrations\n",
     "from avoca.qa_class.zscore import XY_Correlations, Multiple_XY_Correlations\n",
+    "from avoca.qa_class.rolling import RollingWindow\n",
     "\n",
     "\n",
     "# Create assingers for each compound\n",
@@ -145,10 +146,13 @@
     "    XY_Correlations(\n",
     "        compounds=[\"ethane\", \"propane\", \"n-butane\"], variable=\"C\", threshold=4.0\n",
     "    ),\n",
-    "    # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
-    "    multiple_assigner := Multiple_XY_Correlations(\n",
-    "        number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
+    "    rolling_window := RollingWindow(\n",
+    "        compounds=compounds, variable=\"C\"\n",
     "    ),\n",
+    "    # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
+    "    # multiple_assigner := Multiple_XY_Correlations(\n",
+    "    #     number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
+    "    # ),\n",
     "]"
    ]
   },
@@ -329,6 +333,15 @@
     "fig.patch.set_alpha(0)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rolling_window.plot()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

{avoca-0.12.0 → avoca-0.14.0}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "avoca"
-version = "0.12.0"
+version = "0.14.0"
 authors = [
   { name="Lionel Constantin", email="lionel.constantin@empa.ch" },
 ]

{avoca-0.12.0 → avoca-0.14.0}/tests/test_assigners.py RENAMED Viewed

@@ -6,18 +6,14 @@ import pytest
 from avoca.qa_class.abstract import AbstractQA_Assigner
 from avoca.qa_class.invalid import InvalidValues
 from avoca.qa_class.zscore import ExtremeValues, XY_Correlations
-from avoca.testing.df import (
-    df_around_zero,
-    df_full_nan,
-    df_nan_training,
-    df_one_extreme,
-    df_regular,
-    df_with_inf,
-    empty_index,
-)
+from avoca.qa_class.rolling import RollingWindow
+import avoca.testing.df as df_test
+from avoca.testing.utils import make_dt_index
 index_all_3 = pd.Index([0, 1, 2], dtype="int64")
+index_all_3_dt = make_dt_index(index_all_3)
 index_2 = pd.Index([2], dtype="int64")
+index_2_dt = index_all_3_dt[index_2]
 @pytest.fixture(
@@ -36,6 +32,8 @@ index_2 = pd.Index([2], dtype="int64")
                 "name": "invalid_zeros_and_negative",
             },
         ),
+        (RollingWindow, {"rolling_window": pd.Timedelta(days=3)}),
+        (RollingWindow, {}),
     ]
 )
 def assigner(
@@ -47,26 +45,59 @@ def assigner(
 def test_simple(assigner: AbstractQA_Assigner):
+    df_one_extreme = df_test.df_one_extreme
+    df_regular = df_test.df_regular
+    if assigner.require_datetime_index:
+        df_one_extreme = make_dt_index(df_one_extreme)
+        df_regular = make_dt_index(df_regular)
     assigner.fit(df_regular)
     flagged = assigner.assign(df_one_extreme)
+    empty_index = (
+        df_test.empty_index
+        if not assigner.require_datetime_index
+        else df_test.empty_index_dt
+    )
     comparison_output_a = {
         InvalidValues: empty_index,
+        RollingWindow: index_2_dt,
     }
     comparison_output_b = {
-        ExtremeValues: empty_index,
         # Also b is outside of the correlation cloud
         XY_Correlations: index_2,
-        InvalidValues: empty_index,
     }
     pd.testing.assert_index_equal(
         flagged["compA"], comparison_output_a.get(type(assigner), index_2)
     )
-    pd.testing.assert_index_equal(flagged["compB"], comparison_output_b[type(assigner)])
+    pd.testing.assert_index_equal(
+        flagged["compB"], comparison_output_b.get(type(assigner), empty_index)
+    )
+def test_input_dataframe_dt_index(assigner: AbstractQA_Assigner):
+    df_regular = df_test.df_regular
+    if assigner.require_datetime_index:
+        with pytest.raises(ValueError, match="requires a DatetimeIndex"):
+            assigner.fit(df_regular)
 def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
+    df_nan_training = df_test.df_nan_training
+    df_regular = df_test.df_regular
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_nan_training = make_dt_index(df_nan_training)
+        df_regular = make_dt_index(df_regular)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_nan_training)
     flagged = assigner.assign(df_regular)
@@ -76,6 +107,15 @@ def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
 def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
+    df_full_nan = df_test.df_full_nan
+    df_regular = df_test.df_regular
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_full_nan = make_dt_index(df_full_nan)
+        df_regular = make_dt_index(df_regular)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_full_nan)
     flagged = assigner.assign(df_regular)
@@ -85,11 +125,19 @@ def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
 def test_fitting_nans(assigner: AbstractQA_Assigner):
+    df_regular = df_test.df_regular
+    df_nan_training = df_test.df_nan_training
+    df_full_nan = df_test.df_full_nan
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_regular = make_dt_index(df_regular)
+        df_nan_training = make_dt_index(df_nan_training)
+        df_full_nan = make_dt_index(df_full_nan)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_regular)
     flagged = assigner.assign(df_nan_training)
     flagged_allnans = assigner.assign(df_full_nan)
     comparison_output_a_one_nan = {
         InvalidValues: index_2,
     }
@@ -99,7 +147,8 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
     # Nothing should be flagged
     pd.testing.assert_index_equal(
-        flagged["compA"], comparison_output_a_one_nan.get(type(assigner), empty_index)
+        flagged["compA"],
+        comparison_output_a_one_nan.get(type(assigner), empty_index),
     )
     pd.testing.assert_index_equal(flagged["compB"], empty_index)
     pd.testing.assert_index_equal(
@@ -112,6 +161,12 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
 def test_zero_values(assigner: AbstractQA_Assigner):
     """Test that zero values are not flagged."""
+    df_around_zero = df_test.df_around_zero
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_around_zero = make_dt_index(df_around_zero)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_around_zero)
     flagged = assigner.assign(df_around_zero)
@@ -138,6 +193,12 @@ def test_zero_values(assigner: AbstractQA_Assigner):
 def test_inf_values(assigner: AbstractQA_Assigner):
     """Test that inf values are flagged."""
+    df_with_inf = df_test.df_with_inf
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_with_inf = make_dt_index(df_with_inf)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_with_inf)
     flagged = assigner.assign(df_with_inf)