PyPI - avoca - Versions diffs - 0.12.0__tar.gz → 0.15.0__tar.gz - Mend

avoca 0.12.0tar.gz → 0.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

{avoca-0.12.0 → avoca-0.15.0}/.gitignore RENAMED Viewed

@@ -13,3 +13,4 @@ dist/
 # Generated by pytests
 simple_df.csv
+data/tests/export_empa_qa_tool/*.csv

{avoca-0.12.0 → avoca-0.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: avoca
-Version: 0.12.0
+Version: 0.15.0
 Summary: @voc@: Quality assessement of measurement data
 Project-URL: Homepage, https://gitlab.com/empa503/atmospheric-measurements/avoca
 Project-URL: Bug Tracker, https://gitlab.com/empa503/atmospheric-measurements/avoca/-/issues

{avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/ebas.py RENAMED Viewed

@@ -171,7 +171,6 @@ def set_dataframe(
             )
         )
     this_nan_flags = nan_flags.copy()
     if data_level in concs_data_levels and invalidate_conc_calib:
@@ -188,7 +187,9 @@ def set_dataframe(
             )
             for flag in flag_col
         ]
-        nan_flag = np.logical_or.reduce([flag_col & flag.value for flag in this_nan_flags])
+        nan_flag = np.logical_or.reduce(
+            [flag_col & flag.value for flag in this_nan_flags]
+        )
         for var in vars_to_export[data_level]:
             ebas_name = compounds[sub]
@@ -199,6 +200,16 @@ def set_dataframe(
                 for val, isnan in zip(serie_to_export, nan_flag)
             ]
+            if var == "conc_calib":
+                # Invalidate calibration concentration for non-calibration samples
+                this_flags = [
+                    flags_ebas
+                    + ([] if (QA_Flag.CALIBRATION.value & flag_avoca) else [980])
+                    for flags_ebas, flag_avoca in zip(flags, flag_col)
+                ]
+            else:
+                this_flags = flags
             metadata = DataObject()
             metadata.comp_name = (
                 f"{ebas_name}_{ebas_compname_of_var[var]}"
@@ -214,7 +225,9 @@ def set_dataframe(
             metadata.matrix = "air"
             # add the variable
             nas.variables.append(
-                DataObject(values_=values, flags=flags, flagcol=True, metadata=metadata)
+                DataObject(
+                    values_=values, flags=this_flags, flagcol=True, metadata=metadata
+                )
             )
             if var == "conc_calib":

{avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/ebas_flags.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# https://projects.nilu.no/ccc/flags/flags.html for more info on what ebas uses
+# https://ebas-submit.nilu.no/templates/comments/fl_flag for more info on what ebas uses
 from avoca.flags import QA_Flag, nan_flags
 flags_to_ebas: dict[QA_Flag, int] = {

{avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/qa_tool.py RENAMED Viewed

@@ -37,10 +37,11 @@ def export_EmpaQATool(
     station: str = "XXX",
     revision_date: datetime | None = None,
     dataset: datetime | str | None = None,
-    export_names: dict[str, str] = {},
+    export_names: dict[str, str] | None = None,
     datetime_offsets: tuple[timedelta, timedelta] | None = None,
     substances: list[str] = [],
     rounding_decimals: int = 4,
+    df_substances: pd.DataFrame | None = None,
 ) -> Path:
     """Export to the EmpaQATool format.
@@ -64,7 +65,17 @@ def export_EmpaQATool(
     :arg datetime_offsets: Tuple of two timedelta to use for the start and end datetime
     :arg substances: List of substances to export. You can also specify group names.
         If not specified, this will use the substances from `df_substances`.
+        If a substance is present here and not in `df_substances`, it will still be exported.
     :arg rounding_decimals: Number of decimals to round the values to.
+    :arg df_substances: DataFrame with substance information.
+        If provided, the substances to export will be taken from this dataframe.
+        Columns:
+        - index: substance name
+        - export: bool, whether to export the substance
+        - export_name: str, name to use in the export file
+    :returns: Path to the exported file.
     """
@@ -113,12 +124,42 @@ def export_EmpaQATool(
     logger.debug(f"df_out: {df_out.head()}")
     if not substances:
         substances = compounds_from_df(df)
+        if df_substances is not None and "export" in df_substances.columns:
+            # Remove the substances that should not be exported
+            substances = [
+                s
+                for s in substances
+                if s not in df_substances.index or df_substances.loc[s, "export"]
+            ]
     remove_infs = lambda x: x.replace([np.inf, -np.inf], np.nan)
     is_invalid = lambda x: x.isin([np.inf, -np.inf]) | pd.isna(x)
     clean_col = lambda x: remove_infs(x).round(rounding_decimals).astype(str)
+    if export_names is None:
+        export_names = {}
+    if df_substances is not None and "export_name" in df_substances.columns:
+        # Read export names from the dataframe if provided
+        for substance in substances:
+            if not substance or substance not in df_substances.index:
+                continue
+            export_name_df = df_substances.loc[substance, "export_name"]
+            if not export_name_df or pd.isna(export_name_df):
+                continue
+            if substance in export_names and export_names[substance] != export_name_df:
+                logger.warning(
+                    f"Substance {substance} found in both df_substances and"
+                    " export_names. Using the name from export_names.\n"
+                    f"  - export_names (used): {export_names[substance]}\n"
+                    f"  - df_substances: {export_name_df}"
+                )
+                continue
+            export_names[substance] = export_name_df
     for substance in substances:
+        if not substance:
+            continue
         export_name = export_names.get(substance, substance)
@@ -234,12 +275,12 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
         dt += shift
     columns[("-", "datetime")] = dt
-     # Last column is empty
-    compounds = [ '-'.join(s[:-1]) for col in df.columns if len(s:=col.split("-")) >= 2]
-    for compound in compounds:
+    # Last column is empty
+    compounds = [
+        "-".join(s[:-1]) for col in df.columns if len(s := col.split("-")) >= 2
+    ]
+    for compound in compounds:
         flag_col = f"{compound}-flag"
         value_col = f"{compound}-value"
@@ -248,8 +289,8 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
         mapping = {
             "conc": value_col,
-            "u_expanded":acc_col,
-            "u_precision":precision_col,
+            "u_expanded": acc_col,
+            "u_precision": precision_col,
         }
         flag_values = (pd.to_numeric(df[flag_col]) * 1e3).astype(int).mod(1000)
@@ -263,10 +304,10 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
             serie = pd.to_numeric(df[value])
             mask_nan = flags == QA_Flag.MISSING.value
             serie[mask_nan] = np.nan
-            columns[(compound, key)] = serie
+            columns[(compound, key)] = serie
         columns[(compound, "flag")] = flags
         mask_nan = columns[(compound, "conc")].isna()
         columns[(compound, "flag")][mask_nan] |= QA_Flag.MISSING.value

{avoca-0.12.0 → avoca-0.15.0}/avoca/manager.py RENAMED Viewed

@@ -20,6 +20,7 @@ class AssignerManager:
     _assigners_importpath = {
         "RetentionTimeChecker": "avoca.qa_class.rt",
         "ExtremeValues": "avoca.qa_class.zscore",
+        "RollingWindow": "avoca.qa_class.rolling",
         "ExtremeConcentrations": "avoca.qa_class.concs",
         "XY_Correlations": "avoca.qa_class.zscore",
         "TestAssigner": "avoca.qa_class.test",

{avoca-0.12.0 → avoca-0.15.0}/avoca/plots.py RENAMED Viewed

@@ -69,46 +69,74 @@ def plot_yearly_plotly(
     df: pd.DataFrame,
     compound: str,
     df_new: pd.DataFrame | None = None,
+    opacity: float = 0.5,
+    size: int = 6,
 ) -> "plotly.graph_objs._figure.Figure":
     """Plot yearly data using plotly."""
     import plotly.express as px
     import plotly.graph_objects as go
     dt_column = ("-", "datetime")
-    serie = df[(compound, "conc")]
-    dt = df[dt_column]
     if ("-", "type") in df.columns:
         mask_air = df[("-", "type")] == "air"
-        serie = serie[mask_air]
-        dt = dt[mask_air]
-    if ("-", "type") in df_new.columns:
+        df = df[mask_air]
+    if df_new is not None and ("-", "type") in df_new.columns:
         mask_air_new = df_new[("-", "type")] == "air"
         df_new = df_new[mask_air_new]
+    dt = df[dt_column]
     x = dt.dt.day_of_year + dt.dt.hour / 24.0
-    df_to_plot = pd.DataFrame(
-        {
-            "conc": serie.values,
-            "year": dt.dt.year.values,
-        },
-        index=x.values,
-    )
-    # Break down by year, to have year as columns and conc as values
-    df_to_plot = df_to_plot.pivot_table(
-        index=df_to_plot.index, columns="year", values="conc"
-    )
-    fig = px.scatter(df_to_plot)
+    fig = go.Figure()
+    hover_template = "Timestamp: %{text}<br>Conc: %{y:.2f} ppt"
+    kwargs = {
+        "mode": "markers",
+        "opacity": opacity,
+        "marker": dict(size=size),
+        "hovertemplate": hover_template,
+    }
+    if (compound, "conc") in df:
+        serie = df[(compound, "conc")]
+        df_to_plot = pd.DataFrame(
+            {
+                "conc": serie.values,
+                "year": dt.dt.year.values,
+            },
+            index=x.values,
+        )
+        # Break down by year, to have year as columns and conc as values
+        df_to_plot = df_to_plot.pivot_table(
+            index=df_to_plot.index, columns="year", values="conc"
+        )
+        for year in df_to_plot.columns:
+            fig.add_trace(
+                go.Scatter(
+                    x=df_to_plot.index,
+                    y=df_to_plot[year],
+                    name=str(year),
+                    zorder=-year,
+                    text=dt[dt.dt.year == year].dt.strftime("%y%m%d.%H%M"),
+                    **kwargs,
+                )
+            )
     x_values = pd.date_range(start="2024-01-01", end="2024-12-31", freq="MS")
-    dt_new = df_new[dt_column]
-    fig.add_trace(
-        go.Scatter(
-            x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
-            y=df_new[(compound, "conc")],
-            mode="markers",
-            name="New Data",
+    if df_new is not None and (compound, "conc") in df_new:
+        dt_new = df_new[dt_column]
+        fig.add_trace(
+            go.Scatter(
+                x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
+                y=df_new[(compound, "conc")],
+                name="New Data",
+                text=dt_new.dt.strftime("%y%m%d.%H%M"),
+                **kwargs,
+            )
         )
-    )
     fig.update_layout(
         xaxis_title="Time of Year",
         yaxis_title=f"{compound} (ppt)",

{avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/abstract.py RENAMED Viewed

@@ -49,6 +49,7 @@ class AbstractQA_Assigner(ABC):
     flag: QA_Flag
     runtypes: list[str] | None
     required_packages: list[PythonPackageRequirement] | None = None
+    require_datetime_index: bool = False
     # Options that can be set by the user
     name: str
@@ -142,6 +143,14 @@ class AbstractQA_Assigner(ABC):
                 f"Please check the data and the settings for {self.name}"
             )
+        if self.require_datetime_index:
+            if not isinstance(df.index, pd.DatetimeIndex):
+                raise ValueError(
+                    f"Assigner {self} requires a DatetimeIndex but the dataframe"
+                    " does not have one. \n "
+                    f"Please check the data and the settings for {self.name}"
+                )
     @abstractmethod
     def fit(self, df: pd.DataFrame):
         """Fit the QA assigner on some data.

avoca-0.15.0/avoca/qa_class/rolling.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Quality assurance based on statistical methods."""
+from __future__ import annotations
+from datetime import timedelta
+from typing import TYPE_CHECKING
+import numpy as np
+import pandas as pd
+from avoca.qa_class.zscore import ExtremeValues
+if TYPE_CHECKING:
+    from avoca.utils.torch_models import MultipleRegressionModel
+class RollingWindow(ExtremeValues):
+    """Detect in rolling windows.
+    The method is based on outliers in a rolling window using the median and standard deviation.
+    The training is done directly on the fitted data.
+    :param variable: The variable to check for extreme values.
+    :param threshold: The threshold for the z-score. To flag values.
+    :param use_log_normal: If True, the log of the values will be used to calculate the z-score.
+        This can be useful if the values are log-normal distributed.
+    :param only_greater: If True, only values greater than the threshold will be flagged.
+        The values lower than the negative threshold will not be flagged.
+        By default, this is True if use_log_normal is True, and False otherwise.
+    :param rolling_window: The size of the rolling window as a `timedelta` object.
+        See `window` parameters in pandas documentation for more details.
+        https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html#pandas-dataframe-rolling
+    """
+    require_datetime_index = True
+    rolling_window: timedelta
+    def __init__(
+        self,
+        *args,
+        rolling_window: timedelta = timedelta(days=7),
+        threshold: float = 1.5,
+        **kwargs,
+    ):
+        super().__init__(*args, threshold=threshold, **kwargs)
+        self.rolling_window = rolling_window
+    def fit(self, df: pd.DataFrame):
+        self.check_columns_or_raise(df, columns=self._stats_columns)
+        self.df_train = df[self._stats_columns]
+    def assign(self, df: pd.DataFrame) -> dict[str, pd.Index]:
+        df = df[self._stats_columns]
+        df = self._clean_data(df)
+        if self.use_log_normal:
+            # Replace <=0 with NaN
+            df = df.where(df > 0, np.nan)
+            df = df.map(lambda x: np.log(x))
+        rolling = df.rolling(window=self.rolling_window)
+        means = rolling.median()
+        stds = rolling.std()
+        self.rolling_median = means
+        self.rolling_std = stds
+        thresholds = means + stds * self.threshold
+        df_fail = df > thresholds
+        if not self.only_greater:
+            df_fail = df_fail | (df < (means - stds * self.threshold))
+        out_dict = {}
+        for compound in self.compounds:
+            col = (compound, self.variable)
+            this_c_fail = df_fail[col]
+            out_dict[compound] = this_c_fail.loc[this_c_fail].index
+        return out_dict
+    def plot(self):
+        import matplotlib.pyplot as plt
+        fig, axes = plt.subplots(
+            len(self.compounds), 1, figsize=(6, 3 * len(self.compounds)), sharex=True
+        )
+        x = self.dt if hasattr(self, "dt") else self.df_train.index
+        x = pd.Series(x, index=self.df_train.index)
+        outliers = self.assign(self.df_train)
+        for i, compound in enumerate(self.compounds):
+            ax = axes[i]
+            col = (compound, self.variable)
+            ax.scatter(
+                x,
+                self.df_train[col],
+                s=1,
+                label="darkblue",
+            )
+            median = self.rolling_median[col]
+            std = self.rolling_std[col]
+            top, bottom = median + std * self.threshold, median - std * self.threshold
+            ax.fill_between(
+                x,
+                top,
+                bottom,
+                color="lightgray",
+                label="Rolling threshold",
+                alpha=0.5,
+            )
+            outlier_indices = outliers[compound]
+            ax.scatter(
+                x.loc[outlier_indices],
+                self.df_train.loc[outlier_indices, col],
+                s=10,
+                marker="x",
+                color="red",
+                label="Extreme values",
+            )
+            ax.set_title(
+                f"{compound} +- {self.threshold} std",
+                # Under teh top line
+                y=0.8,
+            )
+            ax.tick_params(axis="x", rotation=25)
+        return fig, axes

{avoca-0.12.0 → avoca-0.15.0}/avoca/testing/df.py RENAMED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 import pandas as pd
 empty_index = pd.Index([], dtype="int64")
+empty_index_dt = pd.DatetimeIndex([])
 simple_df = pd.DataFrame(
     np.ones((2, 4)),

avoca-0.15.0/avoca/testing/utils.py ADDED Viewed

@@ -0,0 +1,9 @@
+import pandas as pd
+def make_dt_index(df: pd.DataFrame | pd.Index) -> pd.DataFrame | pd.Index:
+    """Create a datetime index for the dataframe."""
+    index = pd.date_range(start="2023-01-01", periods=len(df), freq="h")
+    if isinstance(df, pd.Index):
+        return index
+    return df.set_index(index)

{avoca-0.12.0 → avoca-0.15.0}/avoca/utils/__init__.py RENAMED Viewed

@@ -13,7 +13,7 @@ def compounds_from_df(df: pd.DataFrame) -> list[str]:
     Returns:
         The compounds in the dataframe.
     """
-    return [c for c in df.columns.get_level_values(0).unique() if c != "-"]
+    return [c for c in df.columns.get_level_values(0).unique() if c not in ["-", ""]]
 def runtypes_from_df(df: pd.DataFrame) -> list[str]:

{avoca-0.12.0 → avoca-0.15.0}/avoca/utils/flags_doc.py RENAMED Viewed

@@ -56,6 +56,9 @@ def parse_enum_comments(filepath: Path, enum_class_name: str) -> dict[Enum, str]
         exec(code, module)
         enum_cls = module[enum_class_name]
         for name, comment in comment_dict.items():
+            if not hasattr(enum_cls, name):
+                # Probably somehwere else in the file
+                continue
             enum_member = getattr(enum_cls, name)
             enum_obj[enum_member] = comment

{avoca-0.12.0 → avoca-0.15.0}/examples/data_qa.ipynb RENAMED Viewed

@@ -137,6 +137,7 @@
    "source": [
     "from avoca.qa_class.concs import ExtremeConcentrations\n",
     "from avoca.qa_class.zscore import XY_Correlations, Multiple_XY_Correlations\n",
+    "from avoca.qa_class.rolling import RollingWindow\n",
     "\n",
     "\n",
     "# Create assingers for each compound\n",
@@ -145,10 +146,13 @@
     "    XY_Correlations(\n",
     "        compounds=[\"ethane\", \"propane\", \"n-butane\"], variable=\"C\", threshold=4.0\n",
     "    ),\n",
-    "    # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
-    "    multiple_assigner := Multiple_XY_Correlations(\n",
-    "        number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
+    "    rolling_window := RollingWindow(\n",
+    "        compounds=compounds, variable=\"C\"\n",
     "    ),\n",
+    "    # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
+    "    # multiple_assigner := Multiple_XY_Correlations(\n",
+    "    #     number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
+    "    # ),\n",
     "]"
    ]
   },
@@ -329,6 +333,15 @@
     "fig.patch.set_alpha(0)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rolling_window.plot()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

{avoca-0.12.0 → avoca-0.15.0}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "avoca"
-version = "0.12.0"
+version = "0.15.0"
 authors = [
   { name="Lionel Constantin", email="lionel.constantin@empa.ch" },
 ]

avoca-0.15.0/tests/bindings/test_qatool.py ADDED Viewed

@@ -0,0 +1,216 @@
+from datetime import timedelta
+from pathlib import Path
+import pandas as pd
+import pytest
+from avoca.bindings.qa_tool import export_EmpaQATool
+from avoca.testing import testdata_dir
+from avoca.testing.df import invalids_df, simple_df
+export_path = testdata_dir / "export_empa_qa_tool"
+@pytest.mark.parametrize(
+    "df, name",
+    [
+        (simple_df, "simple"),
+        (invalids_df, "invalids"),
+    ],
+)
+def test_export_EmpaQATool(df, name):
+    """Test the export_EmpaQATool function."""
+    # Create a test dataframe
+    df = df.copy()
+    df[("compA", "flag")] = 0
+    df[("compB", "flag")] = 0
+    df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
+    # Export the dataframe to a file
+    export_file = export_EmpaQATool(
+        df,
+        export_path,
+        datetime_offsets=(timedelta(minutes=-5), timedelta(minutes=0)),
+        station=name,
+    )
+    # Check that the file is created
+    assert Path(export_file).is_file()
+    # Read the file and check that the data is correct
+    df_exported = pd.read_csv(
+        export_file,
+        sep=";",
+    )
+    assert len(df_exported) == len(df)
+    # Check that the 'compB-Value' column is of float dtype
+    assert pd.api.types.is_float_dtype(df_exported["compB-Value"])
+    assert not pd.isna(df_exported["compB-Value"]).any(), "NAN values must be 999..."
+def _prepare_df_for_export(df: pd.DataFrame) -> pd.DataFrame:
+    """Prepare a dataframe for export testing."""
+    df = df.copy()
+    df[("compA", "flag")] = 0
+    df[("compB", "flag")] = 0
+    df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
+    df[("-", "datetime_start")] = df[("-", "datetime")] - timedelta(minutes=5)
+    df[("-", "datetime_end")] = df[("-", "datetime")] + timedelta(minutes=0)
+    return df
+def test_export_names_dict():
+    """test that export names from dict are used correctly."""
+    out_file = export_EmpaQATool(
+        _prepare_df_for_export(simple_df),
+        export_path,
+        export_names={"compA": "CustomCompA", "compB": "CustomCompB"},
+        station="TEST_DICT",
+    )
+    df_exported = pd.read_csv(
+        out_file,
+        sep=";",
+    )
+    assert "CustomCompA-Value" in df_exported.columns
+    assert "CustomCompB-Value" in df_exported.columns
+    assert "compA-Value" not in df_exported.columns
+    assert "compB-Value" not in df_exported.columns
+def test_export_names_df():
+    """test that export names from dict are used correctly."""
+    out_file = export_EmpaQATool(
+        _prepare_df_for_export(simple_df),
+        export_path,
+        station="TEST_NAMES_DF",
+        df_substances=pd.DataFrame(
+            {
+                "substance": ["compA", "compB"],
+                "export_name": ["CustomCompA", "CustomCompB"],
+            }
+        ).set_index("substance"),
+    )
+    df_exported = pd.read_csv(
+        out_file,
+        sep=";",
+    )
+    assert "CustomCompA-Value" in df_exported.columns
+    assert "CustomCompB-Value" in df_exported.columns
+    assert "compA-Value" not in df_exported.columns
+    assert "compB-Value" not in df_exported.columns
+def test_both_export_names_warns(caplog):
+    """test that export names from dict are used correctly."""
+    with caplog.at_level("WARNING"):
+        out_file = export_EmpaQATool(
+            _prepare_df_for_export(simple_df),
+            export_path,
+            station="TEST_BOTH_WARN",
+            export_names={"compA": "CustomCompA", "compB": "CustomCompB"},
+            df_substances=pd.DataFrame(
+                {
+                    "substance": ["compA", "compB"],
+                    "export_name": ["WrongCompA", "CustomCompB"],
+                }
+            ).set_index("substance"),
+        )
+    assert (
+        "Substance compA found in both df_substances and export_names." in caplog.text
+    )
+    df_exported = pd.read_csv(
+        out_file,
+        sep=";",
+    )
+    assert "CustomCompA-Value" in df_exported.columns
+    assert "CustomCompB-Value" in df_exported.columns
+    assert "compA-Value" not in df_exported.columns
+    assert "WrongCompA-Value" not in df_exported.columns
+    assert "compB-Value" not in df_exported.columns
+def test_export_no_export_substances():
+    """test that substances with export=False in df_substances are not exported."""
+    out_file = export_EmpaQATool(
+        _prepare_df_for_export(simple_df),
+        export_path,
+        station="TEST_NO_EXPORT_SUBSTANCES",
+        df_substances=pd.DataFrame(
+            {
+                "substance": ["compA", "compB"],
+                "export": [True, False],
+            }
+        ).set_index("substance"),
+    )
+    df_exported = pd.read_csv(
+        out_file,
+        sep=";",
+    )
+    assert "compA-Value" in df_exported.columns
+    assert "compB-Value" not in df_exported.columns
+def test_export_if_not_in_df_substances():
+    """test that substances not in df_substances are exported."""
+    out_file = export_EmpaQATool(
+        _prepare_df_for_export(simple_df),
+        export_path,
+        station="TEST_IF_NOT_IN_DF_SUBSTANCES",
+        df_substances=pd.DataFrame(
+            {
+                "substance": ["compA"],
+                "export": [True],
+            }
+        ).set_index("substance"),
+    )
+    df_exported = pd.read_csv(
+        out_file,
+        sep=";",
+    )
+    assert "compA-Value" in df_exported.columns
+    assert "compB-Value" in df_exported.columns
+def test_export_and_rename_in_df_substances():
+    """test that export names from dict are used correctly."""
+    out_file = export_EmpaQATool(
+        _prepare_df_for_export(simple_df),
+        export_path,
+        station="TEST_EXPORT_AND_RENAME_IN_DF_SUBSTANCES",
+        df_substances=pd.DataFrame(
+            {
+                "substance": ["compA", "compB"],
+                "export_name": ["CustomCompA", "CustomCompB"],
+                "export": [True, False],
+            }
+        ).set_index("substance"),
+    )
+    df_exported = pd.read_csv(
+        out_file,
+        sep=";",
+    )
+    assert "CustomCompA-Value" in df_exported.columns
+    assert "compA-Value" not in df_exported.columns
+    assert "compB-Value" not in df_exported.columns
+    assert "CustomCompB-Value" not in df_exported.columns

{avoca-0.12.0 → avoca-0.15.0}/tests/test_assigners.py RENAMED Viewed

@@ -3,21 +3,18 @@
 import pandas as pd
 import pytest
+from avoca.manager import AssignerManager
 from avoca.qa_class.abstract import AbstractQA_Assigner
 from avoca.qa_class.invalid import InvalidValues
 from avoca.qa_class.zscore import ExtremeValues, XY_Correlations
-from avoca.testing.df import (
-    df_around_zero,
-    df_full_nan,
-    df_nan_training,
-    df_one_extreme,
-    df_regular,
-    df_with_inf,
-    empty_index,
-)
+from avoca.qa_class.rolling import RollingWindow
+import avoca.testing.df as df_test
+from avoca.testing.utils import make_dt_index
 index_all_3 = pd.Index([0, 1, 2], dtype="int64")
+index_all_3_dt = make_dt_index(index_all_3)
 index_2 = pd.Index([2], dtype="int64")
+index_2_dt = index_all_3_dt[index_2]
 @pytest.fixture(
@@ -36,6 +33,8 @@ index_2 = pd.Index([2], dtype="int64")
                 "name": "invalid_zeros_and_negative",
             },
         ),
+        (RollingWindow, {"rolling_window": pd.Timedelta(days=3)}),
+        (RollingWindow, {}),
     ]
 )
 def assigner(
@@ -46,27 +45,66 @@ def assigner(
     return assigner_type(variable="test_var", compounds=["compA", "compB"], **kwargs)
+def test_is_in_documentation(assigner: AbstractQA_Assigner):
+    """Test the assigner will appear in the documentation."""
+    assert type(assigner).__name__ in AssignerManager._assigners_importpath
 def test_simple(assigner: AbstractQA_Assigner):
+    df_one_extreme = df_test.df_one_extreme
+    df_regular = df_test.df_regular
+    if assigner.require_datetime_index:
+        df_one_extreme = make_dt_index(df_one_extreme)
+        df_regular = make_dt_index(df_regular)
     assigner.fit(df_regular)
     flagged = assigner.assign(df_one_extreme)
+    empty_index = (
+        df_test.empty_index
+        if not assigner.require_datetime_index
+        else df_test.empty_index_dt
+    )
     comparison_output_a = {
         InvalidValues: empty_index,
+        RollingWindow: index_2_dt,
     }
     comparison_output_b = {
-        ExtremeValues: empty_index,
         # Also b is outside of the correlation cloud
         XY_Correlations: index_2,
-        InvalidValues: empty_index,
     }
     pd.testing.assert_index_equal(
         flagged["compA"], comparison_output_a.get(type(assigner), index_2)
     )
-    pd.testing.assert_index_equal(flagged["compB"], comparison_output_b[type(assigner)])
+    pd.testing.assert_index_equal(
+        flagged["compB"], comparison_output_b.get(type(assigner), empty_index)
+    )
+def test_input_dataframe_dt_index(assigner: AbstractQA_Assigner):
+    df_regular = df_test.df_regular
+    if assigner.require_datetime_index:
+        with pytest.raises(ValueError, match="requires a DatetimeIndex"):
+            assigner.fit(df_regular)
 def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
+    df_nan_training = df_test.df_nan_training
+    df_regular = df_test.df_regular
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_nan_training = make_dt_index(df_nan_training)
+        df_regular = make_dt_index(df_regular)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_nan_training)
     flagged = assigner.assign(df_regular)
@@ -76,6 +114,15 @@ def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
 def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
+    df_full_nan = df_test.df_full_nan
+    df_regular = df_test.df_regular
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_full_nan = make_dt_index(df_full_nan)
+        df_regular = make_dt_index(df_regular)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_full_nan)
     flagged = assigner.assign(df_regular)
@@ -85,11 +132,19 @@ def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
 def test_fitting_nans(assigner: AbstractQA_Assigner):
+    df_regular = df_test.df_regular
+    df_nan_training = df_test.df_nan_training
+    df_full_nan = df_test.df_full_nan
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_regular = make_dt_index(df_regular)
+        df_nan_training = make_dt_index(df_nan_training)
+        df_full_nan = make_dt_index(df_full_nan)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_regular)
     flagged = assigner.assign(df_nan_training)
     flagged_allnans = assigner.assign(df_full_nan)
     comparison_output_a_one_nan = {
         InvalidValues: index_2,
     }
@@ -99,7 +154,8 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
     # Nothing should be flagged
     pd.testing.assert_index_equal(
-        flagged["compA"], comparison_output_a_one_nan.get(type(assigner), empty_index)
+        flagged["compA"],
+        comparison_output_a_one_nan.get(type(assigner), empty_index),
     )
     pd.testing.assert_index_equal(flagged["compB"], empty_index)
     pd.testing.assert_index_equal(
@@ -112,6 +168,12 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
 def test_zero_values(assigner: AbstractQA_Assigner):
     """Test that zero values are not flagged."""
+    df_around_zero = df_test.df_around_zero
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_around_zero = make_dt_index(df_around_zero)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_around_zero)
     flagged = assigner.assign(df_around_zero)
@@ -138,6 +200,12 @@ def test_zero_values(assigner: AbstractQA_Assigner):
 def test_inf_values(assigner: AbstractQA_Assigner):
     """Test that inf values are flagged."""
+    df_with_inf = df_test.df_with_inf
+    empty_index = df_test.empty_index
+    if assigner.require_datetime_index:
+        df_with_inf = make_dt_index(df_with_inf)
+        empty_index = df_test.empty_index_dt
     assigner.fit(df_with_inf)
     flagged = assigner.assign(df_with_inf)

avoca-0.12.0/tests/bindings/test_qatool.py DELETED Viewed

@@ -1,49 +0,0 @@
-from datetime import timedelta
-from pathlib import Path
-import pandas as pd
-import pytest
-from avoca.bindings.qa_tool import export_EmpaQATool
-from avoca.testing import testdata_dir
-from avoca.testing.df import invalids_df, simple_df
-@pytest.mark.parametrize(
-    "df, name",
-    [
-        (simple_df, "simple"),
-        (invalids_df, "invalids"),
-    ],
-)
-def test_export_EmpaQATool(df, name):
-    """Test the export_EmpaQATool function."""
-    # Create a test dataframe
-    df = df.copy()
-    df[("compA", "flag")] = 0
-    df[("compB", "flag")] = 0
-    df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
-    # Export the dataframe to a file
-    export_path = testdata_dir / "export_empa_qa_tool"
-    export_file = export_EmpaQATool(
-        df,
-        export_path,
-        datetime_offsets=(timedelta(minutes=-5), timedelta(minutes=0)),
-        station=name,
-    )
-    # Check that the file is created
-    assert Path(export_file).is_file()
-    # Read the file and check that the data is correct
-    df_exported = pd.read_csv(
-        export_file,
-        sep=";",
-    )
-    assert len(df_exported) == len(df)
-    # Check that the 'compB-Value' column is of float dtype
-    assert pd.api.types.is_float_dtype(df_exported["compB-Value"])
-    assert not pd.isna(df_exported["compB-Value"]).any(), "NAN values must be 999..."