avoca 0.11.4__tar.gz → 0.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {avoca-0.11.4 → avoca-0.14.0}/PKG-INFO +1 -1
  2. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/ebas.py +16 -3
  3. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/ebas_flags.py +3 -9
  4. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/qa_tool.py +65 -1
  5. {avoca-0.11.4 → avoca-0.14.0}/avoca/flags.py +8 -0
  6. avoca-0.14.0/avoca/plots.py +146 -0
  7. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/abstract.py +9 -0
  8. avoca-0.14.0/avoca/qa_class/rolling.py +133 -0
  9. {avoca-0.11.4 → avoca-0.14.0}/avoca/testing/df.py +1 -0
  10. avoca-0.14.0/avoca/testing/utils.py +9 -0
  11. {avoca-0.11.4 → avoca-0.14.0}/examples/data_qa.ipynb +16 -3
  12. {avoca-0.11.4 → avoca-0.14.0}/pyproject.toml +1 -1
  13. {avoca-0.11.4 → avoca-0.14.0}/tests/test_assigners.py +75 -14
  14. {avoca-0.11.4 → avoca-0.14.0}/.gitignore +0 -0
  15. {avoca-0.11.4 → avoca-0.14.0}/.gitlab-ci.yml +0 -0
  16. {avoca-0.11.4 → avoca-0.14.0}/.readthedocs.yaml +0 -0
  17. {avoca-0.11.4 → avoca-0.14.0}/.vscode/settings.json +0 -0
  18. {avoca-0.11.4 → avoca-0.14.0}/LICENCE.txt +0 -0
  19. {avoca-0.11.4 → avoca-0.14.0}/README.md +0 -0
  20. {avoca-0.11.4 → avoca-0.14.0}/avoca/__init__.py +0 -0
  21. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/__init__.py +0 -0
  22. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/gcwerks-report.conf +0 -0
  23. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/gcwerks.py +0 -0
  24. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/gcwerks_gui.py +0 -0
  25. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/nabel.py +0 -0
  26. {avoca-0.11.4 → avoca-0.14.0}/avoca/bindings/synspec.py +0 -0
  27. {avoca-0.11.4 → avoca-0.14.0}/avoca/export_nas.py +0 -0
  28. {avoca-0.11.4 → avoca-0.14.0}/avoca/flagging.py +0 -0
  29. {avoca-0.11.4 → avoca-0.14.0}/avoca/io.py +0 -0
  30. {avoca-0.11.4 → avoca-0.14.0}/avoca/logging.py +0 -0
  31. {avoca-0.11.4 → avoca-0.14.0}/avoca/manager.py +0 -0
  32. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/__init__.py +0 -0
  33. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/concs.py +0 -0
  34. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/generate_classes_doc.py +0 -0
  35. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/invalid.py +0 -0
  36. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/rt.py +0 -0
  37. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/test.py +0 -0
  38. {avoca-0.11.4 → avoca-0.14.0}/avoca/qa_class/zscore.py +0 -0
  39. {avoca-0.11.4 → avoca-0.14.0}/avoca/requirements.py +0 -0
  40. {avoca-0.11.4 → avoca-0.14.0}/avoca/settings.py +0 -0
  41. {avoca-0.11.4 → avoca-0.14.0}/avoca/testing/__init__.py +0 -0
  42. {avoca-0.11.4 → avoca-0.14.0}/avoca/utils/__init__.py +0 -0
  43. {avoca-0.11.4 → avoca-0.14.0}/avoca/utils/flags_doc.py +0 -0
  44. {avoca-0.11.4 → avoca-0.14.0}/avoca/utils/torch_models.py +0 -0
  45. {avoca-0.11.4 → avoca-0.14.0}/data/.avoca/config.yaml +0 -0
  46. {avoca-0.11.4 → avoca-0.14.0}/data/CH0001G.20240219123300.20240307132229.online_gc.NMHC.air.16d.61mn.CH01L_Agilent_GC-MS-MEDUSA_Medusa-12_JFJ.CH01L_gc_ms.lev0.nas +0 -0
  47. {avoca-0.11.4 → avoca-0.14.0}/data/tests/missing_area_cols.csv +0 -0
  48. {avoca-0.11.4 → avoca-0.14.0}/data/voc_jan2jun_2023.csv +0 -0
  49. {avoca-0.11.4 → avoca-0.14.0}/docs/Makefile +0 -0
  50. {avoca-0.11.4 → avoca-0.14.0}/docs/make.bat +0 -0
  51. {avoca-0.11.4 → avoca-0.14.0}/docs/source/bindings/ebas.md +0 -0
  52. {avoca-0.11.4 → avoca-0.14.0}/docs/source/bindings/gcwerks.md +0 -0
  53. {avoca-0.11.4 → avoca-0.14.0}/docs/source/bindings/index.rst +0 -0
  54. {avoca-0.11.4 → avoca-0.14.0}/docs/source/bindings/qa_tool.md +0 -0
  55. {avoca-0.11.4 → avoca-0.14.0}/docs/source/conf.py +0 -0
  56. {avoca-0.11.4 → avoca-0.14.0}/docs/source/index.rst +0 -0
  57. {avoca-0.11.4 → avoca-0.14.0}/docs/source/quickstart.ipynb +0 -0
  58. {avoca-0.11.4 → avoca-0.14.0}/examples/config.yaml +0 -0
  59. {avoca-0.11.4 → avoca-0.14.0}/examples/convert_synspec_to_gcwerks.py +0 -0
  60. {avoca-0.11.4 → avoca-0.14.0}/examples/data_qa_gcwerks.ipynb +0 -0
  61. {avoca-0.11.4 → avoca-0.14.0}/examples/export_gc_werks.py +0 -0
  62. {avoca-0.11.4 → avoca-0.14.0}/examples/export_gc_werks_secondary_peaks.py +0 -0
  63. {avoca-0.11.4 → avoca-0.14.0}/examples/get_tanks.ipynb +0 -0
  64. {avoca-0.11.4 → avoca-0.14.0}/examples/read_nas.ipynb +0 -0
  65. {avoca-0.11.4 → avoca-0.14.0}/tests/bindings/gcwerks.dat +0 -0
  66. {avoca-0.11.4 → avoca-0.14.0}/tests/bindings/test_gcwerks.py +0 -0
  67. {avoca-0.11.4 → avoca-0.14.0}/tests/bindings/test_qatool.py +0 -0
  68. {avoca-0.11.4 → avoca-0.14.0}/tests/test_flagging.py +0 -0
  69. {avoca-0.11.4 → avoca-0.14.0}/tests/test_io.py +0 -0
  70. {avoca-0.11.4 → avoca-0.14.0}/tests/test_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: avoca
3
- Version: 0.11.4
3
+ Version: 0.14.0
4
4
  Summary: @voc@: Quality assessement of measurement data
5
5
  Project-URL: Homepage, https://gitlab.com/empa503/atmospheric-measurements/avoca
6
6
  Project-URL: Bug Tracker, https://gitlab.com/empa503/atmospheric-measurements/avoca/-/issues
@@ -171,7 +171,6 @@ def set_dataframe(
171
171
  )
172
172
  )
173
173
 
174
-
175
174
  this_nan_flags = nan_flags.copy()
176
175
 
177
176
  if data_level in concs_data_levels and invalidate_conc_calib:
@@ -188,7 +187,9 @@ def set_dataframe(
188
187
  )
189
188
  for flag in flag_col
190
189
  ]
191
- nan_flag = np.logical_or.reduce([flag_col & flag.value for flag in this_nan_flags])
190
+ nan_flag = np.logical_or.reduce(
191
+ [flag_col & flag.value for flag in this_nan_flags]
192
+ )
192
193
 
193
194
  for var in vars_to_export[data_level]:
194
195
  ebas_name = compounds[sub]
@@ -199,6 +200,16 @@ def set_dataframe(
199
200
  for val, isnan in zip(serie_to_export, nan_flag)
200
201
  ]
201
202
 
203
+ if var == "conc_calib":
204
+ # Invalidate calibration concentration for non-calibration samples
205
+ this_flags = [
206
+ flags_ebas
207
+ + ([] if (QA_Flag.CALIBRATION.value & flag_avoca) else [980])
208
+ for flags_ebas, flag_avoca in zip(flags, flag_col)
209
+ ]
210
+ else:
211
+ this_flags = flags
212
+
202
213
  metadata = DataObject()
203
214
  metadata.comp_name = (
204
215
  f"{ebas_name}_{ebas_compname_of_var[var]}"
@@ -214,7 +225,9 @@ def set_dataframe(
214
225
  metadata.matrix = "air"
215
226
  # add the variable
216
227
  nas.variables.append(
217
- DataObject(values_=values, flags=flags, flagcol=True, metadata=metadata)
228
+ DataObject(
229
+ values_=values, flags=this_flags, flagcol=True, metadata=metadata
230
+ )
218
231
  )
219
232
 
220
233
  if var == "conc_calib":
@@ -1,5 +1,5 @@
1
- # https://projects.nilu.no/ccc/flags/flags.html for more info on what ebas uses
2
- from avoca.flags import QA_Flag
1
+ # https://ebas-submit.nilu.no/templates/comments/fl_flag for more info on what ebas uses
2
+ from avoca.flags import QA_Flag, nan_flags
3
3
 
4
4
  flags_to_ebas: dict[QA_Flag, int] = {
5
5
  QA_Flag.MISSING: 999, # M Missing measurement, unspecified reason
@@ -40,13 +40,7 @@ if missing_flags:
40
40
  f"Not all QA flags are mapped to Ebas flags. Missing: {missing_flags}"
41
41
  )
42
42
 
43
- # Flags that are considered to have missing values
44
- nan_flags = [
45
- QA_Flag.MISSING,
46
- QA_Flag.ZERO_NEG_CONC_EXT,
47
- QA_Flag.INVALIDATED_EXT,
48
- QA_Flag.INVALID_VALUES,
49
- ]
43
+ nan_flags = nan_flags
50
44
 
51
45
  # priority of the flag to appear in the output
52
46
  # Useful when you can select only one flag value
@@ -12,7 +12,7 @@ import numpy as np
12
12
  import pandas as pd
13
13
  import pandas.errors
14
14
 
15
- from avoca.bindings.ebas_flags import flag_order, flags_to_ebas
15
+ from avoca.bindings.ebas_flags import flag_order, flags_to_ebas, ebas_flag_to_avoca
16
16
  from avoca.flags import QA_Flag
17
17
  from avoca.utils import compounds_from_df
18
18
 
@@ -207,3 +207,67 @@ def export_EmpaQATool(
207
207
  logger.info(f"Exported to `{out_filepath}`")
208
208
 
209
209
  return out_filepath
210
+
211
+
212
+ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataFrame:
213
+ """Read an EmpaQATool export file.
214
+
215
+ Data is exported through : https://voc-qc.nilu.no/ExportData
216
+
217
+ :arg file_path: Path to the EmpaQATool export file.
218
+
219
+ :returns: DataFrame with the data.
220
+ """
221
+
222
+ # Pandas skips the 2 empty rows
223
+ df = pd.read_csv(file_path, sep=";", header=2)
224
+
225
+ # Convert the datetime columns
226
+ columns = {}
227
+ to_datetime = lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S")
228
+ columns[("-", "datetime_start")] = to_datetime(df["Start"])
229
+ columns[("-", "datetime_end")] = to_datetime(df["End"])
230
+
231
+ # Get the datetime column as the start time
232
+ dt = columns[("-", "datetime_start")].copy()
233
+ if shift is not None:
234
+ dt += shift
235
+ columns[("-", "datetime")] = dt
236
+
237
+ # Last column is empty
238
+ compounds = [ '-'.join(s[:-1]) for col in df.columns if len(s:=col.split("-")) >= 2]
239
+
240
+
241
+ for compound in compounds:
242
+
243
+
244
+ flag_col = f"{compound}-flag"
245
+ value_col = f"{compound}-value"
246
+ acc_col = f"{compound}-accuracy"
247
+ precision_col = f"{compound}-precision"
248
+
249
+ mapping = {
250
+ "conc": value_col,
251
+ "u_expanded":acc_col,
252
+ "u_precision":precision_col,
253
+ }
254
+
255
+ flag_values = (pd.to_numeric(df[flag_col]) * 1e3).astype(int).mod(1000)
256
+ # Flags are adding 1000 for specifying when set by qa tool or not
257
+ flags = flag_values.apply(
258
+ lambda x: ebas_flag_to_avoca[x].value if x else int(0)
259
+ )
260
+
261
+ for key, value in mapping.items():
262
+ # Since the nan flags values are set to 9999, we need to set them to nan
263
+ serie = pd.to_numeric(df[value])
264
+ mask_nan = flags == QA_Flag.MISSING.value
265
+ serie[mask_nan] = np.nan
266
+ columns[(compound, key)] = serie
267
+
268
+ columns[(compound, "flag")] = flags
269
+
270
+ mask_nan = columns[(compound, "conc")].isna()
271
+ columns[(compound, "flag")][mask_nan] |= QA_Flag.MISSING.value
272
+
273
+ return pd.DataFrame(columns)
@@ -46,6 +46,14 @@ class QA_Flag(Flag):
46
46
  # Invalid Values
47
47
  INVALID_VALUES = auto()
48
48
 
49
+ # Flags that are considered to have missing values
50
+ nan_flags = [
51
+ QA_Flag.MISSING,
52
+ QA_Flag.ZERO_NEG_CONC_EXT,
53
+ QA_Flag.INVALIDATED_EXT,
54
+ QA_Flag.INVALID_VALUES,
55
+ ]
56
+
49
57
 
50
58
  if __name__ == "__main__":
51
59
  # Print the flages and their values
@@ -0,0 +1,146 @@
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ def plot_historical_comparison(
6
+ df_new: pd.DataFrame, df_hist: pd.DataFrame, compound: str, ax=None
7
+ ) -> tuple[plt.Figure, plt.Axes]:
8
+ if ax is None:
9
+ fig, ax = plt.subplots(figsize=(10, 6))
10
+
11
+ dt_column = ("-", "datetime")
12
+
13
+ for data_type, df in zip(["Historical", "New"], [df_hist, df_new]):
14
+ if data_type == "Historical":
15
+ color = "blue"
16
+ else:
17
+ color = "red"
18
+
19
+ serie = df[(compound, "conc")]
20
+ dt = df[dt_column]
21
+ if ("-", "type") in df.columns:
22
+ mask_air = df[("-", "type")] == "air"
23
+ serie = serie[mask_air]
24
+ dt = dt[mask_air]
25
+
26
+ ax.scatter(dt, serie, label=data_type, color=color, alpha=0.5, s=4)
27
+ ax.set_title(compound)
28
+ ax.set_xlabel("Date")
29
+ ax.set_ylabel("Concentration (ppt)")
30
+ ax.legend()
31
+ return fig, ax
32
+
33
+
34
+ def plot_yearly_data(
35
+ df: pd.DataFrame, compound: str, ax=None
36
+ ) -> tuple[plt.Figure, plt.Axes]:
37
+ if ax is None:
38
+ fig, ax = plt.subplots(figsize=(10, 6))
39
+
40
+ dt_column = ("-", "datetime")
41
+ serie = df[(compound, "conc")]
42
+ dt = df[dt_column]
43
+ if ("-", "type") in df.columns:
44
+ mask_air = df[("-", "type")] == "air"
45
+ serie = serie[mask_air]
46
+ dt = dt[mask_air]
47
+
48
+ years = dt.dt.year.unique()
49
+ x = dt.dt.day_of_year + dt.dt.hour / 24.0
50
+ for year in years:
51
+ mask_year = dt.dt.year == year
52
+ ax.scatter(x[mask_year], serie[mask_year], label=str(year), alpha=0.5, s=4)
53
+
54
+ ax.set_title(compound)
55
+ ax.set_xlabel("Time of Year")
56
+ ax.set_ylabel("Concentration (ppt)")
57
+
58
+ # Add ticks with the mounths
59
+ month_starts = pd.date_range(start="2024-01-01", end="2025-01-01", freq="MS")
60
+ month_days = month_starts.dayofyear
61
+ month_labels = month_starts.strftime("%b")
62
+ ax.set_xticks(month_days)
63
+ ax.set_xticklabels(month_labels)
64
+ ax.legend()
65
+ return fig, ax
66
+
67
+
68
+ def plot_yearly_plotly(
69
+ df: pd.DataFrame,
70
+ compound: str,
71
+ df_new: pd.DataFrame | None = None,
72
+ opacity: float = 0.5,
73
+ size: int = 6,
74
+ ) -> "plotly.graph_objs._figure.Figure":
75
+ """Plot yearly data using plotly."""
76
+ import plotly.express as px
77
+ import plotly.graph_objects as go
78
+
79
+ dt_column = ("-", "datetime")
80
+ serie = df[(compound, "conc")]
81
+ dt = df[dt_column]
82
+ if ("-", "type") in df.columns:
83
+ mask_air = df[("-", "type")] == "air"
84
+ serie = serie[mask_air]
85
+ dt = dt[mask_air]
86
+ if ("-", "type") in df_new.columns:
87
+ mask_air_new = df_new[("-", "type")] == "air"
88
+ df_new = df_new[mask_air_new]
89
+
90
+ x = dt.dt.day_of_year + dt.dt.hour / 24.0
91
+ df_to_plot = pd.DataFrame(
92
+ {
93
+ "conc": serie.values,
94
+ "year": dt.dt.year.values,
95
+ },
96
+ index=x.values,
97
+ )
98
+ # Break down by year, to have year as columns and conc as values
99
+ df_to_plot = df_to_plot.pivot_table(
100
+ index=df_to_plot.index, columns="year", values="conc"
101
+ )
102
+ fig = go.Figure()
103
+
104
+ hover_template = "Timestamp: %{text}<br>Conc: %{y:.2f} ppt"
105
+
106
+ kwargs = {
107
+ "mode": "markers",
108
+ "opacity": opacity,
109
+ "marker": dict(size=size),
110
+ "hovertemplate": hover_template,
111
+ }
112
+
113
+ for year in df_to_plot.columns:
114
+ fig.add_trace(
115
+ go.Scatter(
116
+ x=df_to_plot.index,
117
+ y=df_to_plot[year],
118
+ name=str(year),
119
+ zorder=-year,
120
+ text=dt[dt.dt.year == year].dt.strftime("%y%m%d.%H%M"),
121
+ **kwargs,
122
+ )
123
+ )
124
+ x_values = pd.date_range(start="2024-01-01", end="2024-12-31", freq="MS")
125
+
126
+ dt_new = df_new[dt_column]
127
+ fig.add_trace(
128
+ go.Scatter(
129
+ x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
130
+ y=df_new[(compound, "conc")],
131
+ name="New Data",
132
+ text=dt_new.dt.strftime("%y%m%d.%H%M"),
133
+ **kwargs,
134
+ )
135
+ )
136
+ fig.update_layout(
137
+ xaxis_title="Time of Year",
138
+ yaxis_title=f"{compound} (ppt)",
139
+ xaxis=dict(
140
+ tickmode="array",
141
+ tickvals=x_values.dayofyear,
142
+ ticktext=x_values.strftime("%b"),
143
+ ),
144
+ )
145
+
146
+ return fig
@@ -49,6 +49,7 @@ class AbstractQA_Assigner(ABC):
49
49
  flag: QA_Flag
50
50
  runtypes: list[str] | None
51
51
  required_packages: list[PythonPackageRequirement] | None = None
52
+ require_datetime_index: bool = False
52
53
 
53
54
  # Options that can be set by the user
54
55
  name: str
@@ -142,6 +143,14 @@ class AbstractQA_Assigner(ABC):
142
143
  f"Please check the data and the settings for {self.name}"
143
144
  )
144
145
 
146
+ if self.require_datetime_index:
147
+ if not isinstance(df.index, pd.DatetimeIndex):
148
+ raise ValueError(
149
+ f"Assigner {self} requires a DatetimeIndex but the dataframe"
150
+ " does not have one. \n "
151
+ f"Please check the data and the settings for {self.name}"
152
+ )
153
+
145
154
  @abstractmethod
146
155
  def fit(self, df: pd.DataFrame):
147
156
  """Fit the QA assigner on some data.
@@ -0,0 +1,133 @@
1
+ """Quality assurance based on statistical methods."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import timedelta
6
+ from typing import TYPE_CHECKING
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from avoca.qa_class.zscore import ExtremeValues
12
+
13
+ if TYPE_CHECKING:
14
+
15
+ from avoca.utils.torch_models import MultipleRegressionModel
16
+
17
+
18
+ class RollingWindow(ExtremeValues):
19
+ """Detect in rolling windows.
20
+
21
+ The method is based on outliers in a rolling window using the median and standard deviation.
22
+ The training is done directly on the fitted data.
23
+
24
+ :param variable: The variable to check for extreme values.
25
+ :param threshold: The threshold for the z-score. To flag values.
26
+ :param use_log_normal: If True, the log of the values will be used to calculate the z-score.
27
+ This can be useful if the values are log-normal distributed.
28
+ :param only_greater: If True, only values greater than the threshold will be flagged.
29
+ The values lower than the negative threshold will not be flagged.
30
+ By default, this is True if use_log_normal is True, and False otherwise.
31
+ """
32
+
33
+ require_datetime_index = True
34
+
35
+ rolling_window: timedelta
36
+
37
+ def __init__(
38
+ self,
39
+ *args,
40
+ rolling_window: timedelta = timedelta(days=7),
41
+ threshold: float = 1.5,
42
+ **kwargs,
43
+ ):
44
+ super().__init__(*args, threshold=threshold, **kwargs)
45
+ self.rolling_window = rolling_window
46
+
47
+ def fit(self, df: pd.DataFrame):
48
+
49
+ self.check_columns_or_raise(df, columns=self._stats_columns)
50
+
51
+ self.df_train = df[self._stats_columns]
52
+
53
+ def assign(self, df: pd.DataFrame) -> dict[str, pd.Index]:
54
+ df = df[self._stats_columns]
55
+ df = self._clean_data(df)
56
+ if self.use_log_normal:
57
+ # Replace <=0 with NaN
58
+ df = df.where(df > 0, np.nan)
59
+ df = df.map(lambda x: np.log(x))
60
+
61
+ rolling = df.rolling(window=self.rolling_window)
62
+ means = rolling.median()
63
+ stds = rolling.std()
64
+
65
+ self.rolling_median = means
66
+ self.rolling_std = stds
67
+
68
+ thresholds = means + stds * self.threshold
69
+
70
+ df_fail = df > thresholds
71
+ if not self.only_greater:
72
+ df_fail = df_fail | (df < (means - stds * self.threshold))
73
+
74
+ out_dict = {}
75
+ for compound in self.compounds:
76
+ col = (compound, self.variable)
77
+ this_c_fail = df_fail[col]
78
+ out_dict[compound] = this_c_fail.loc[this_c_fail].index
79
+
80
+ return out_dict
81
+
82
+ def plot(self):
83
+
84
+ import matplotlib.pyplot as plt
85
+
86
+ fig, axes = plt.subplots(
87
+ len(self.compounds), 1, figsize=(6, 3 * len(self.compounds)), sharex=True
88
+ )
89
+
90
+ x = self.dt if hasattr(self, "dt") else self.df_train.index
91
+ x = pd.Series(x, index=self.df_train.index)
92
+
93
+ outliers = self.assign(self.df_train)
94
+
95
+ for i, compound in enumerate(self.compounds):
96
+ ax = axes[i]
97
+ col = (compound, self.variable)
98
+ ax.scatter(
99
+ x,
100
+ self.df_train[col],
101
+ s=1,
102
+ label="darkblue",
103
+ )
104
+ median = self.rolling_median[col]
105
+ std = self.rolling_std[col]
106
+ top, bottom = median + std * self.threshold, median - std * self.threshold
107
+
108
+ ax.fill_between(
109
+ x,
110
+ top,
111
+ bottom,
112
+ color="lightgray",
113
+ label="Rolling threshold",
114
+ alpha=0.5,
115
+ )
116
+
117
+ outlier_indices = outliers[compound]
118
+ ax.scatter(
119
+ x.loc[outlier_indices],
120
+ self.df_train.loc[outlier_indices, col],
121
+ s=10,
122
+ marker="x",
123
+ color="red",
124
+ label="Extreme values",
125
+ )
126
+ ax.set_title(
127
+ f"{compound} +- {self.threshold} std",
128
+ # Under teh top line
129
+ y=0.8,
130
+ )
131
+ ax.tick_params(axis="x", rotation=25)
132
+
133
+ return fig, axes
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
 
12
12
  empty_index = pd.Index([], dtype="int64")
13
+ empty_index_dt = pd.DatetimeIndex([])
13
14
 
14
15
  simple_df = pd.DataFrame(
15
16
  np.ones((2, 4)),
@@ -0,0 +1,9 @@
1
+ import pandas as pd
2
+
3
+
4
+ def make_dt_index(df: pd.DataFrame | pd.Index) -> pd.DataFrame | pd.Index:
5
+ """Create a datetime index for the dataframe."""
6
+ index = pd.date_range(start="2023-01-01", periods=len(df), freq="h")
7
+ if isinstance(df, pd.Index):
8
+ return index
9
+ return df.set_index(index)
@@ -137,6 +137,7 @@
137
137
  "source": [
138
138
  "from avoca.qa_class.concs import ExtremeConcentrations\n",
139
139
  "from avoca.qa_class.zscore import XY_Correlations, Multiple_XY_Correlations\n",
140
+ "from avoca.qa_class.rolling import RollingWindow\n",
140
141
  "\n",
141
142
  "\n",
142
143
  "# Create assingers for each compound\n",
@@ -145,10 +146,13 @@
145
146
  " XY_Correlations(\n",
146
147
  " compounds=[\"ethane\", \"propane\", \"n-butane\"], variable=\"C\", threshold=4.0\n",
147
148
  " ),\n",
148
- " # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
149
- " multiple_assigner := Multiple_XY_Correlations(\n",
150
- " number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
149
+ " rolling_window := RollingWindow(\n",
150
+ " compounds=compounds, variable=\"C\"\n",
151
151
  " ),\n",
152
+ " # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
153
+ " # multiple_assigner := Multiple_XY_Correlations(\n",
154
+ " # number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
155
+ " # ),\n",
152
156
  "]"
153
157
  ]
154
158
  },
@@ -329,6 +333,15 @@
329
333
  "fig.patch.set_alpha(0)\n"
330
334
  ]
331
335
  },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "metadata": {},
340
+ "outputs": [],
341
+ "source": [
342
+ "rolling_window.plot()"
343
+ ]
344
+ },
332
345
  {
333
346
  "cell_type": "code",
334
347
  "execution_count": null,
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
 
6
6
  [project]
7
7
  name = "avoca"
8
- version = "0.11.4"
8
+ version = "0.14.0"
9
9
  authors = [
10
10
  { name="Lionel Constantin", email="lionel.constantin@empa.ch" },
11
11
  ]
@@ -6,18 +6,14 @@ import pytest
6
6
  from avoca.qa_class.abstract import AbstractQA_Assigner
7
7
  from avoca.qa_class.invalid import InvalidValues
8
8
  from avoca.qa_class.zscore import ExtremeValues, XY_Correlations
9
- from avoca.testing.df import (
10
- df_around_zero,
11
- df_full_nan,
12
- df_nan_training,
13
- df_one_extreme,
14
- df_regular,
15
- df_with_inf,
16
- empty_index,
17
- )
9
+ from avoca.qa_class.rolling import RollingWindow
10
+ import avoca.testing.df as df_test
11
+ from avoca.testing.utils import make_dt_index
18
12
 
19
13
  index_all_3 = pd.Index([0, 1, 2], dtype="int64")
14
+ index_all_3_dt = make_dt_index(index_all_3)
20
15
  index_2 = pd.Index([2], dtype="int64")
16
+ index_2_dt = index_all_3_dt[index_2]
21
17
 
22
18
 
23
19
  @pytest.fixture(
@@ -36,6 +32,8 @@ index_2 = pd.Index([2], dtype="int64")
36
32
  "name": "invalid_zeros_and_negative",
37
33
  },
38
34
  ),
35
+ (RollingWindow, {"rolling_window": pd.Timedelta(days=3)}),
36
+ (RollingWindow, {}),
39
37
  ]
40
38
  )
41
39
  def assigner(
@@ -47,26 +45,59 @@ def assigner(
47
45
 
48
46
 
49
47
  def test_simple(assigner: AbstractQA_Assigner):
48
+
49
+ df_one_extreme = df_test.df_one_extreme
50
+ df_regular = df_test.df_regular
51
+ if assigner.require_datetime_index:
52
+ df_one_extreme = make_dt_index(df_one_extreme)
53
+ df_regular = make_dt_index(df_regular)
54
+
50
55
  assigner.fit(df_regular)
51
56
  flagged = assigner.assign(df_one_extreme)
52
57
 
58
+ empty_index = (
59
+ df_test.empty_index
60
+ if not assigner.require_datetime_index
61
+ else df_test.empty_index_dt
62
+ )
63
+
53
64
  comparison_output_a = {
54
65
  InvalidValues: empty_index,
66
+ RollingWindow: index_2_dt,
55
67
  }
56
68
  comparison_output_b = {
57
- ExtremeValues: empty_index,
58
69
  # Also b is outside of the correlation cloud
59
70
  XY_Correlations: index_2,
60
- InvalidValues: empty_index,
61
71
  }
62
72
 
63
73
  pd.testing.assert_index_equal(
64
74
  flagged["compA"], comparison_output_a.get(type(assigner), index_2)
65
75
  )
66
- pd.testing.assert_index_equal(flagged["compB"], comparison_output_b[type(assigner)])
76
+ pd.testing.assert_index_equal(
77
+ flagged["compB"], comparison_output_b.get(type(assigner), empty_index)
78
+ )
79
+
80
+
81
+ def test_input_dataframe_dt_index(assigner: AbstractQA_Assigner):
82
+
83
+ df_regular = df_test.df_regular
84
+
85
+ if assigner.require_datetime_index:
86
+ with pytest.raises(ValueError, match="requires a DatetimeIndex"):
87
+ assigner.fit(df_regular)
67
88
 
68
89
 
69
90
  def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
91
+
92
+ df_nan_training = df_test.df_nan_training
93
+ df_regular = df_test.df_regular
94
+ empty_index = df_test.empty_index
95
+
96
+ if assigner.require_datetime_index:
97
+ df_nan_training = make_dt_index(df_nan_training)
98
+ df_regular = make_dt_index(df_regular)
99
+ empty_index = df_test.empty_index_dt
100
+
70
101
  assigner.fit(df_nan_training)
71
102
  flagged = assigner.assign(df_regular)
72
103
 
@@ -76,6 +107,15 @@ def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
76
107
 
77
108
 
78
109
  def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
110
+
111
+ df_full_nan = df_test.df_full_nan
112
+ df_regular = df_test.df_regular
113
+ empty_index = df_test.empty_index
114
+ if assigner.require_datetime_index:
115
+ df_full_nan = make_dt_index(df_full_nan)
116
+ df_regular = make_dt_index(df_regular)
117
+ empty_index = df_test.empty_index_dt
118
+
79
119
  assigner.fit(df_full_nan)
80
120
  flagged = assigner.assign(df_regular)
81
121
 
@@ -85,11 +125,19 @@ def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
85
125
 
86
126
 
87
127
  def test_fitting_nans(assigner: AbstractQA_Assigner):
128
+ df_regular = df_test.df_regular
129
+ df_nan_training = df_test.df_nan_training
130
+ df_full_nan = df_test.df_full_nan
131
+ empty_index = df_test.empty_index
132
+ if assigner.require_datetime_index:
133
+ df_regular = make_dt_index(df_regular)
134
+ df_nan_training = make_dt_index(df_nan_training)
135
+ df_full_nan = make_dt_index(df_full_nan)
136
+ empty_index = df_test.empty_index_dt
88
137
  assigner.fit(df_regular)
89
138
 
90
139
  flagged = assigner.assign(df_nan_training)
91
140
  flagged_allnans = assigner.assign(df_full_nan)
92
-
93
141
  comparison_output_a_one_nan = {
94
142
  InvalidValues: index_2,
95
143
  }
@@ -99,7 +147,8 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
99
147
 
100
148
  # Nothing should be flagged
101
149
  pd.testing.assert_index_equal(
102
- flagged["compA"], comparison_output_a_one_nan.get(type(assigner), empty_index)
150
+ flagged["compA"],
151
+ comparison_output_a_one_nan.get(type(assigner), empty_index),
103
152
  )
104
153
  pd.testing.assert_index_equal(flagged["compB"], empty_index)
105
154
  pd.testing.assert_index_equal(
@@ -112,6 +161,12 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
112
161
  def test_zero_values(assigner: AbstractQA_Assigner):
113
162
  """Test that zero values are not flagged."""
114
163
 
164
+ df_around_zero = df_test.df_around_zero
165
+ empty_index = df_test.empty_index
166
+ if assigner.require_datetime_index:
167
+ df_around_zero = make_dt_index(df_around_zero)
168
+ empty_index = df_test.empty_index_dt
169
+
115
170
  assigner.fit(df_around_zero)
116
171
  flagged = assigner.assign(df_around_zero)
117
172
 
@@ -138,6 +193,12 @@ def test_zero_values(assigner: AbstractQA_Assigner):
138
193
  def test_inf_values(assigner: AbstractQA_Assigner):
139
194
  """Test that inf values are flagged."""
140
195
 
196
+ df_with_inf = df_test.df_with_inf
197
+ empty_index = df_test.empty_index
198
+ if assigner.require_datetime_index:
199
+ df_with_inf = make_dt_index(df_with_inf)
200
+ empty_index = df_test.empty_index_dt
201
+
141
202
  assigner.fit(df_with_inf)
142
203
  flagged = assigner.assign(df_with_inf)
143
204
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes