avoca 0.12.0__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {avoca-0.12.0 → avoca-0.15.0}/.gitignore +1 -0
  2. {avoca-0.12.0 → avoca-0.15.0}/PKG-INFO +1 -1
  3. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/ebas.py +16 -3
  4. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/ebas_flags.py +1 -1
  5. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/qa_tool.py +51 -10
  6. {avoca-0.12.0 → avoca-0.15.0}/avoca/manager.py +1 -0
  7. {avoca-0.12.0 → avoca-0.15.0}/avoca/plots.py +53 -25
  8. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/abstract.py +9 -0
  9. avoca-0.15.0/avoca/qa_class/rolling.py +136 -0
  10. {avoca-0.12.0 → avoca-0.15.0}/avoca/testing/df.py +1 -0
  11. avoca-0.15.0/avoca/testing/utils.py +9 -0
  12. {avoca-0.12.0 → avoca-0.15.0}/avoca/utils/__init__.py +1 -1
  13. {avoca-0.12.0 → avoca-0.15.0}/avoca/utils/flags_doc.py +3 -0
  14. {avoca-0.12.0 → avoca-0.15.0}/examples/data_qa.ipynb +16 -3
  15. {avoca-0.12.0 → avoca-0.15.0}/pyproject.toml +1 -1
  16. avoca-0.15.0/tests/bindings/test_qatool.py +216 -0
  17. {avoca-0.12.0 → avoca-0.15.0}/tests/test_assigners.py +82 -14
  18. avoca-0.12.0/tests/bindings/test_qatool.py +0 -49
  19. {avoca-0.12.0 → avoca-0.15.0}/.gitlab-ci.yml +0 -0
  20. {avoca-0.12.0 → avoca-0.15.0}/.readthedocs.yaml +0 -0
  21. {avoca-0.12.0 → avoca-0.15.0}/.vscode/settings.json +0 -0
  22. {avoca-0.12.0 → avoca-0.15.0}/LICENCE.txt +0 -0
  23. {avoca-0.12.0 → avoca-0.15.0}/README.md +0 -0
  24. {avoca-0.12.0 → avoca-0.15.0}/avoca/__init__.py +0 -0
  25. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/__init__.py +0 -0
  26. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/gcwerks-report.conf +0 -0
  27. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/gcwerks.py +0 -0
  28. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/gcwerks_gui.py +0 -0
  29. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/nabel.py +0 -0
  30. {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/synspec.py +0 -0
  31. {avoca-0.12.0 → avoca-0.15.0}/avoca/export_nas.py +0 -0
  32. {avoca-0.12.0 → avoca-0.15.0}/avoca/flagging.py +0 -0
  33. {avoca-0.12.0 → avoca-0.15.0}/avoca/flags.py +0 -0
  34. {avoca-0.12.0 → avoca-0.15.0}/avoca/io.py +0 -0
  35. {avoca-0.12.0 → avoca-0.15.0}/avoca/logging.py +0 -0
  36. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/__init__.py +0 -0
  37. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/concs.py +0 -0
  38. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/generate_classes_doc.py +0 -0
  39. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/invalid.py +0 -0
  40. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/rt.py +0 -0
  41. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/test.py +0 -0
  42. {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/zscore.py +0 -0
  43. {avoca-0.12.0 → avoca-0.15.0}/avoca/requirements.py +0 -0
  44. {avoca-0.12.0 → avoca-0.15.0}/avoca/settings.py +0 -0
  45. {avoca-0.12.0 → avoca-0.15.0}/avoca/testing/__init__.py +0 -0
  46. {avoca-0.12.0 → avoca-0.15.0}/avoca/utils/torch_models.py +0 -0
  47. {avoca-0.12.0 → avoca-0.15.0}/data/.avoca/config.yaml +0 -0
  48. {avoca-0.12.0 → avoca-0.15.0}/data/CH0001G.20240219123300.20240307132229.online_gc.NMHC.air.16d.61mn.CH01L_Agilent_GC-MS-MEDUSA_Medusa-12_JFJ.CH01L_gc_ms.lev0.nas +0 -0
  49. {avoca-0.12.0 → avoca-0.15.0}/data/tests/missing_area_cols.csv +0 -0
  50. {avoca-0.12.0 → avoca-0.15.0}/data/voc_jan2jun_2023.csv +0 -0
  51. {avoca-0.12.0 → avoca-0.15.0}/docs/Makefile +0 -0
  52. {avoca-0.12.0 → avoca-0.15.0}/docs/make.bat +0 -0
  53. {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/ebas.md +0 -0
  54. {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/gcwerks.md +0 -0
  55. {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/index.rst +0 -0
  56. {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/qa_tool.md +0 -0
  57. {avoca-0.12.0 → avoca-0.15.0}/docs/source/conf.py +0 -0
  58. {avoca-0.12.0 → avoca-0.15.0}/docs/source/index.rst +0 -0
  59. {avoca-0.12.0 → avoca-0.15.0}/docs/source/quickstart.ipynb +0 -0
  60. {avoca-0.12.0 → avoca-0.15.0}/examples/config.yaml +0 -0
  61. {avoca-0.12.0 → avoca-0.15.0}/examples/convert_synspec_to_gcwerks.py +0 -0
  62. {avoca-0.12.0 → avoca-0.15.0}/examples/data_qa_gcwerks.ipynb +0 -0
  63. {avoca-0.12.0 → avoca-0.15.0}/examples/export_gc_werks.py +0 -0
  64. {avoca-0.12.0 → avoca-0.15.0}/examples/export_gc_werks_secondary_peaks.py +0 -0
  65. {avoca-0.12.0 → avoca-0.15.0}/examples/get_tanks.ipynb +0 -0
  66. {avoca-0.12.0 → avoca-0.15.0}/examples/read_nas.ipynb +0 -0
  67. {avoca-0.12.0 → avoca-0.15.0}/tests/bindings/gcwerks.dat +0 -0
  68. {avoca-0.12.0 → avoca-0.15.0}/tests/bindings/test_gcwerks.py +0 -0
  69. {avoca-0.12.0 → avoca-0.15.0}/tests/test_flagging.py +0 -0
  70. {avoca-0.12.0 → avoca-0.15.0}/tests/test_io.py +0 -0
  71. {avoca-0.12.0 → avoca-0.15.0}/tests/test_manager.py +0 -0
@@ -13,3 +13,4 @@ dist/
13
13
 
14
14
  # Generated by pytests
15
15
  simple_df.csv
16
+ data/tests/export_empa_qa_tool/*.csv
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: avoca
3
- Version: 0.12.0
3
+ Version: 0.15.0
4
4
  Summary: @voc@: Quality assessement of measurement data
5
5
  Project-URL: Homepage, https://gitlab.com/empa503/atmospheric-measurements/avoca
6
6
  Project-URL: Bug Tracker, https://gitlab.com/empa503/atmospheric-measurements/avoca/-/issues
@@ -171,7 +171,6 @@ def set_dataframe(
171
171
  )
172
172
  )
173
173
 
174
-
175
174
  this_nan_flags = nan_flags.copy()
176
175
 
177
176
  if data_level in concs_data_levels and invalidate_conc_calib:
@@ -188,7 +187,9 @@ def set_dataframe(
188
187
  )
189
188
  for flag in flag_col
190
189
  ]
191
- nan_flag = np.logical_or.reduce([flag_col & flag.value for flag in this_nan_flags])
190
+ nan_flag = np.logical_or.reduce(
191
+ [flag_col & flag.value for flag in this_nan_flags]
192
+ )
192
193
 
193
194
  for var in vars_to_export[data_level]:
194
195
  ebas_name = compounds[sub]
@@ -199,6 +200,16 @@ def set_dataframe(
199
200
  for val, isnan in zip(serie_to_export, nan_flag)
200
201
  ]
201
202
 
203
+ if var == "conc_calib":
204
+ # Invalidate calibration concentration for non-calibration samples
205
+ this_flags = [
206
+ flags_ebas
207
+ + ([] if (QA_Flag.CALIBRATION.value & flag_avoca) else [980])
208
+ for flags_ebas, flag_avoca in zip(flags, flag_col)
209
+ ]
210
+ else:
211
+ this_flags = flags
212
+
202
213
  metadata = DataObject()
203
214
  metadata.comp_name = (
204
215
  f"{ebas_name}_{ebas_compname_of_var[var]}"
@@ -214,7 +225,9 @@ def set_dataframe(
214
225
  metadata.matrix = "air"
215
226
  # add the variable
216
227
  nas.variables.append(
217
- DataObject(values_=values, flags=flags, flagcol=True, metadata=metadata)
228
+ DataObject(
229
+ values_=values, flags=this_flags, flagcol=True, metadata=metadata
230
+ )
218
231
  )
219
232
 
220
233
  if var == "conc_calib":
@@ -1,4 +1,4 @@
1
- # https://projects.nilu.no/ccc/flags/flags.html for more info on what ebas uses
1
+ # https://ebas-submit.nilu.no/templates/comments/fl_flag for more info on what ebas uses
2
2
  from avoca.flags import QA_Flag, nan_flags
3
3
 
4
4
  flags_to_ebas: dict[QA_Flag, int] = {
@@ -37,10 +37,11 @@ def export_EmpaQATool(
37
37
  station: str = "XXX",
38
38
  revision_date: datetime | None = None,
39
39
  dataset: datetime | str | None = None,
40
- export_names: dict[str, str] = {},
40
+ export_names: dict[str, str] | None = None,
41
41
  datetime_offsets: tuple[timedelta, timedelta] | None = None,
42
42
  substances: list[str] = [],
43
43
  rounding_decimals: int = 4,
44
+ df_substances: pd.DataFrame | None = None,
44
45
  ) -> Path:
45
46
  """Export to the EmpaQATool format.
46
47
 
@@ -64,7 +65,17 @@ def export_EmpaQATool(
64
65
  :arg datetime_offsets: Tuple of two timedelta to use for the start and end datetime
65
66
  :arg substances: List of substances to export. You can also specify group names.
66
67
  If not specified, this will use the substances from `df_substances`.
68
+ If a substance is present here and not in `df_substances`, it will still be exported.
67
69
  :arg rounding_decimals: Number of decimals to round the values to.
70
+ :arg df_substances: DataFrame with substance information.
71
+ If provided, the substances to export will be taken from this dataframe.
72
+ Columns:
73
+ - index: substance name
74
+ - export: bool, whether to export the substance
75
+ - export_name: str, name to use in the export file
76
+
77
+
78
+ :returns: Path to the exported file.
68
79
 
69
80
  """
70
81
 
@@ -113,12 +124,42 @@ def export_EmpaQATool(
113
124
  logger.debug(f"df_out: {df_out.head()}")
114
125
  if not substances:
115
126
  substances = compounds_from_df(df)
127
+ if df_substances is not None and "export" in df_substances.columns:
128
+ # Remove the substances that should not be exported
129
+ substances = [
130
+ s
131
+ for s in substances
132
+ if s not in df_substances.index or df_substances.loc[s, "export"]
133
+ ]
116
134
 
117
135
  remove_infs = lambda x: x.replace([np.inf, -np.inf], np.nan)
118
136
  is_invalid = lambda x: x.isin([np.inf, -np.inf]) | pd.isna(x)
119
137
  clean_col = lambda x: remove_infs(x).round(rounding_decimals).astype(str)
120
138
 
139
+ if export_names is None:
140
+ export_names = {}
141
+
142
+ if df_substances is not None and "export_name" in df_substances.columns:
143
+ # Read export names from the dataframe if provided
144
+ for substance in substances:
145
+ if not substance or substance not in df_substances.index:
146
+ continue
147
+ export_name_df = df_substances.loc[substance, "export_name"]
148
+ if not export_name_df or pd.isna(export_name_df):
149
+ continue
150
+ if substance in export_names and export_names[substance] != export_name_df:
151
+ logger.warning(
152
+ f"Substance {substance} found in both df_substances and"
153
+ " export_names. Using the name from export_names.\n"
154
+ f" - export_names (used): {export_names[substance]}\n"
155
+ f" - df_substances: {export_name_df}"
156
+ )
157
+ continue
158
+ export_names[substance] = export_name_df
159
+
121
160
  for substance in substances:
161
+ if not substance:
162
+ continue
122
163
 
123
164
  export_name = export_names.get(substance, substance)
124
165
 
@@ -234,12 +275,12 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
234
275
  dt += shift
235
276
  columns[("-", "datetime")] = dt
236
277
 
237
- # Last column is empty
238
- compounds = [ '-'.join(s[:-1]) for col in df.columns if len(s:=col.split("-")) >= 2]
239
-
240
-
241
- for compound in compounds:
278
+ # Last column is empty
279
+ compounds = [
280
+ "-".join(s[:-1]) for col in df.columns if len(s := col.split("-")) >= 2
281
+ ]
242
282
 
283
+ for compound in compounds:
243
284
 
244
285
  flag_col = f"{compound}-flag"
245
286
  value_col = f"{compound}-value"
@@ -248,8 +289,8 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
248
289
 
249
290
  mapping = {
250
291
  "conc": value_col,
251
- "u_expanded":acc_col,
252
- "u_precision":precision_col,
292
+ "u_expanded": acc_col,
293
+ "u_precision": precision_col,
253
294
  }
254
295
 
255
296
  flag_values = (pd.to_numeric(df[flag_col]) * 1e3).astype(int).mod(1000)
@@ -263,10 +304,10 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
263
304
  serie = pd.to_numeric(df[value])
264
305
  mask_nan = flags == QA_Flag.MISSING.value
265
306
  serie[mask_nan] = np.nan
266
- columns[(compound, key)] = serie
307
+ columns[(compound, key)] = serie
267
308
 
268
309
  columns[(compound, "flag")] = flags
269
-
310
+
270
311
  mask_nan = columns[(compound, "conc")].isna()
271
312
  columns[(compound, "flag")][mask_nan] |= QA_Flag.MISSING.value
272
313
 
@@ -20,6 +20,7 @@ class AssignerManager:
20
20
  _assigners_importpath = {
21
21
  "RetentionTimeChecker": "avoca.qa_class.rt",
22
22
  "ExtremeValues": "avoca.qa_class.zscore",
23
+ "RollingWindow": "avoca.qa_class.rolling",
23
24
  "ExtremeConcentrations": "avoca.qa_class.concs",
24
25
  "XY_Correlations": "avoca.qa_class.zscore",
25
26
  "TestAssigner": "avoca.qa_class.test",
@@ -69,46 +69,74 @@ def plot_yearly_plotly(
69
69
  df: pd.DataFrame,
70
70
  compound: str,
71
71
  df_new: pd.DataFrame | None = None,
72
+ opacity: float = 0.5,
73
+ size: int = 6,
72
74
  ) -> "plotly.graph_objs._figure.Figure":
73
75
  """Plot yearly data using plotly."""
74
76
  import plotly.express as px
75
77
  import plotly.graph_objects as go
76
78
 
77
79
  dt_column = ("-", "datetime")
78
- serie = df[(compound, "conc")]
79
- dt = df[dt_column]
80
+
80
81
  if ("-", "type") in df.columns:
81
82
  mask_air = df[("-", "type")] == "air"
82
- serie = serie[mask_air]
83
- dt = dt[mask_air]
84
- if ("-", "type") in df_new.columns:
83
+ df = df[mask_air]
84
+ if df_new is not None and ("-", "type") in df_new.columns:
85
85
  mask_air_new = df_new[("-", "type")] == "air"
86
86
  df_new = df_new[mask_air_new]
87
87
 
88
+ dt = df[dt_column]
88
89
  x = dt.dt.day_of_year + dt.dt.hour / 24.0
89
- df_to_plot = pd.DataFrame(
90
- {
91
- "conc": serie.values,
92
- "year": dt.dt.year.values,
93
- },
94
- index=x.values,
95
- )
96
- # Break down by year, to have year as columns and conc as values
97
- df_to_plot = df_to_plot.pivot_table(
98
- index=df_to_plot.index, columns="year", values="conc"
99
- )
100
- fig = px.scatter(df_to_plot)
90
+
91
+ fig = go.Figure()
92
+
93
+ hover_template = "Timestamp: %{text}<br>Conc: %{y:.2f} ppt"
94
+
95
+ kwargs = {
96
+ "mode": "markers",
97
+ "opacity": opacity,
98
+ "marker": dict(size=size),
99
+ "hovertemplate": hover_template,
100
+ }
101
+
102
+ if (compound, "conc") in df:
103
+ serie = df[(compound, "conc")]
104
+ df_to_plot = pd.DataFrame(
105
+ {
106
+ "conc": serie.values,
107
+ "year": dt.dt.year.values,
108
+ },
109
+ index=x.values,
110
+ )
111
+ # Break down by year, to have year as columns and conc as values
112
+ df_to_plot = df_to_plot.pivot_table(
113
+ index=df_to_plot.index, columns="year", values="conc"
114
+ )
115
+ for year in df_to_plot.columns:
116
+ fig.add_trace(
117
+ go.Scatter(
118
+ x=df_to_plot.index,
119
+ y=df_to_plot[year],
120
+ name=str(year),
121
+ zorder=-year,
122
+ text=dt[dt.dt.year == year].dt.strftime("%y%m%d.%H%M"),
123
+ **kwargs,
124
+ )
125
+ )
126
+
101
127
  x_values = pd.date_range(start="2024-01-01", end="2024-12-31", freq="MS")
102
128
 
103
- dt_new = df_new[dt_column]
104
- fig.add_trace(
105
- go.Scatter(
106
- x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
107
- y=df_new[(compound, "conc")],
108
- mode="markers",
109
- name="New Data",
129
+ if df_new is not None and (compound, "conc") in df_new:
130
+ dt_new = df_new[dt_column]
131
+ fig.add_trace(
132
+ go.Scatter(
133
+ x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
134
+ y=df_new[(compound, "conc")],
135
+ name="New Data",
136
+ text=dt_new.dt.strftime("%y%m%d.%H%M"),
137
+ **kwargs,
138
+ )
110
139
  )
111
- )
112
140
  fig.update_layout(
113
141
  xaxis_title="Time of Year",
114
142
  yaxis_title=f"{compound} (ppt)",
@@ -49,6 +49,7 @@ class AbstractQA_Assigner(ABC):
49
49
  flag: QA_Flag
50
50
  runtypes: list[str] | None
51
51
  required_packages: list[PythonPackageRequirement] | None = None
52
+ require_datetime_index: bool = False
52
53
 
53
54
  # Options that can be set by the user
54
55
  name: str
@@ -142,6 +143,14 @@ class AbstractQA_Assigner(ABC):
142
143
  f"Please check the data and the settings for {self.name}"
143
144
  )
144
145
 
146
+ if self.require_datetime_index:
147
+ if not isinstance(df.index, pd.DatetimeIndex):
148
+ raise ValueError(
149
+ f"Assigner {self} requires a DatetimeIndex but the dataframe"
150
+ " does not have one. \n "
151
+ f"Please check the data and the settings for {self.name}"
152
+ )
153
+
145
154
  @abstractmethod
146
155
  def fit(self, df: pd.DataFrame):
147
156
  """Fit the QA assigner on some data.
@@ -0,0 +1,136 @@
1
+ """Quality assurance based on statistical methods."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import timedelta
6
+ from typing import TYPE_CHECKING
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from avoca.qa_class.zscore import ExtremeValues
12
+
13
+ if TYPE_CHECKING:
14
+
15
+ from avoca.utils.torch_models import MultipleRegressionModel
16
+
17
+
18
+ class RollingWindow(ExtremeValues):
19
+ """Detect in rolling windows.
20
+
21
+ The method is based on outliers in a rolling window using the median and standard deviation.
22
+ The training is done directly on the fitted data.
23
+
24
+ :param variable: The variable to check for extreme values.
25
+ :param threshold: The threshold for the z-score. To flag values.
26
+ :param use_log_normal: If True, the log of the values will be used to calculate the z-score.
27
+ This can be useful if the values are log-normal distributed.
28
+ :param only_greater: If True, only values greater than the threshold will be flagged.
29
+ The values lower than the negative threshold will not be flagged.
30
+ By default, this is True if use_log_normal is True, and False otherwise.
31
+ :param rolling_window: The size of the rolling window as a `timedelta` object.
32
+ See `window` parameters in pandas documentation for more details.
33
+ https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html#pandas-dataframe-rolling
34
+ """
35
+
36
+ require_datetime_index = True
37
+
38
+ rolling_window: timedelta
39
+
40
+ def __init__(
41
+ self,
42
+ *args,
43
+ rolling_window: timedelta = timedelta(days=7),
44
+ threshold: float = 1.5,
45
+ **kwargs,
46
+ ):
47
+ super().__init__(*args, threshold=threshold, **kwargs)
48
+ self.rolling_window = rolling_window
49
+
50
+ def fit(self, df: pd.DataFrame):
51
+
52
+ self.check_columns_or_raise(df, columns=self._stats_columns)
53
+
54
+ self.df_train = df[self._stats_columns]
55
+
56
+ def assign(self, df: pd.DataFrame) -> dict[str, pd.Index]:
57
+ df = df[self._stats_columns]
58
+ df = self._clean_data(df)
59
+ if self.use_log_normal:
60
+ # Replace <=0 with NaN
61
+ df = df.where(df > 0, np.nan)
62
+ df = df.map(lambda x: np.log(x))
63
+
64
+ rolling = df.rolling(window=self.rolling_window)
65
+ means = rolling.median()
66
+ stds = rolling.std()
67
+
68
+ self.rolling_median = means
69
+ self.rolling_std = stds
70
+
71
+ thresholds = means + stds * self.threshold
72
+
73
+ df_fail = df > thresholds
74
+ if not self.only_greater:
75
+ df_fail = df_fail | (df < (means - stds * self.threshold))
76
+
77
+ out_dict = {}
78
+ for compound in self.compounds:
79
+ col = (compound, self.variable)
80
+ this_c_fail = df_fail[col]
81
+ out_dict[compound] = this_c_fail.loc[this_c_fail].index
82
+
83
+ return out_dict
84
+
85
+ def plot(self):
86
+
87
+ import matplotlib.pyplot as plt
88
+
89
+ fig, axes = plt.subplots(
90
+ len(self.compounds), 1, figsize=(6, 3 * len(self.compounds)), sharex=True
91
+ )
92
+
93
+ x = self.dt if hasattr(self, "dt") else self.df_train.index
94
+ x = pd.Series(x, index=self.df_train.index)
95
+
96
+ outliers = self.assign(self.df_train)
97
+
98
+ for i, compound in enumerate(self.compounds):
99
+ ax = axes[i]
100
+ col = (compound, self.variable)
101
+ ax.scatter(
102
+ x,
103
+ self.df_train[col],
104
+ s=1,
105
+ label="darkblue",
106
+ )
107
+ median = self.rolling_median[col]
108
+ std = self.rolling_std[col]
109
+ top, bottom = median + std * self.threshold, median - std * self.threshold
110
+
111
+ ax.fill_between(
112
+ x,
113
+ top,
114
+ bottom,
115
+ color="lightgray",
116
+ label="Rolling threshold",
117
+ alpha=0.5,
118
+ )
119
+
120
+ outlier_indices = outliers[compound]
121
+ ax.scatter(
122
+ x.loc[outlier_indices],
123
+ self.df_train.loc[outlier_indices, col],
124
+ s=10,
125
+ marker="x",
126
+ color="red",
127
+ label="Extreme values",
128
+ )
129
+ ax.set_title(
130
+ f"{compound} +- {self.threshold} std",
131
+ # Under teh top line
132
+ y=0.8,
133
+ )
134
+ ax.tick_params(axis="x", rotation=25)
135
+
136
+ return fig, axes
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
 
12
12
  empty_index = pd.Index([], dtype="int64")
13
+ empty_index_dt = pd.DatetimeIndex([])
13
14
 
14
15
  simple_df = pd.DataFrame(
15
16
  np.ones((2, 4)),
@@ -0,0 +1,9 @@
1
+ import pandas as pd
2
+
3
+
4
+ def make_dt_index(df: pd.DataFrame | pd.Index) -> pd.DataFrame | pd.Index:
5
+ """Create a datetime index for the dataframe."""
6
+ index = pd.date_range(start="2023-01-01", periods=len(df), freq="h")
7
+ if isinstance(df, pd.Index):
8
+ return index
9
+ return df.set_index(index)
@@ -13,7 +13,7 @@ def compounds_from_df(df: pd.DataFrame) -> list[str]:
13
13
  Returns:
14
14
  The compounds in the dataframe.
15
15
  """
16
- return [c for c in df.columns.get_level_values(0).unique() if c != "-"]
16
+ return [c for c in df.columns.get_level_values(0).unique() if c not in ["-", ""]]
17
17
 
18
18
 
19
19
  def runtypes_from_df(df: pd.DataFrame) -> list[str]:
@@ -56,6 +56,9 @@ def parse_enum_comments(filepath: Path, enum_class_name: str) -> dict[Enum, str]
56
56
  exec(code, module)
57
57
  enum_cls = module[enum_class_name]
58
58
  for name, comment in comment_dict.items():
59
+ if not hasattr(enum_cls, name):
60
+ # Probably somehwere else in the file
61
+ continue
59
62
  enum_member = getattr(enum_cls, name)
60
63
  enum_obj[enum_member] = comment
61
64
 
@@ -137,6 +137,7 @@
137
137
  "source": [
138
138
  "from avoca.qa_class.concs import ExtremeConcentrations\n",
139
139
  "from avoca.qa_class.zscore import XY_Correlations, Multiple_XY_Correlations\n",
140
+ "from avoca.qa_class.rolling import RollingWindow\n",
140
141
  "\n",
141
142
  "\n",
142
143
  "# Create assingers for each compound\n",
@@ -145,10 +146,13 @@
145
146
  " XY_Correlations(\n",
146
147
  " compounds=[\"ethane\", \"propane\", \"n-butane\"], variable=\"C\", threshold=4.0\n",
147
148
  " ),\n",
148
- " # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
149
- " multiple_assigner := Multiple_XY_Correlations(\n",
150
- " number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
149
+ " rolling_window := RollingWindow(\n",
150
+ " compounds=compounds, variable=\"C\"\n",
151
151
  " ),\n",
152
+ " # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
153
+ " # multiple_assigner := Multiple_XY_Correlations(\n",
154
+ " # number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
155
+ " # ),\n",
152
156
  "]"
153
157
  ]
154
158
  },
@@ -329,6 +333,15 @@
329
333
  "fig.patch.set_alpha(0)\n"
330
334
  ]
331
335
  },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "metadata": {},
340
+ "outputs": [],
341
+ "source": [
342
+ "rolling_window.plot()"
343
+ ]
344
+ },
332
345
  {
333
346
  "cell_type": "code",
334
347
  "execution_count": null,
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
 
6
6
  [project]
7
7
  name = "avoca"
8
- version = "0.12.0"
8
+ version = "0.15.0"
9
9
  authors = [
10
10
  { name="Lionel Constantin", email="lionel.constantin@empa.ch" },
11
11
  ]
@@ -0,0 +1,216 @@
1
+ from datetime import timedelta
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import pytest
6
+
7
+ from avoca.bindings.qa_tool import export_EmpaQATool
8
+ from avoca.testing import testdata_dir
9
+ from avoca.testing.df import invalids_df, simple_df
10
+
11
+ export_path = testdata_dir / "export_empa_qa_tool"
12
+
13
+
14
+ @pytest.mark.parametrize(
15
+ "df, name",
16
+ [
17
+ (simple_df, "simple"),
18
+ (invalids_df, "invalids"),
19
+ ],
20
+ )
21
+ def test_export_EmpaQATool(df, name):
22
+ """Test the export_EmpaQATool function."""
23
+
24
+ # Create a test dataframe
25
+ df = df.copy()
26
+ df[("compA", "flag")] = 0
27
+ df[("compB", "flag")] = 0
28
+
29
+ df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
30
+
31
+ # Export the dataframe to a file
32
+ export_file = export_EmpaQATool(
33
+ df,
34
+ export_path,
35
+ datetime_offsets=(timedelta(minutes=-5), timedelta(minutes=0)),
36
+ station=name,
37
+ )
38
+
39
+ # Check that the file is created
40
+ assert Path(export_file).is_file()
41
+
42
+ # Read the file and check that the data is correct
43
+ df_exported = pd.read_csv(
44
+ export_file,
45
+ sep=";",
46
+ )
47
+ assert len(df_exported) == len(df)
48
+ # Check that the 'compB-Value' column is of float dtype
49
+ assert pd.api.types.is_float_dtype(df_exported["compB-Value"])
50
+ assert not pd.isna(df_exported["compB-Value"]).any(), "NAN values must be 999..."
51
+
52
+
53
+ def _prepare_df_for_export(df: pd.DataFrame) -> pd.DataFrame:
54
+ """Prepare a dataframe for export testing."""
55
+ df = df.copy()
56
+ df[("compA", "flag")] = 0
57
+ df[("compB", "flag")] = 0
58
+ df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
59
+ df[("-", "datetime_start")] = df[("-", "datetime")] - timedelta(minutes=5)
60
+ df[("-", "datetime_end")] = df[("-", "datetime")] + timedelta(minutes=0)
61
+ return df
62
+
63
+
64
+ def test_export_names_dict():
65
+ """test that export names from dict are used correctly."""
66
+
67
+ out_file = export_EmpaQATool(
68
+ _prepare_df_for_export(simple_df),
69
+ export_path,
70
+ export_names={"compA": "CustomCompA", "compB": "CustomCompB"},
71
+ station="TEST_DICT",
72
+ )
73
+
74
+ df_exported = pd.read_csv(
75
+ out_file,
76
+ sep=";",
77
+ )
78
+
79
+ assert "CustomCompA-Value" in df_exported.columns
80
+ assert "CustomCompB-Value" in df_exported.columns
81
+ assert "compA-Value" not in df_exported.columns
82
+ assert "compB-Value" not in df_exported.columns
83
+
84
+
85
+ def test_export_names_df():
86
+ """test that export names from dict are used correctly."""
87
+
88
+ out_file = export_EmpaQATool(
89
+ _prepare_df_for_export(simple_df),
90
+ export_path,
91
+ station="TEST_NAMES_DF",
92
+ df_substances=pd.DataFrame(
93
+ {
94
+ "substance": ["compA", "compB"],
95
+ "export_name": ["CustomCompA", "CustomCompB"],
96
+ }
97
+ ).set_index("substance"),
98
+ )
99
+
100
+ df_exported = pd.read_csv(
101
+ out_file,
102
+ sep=";",
103
+ )
104
+
105
+ assert "CustomCompA-Value" in df_exported.columns
106
+ assert "CustomCompB-Value" in df_exported.columns
107
+ assert "compA-Value" not in df_exported.columns
108
+ assert "compB-Value" not in df_exported.columns
109
+
110
+
111
+ def test_both_export_names_warns(caplog):
112
+ """test that export names from dict are used correctly."""
113
+
114
+ with caplog.at_level("WARNING"):
115
+ out_file = export_EmpaQATool(
116
+ _prepare_df_for_export(simple_df),
117
+ export_path,
118
+ station="TEST_BOTH_WARN",
119
+ export_names={"compA": "CustomCompA", "compB": "CustomCompB"},
120
+ df_substances=pd.DataFrame(
121
+ {
122
+ "substance": ["compA", "compB"],
123
+ "export_name": ["WrongCompA", "CustomCompB"],
124
+ }
125
+ ).set_index("substance"),
126
+ )
127
+
128
+ assert (
129
+ "Substance compA found in both df_substances and export_names." in caplog.text
130
+ )
131
+
132
+ df_exported = pd.read_csv(
133
+ out_file,
134
+ sep=";",
135
+ )
136
+
137
+ assert "CustomCompA-Value" in df_exported.columns
138
+ assert "CustomCompB-Value" in df_exported.columns
139
+ assert "compA-Value" not in df_exported.columns
140
+ assert "WrongCompA-Value" not in df_exported.columns
141
+ assert "compB-Value" not in df_exported.columns
142
+
143
+
144
+ def test_export_no_export_substances():
145
+ """test that substances with export=False in df_substances are not exported."""
146
+
147
+ out_file = export_EmpaQATool(
148
+ _prepare_df_for_export(simple_df),
149
+ export_path,
150
+ station="TEST_NO_EXPORT_SUBSTANCES",
151
+ df_substances=pd.DataFrame(
152
+ {
153
+ "substance": ["compA", "compB"],
154
+ "export": [True, False],
155
+ }
156
+ ).set_index("substance"),
157
+ )
158
+
159
+ df_exported = pd.read_csv(
160
+ out_file,
161
+ sep=";",
162
+ )
163
+
164
+ assert "compA-Value" in df_exported.columns
165
+ assert "compB-Value" not in df_exported.columns
166
+
167
+
168
+ def test_export_if_not_in_df_substances():
169
+ """test that substances not in df_substances are exported."""
170
+
171
+ out_file = export_EmpaQATool(
172
+ _prepare_df_for_export(simple_df),
173
+ export_path,
174
+ station="TEST_IF_NOT_IN_DF_SUBSTANCES",
175
+ df_substances=pd.DataFrame(
176
+ {
177
+ "substance": ["compA"],
178
+ "export": [True],
179
+ }
180
+ ).set_index("substance"),
181
+ )
182
+
183
+ df_exported = pd.read_csv(
184
+ out_file,
185
+ sep=";",
186
+ )
187
+
188
+ assert "compA-Value" in df_exported.columns
189
+ assert "compB-Value" in df_exported.columns
190
+
191
+
192
+ def test_export_and_rename_in_df_substances():
193
+ """test that export names from dict are used correctly."""
194
+
195
+ out_file = export_EmpaQATool(
196
+ _prepare_df_for_export(simple_df),
197
+ export_path,
198
+ station="TEST_EXPORT_AND_RENAME_IN_DF_SUBSTANCES",
199
+ df_substances=pd.DataFrame(
200
+ {
201
+ "substance": ["compA", "compB"],
202
+ "export_name": ["CustomCompA", "CustomCompB"],
203
+ "export": [True, False],
204
+ }
205
+ ).set_index("substance"),
206
+ )
207
+
208
+ df_exported = pd.read_csv(
209
+ out_file,
210
+ sep=";",
211
+ )
212
+
213
+ assert "CustomCompA-Value" in df_exported.columns
214
+ assert "compA-Value" not in df_exported.columns
215
+ assert "compB-Value" not in df_exported.columns
216
+ assert "CustomCompB-Value" not in df_exported.columns
@@ -3,21 +3,18 @@
3
3
  import pandas as pd
4
4
  import pytest
5
5
 
6
+ from avoca.manager import AssignerManager
6
7
  from avoca.qa_class.abstract import AbstractQA_Assigner
7
8
  from avoca.qa_class.invalid import InvalidValues
8
9
  from avoca.qa_class.zscore import ExtremeValues, XY_Correlations
9
- from avoca.testing.df import (
10
- df_around_zero,
11
- df_full_nan,
12
- df_nan_training,
13
- df_one_extreme,
14
- df_regular,
15
- df_with_inf,
16
- empty_index,
17
- )
10
+ from avoca.qa_class.rolling import RollingWindow
11
+ import avoca.testing.df as df_test
12
+ from avoca.testing.utils import make_dt_index
18
13
 
19
14
  index_all_3 = pd.Index([0, 1, 2], dtype="int64")
15
+ index_all_3_dt = make_dt_index(index_all_3)
20
16
  index_2 = pd.Index([2], dtype="int64")
17
+ index_2_dt = index_all_3_dt[index_2]
21
18
 
22
19
 
23
20
  @pytest.fixture(
@@ -36,6 +33,8 @@ index_2 = pd.Index([2], dtype="int64")
36
33
  "name": "invalid_zeros_and_negative",
37
34
  },
38
35
  ),
36
+ (RollingWindow, {"rolling_window": pd.Timedelta(days=3)}),
37
+ (RollingWindow, {}),
39
38
  ]
40
39
  )
41
40
  def assigner(
@@ -46,27 +45,66 @@ def assigner(
46
45
  return assigner_type(variable="test_var", compounds=["compA", "compB"], **kwargs)
47
46
 
48
47
 
48
+ def test_is_in_documentation(assigner: AbstractQA_Assigner):
49
+ """Test the assigner will appear in the documentation."""
50
+
51
+ assert type(assigner).__name__ in AssignerManager._assigners_importpath
52
+
53
+
49
54
  def test_simple(assigner: AbstractQA_Assigner):
55
+
56
+ df_one_extreme = df_test.df_one_extreme
57
+ df_regular = df_test.df_regular
58
+ if assigner.require_datetime_index:
59
+ df_one_extreme = make_dt_index(df_one_extreme)
60
+ df_regular = make_dt_index(df_regular)
61
+
50
62
  assigner.fit(df_regular)
51
63
  flagged = assigner.assign(df_one_extreme)
52
64
 
65
+ empty_index = (
66
+ df_test.empty_index
67
+ if not assigner.require_datetime_index
68
+ else df_test.empty_index_dt
69
+ )
70
+
53
71
  comparison_output_a = {
54
72
  InvalidValues: empty_index,
73
+ RollingWindow: index_2_dt,
55
74
  }
56
75
  comparison_output_b = {
57
- ExtremeValues: empty_index,
58
76
  # Also b is outside of the correlation cloud
59
77
  XY_Correlations: index_2,
60
- InvalidValues: empty_index,
61
78
  }
62
79
 
63
80
  pd.testing.assert_index_equal(
64
81
  flagged["compA"], comparison_output_a.get(type(assigner), index_2)
65
82
  )
66
- pd.testing.assert_index_equal(flagged["compB"], comparison_output_b[type(assigner)])
83
+ pd.testing.assert_index_equal(
84
+ flagged["compB"], comparison_output_b.get(type(assigner), empty_index)
85
+ )
86
+
87
+
88
+ def test_input_dataframe_dt_index(assigner: AbstractQA_Assigner):
89
+
90
+ df_regular = df_test.df_regular
91
+
92
+ if assigner.require_datetime_index:
93
+ with pytest.raises(ValueError, match="requires a DatetimeIndex"):
94
+ assigner.fit(df_regular)
67
95
 
68
96
 
69
97
  def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
98
+
99
+ df_nan_training = df_test.df_nan_training
100
+ df_regular = df_test.df_regular
101
+ empty_index = df_test.empty_index
102
+
103
+ if assigner.require_datetime_index:
104
+ df_nan_training = make_dt_index(df_nan_training)
105
+ df_regular = make_dt_index(df_regular)
106
+ empty_index = df_test.empty_index_dt
107
+
70
108
  assigner.fit(df_nan_training)
71
109
  flagged = assigner.assign(df_regular)
72
110
 
@@ -76,6 +114,15 @@ def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
76
114
 
77
115
 
78
116
  def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
117
+
118
+ df_full_nan = df_test.df_full_nan
119
+ df_regular = df_test.df_regular
120
+ empty_index = df_test.empty_index
121
+ if assigner.require_datetime_index:
122
+ df_full_nan = make_dt_index(df_full_nan)
123
+ df_regular = make_dt_index(df_regular)
124
+ empty_index = df_test.empty_index_dt
125
+
79
126
  assigner.fit(df_full_nan)
80
127
  flagged = assigner.assign(df_regular)
81
128
 
@@ -85,11 +132,19 @@ def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
85
132
 
86
133
 
87
134
  def test_fitting_nans(assigner: AbstractQA_Assigner):
135
+ df_regular = df_test.df_regular
136
+ df_nan_training = df_test.df_nan_training
137
+ df_full_nan = df_test.df_full_nan
138
+ empty_index = df_test.empty_index
139
+ if assigner.require_datetime_index:
140
+ df_regular = make_dt_index(df_regular)
141
+ df_nan_training = make_dt_index(df_nan_training)
142
+ df_full_nan = make_dt_index(df_full_nan)
143
+ empty_index = df_test.empty_index_dt
88
144
  assigner.fit(df_regular)
89
145
 
90
146
  flagged = assigner.assign(df_nan_training)
91
147
  flagged_allnans = assigner.assign(df_full_nan)
92
-
93
148
  comparison_output_a_one_nan = {
94
149
  InvalidValues: index_2,
95
150
  }
@@ -99,7 +154,8 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
99
154
 
100
155
  # Nothing should be flagged
101
156
  pd.testing.assert_index_equal(
102
- flagged["compA"], comparison_output_a_one_nan.get(type(assigner), empty_index)
157
+ flagged["compA"],
158
+ comparison_output_a_one_nan.get(type(assigner), empty_index),
103
159
  )
104
160
  pd.testing.assert_index_equal(flagged["compB"], empty_index)
105
161
  pd.testing.assert_index_equal(
@@ -112,6 +168,12 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
112
168
  def test_zero_values(assigner: AbstractQA_Assigner):
113
169
  """Test that zero values are not flagged."""
114
170
 
171
+ df_around_zero = df_test.df_around_zero
172
+ empty_index = df_test.empty_index
173
+ if assigner.require_datetime_index:
174
+ df_around_zero = make_dt_index(df_around_zero)
175
+ empty_index = df_test.empty_index_dt
176
+
115
177
  assigner.fit(df_around_zero)
116
178
  flagged = assigner.assign(df_around_zero)
117
179
 
@@ -138,6 +200,12 @@ def test_zero_values(assigner: AbstractQA_Assigner):
138
200
  def test_inf_values(assigner: AbstractQA_Assigner):
139
201
  """Test that inf values are flagged."""
140
202
 
203
+ df_with_inf = df_test.df_with_inf
204
+ empty_index = df_test.empty_index
205
+ if assigner.require_datetime_index:
206
+ df_with_inf = make_dt_index(df_with_inf)
207
+ empty_index = df_test.empty_index_dt
208
+
141
209
  assigner.fit(df_with_inf)
142
210
  flagged = assigner.assign(df_with_inf)
143
211
 
@@ -1,49 +0,0 @@
1
- from datetime import timedelta
2
- from pathlib import Path
3
-
4
- import pandas as pd
5
- import pytest
6
-
7
- from avoca.bindings.qa_tool import export_EmpaQATool
8
- from avoca.testing import testdata_dir
9
- from avoca.testing.df import invalids_df, simple_df
10
-
11
-
12
- @pytest.mark.parametrize(
13
- "df, name",
14
- [
15
- (simple_df, "simple"),
16
- (invalids_df, "invalids"),
17
- ],
18
- )
19
- def test_export_EmpaQATool(df, name):
20
- """Test the export_EmpaQATool function."""
21
-
22
- # Create a test dataframe
23
- df = df.copy()
24
- df[("compA", "flag")] = 0
25
- df[("compB", "flag")] = 0
26
-
27
- df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
28
-
29
- # Export the dataframe to a file
30
- export_path = testdata_dir / "export_empa_qa_tool"
31
- export_file = export_EmpaQATool(
32
- df,
33
- export_path,
34
- datetime_offsets=(timedelta(minutes=-5), timedelta(minutes=0)),
35
- station=name,
36
- )
37
-
38
- # Check that the file is created
39
- assert Path(export_file).is_file()
40
-
41
- # Read the file and check that the data is correct
42
- df_exported = pd.read_csv(
43
- export_file,
44
- sep=";",
45
- )
46
- assert len(df_exported) == len(df)
47
- # Check that the 'compB-Value' column is of float dtype
48
- assert pd.api.types.is_float_dtype(df_exported["compB-Value"])
49
- assert not pd.isna(df_exported["compB-Value"]).any(), "NAN values must be 999..."
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes