avoca 0.12.0__tar.gz → 0.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {avoca-0.12.0 → avoca-0.15.0}/.gitignore +1 -0
- {avoca-0.12.0 → avoca-0.15.0}/PKG-INFO +1 -1
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/ebas.py +16 -3
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/ebas_flags.py +1 -1
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/qa_tool.py +51 -10
- {avoca-0.12.0 → avoca-0.15.0}/avoca/manager.py +1 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/plots.py +53 -25
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/abstract.py +9 -0
- avoca-0.15.0/avoca/qa_class/rolling.py +136 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/testing/df.py +1 -0
- avoca-0.15.0/avoca/testing/utils.py +9 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/utils/__init__.py +1 -1
- {avoca-0.12.0 → avoca-0.15.0}/avoca/utils/flags_doc.py +3 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/data_qa.ipynb +16 -3
- {avoca-0.12.0 → avoca-0.15.0}/pyproject.toml +1 -1
- avoca-0.15.0/tests/bindings/test_qatool.py +216 -0
- {avoca-0.12.0 → avoca-0.15.0}/tests/test_assigners.py +82 -14
- avoca-0.12.0/tests/bindings/test_qatool.py +0 -49
- {avoca-0.12.0 → avoca-0.15.0}/.gitlab-ci.yml +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/.readthedocs.yaml +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/.vscode/settings.json +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/LICENCE.txt +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/README.md +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/__init__.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/__init__.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/gcwerks-report.conf +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/gcwerks.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/gcwerks_gui.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/nabel.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/bindings/synspec.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/export_nas.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/flagging.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/flags.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/io.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/logging.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/__init__.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/concs.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/generate_classes_doc.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/invalid.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/rt.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/test.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/qa_class/zscore.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/requirements.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/settings.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/testing/__init__.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/avoca/utils/torch_models.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/data/.avoca/config.yaml +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/data/CH0001G.20240219123300.20240307132229.online_gc.NMHC.air.16d.61mn.CH01L_Agilent_GC-MS-MEDUSA_Medusa-12_JFJ.CH01L_gc_ms.lev0.nas +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/data/tests/missing_area_cols.csv +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/data/voc_jan2jun_2023.csv +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/Makefile +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/make.bat +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/ebas.md +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/gcwerks.md +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/index.rst +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/bindings/qa_tool.md +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/conf.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/index.rst +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/docs/source/quickstart.ipynb +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/config.yaml +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/convert_synspec_to_gcwerks.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/data_qa_gcwerks.ipynb +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/export_gc_werks.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/export_gc_werks_secondary_peaks.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/get_tanks.ipynb +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/examples/read_nas.ipynb +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/tests/bindings/gcwerks.dat +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/tests/bindings/test_gcwerks.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/tests/test_flagging.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/tests/test_io.py +0 -0
- {avoca-0.12.0 → avoca-0.15.0}/tests/test_manager.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: avoca
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0
|
|
4
4
|
Summary: @voc@: Quality assessement of measurement data
|
|
5
5
|
Project-URL: Homepage, https://gitlab.com/empa503/atmospheric-measurements/avoca
|
|
6
6
|
Project-URL: Bug Tracker, https://gitlab.com/empa503/atmospheric-measurements/avoca/-/issues
|
|
@@ -171,7 +171,6 @@ def set_dataframe(
|
|
|
171
171
|
)
|
|
172
172
|
)
|
|
173
173
|
|
|
174
|
-
|
|
175
174
|
this_nan_flags = nan_flags.copy()
|
|
176
175
|
|
|
177
176
|
if data_level in concs_data_levels and invalidate_conc_calib:
|
|
@@ -188,7 +187,9 @@ def set_dataframe(
|
|
|
188
187
|
)
|
|
189
188
|
for flag in flag_col
|
|
190
189
|
]
|
|
191
|
-
nan_flag = np.logical_or.reduce(
|
|
190
|
+
nan_flag = np.logical_or.reduce(
|
|
191
|
+
[flag_col & flag.value for flag in this_nan_flags]
|
|
192
|
+
)
|
|
192
193
|
|
|
193
194
|
for var in vars_to_export[data_level]:
|
|
194
195
|
ebas_name = compounds[sub]
|
|
@@ -199,6 +200,16 @@ def set_dataframe(
|
|
|
199
200
|
for val, isnan in zip(serie_to_export, nan_flag)
|
|
200
201
|
]
|
|
201
202
|
|
|
203
|
+
if var == "conc_calib":
|
|
204
|
+
# Invalidate calibration concentration for non-calibration samples
|
|
205
|
+
this_flags = [
|
|
206
|
+
flags_ebas
|
|
207
|
+
+ ([] if (QA_Flag.CALIBRATION.value & flag_avoca) else [980])
|
|
208
|
+
for flags_ebas, flag_avoca in zip(flags, flag_col)
|
|
209
|
+
]
|
|
210
|
+
else:
|
|
211
|
+
this_flags = flags
|
|
212
|
+
|
|
202
213
|
metadata = DataObject()
|
|
203
214
|
metadata.comp_name = (
|
|
204
215
|
f"{ebas_name}_{ebas_compname_of_var[var]}"
|
|
@@ -214,7 +225,9 @@ def set_dataframe(
|
|
|
214
225
|
metadata.matrix = "air"
|
|
215
226
|
# add the variable
|
|
216
227
|
nas.variables.append(
|
|
217
|
-
DataObject(
|
|
228
|
+
DataObject(
|
|
229
|
+
values_=values, flags=this_flags, flagcol=True, metadata=metadata
|
|
230
|
+
)
|
|
218
231
|
)
|
|
219
232
|
|
|
220
233
|
if var == "conc_calib":
|
|
@@ -37,10 +37,11 @@ def export_EmpaQATool(
|
|
|
37
37
|
station: str = "XXX",
|
|
38
38
|
revision_date: datetime | None = None,
|
|
39
39
|
dataset: datetime | str | None = None,
|
|
40
|
-
export_names: dict[str, str] =
|
|
40
|
+
export_names: dict[str, str] | None = None,
|
|
41
41
|
datetime_offsets: tuple[timedelta, timedelta] | None = None,
|
|
42
42
|
substances: list[str] = [],
|
|
43
43
|
rounding_decimals: int = 4,
|
|
44
|
+
df_substances: pd.DataFrame | None = None,
|
|
44
45
|
) -> Path:
|
|
45
46
|
"""Export to the EmpaQATool format.
|
|
46
47
|
|
|
@@ -64,7 +65,17 @@ def export_EmpaQATool(
|
|
|
64
65
|
:arg datetime_offsets: Tuple of two timedelta to use for the start and end datetime
|
|
65
66
|
:arg substances: List of substances to export. You can also specify group names.
|
|
66
67
|
If not specified, this will use the substances from `df_substances`.
|
|
68
|
+
If a substance is present here and not in `df_substances`, it will still be exported.
|
|
67
69
|
:arg rounding_decimals: Number of decimals to round the values to.
|
|
70
|
+
:arg df_substances: DataFrame with substance information.
|
|
71
|
+
If provided, the substances to export will be taken from this dataframe.
|
|
72
|
+
Columns:
|
|
73
|
+
- index: substance name
|
|
74
|
+
- export: bool, whether to export the substance
|
|
75
|
+
- export_name: str, name to use in the export file
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
:returns: Path to the exported file.
|
|
68
79
|
|
|
69
80
|
"""
|
|
70
81
|
|
|
@@ -113,12 +124,42 @@ def export_EmpaQATool(
|
|
|
113
124
|
logger.debug(f"df_out: {df_out.head()}")
|
|
114
125
|
if not substances:
|
|
115
126
|
substances = compounds_from_df(df)
|
|
127
|
+
if df_substances is not None and "export" in df_substances.columns:
|
|
128
|
+
# Remove the substances that should not be exported
|
|
129
|
+
substances = [
|
|
130
|
+
s
|
|
131
|
+
for s in substances
|
|
132
|
+
if s not in df_substances.index or df_substances.loc[s, "export"]
|
|
133
|
+
]
|
|
116
134
|
|
|
117
135
|
remove_infs = lambda x: x.replace([np.inf, -np.inf], np.nan)
|
|
118
136
|
is_invalid = lambda x: x.isin([np.inf, -np.inf]) | pd.isna(x)
|
|
119
137
|
clean_col = lambda x: remove_infs(x).round(rounding_decimals).astype(str)
|
|
120
138
|
|
|
139
|
+
if export_names is None:
|
|
140
|
+
export_names = {}
|
|
141
|
+
|
|
142
|
+
if df_substances is not None and "export_name" in df_substances.columns:
|
|
143
|
+
# Read export names from the dataframe if provided
|
|
144
|
+
for substance in substances:
|
|
145
|
+
if not substance or substance not in df_substances.index:
|
|
146
|
+
continue
|
|
147
|
+
export_name_df = df_substances.loc[substance, "export_name"]
|
|
148
|
+
if not export_name_df or pd.isna(export_name_df):
|
|
149
|
+
continue
|
|
150
|
+
if substance in export_names and export_names[substance] != export_name_df:
|
|
151
|
+
logger.warning(
|
|
152
|
+
f"Substance {substance} found in both df_substances and"
|
|
153
|
+
" export_names. Using the name from export_names.\n"
|
|
154
|
+
f" - export_names (used): {export_names[substance]}\n"
|
|
155
|
+
f" - df_substances: {export_name_df}"
|
|
156
|
+
)
|
|
157
|
+
continue
|
|
158
|
+
export_names[substance] = export_name_df
|
|
159
|
+
|
|
121
160
|
for substance in substances:
|
|
161
|
+
if not substance:
|
|
162
|
+
continue
|
|
122
163
|
|
|
123
164
|
export_name = export_names.get(substance, substance)
|
|
124
165
|
|
|
@@ -234,12 +275,12 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
|
|
|
234
275
|
dt += shift
|
|
235
276
|
columns[("-", "datetime")] = dt
|
|
236
277
|
|
|
237
|
-
|
|
238
|
-
compounds = [
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
for compound in compounds:
|
|
278
|
+
# Last column is empty
|
|
279
|
+
compounds = [
|
|
280
|
+
"-".join(s[:-1]) for col in df.columns if len(s := col.split("-")) >= 2
|
|
281
|
+
]
|
|
242
282
|
|
|
283
|
+
for compound in compounds:
|
|
243
284
|
|
|
244
285
|
flag_col = f"{compound}-flag"
|
|
245
286
|
value_col = f"{compound}-value"
|
|
@@ -248,8 +289,8 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
|
|
|
248
289
|
|
|
249
290
|
mapping = {
|
|
250
291
|
"conc": value_col,
|
|
251
|
-
"u_expanded":acc_col,
|
|
252
|
-
"u_precision":precision_col,
|
|
292
|
+
"u_expanded": acc_col,
|
|
293
|
+
"u_precision": precision_col,
|
|
253
294
|
}
|
|
254
295
|
|
|
255
296
|
flag_values = (pd.to_numeric(df[flag_col]) * 1e3).astype(int).mod(1000)
|
|
@@ -263,10 +304,10 @@ def read_empaqatool(file_path: Path, shift: timedelta | None = None) -> pd.DataF
|
|
|
263
304
|
serie = pd.to_numeric(df[value])
|
|
264
305
|
mask_nan = flags == QA_Flag.MISSING.value
|
|
265
306
|
serie[mask_nan] = np.nan
|
|
266
|
-
columns[(compound, key)] = serie
|
|
307
|
+
columns[(compound, key)] = serie
|
|
267
308
|
|
|
268
309
|
columns[(compound, "flag")] = flags
|
|
269
|
-
|
|
310
|
+
|
|
270
311
|
mask_nan = columns[(compound, "conc")].isna()
|
|
271
312
|
columns[(compound, "flag")][mask_nan] |= QA_Flag.MISSING.value
|
|
272
313
|
|
|
@@ -20,6 +20,7 @@ class AssignerManager:
|
|
|
20
20
|
_assigners_importpath = {
|
|
21
21
|
"RetentionTimeChecker": "avoca.qa_class.rt",
|
|
22
22
|
"ExtremeValues": "avoca.qa_class.zscore",
|
|
23
|
+
"RollingWindow": "avoca.qa_class.rolling",
|
|
23
24
|
"ExtremeConcentrations": "avoca.qa_class.concs",
|
|
24
25
|
"XY_Correlations": "avoca.qa_class.zscore",
|
|
25
26
|
"TestAssigner": "avoca.qa_class.test",
|
|
@@ -69,46 +69,74 @@ def plot_yearly_plotly(
|
|
|
69
69
|
df: pd.DataFrame,
|
|
70
70
|
compound: str,
|
|
71
71
|
df_new: pd.DataFrame | None = None,
|
|
72
|
+
opacity: float = 0.5,
|
|
73
|
+
size: int = 6,
|
|
72
74
|
) -> "plotly.graph_objs._figure.Figure":
|
|
73
75
|
"""Plot yearly data using plotly."""
|
|
74
76
|
import plotly.express as px
|
|
75
77
|
import plotly.graph_objects as go
|
|
76
78
|
|
|
77
79
|
dt_column = ("-", "datetime")
|
|
78
|
-
|
|
79
|
-
dt = df[dt_column]
|
|
80
|
+
|
|
80
81
|
if ("-", "type") in df.columns:
|
|
81
82
|
mask_air = df[("-", "type")] == "air"
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if ("-", "type") in df_new.columns:
|
|
83
|
+
df = df[mask_air]
|
|
84
|
+
if df_new is not None and ("-", "type") in df_new.columns:
|
|
85
85
|
mask_air_new = df_new[("-", "type")] == "air"
|
|
86
86
|
df_new = df_new[mask_air_new]
|
|
87
87
|
|
|
88
|
+
dt = df[dt_column]
|
|
88
89
|
x = dt.dt.day_of_year + dt.dt.hour / 24.0
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
90
|
+
|
|
91
|
+
fig = go.Figure()
|
|
92
|
+
|
|
93
|
+
hover_template = "Timestamp: %{text}<br>Conc: %{y:.2f} ppt"
|
|
94
|
+
|
|
95
|
+
kwargs = {
|
|
96
|
+
"mode": "markers",
|
|
97
|
+
"opacity": opacity,
|
|
98
|
+
"marker": dict(size=size),
|
|
99
|
+
"hovertemplate": hover_template,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (compound, "conc") in df:
|
|
103
|
+
serie = df[(compound, "conc")]
|
|
104
|
+
df_to_plot = pd.DataFrame(
|
|
105
|
+
{
|
|
106
|
+
"conc": serie.values,
|
|
107
|
+
"year": dt.dt.year.values,
|
|
108
|
+
},
|
|
109
|
+
index=x.values,
|
|
110
|
+
)
|
|
111
|
+
# Break down by year, to have year as columns and conc as values
|
|
112
|
+
df_to_plot = df_to_plot.pivot_table(
|
|
113
|
+
index=df_to_plot.index, columns="year", values="conc"
|
|
114
|
+
)
|
|
115
|
+
for year in df_to_plot.columns:
|
|
116
|
+
fig.add_trace(
|
|
117
|
+
go.Scatter(
|
|
118
|
+
x=df_to_plot.index,
|
|
119
|
+
y=df_to_plot[year],
|
|
120
|
+
name=str(year),
|
|
121
|
+
zorder=-year,
|
|
122
|
+
text=dt[dt.dt.year == year].dt.strftime("%y%m%d.%H%M"),
|
|
123
|
+
**kwargs,
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
101
127
|
x_values = pd.date_range(start="2024-01-01", end="2024-12-31", freq="MS")
|
|
102
128
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
129
|
+
if df_new is not None and (compound, "conc") in df_new:
|
|
130
|
+
dt_new = df_new[dt_column]
|
|
131
|
+
fig.add_trace(
|
|
132
|
+
go.Scatter(
|
|
133
|
+
x=dt_new.dt.dayofyear + dt_new.dt.hour / 24.0,
|
|
134
|
+
y=df_new[(compound, "conc")],
|
|
135
|
+
name="New Data",
|
|
136
|
+
text=dt_new.dt.strftime("%y%m%d.%H%M"),
|
|
137
|
+
**kwargs,
|
|
138
|
+
)
|
|
110
139
|
)
|
|
111
|
-
)
|
|
112
140
|
fig.update_layout(
|
|
113
141
|
xaxis_title="Time of Year",
|
|
114
142
|
yaxis_title=f"{compound} (ppt)",
|
|
@@ -49,6 +49,7 @@ class AbstractQA_Assigner(ABC):
|
|
|
49
49
|
flag: QA_Flag
|
|
50
50
|
runtypes: list[str] | None
|
|
51
51
|
required_packages: list[PythonPackageRequirement] | None = None
|
|
52
|
+
require_datetime_index: bool = False
|
|
52
53
|
|
|
53
54
|
# Options that can be set by the user
|
|
54
55
|
name: str
|
|
@@ -142,6 +143,14 @@ class AbstractQA_Assigner(ABC):
|
|
|
142
143
|
f"Please check the data and the settings for {self.name}"
|
|
143
144
|
)
|
|
144
145
|
|
|
146
|
+
if self.require_datetime_index:
|
|
147
|
+
if not isinstance(df.index, pd.DatetimeIndex):
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f"Assigner {self} requires a DatetimeIndex but the dataframe"
|
|
150
|
+
" does not have one. \n "
|
|
151
|
+
f"Please check the data and the settings for {self.name}"
|
|
152
|
+
)
|
|
153
|
+
|
|
145
154
|
@abstractmethod
|
|
146
155
|
def fit(self, df: pd.DataFrame):
|
|
147
156
|
"""Fit the QA assigner on some data.
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Quality assurance based on statistical methods."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import timedelta
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from avoca.qa_class.zscore import ExtremeValues
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
|
|
15
|
+
from avoca.utils.torch_models import MultipleRegressionModel
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RollingWindow(ExtremeValues):
|
|
19
|
+
"""Detect in rolling windows.
|
|
20
|
+
|
|
21
|
+
The method is based on outliers in a rolling window using the median and standard deviation.
|
|
22
|
+
The training is done directly on the fitted data.
|
|
23
|
+
|
|
24
|
+
:param variable: The variable to check for extreme values.
|
|
25
|
+
:param threshold: The threshold for the z-score. To flag values.
|
|
26
|
+
:param use_log_normal: If True, the log of the values will be used to calculate the z-score.
|
|
27
|
+
This can be useful if the values are log-normal distributed.
|
|
28
|
+
:param only_greater: If True, only values greater than the threshold will be flagged.
|
|
29
|
+
The values lower than the negative threshold will not be flagged.
|
|
30
|
+
By default, this is True if use_log_normal is True, and False otherwise.
|
|
31
|
+
:param rolling_window: The size of the rolling window as a `timedelta` object.
|
|
32
|
+
See `window` parameters in pandas documentation for more details.
|
|
33
|
+
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html#pandas-dataframe-rolling
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
require_datetime_index = True
|
|
37
|
+
|
|
38
|
+
rolling_window: timedelta
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
*args,
|
|
43
|
+
rolling_window: timedelta = timedelta(days=7),
|
|
44
|
+
threshold: float = 1.5,
|
|
45
|
+
**kwargs,
|
|
46
|
+
):
|
|
47
|
+
super().__init__(*args, threshold=threshold, **kwargs)
|
|
48
|
+
self.rolling_window = rolling_window
|
|
49
|
+
|
|
50
|
+
def fit(self, df: pd.DataFrame):
|
|
51
|
+
|
|
52
|
+
self.check_columns_or_raise(df, columns=self._stats_columns)
|
|
53
|
+
|
|
54
|
+
self.df_train = df[self._stats_columns]
|
|
55
|
+
|
|
56
|
+
def assign(self, df: pd.DataFrame) -> dict[str, pd.Index]:
|
|
57
|
+
df = df[self._stats_columns]
|
|
58
|
+
df = self._clean_data(df)
|
|
59
|
+
if self.use_log_normal:
|
|
60
|
+
# Replace <=0 with NaN
|
|
61
|
+
df = df.where(df > 0, np.nan)
|
|
62
|
+
df = df.map(lambda x: np.log(x))
|
|
63
|
+
|
|
64
|
+
rolling = df.rolling(window=self.rolling_window)
|
|
65
|
+
means = rolling.median()
|
|
66
|
+
stds = rolling.std()
|
|
67
|
+
|
|
68
|
+
self.rolling_median = means
|
|
69
|
+
self.rolling_std = stds
|
|
70
|
+
|
|
71
|
+
thresholds = means + stds * self.threshold
|
|
72
|
+
|
|
73
|
+
df_fail = df > thresholds
|
|
74
|
+
if not self.only_greater:
|
|
75
|
+
df_fail = df_fail | (df < (means - stds * self.threshold))
|
|
76
|
+
|
|
77
|
+
out_dict = {}
|
|
78
|
+
for compound in self.compounds:
|
|
79
|
+
col = (compound, self.variable)
|
|
80
|
+
this_c_fail = df_fail[col]
|
|
81
|
+
out_dict[compound] = this_c_fail.loc[this_c_fail].index
|
|
82
|
+
|
|
83
|
+
return out_dict
|
|
84
|
+
|
|
85
|
+
def plot(self):
|
|
86
|
+
|
|
87
|
+
import matplotlib.pyplot as plt
|
|
88
|
+
|
|
89
|
+
fig, axes = plt.subplots(
|
|
90
|
+
len(self.compounds), 1, figsize=(6, 3 * len(self.compounds)), sharex=True
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
x = self.dt if hasattr(self, "dt") else self.df_train.index
|
|
94
|
+
x = pd.Series(x, index=self.df_train.index)
|
|
95
|
+
|
|
96
|
+
outliers = self.assign(self.df_train)
|
|
97
|
+
|
|
98
|
+
for i, compound in enumerate(self.compounds):
|
|
99
|
+
ax = axes[i]
|
|
100
|
+
col = (compound, self.variable)
|
|
101
|
+
ax.scatter(
|
|
102
|
+
x,
|
|
103
|
+
self.df_train[col],
|
|
104
|
+
s=1,
|
|
105
|
+
label="darkblue",
|
|
106
|
+
)
|
|
107
|
+
median = self.rolling_median[col]
|
|
108
|
+
std = self.rolling_std[col]
|
|
109
|
+
top, bottom = median + std * self.threshold, median - std * self.threshold
|
|
110
|
+
|
|
111
|
+
ax.fill_between(
|
|
112
|
+
x,
|
|
113
|
+
top,
|
|
114
|
+
bottom,
|
|
115
|
+
color="lightgray",
|
|
116
|
+
label="Rolling threshold",
|
|
117
|
+
alpha=0.5,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
outlier_indices = outliers[compound]
|
|
121
|
+
ax.scatter(
|
|
122
|
+
x.loc[outlier_indices],
|
|
123
|
+
self.df_train.loc[outlier_indices, col],
|
|
124
|
+
s=10,
|
|
125
|
+
marker="x",
|
|
126
|
+
color="red",
|
|
127
|
+
label="Extreme values",
|
|
128
|
+
)
|
|
129
|
+
ax.set_title(
|
|
130
|
+
f"{compound} +- {self.threshold} std",
|
|
131
|
+
# Under teh top line
|
|
132
|
+
y=0.8,
|
|
133
|
+
)
|
|
134
|
+
ax.tick_params(axis="x", rotation=25)
|
|
135
|
+
|
|
136
|
+
return fig, axes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def make_dt_index(df: pd.DataFrame | pd.Index) -> pd.DataFrame | pd.Index:
|
|
5
|
+
"""Create a datetime index for the dataframe."""
|
|
6
|
+
index = pd.date_range(start="2023-01-01", periods=len(df), freq="h")
|
|
7
|
+
if isinstance(df, pd.Index):
|
|
8
|
+
return index
|
|
9
|
+
return df.set_index(index)
|
|
@@ -13,7 +13,7 @@ def compounds_from_df(df: pd.DataFrame) -> list[str]:
|
|
|
13
13
|
Returns:
|
|
14
14
|
The compounds in the dataframe.
|
|
15
15
|
"""
|
|
16
|
-
return [c for c in df.columns.get_level_values(0).unique() if c
|
|
16
|
+
return [c for c in df.columns.get_level_values(0).unique() if c not in ["-", ""]]
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def runtypes_from_df(df: pd.DataFrame) -> list[str]:
|
|
@@ -56,6 +56,9 @@ def parse_enum_comments(filepath: Path, enum_class_name: str) -> dict[Enum, str]
|
|
|
56
56
|
exec(code, module)
|
|
57
57
|
enum_cls = module[enum_class_name]
|
|
58
58
|
for name, comment in comment_dict.items():
|
|
59
|
+
if not hasattr(enum_cls, name):
|
|
60
|
+
# Probably somehwere else in the file
|
|
61
|
+
continue
|
|
59
62
|
enum_member = getattr(enum_cls, name)
|
|
60
63
|
enum_obj[enum_member] = comment
|
|
61
64
|
|
|
@@ -137,6 +137,7 @@
|
|
|
137
137
|
"source": [
|
|
138
138
|
"from avoca.qa_class.concs import ExtremeConcentrations\n",
|
|
139
139
|
"from avoca.qa_class.zscore import XY_Correlations, Multiple_XY_Correlations\n",
|
|
140
|
+
"from avoca.qa_class.rolling import RollingWindow\n",
|
|
140
141
|
"\n",
|
|
141
142
|
"\n",
|
|
142
143
|
"# Create assingers for each compound\n",
|
|
@@ -145,10 +146,13 @@
|
|
|
145
146
|
" XY_Correlations(\n",
|
|
146
147
|
" compounds=[\"ethane\", \"propane\", \"n-butane\"], variable=\"C\", threshold=4.0\n",
|
|
147
148
|
" ),\n",
|
|
148
|
-
"
|
|
149
|
-
"
|
|
150
|
-
" number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
|
|
149
|
+
" rolling_window := RollingWindow(\n",
|
|
150
|
+
" compounds=compounds, variable=\"C\"\n",
|
|
151
151
|
" ),\n",
|
|
152
|
+
" # xy_benzene_toluene := XY_Correlations(compounds=[\"benzene\", \"toluene\"], variable=\"C\"),\n",
|
|
153
|
+
" # multiple_assigner := Multiple_XY_Correlations(\n",
|
|
154
|
+
" # number_of_regression=3, compounds=[\"benzene\", \"toluene\"], variable=\"C\"\n",
|
|
155
|
+
" # ),\n",
|
|
152
156
|
"]"
|
|
153
157
|
]
|
|
154
158
|
},
|
|
@@ -329,6 +333,15 @@
|
|
|
329
333
|
"fig.patch.set_alpha(0)\n"
|
|
330
334
|
]
|
|
331
335
|
},
|
|
336
|
+
{
|
|
337
|
+
"cell_type": "code",
|
|
338
|
+
"execution_count": null,
|
|
339
|
+
"metadata": {},
|
|
340
|
+
"outputs": [],
|
|
341
|
+
"source": [
|
|
342
|
+
"rolling_window.plot()"
|
|
343
|
+
]
|
|
344
|
+
},
|
|
332
345
|
{
|
|
333
346
|
"cell_type": "code",
|
|
334
347
|
"execution_count": null,
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from avoca.bindings.qa_tool import export_EmpaQATool
|
|
8
|
+
from avoca.testing import testdata_dir
|
|
9
|
+
from avoca.testing.df import invalids_df, simple_df
|
|
10
|
+
|
|
11
|
+
export_path = testdata_dir / "export_empa_qa_tool"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.mark.parametrize(
|
|
15
|
+
"df, name",
|
|
16
|
+
[
|
|
17
|
+
(simple_df, "simple"),
|
|
18
|
+
(invalids_df, "invalids"),
|
|
19
|
+
],
|
|
20
|
+
)
|
|
21
|
+
def test_export_EmpaQATool(df, name):
|
|
22
|
+
"""Test the export_EmpaQATool function."""
|
|
23
|
+
|
|
24
|
+
# Create a test dataframe
|
|
25
|
+
df = df.copy()
|
|
26
|
+
df[("compA", "flag")] = 0
|
|
27
|
+
df[("compB", "flag")] = 0
|
|
28
|
+
|
|
29
|
+
df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
|
|
30
|
+
|
|
31
|
+
# Export the dataframe to a file
|
|
32
|
+
export_file = export_EmpaQATool(
|
|
33
|
+
df,
|
|
34
|
+
export_path,
|
|
35
|
+
datetime_offsets=(timedelta(minutes=-5), timedelta(minutes=0)),
|
|
36
|
+
station=name,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Check that the file is created
|
|
40
|
+
assert Path(export_file).is_file()
|
|
41
|
+
|
|
42
|
+
# Read the file and check that the data is correct
|
|
43
|
+
df_exported = pd.read_csv(
|
|
44
|
+
export_file,
|
|
45
|
+
sep=";",
|
|
46
|
+
)
|
|
47
|
+
assert len(df_exported) == len(df)
|
|
48
|
+
# Check that the 'compB-Value' column is of float dtype
|
|
49
|
+
assert pd.api.types.is_float_dtype(df_exported["compB-Value"])
|
|
50
|
+
assert not pd.isna(df_exported["compB-Value"]).any(), "NAN values must be 999..."
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _prepare_df_for_export(df: pd.DataFrame) -> pd.DataFrame:
|
|
54
|
+
"""Prepare a dataframe for export testing."""
|
|
55
|
+
df = df.copy()
|
|
56
|
+
df[("compA", "flag")] = 0
|
|
57
|
+
df[("compB", "flag")] = 0
|
|
58
|
+
df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
|
|
59
|
+
df[("-", "datetime_start")] = df[("-", "datetime")] - timedelta(minutes=5)
|
|
60
|
+
df[("-", "datetime_end")] = df[("-", "datetime")] + timedelta(minutes=0)
|
|
61
|
+
return df
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_export_names_dict():
|
|
65
|
+
"""test that export names from dict are used correctly."""
|
|
66
|
+
|
|
67
|
+
out_file = export_EmpaQATool(
|
|
68
|
+
_prepare_df_for_export(simple_df),
|
|
69
|
+
export_path,
|
|
70
|
+
export_names={"compA": "CustomCompA", "compB": "CustomCompB"},
|
|
71
|
+
station="TEST_DICT",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
df_exported = pd.read_csv(
|
|
75
|
+
out_file,
|
|
76
|
+
sep=";",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
assert "CustomCompA-Value" in df_exported.columns
|
|
80
|
+
assert "CustomCompB-Value" in df_exported.columns
|
|
81
|
+
assert "compA-Value" not in df_exported.columns
|
|
82
|
+
assert "compB-Value" not in df_exported.columns
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_export_names_df():
|
|
86
|
+
"""test that export names from dict are used correctly."""
|
|
87
|
+
|
|
88
|
+
out_file = export_EmpaQATool(
|
|
89
|
+
_prepare_df_for_export(simple_df),
|
|
90
|
+
export_path,
|
|
91
|
+
station="TEST_NAMES_DF",
|
|
92
|
+
df_substances=pd.DataFrame(
|
|
93
|
+
{
|
|
94
|
+
"substance": ["compA", "compB"],
|
|
95
|
+
"export_name": ["CustomCompA", "CustomCompB"],
|
|
96
|
+
}
|
|
97
|
+
).set_index("substance"),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
df_exported = pd.read_csv(
|
|
101
|
+
out_file,
|
|
102
|
+
sep=";",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
assert "CustomCompA-Value" in df_exported.columns
|
|
106
|
+
assert "CustomCompB-Value" in df_exported.columns
|
|
107
|
+
assert "compA-Value" not in df_exported.columns
|
|
108
|
+
assert "compB-Value" not in df_exported.columns
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_both_export_names_warns(caplog):
|
|
112
|
+
"""test that export names from dict are used correctly."""
|
|
113
|
+
|
|
114
|
+
with caplog.at_level("WARNING"):
|
|
115
|
+
out_file = export_EmpaQATool(
|
|
116
|
+
_prepare_df_for_export(simple_df),
|
|
117
|
+
export_path,
|
|
118
|
+
station="TEST_BOTH_WARN",
|
|
119
|
+
export_names={"compA": "CustomCompA", "compB": "CustomCompB"},
|
|
120
|
+
df_substances=pd.DataFrame(
|
|
121
|
+
{
|
|
122
|
+
"substance": ["compA", "compB"],
|
|
123
|
+
"export_name": ["WrongCompA", "CustomCompB"],
|
|
124
|
+
}
|
|
125
|
+
).set_index("substance"),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
assert (
|
|
129
|
+
"Substance compA found in both df_substances and export_names." in caplog.text
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
df_exported = pd.read_csv(
|
|
133
|
+
out_file,
|
|
134
|
+
sep=";",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
assert "CustomCompA-Value" in df_exported.columns
|
|
138
|
+
assert "CustomCompB-Value" in df_exported.columns
|
|
139
|
+
assert "compA-Value" not in df_exported.columns
|
|
140
|
+
assert "WrongCompA-Value" not in df_exported.columns
|
|
141
|
+
assert "compB-Value" not in df_exported.columns
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_export_no_export_substances():
|
|
145
|
+
"""test that substances with export=False in df_substances are not exported."""
|
|
146
|
+
|
|
147
|
+
out_file = export_EmpaQATool(
|
|
148
|
+
_prepare_df_for_export(simple_df),
|
|
149
|
+
export_path,
|
|
150
|
+
station="TEST_NO_EXPORT_SUBSTANCES",
|
|
151
|
+
df_substances=pd.DataFrame(
|
|
152
|
+
{
|
|
153
|
+
"substance": ["compA", "compB"],
|
|
154
|
+
"export": [True, False],
|
|
155
|
+
}
|
|
156
|
+
).set_index("substance"),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
df_exported = pd.read_csv(
|
|
160
|
+
out_file,
|
|
161
|
+
sep=";",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
assert "compA-Value" in df_exported.columns
|
|
165
|
+
assert "compB-Value" not in df_exported.columns
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_export_if_not_in_df_substances():
|
|
169
|
+
"""test that substances not in df_substances are exported."""
|
|
170
|
+
|
|
171
|
+
out_file = export_EmpaQATool(
|
|
172
|
+
_prepare_df_for_export(simple_df),
|
|
173
|
+
export_path,
|
|
174
|
+
station="TEST_IF_NOT_IN_DF_SUBSTANCES",
|
|
175
|
+
df_substances=pd.DataFrame(
|
|
176
|
+
{
|
|
177
|
+
"substance": ["compA"],
|
|
178
|
+
"export": [True],
|
|
179
|
+
}
|
|
180
|
+
).set_index("substance"),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
df_exported = pd.read_csv(
|
|
184
|
+
out_file,
|
|
185
|
+
sep=";",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
assert "compA-Value" in df_exported.columns
|
|
189
|
+
assert "compB-Value" in df_exported.columns
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_export_and_rename_in_df_substances():
|
|
193
|
+
"""test that export names from dict are used correctly."""
|
|
194
|
+
|
|
195
|
+
out_file = export_EmpaQATool(
|
|
196
|
+
_prepare_df_for_export(simple_df),
|
|
197
|
+
export_path,
|
|
198
|
+
station="TEST_EXPORT_AND_RENAME_IN_DF_SUBSTANCES",
|
|
199
|
+
df_substances=pd.DataFrame(
|
|
200
|
+
{
|
|
201
|
+
"substance": ["compA", "compB"],
|
|
202
|
+
"export_name": ["CustomCompA", "CustomCompB"],
|
|
203
|
+
"export": [True, False],
|
|
204
|
+
}
|
|
205
|
+
).set_index("substance"),
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
df_exported = pd.read_csv(
|
|
209
|
+
out_file,
|
|
210
|
+
sep=";",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
assert "CustomCompA-Value" in df_exported.columns
|
|
214
|
+
assert "compA-Value" not in df_exported.columns
|
|
215
|
+
assert "compB-Value" not in df_exported.columns
|
|
216
|
+
assert "CustomCompB-Value" not in df_exported.columns
|
|
@@ -3,21 +3,18 @@
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import pytest
|
|
5
5
|
|
|
6
|
+
from avoca.manager import AssignerManager
|
|
6
7
|
from avoca.qa_class.abstract import AbstractQA_Assigner
|
|
7
8
|
from avoca.qa_class.invalid import InvalidValues
|
|
8
9
|
from avoca.qa_class.zscore import ExtremeValues, XY_Correlations
|
|
9
|
-
from avoca.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
df_nan_training,
|
|
13
|
-
df_one_extreme,
|
|
14
|
-
df_regular,
|
|
15
|
-
df_with_inf,
|
|
16
|
-
empty_index,
|
|
17
|
-
)
|
|
10
|
+
from avoca.qa_class.rolling import RollingWindow
|
|
11
|
+
import avoca.testing.df as df_test
|
|
12
|
+
from avoca.testing.utils import make_dt_index
|
|
18
13
|
|
|
19
14
|
index_all_3 = pd.Index([0, 1, 2], dtype="int64")
|
|
15
|
+
index_all_3_dt = make_dt_index(index_all_3)
|
|
20
16
|
index_2 = pd.Index([2], dtype="int64")
|
|
17
|
+
index_2_dt = index_all_3_dt[index_2]
|
|
21
18
|
|
|
22
19
|
|
|
23
20
|
@pytest.fixture(
|
|
@@ -36,6 +33,8 @@ index_2 = pd.Index([2], dtype="int64")
|
|
|
36
33
|
"name": "invalid_zeros_and_negative",
|
|
37
34
|
},
|
|
38
35
|
),
|
|
36
|
+
(RollingWindow, {"rolling_window": pd.Timedelta(days=3)}),
|
|
37
|
+
(RollingWindow, {}),
|
|
39
38
|
]
|
|
40
39
|
)
|
|
41
40
|
def assigner(
|
|
@@ -46,27 +45,66 @@ def assigner(
|
|
|
46
45
|
return assigner_type(variable="test_var", compounds=["compA", "compB"], **kwargs)
|
|
47
46
|
|
|
48
47
|
|
|
48
|
+
def test_is_in_documentation(assigner: AbstractQA_Assigner):
|
|
49
|
+
"""Test the assigner will appear in the documentation."""
|
|
50
|
+
|
|
51
|
+
assert type(assigner).__name__ in AssignerManager._assigners_importpath
|
|
52
|
+
|
|
53
|
+
|
|
49
54
|
def test_simple(assigner: AbstractQA_Assigner):
|
|
55
|
+
|
|
56
|
+
df_one_extreme = df_test.df_one_extreme
|
|
57
|
+
df_regular = df_test.df_regular
|
|
58
|
+
if assigner.require_datetime_index:
|
|
59
|
+
df_one_extreme = make_dt_index(df_one_extreme)
|
|
60
|
+
df_regular = make_dt_index(df_regular)
|
|
61
|
+
|
|
50
62
|
assigner.fit(df_regular)
|
|
51
63
|
flagged = assigner.assign(df_one_extreme)
|
|
52
64
|
|
|
65
|
+
empty_index = (
|
|
66
|
+
df_test.empty_index
|
|
67
|
+
if not assigner.require_datetime_index
|
|
68
|
+
else df_test.empty_index_dt
|
|
69
|
+
)
|
|
70
|
+
|
|
53
71
|
comparison_output_a = {
|
|
54
72
|
InvalidValues: empty_index,
|
|
73
|
+
RollingWindow: index_2_dt,
|
|
55
74
|
}
|
|
56
75
|
comparison_output_b = {
|
|
57
|
-
ExtremeValues: empty_index,
|
|
58
76
|
# Also b is outside of the correlation cloud
|
|
59
77
|
XY_Correlations: index_2,
|
|
60
|
-
InvalidValues: empty_index,
|
|
61
78
|
}
|
|
62
79
|
|
|
63
80
|
pd.testing.assert_index_equal(
|
|
64
81
|
flagged["compA"], comparison_output_a.get(type(assigner), index_2)
|
|
65
82
|
)
|
|
66
|
-
pd.testing.assert_index_equal(
|
|
83
|
+
pd.testing.assert_index_equal(
|
|
84
|
+
flagged["compB"], comparison_output_b.get(type(assigner), empty_index)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_input_dataframe_dt_index(assigner: AbstractQA_Assigner):
|
|
89
|
+
|
|
90
|
+
df_regular = df_test.df_regular
|
|
91
|
+
|
|
92
|
+
if assigner.require_datetime_index:
|
|
93
|
+
with pytest.raises(ValueError, match="requires a DatetimeIndex"):
|
|
94
|
+
assigner.fit(df_regular)
|
|
67
95
|
|
|
68
96
|
|
|
69
97
|
def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
|
|
98
|
+
|
|
99
|
+
df_nan_training = df_test.df_nan_training
|
|
100
|
+
df_regular = df_test.df_regular
|
|
101
|
+
empty_index = df_test.empty_index
|
|
102
|
+
|
|
103
|
+
if assigner.require_datetime_index:
|
|
104
|
+
df_nan_training = make_dt_index(df_nan_training)
|
|
105
|
+
df_regular = make_dt_index(df_regular)
|
|
106
|
+
empty_index = df_test.empty_index_dt
|
|
107
|
+
|
|
70
108
|
assigner.fit(df_nan_training)
|
|
71
109
|
flagged = assigner.assign(df_regular)
|
|
72
110
|
|
|
@@ -76,6 +114,15 @@ def test_nan_values_given_fit(assigner: AbstractQA_Assigner):
|
|
|
76
114
|
|
|
77
115
|
|
|
78
116
|
def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
|
|
117
|
+
|
|
118
|
+
df_full_nan = df_test.df_full_nan
|
|
119
|
+
df_regular = df_test.df_regular
|
|
120
|
+
empty_index = df_test.empty_index
|
|
121
|
+
if assigner.require_datetime_index:
|
|
122
|
+
df_full_nan = make_dt_index(df_full_nan)
|
|
123
|
+
df_regular = make_dt_index(df_regular)
|
|
124
|
+
empty_index = df_test.empty_index_dt
|
|
125
|
+
|
|
79
126
|
assigner.fit(df_full_nan)
|
|
80
127
|
flagged = assigner.assign(df_regular)
|
|
81
128
|
|
|
@@ -85,11 +132,19 @@ def test_only_nan_values_given_fit(assigner: AbstractQA_Assigner):
|
|
|
85
132
|
|
|
86
133
|
|
|
87
134
|
def test_fitting_nans(assigner: AbstractQA_Assigner):
|
|
135
|
+
df_regular = df_test.df_regular
|
|
136
|
+
df_nan_training = df_test.df_nan_training
|
|
137
|
+
df_full_nan = df_test.df_full_nan
|
|
138
|
+
empty_index = df_test.empty_index
|
|
139
|
+
if assigner.require_datetime_index:
|
|
140
|
+
df_regular = make_dt_index(df_regular)
|
|
141
|
+
df_nan_training = make_dt_index(df_nan_training)
|
|
142
|
+
df_full_nan = make_dt_index(df_full_nan)
|
|
143
|
+
empty_index = df_test.empty_index_dt
|
|
88
144
|
assigner.fit(df_regular)
|
|
89
145
|
|
|
90
146
|
flagged = assigner.assign(df_nan_training)
|
|
91
147
|
flagged_allnans = assigner.assign(df_full_nan)
|
|
92
|
-
|
|
93
148
|
comparison_output_a_one_nan = {
|
|
94
149
|
InvalidValues: index_2,
|
|
95
150
|
}
|
|
@@ -99,7 +154,8 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
|
|
|
99
154
|
|
|
100
155
|
# Nothing should be flagged
|
|
101
156
|
pd.testing.assert_index_equal(
|
|
102
|
-
flagged["compA"],
|
|
157
|
+
flagged["compA"],
|
|
158
|
+
comparison_output_a_one_nan.get(type(assigner), empty_index),
|
|
103
159
|
)
|
|
104
160
|
pd.testing.assert_index_equal(flagged["compB"], empty_index)
|
|
105
161
|
pd.testing.assert_index_equal(
|
|
@@ -112,6 +168,12 @@ def test_fitting_nans(assigner: AbstractQA_Assigner):
|
|
|
112
168
|
def test_zero_values(assigner: AbstractQA_Assigner):
|
|
113
169
|
"""Test that zero values are not flagged."""
|
|
114
170
|
|
|
171
|
+
df_around_zero = df_test.df_around_zero
|
|
172
|
+
empty_index = df_test.empty_index
|
|
173
|
+
if assigner.require_datetime_index:
|
|
174
|
+
df_around_zero = make_dt_index(df_around_zero)
|
|
175
|
+
empty_index = df_test.empty_index_dt
|
|
176
|
+
|
|
115
177
|
assigner.fit(df_around_zero)
|
|
116
178
|
flagged = assigner.assign(df_around_zero)
|
|
117
179
|
|
|
@@ -138,6 +200,12 @@ def test_zero_values(assigner: AbstractQA_Assigner):
|
|
|
138
200
|
def test_inf_values(assigner: AbstractQA_Assigner):
|
|
139
201
|
"""Test that inf values are flagged."""
|
|
140
202
|
|
|
203
|
+
df_with_inf = df_test.df_with_inf
|
|
204
|
+
empty_index = df_test.empty_index
|
|
205
|
+
if assigner.require_datetime_index:
|
|
206
|
+
df_with_inf = make_dt_index(df_with_inf)
|
|
207
|
+
empty_index = df_test.empty_index_dt
|
|
208
|
+
|
|
141
209
|
assigner.fit(df_with_inf)
|
|
142
210
|
flagged = assigner.assign(df_with_inf)
|
|
143
211
|
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from datetime import timedelta
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from avoca.bindings.qa_tool import export_EmpaQATool
|
|
8
|
-
from avoca.testing import testdata_dir
|
|
9
|
-
from avoca.testing.df import invalids_df, simple_df
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.mark.parametrize(
|
|
13
|
-
"df, name",
|
|
14
|
-
[
|
|
15
|
-
(simple_df, "simple"),
|
|
16
|
-
(invalids_df, "invalids"),
|
|
17
|
-
],
|
|
18
|
-
)
|
|
19
|
-
def test_export_EmpaQATool(df, name):
|
|
20
|
-
"""Test the export_EmpaQATool function."""
|
|
21
|
-
|
|
22
|
-
# Create a test dataframe
|
|
23
|
-
df = df.copy()
|
|
24
|
-
df[("compA", "flag")] = 0
|
|
25
|
-
df[("compB", "flag")] = 0
|
|
26
|
-
|
|
27
|
-
df[("-", "datetime")] = pd.date_range(start="2025-01-01", periods=len(df), freq="h")
|
|
28
|
-
|
|
29
|
-
# Export the dataframe to a file
|
|
30
|
-
export_path = testdata_dir / "export_empa_qa_tool"
|
|
31
|
-
export_file = export_EmpaQATool(
|
|
32
|
-
df,
|
|
33
|
-
export_path,
|
|
34
|
-
datetime_offsets=(timedelta(minutes=-5), timedelta(minutes=0)),
|
|
35
|
-
station=name,
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
# Check that the file is created
|
|
39
|
-
assert Path(export_file).is_file()
|
|
40
|
-
|
|
41
|
-
# Read the file and check that the data is correct
|
|
42
|
-
df_exported = pd.read_csv(
|
|
43
|
-
export_file,
|
|
44
|
-
sep=";",
|
|
45
|
-
)
|
|
46
|
-
assert len(df_exported) == len(df)
|
|
47
|
-
# Check that the 'compB-Value' column is of float dtype
|
|
48
|
-
assert pd.api.types.is_float_dtype(df_exported["compB-Value"])
|
|
49
|
-
assert not pd.isna(df_exported["compB-Value"]).any(), "NAN values must be 999..."
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|