AeroViz 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AeroViz/__init__.py +13 -0
- AeroViz/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/data/DEFAULT_DATA.csv +1417 -0
- AeroViz/data/DEFAULT_PNSD_DATA.csv +1417 -0
- AeroViz/data/hysplit_example_data.txt +101 -0
- AeroViz/dataProcess/Chemistry/__init__.py +149 -0
- AeroViz/dataProcess/Chemistry/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Chemistry/_calculate.py +557 -0
- AeroViz/dataProcess/Chemistry/_isoropia.py +150 -0
- AeroViz/dataProcess/Chemistry/_mass_volume.py +487 -0
- AeroViz/dataProcess/Chemistry/_ocec.py +172 -0
- AeroViz/dataProcess/Chemistry/isrpia.cnf +21 -0
- AeroViz/dataProcess/Chemistry/isrpia2.exe +0 -0
- AeroViz/dataProcess/Optical/PyMieScatt_update.py +577 -0
- AeroViz/dataProcess/Optical/_IMPROVE.py +452 -0
- AeroViz/dataProcess/Optical/__init__.py +281 -0
- AeroViz/dataProcess/Optical/__pycache__/PyMieScatt_update.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Optical/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Optical/__pycache__/mie_theory.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Optical/_derived.py +518 -0
- AeroViz/dataProcess/Optical/_extinction.py +123 -0
- AeroViz/dataProcess/Optical/_mie_sd.py +912 -0
- AeroViz/dataProcess/Optical/_retrieve_RI.py +243 -0
- AeroViz/dataProcess/Optical/coefficient.py +72 -0
- AeroViz/dataProcess/Optical/fRH.pkl +0 -0
- AeroViz/dataProcess/Optical/mie_theory.py +260 -0
- AeroViz/dataProcess/README.md +271 -0
- AeroViz/dataProcess/SizeDistr/__init__.py +245 -0
- AeroViz/dataProcess/SizeDistr/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/SizeDistr/__pycache__/_size_dist.cpython-312.pyc +0 -0
- AeroViz/dataProcess/SizeDistr/_size_dist.py +810 -0
- AeroViz/dataProcess/SizeDistr/merge/README.md +93 -0
- AeroViz/dataProcess/SizeDistr/merge/__init__.py +20 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v0.py +251 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v0_1.py +246 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v1.py +255 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v2.py +244 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v3.py +518 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v4.py +422 -0
- AeroViz/dataProcess/SizeDistr/prop.py +62 -0
- AeroViz/dataProcess/VOC/__init__.py +14 -0
- AeroViz/dataProcess/VOC/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/VOC/_potential_par.py +108 -0
- AeroViz/dataProcess/VOC/support_voc.json +446 -0
- AeroViz/dataProcess/__init__.py +66 -0
- AeroViz/dataProcess/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/core/__init__.py +272 -0
- AeroViz/dataProcess/core/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/mcp_server.py +352 -0
- AeroViz/plot/__init__.py +13 -0
- AeroViz/plot/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/bar.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/box.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/pie.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/radar.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/regression.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/scatter.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/violin.cpython-312.pyc +0 -0
- AeroViz/plot/bar.py +126 -0
- AeroViz/plot/box.py +69 -0
- AeroViz/plot/distribution/__init__.py +1 -0
- AeroViz/plot/distribution/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/distribution/__pycache__/distribution.cpython-312.pyc +0 -0
- AeroViz/plot/distribution/distribution.py +576 -0
- AeroViz/plot/meteorology/CBPF.py +295 -0
- AeroViz/plot/meteorology/__init__.py +3 -0
- AeroViz/plot/meteorology/__pycache__/CBPF.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/__pycache__/hysplit.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/__pycache__/wind_rose.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/hysplit.py +93 -0
- AeroViz/plot/meteorology/wind_rose.py +77 -0
- AeroViz/plot/optical/__init__.py +1 -0
- AeroViz/plot/optical/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/optical/__pycache__/optical.cpython-312.pyc +0 -0
- AeroViz/plot/optical/optical.py +388 -0
- AeroViz/plot/pie.py +210 -0
- AeroViz/plot/radar.py +184 -0
- AeroViz/plot/regression.py +200 -0
- AeroViz/plot/scatter.py +174 -0
- AeroViz/plot/templates/__init__.py +6 -0
- AeroViz/plot/templates/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/ammonium_rich.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/contour.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/corr_matrix.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/diurnal_pattern.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/koschmieder.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/metal_heatmap.cpython-312.pyc +0 -0
- AeroViz/plot/templates/ammonium_rich.py +34 -0
- AeroViz/plot/templates/contour.py +47 -0
- AeroViz/plot/templates/corr_matrix.py +267 -0
- AeroViz/plot/templates/diurnal_pattern.py +61 -0
- AeroViz/plot/templates/koschmieder.py +95 -0
- AeroViz/plot/templates/metal_heatmap.py +164 -0
- AeroViz/plot/timeseries/__init__.py +2 -0
- AeroViz/plot/timeseries/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/timeseries/__pycache__/template.cpython-312.pyc +0 -0
- AeroViz/plot/timeseries/__pycache__/timeseries.cpython-312.pyc +0 -0
- AeroViz/plot/timeseries/template.py +47 -0
- AeroViz/plot/timeseries/timeseries.py +446 -0
- AeroViz/plot/utils/__init__.py +4 -0
- AeroViz/plot/utils/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/_color.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/_unit.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/plt_utils.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/sklearn_utils.cpython-312.pyc +0 -0
- AeroViz/plot/utils/_color.py +71 -0
- AeroViz/plot/utils/_unit.py +55 -0
- AeroViz/plot/utils/fRH.json +390 -0
- AeroViz/plot/utils/plt_utils.py +92 -0
- AeroViz/plot/utils/sklearn_utils.py +49 -0
- AeroViz/plot/utils/units.json +89 -0
- AeroViz/plot/violin.py +80 -0
- AeroViz/rawDataReader/FLOW.md +138 -0
- AeroViz/rawDataReader/__init__.py +220 -0
- AeroViz/rawDataReader/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/config/__init__.py +0 -0
- AeroViz/rawDataReader/config/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/config/__pycache__/supported_instruments.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/config/supported_instruments.py +135 -0
- AeroViz/rawDataReader/core/__init__.py +658 -0
- AeroViz/rawDataReader/core/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/logger.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/pre_process.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/qc.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/report.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/logger.py +171 -0
- AeroViz/rawDataReader/core/pre_process.py +308 -0
- AeroViz/rawDataReader/core/qc.py +961 -0
- AeroViz/rawDataReader/core/report.py +579 -0
- AeroViz/rawDataReader/script/AE33.py +173 -0
- AeroViz/rawDataReader/script/AE43.py +151 -0
- AeroViz/rawDataReader/script/APS.py +339 -0
- AeroViz/rawDataReader/script/Aurora.py +191 -0
- AeroViz/rawDataReader/script/BAM1020.py +90 -0
- AeroViz/rawDataReader/script/BC1054.py +161 -0
- AeroViz/rawDataReader/script/EPA.py +79 -0
- AeroViz/rawDataReader/script/GRIMM.py +68 -0
- AeroViz/rawDataReader/script/IGAC.py +140 -0
- AeroViz/rawDataReader/script/MA350.py +179 -0
- AeroViz/rawDataReader/script/Minion.py +218 -0
- AeroViz/rawDataReader/script/NEPH.py +199 -0
- AeroViz/rawDataReader/script/OCEC.py +173 -0
- AeroViz/rawDataReader/script/Q-ACSM.py +12 -0
- AeroViz/rawDataReader/script/SMPS.py +389 -0
- AeroViz/rawDataReader/script/TEOM.py +181 -0
- AeroViz/rawDataReader/script/VOC.py +106 -0
- AeroViz/rawDataReader/script/Xact.py +244 -0
- AeroViz/rawDataReader/script/__init__.py +28 -0
- AeroViz/rawDataReader/script/__pycache__/AE33.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/AE43.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/APS.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Aurora.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/BAM1020.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/BC1054.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/EPA.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/GRIMM.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/IGAC.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/MA350.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Minion.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/NEPH.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/OCEC.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Q-ACSM.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/SMPS.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/TEOM.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/VOC.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Xact.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/tools/__init__.py +2 -0
- AeroViz/tools/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/tools/__pycache__/database.cpython-312.pyc +0 -0
- AeroViz/tools/__pycache__/dataclassifier.cpython-312.pyc +0 -0
- AeroViz/tools/database.py +95 -0
- AeroViz/tools/dataclassifier.py +117 -0
- AeroViz/tools/dataprinter.py +58 -0
- aeroviz-0.1.21.dist-info/METADATA +294 -0
- aeroviz-0.1.21.dist-info/RECORD +180 -0
- aeroviz-0.1.21.dist-info/WHEEL +5 -0
- aeroviz-0.1.21.dist-info/licenses/LICENSE +21 -0
- aeroviz-0.1.21.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,961 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# =============================================================================
|
|
9
|
+
# QC Flag System
|
|
10
|
+
# =============================================================================
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class QCRule:
|
|
14
|
+
"""
|
|
15
|
+
Declarative QC rule definition.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
name : str
|
|
20
|
+
Short identifier for the flag (e.g., 'Status Error')
|
|
21
|
+
condition : Callable[[pd.DataFrame], pd.Series]
|
|
22
|
+
Function that takes DataFrame and returns boolean Series
|
|
23
|
+
where True = flagged (problematic data)
|
|
24
|
+
description : str, optional
|
|
25
|
+
Detailed explanation of what this rule checks
|
|
26
|
+
|
|
27
|
+
Examples
|
|
28
|
+
--------
|
|
29
|
+
>>> rule = QCRule(
|
|
30
|
+
... name='Invalid BC',
|
|
31
|
+
... condition=lambda df: (df['BC6'] <= 0) | (df['BC6'] > 20000),
|
|
32
|
+
... description='BC concentration outside valid range 0-20000 ng/m³'
|
|
33
|
+
... )
|
|
34
|
+
"""
|
|
35
|
+
name: str
|
|
36
|
+
condition: Callable[[pd.DataFrame], pd.Series]
|
|
37
|
+
description: str = ''
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class QCFlagBuilder:
|
|
41
|
+
"""
|
|
42
|
+
Centralized QC flag aggregation system.
|
|
43
|
+
|
|
44
|
+
This class collects multiple QC rules and applies them efficiently
|
|
45
|
+
using vectorized operations, producing a single QC_Flag column.
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> builder = QCFlagBuilder()
|
|
50
|
+
>>> builder.add_rule(QCRule('Invalid Value', lambda df: df['value'] < 0))
|
|
51
|
+
>>> builder.add_rule(QCRule('Missing Data', lambda df: df['value'].isna()))
|
|
52
|
+
>>> df_with_flags = builder.apply(df)
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self):
|
|
56
|
+
self.rules: list[QCRule] = []
|
|
57
|
+
|
|
58
|
+
def add_rule(self, rule: QCRule) -> 'QCFlagBuilder':
|
|
59
|
+
"""Add a QC rule. Returns self for method chaining."""
|
|
60
|
+
self.rules.append(rule)
|
|
61
|
+
return self
|
|
62
|
+
|
|
63
|
+
def add_rules(self, rules: list[QCRule]) -> 'QCFlagBuilder':
|
|
64
|
+
"""Add multiple QC rules. Returns self for method chaining."""
|
|
65
|
+
self.rules.extend(rules)
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def apply(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
69
|
+
"""
|
|
70
|
+
Apply all registered QC rules and add QC_Flag column.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
df : pd.DataFrame
|
|
75
|
+
Input DataFrame to apply QC rules to
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
pd.DataFrame
|
|
80
|
+
DataFrame with added 'QC_Flag' column containing
|
|
81
|
+
comma-separated flag names or 'Valid'
|
|
82
|
+
"""
|
|
83
|
+
if not self.rules:
|
|
84
|
+
df = df.copy()
|
|
85
|
+
df['QC_Flag'] = 'Valid'
|
|
86
|
+
return df
|
|
87
|
+
|
|
88
|
+
# Create a mask DataFrame: each column is a boolean mask for one rule
|
|
89
|
+
# This is much faster than iterating row by row
|
|
90
|
+
flag_masks = {}
|
|
91
|
+
for rule in self.rules:
|
|
92
|
+
try:
|
|
93
|
+
mask = rule.condition(df)
|
|
94
|
+
if isinstance(mask, pd.Series):
|
|
95
|
+
flag_masks[rule.name] = mask
|
|
96
|
+
else:
|
|
97
|
+
# Handle scalar or array results
|
|
98
|
+
flag_masks[rule.name] = pd.Series(mask, index=df.index)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
print(f"Warning: QC rule '{rule.name}' failed: {e}")
|
|
101
|
+
flag_masks[rule.name] = pd.Series(False, index=df.index)
|
|
102
|
+
|
|
103
|
+
# Convert to DataFrame for vectorized string operations
|
|
104
|
+
mask_df = pd.DataFrame(flag_masks)
|
|
105
|
+
|
|
106
|
+
# Build flag strings efficiently using numpy
|
|
107
|
+
def build_flag_string(row):
|
|
108
|
+
flags = [col for col, val in row.items() if val]
|
|
109
|
+
return ', '.join(flags) if flags else 'Valid'
|
|
110
|
+
|
|
111
|
+
# Apply vectorized where possible, fallback to apply for string building
|
|
112
|
+
df = df.copy()
|
|
113
|
+
df['QC_Flag'] = mask_df.apply(build_flag_string, axis=1)
|
|
114
|
+
|
|
115
|
+
return df
|
|
116
|
+
|
|
117
|
+
def get_summary(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Get summary statistics of QC flags.
|
|
120
|
+
|
|
121
|
+
Returns DataFrame with counts and percentages for each flag.
|
|
122
|
+
"""
|
|
123
|
+
results = []
|
|
124
|
+
total = len(df)
|
|
125
|
+
flagged_mask = pd.Series(False, index=df.index)
|
|
126
|
+
|
|
127
|
+
for rule in self.rules:
|
|
128
|
+
try:
|
|
129
|
+
mask = rule.condition(df)
|
|
130
|
+
flagged_mask |= mask
|
|
131
|
+
count = mask.sum()
|
|
132
|
+
results.append({
|
|
133
|
+
'Rule': rule.name,
|
|
134
|
+
'Count': count,
|
|
135
|
+
'Percentage': f'{count / total * 100:.1f}%',
|
|
136
|
+
'Description': rule.description
|
|
137
|
+
})
|
|
138
|
+
except Exception:
|
|
139
|
+
results.append({
|
|
140
|
+
'Rule': rule.name,
|
|
141
|
+
'Count': 'Error',
|
|
142
|
+
'Percentage': '-',
|
|
143
|
+
'Description': rule.description
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
# Add Valid count
|
|
147
|
+
valid_count = (~flagged_mask).sum()
|
|
148
|
+
results.append({
|
|
149
|
+
'Rule': 'Valid',
|
|
150
|
+
'Count': valid_count,
|
|
151
|
+
'Percentage': f'{valid_count / total * 100:.1f}%',
|
|
152
|
+
'Description': 'Passed all QC checks'
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
return pd.DataFrame(results)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class QualityControl:
|
|
159
|
+
"""A class providing various methods for data quality control and outlier detection"""
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def _ensure_dataframe(df: pd.DataFrame | pd.Series) -> pd.DataFrame:
|
|
163
|
+
"""Ensure input data is in DataFrame format"""
|
|
164
|
+
return df.to_frame() if isinstance(df, pd.Series) else df
|
|
165
|
+
|
|
166
|
+
@staticmethod
|
|
167
|
+
def _transform_if_log(df: pd.DataFrame, log_dist: bool) -> pd.DataFrame:
|
|
168
|
+
"""Transform data to log scale if required"""
|
|
169
|
+
return np.log10(df) if log_dist else df
|
|
170
|
+
|
|
171
|
+
@classmethod
|
|
172
|
+
def n_sigma(cls, df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
|
|
173
|
+
"""
|
|
174
|
+
Detect outliers using n-sigma method
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
df : pd.DataFrame
|
|
179
|
+
Input data
|
|
180
|
+
std_range : int, default=5
|
|
181
|
+
Number of standard deviations to use as threshold
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
pd.DataFrame
|
|
186
|
+
Cleaned DataFrame with outliers masked as NaN
|
|
187
|
+
"""
|
|
188
|
+
df = cls._ensure_dataframe(df)
|
|
189
|
+
df_ave = df.mean()
|
|
190
|
+
df_std = df.std()
|
|
191
|
+
|
|
192
|
+
lower_bound = df < (df_ave - df_std * std_range)
|
|
193
|
+
upper_bound = df > (df_ave + df_std * std_range)
|
|
194
|
+
|
|
195
|
+
return df.mask(lower_bound | upper_bound)
|
|
196
|
+
|
|
197
|
+
@classmethod
|
|
198
|
+
def iqr(cls, df: pd.DataFrame, log_dist: bool = False) -> pd.DataFrame:
|
|
199
|
+
"""
|
|
200
|
+
Detect outliers using Interquartile Range (IQR) method
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
df : pd.DataFrame
|
|
205
|
+
Input data
|
|
206
|
+
log_dist : bool, default=False
|
|
207
|
+
Whether to apply log transformation to data
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
pd.DataFrame
|
|
212
|
+
Cleaned DataFrame with outliers masked as NaN
|
|
213
|
+
"""
|
|
214
|
+
df = cls._ensure_dataframe(df)
|
|
215
|
+
df_transformed = cls._transform_if_log(df, log_dist)
|
|
216
|
+
|
|
217
|
+
q1 = df_transformed.quantile(0.25)
|
|
218
|
+
q3 = df_transformed.quantile(0.75)
|
|
219
|
+
iqr = q3 - q1
|
|
220
|
+
|
|
221
|
+
lower_bound = df_transformed < (q1 - 1.5 * iqr)
|
|
222
|
+
upper_bound = df_transformed > (q3 + 1.5 * iqr)
|
|
223
|
+
|
|
224
|
+
return df.mask(lower_bound | upper_bound)
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def time_aware_rolling_iqr(cls, df: pd.DataFrame, window_size: str = '24h',
|
|
228
|
+
log_dist: bool = False, iqr_factor: float = 5,
|
|
229
|
+
min_periods: int = 5) -> pd.DataFrame:
|
|
230
|
+
"""
|
|
231
|
+
Detect outliers using rolling time-aware IQR method with handling for initial periods
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
df : pd.DataFrame
|
|
236
|
+
Input data
|
|
237
|
+
window_size : str, default='24h'
|
|
238
|
+
Size of the rolling window
|
|
239
|
+
log_dist : bool, default=False
|
|
240
|
+
Whether to apply log transformation to data
|
|
241
|
+
iqr_factor : float, default=3
|
|
242
|
+
The factor by which to multiply the IQR
|
|
243
|
+
min_periods : int, default=4
|
|
244
|
+
Minimum number of observations required in window
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
pd.DataFrame
|
|
249
|
+
Cleaned DataFrame with outliers masked as NaN
|
|
250
|
+
"""
|
|
251
|
+
df = cls._ensure_dataframe(df)
|
|
252
|
+
df_transformed = cls._transform_if_log(df, log_dist)
|
|
253
|
+
|
|
254
|
+
# Create result DataFrame
|
|
255
|
+
result = pd.DataFrame(index=df.index)
|
|
256
|
+
|
|
257
|
+
# Apply rolling IQR to each column
|
|
258
|
+
for col in df_transformed.columns:
|
|
259
|
+
series = df_transformed[col]
|
|
260
|
+
|
|
261
|
+
# Calculate global IQR for initial values
|
|
262
|
+
global_q1 = series.quantile(0.25)
|
|
263
|
+
global_q3 = series.quantile(0.75)
|
|
264
|
+
global_iqr = global_q3 - global_q1
|
|
265
|
+
|
|
266
|
+
global_lower = global_q1 - iqr_factor * global_iqr
|
|
267
|
+
global_upper = global_q3 + iqr_factor * global_iqr
|
|
268
|
+
|
|
269
|
+
# Calculate rolling IQR
|
|
270
|
+
rolling_q1 = series.rolling(window_size, min_periods=min_periods).quantile(0.25)
|
|
271
|
+
rolling_q3 = series.rolling(window_size, min_periods=min_periods).quantile(0.75)
|
|
272
|
+
rolling_iqr = rolling_q3 - rolling_q1
|
|
273
|
+
|
|
274
|
+
# Calculate dynamic thresholds
|
|
275
|
+
lower_bound = rolling_q1 - iqr_factor * rolling_iqr
|
|
276
|
+
upper_bound = rolling_q3 + iqr_factor * rolling_iqr
|
|
277
|
+
|
|
278
|
+
# Use global thresholds for initial NaN values
|
|
279
|
+
lower_bound = lower_bound.fillna(global_lower)
|
|
280
|
+
upper_bound = upper_bound.fillna(global_upper)
|
|
281
|
+
|
|
282
|
+
# Mark data points within thresholds
|
|
283
|
+
mask = (series >= lower_bound) & (series <= upper_bound)
|
|
284
|
+
result[col] = mask
|
|
285
|
+
|
|
286
|
+
# Set values in original data that don't meet conditions to NaN
|
|
287
|
+
return df.where(result, np.nan)
|
|
288
|
+
|
|
289
|
+
def time_aware_std_QC(self, df: pd.DataFrame, time_window: str = '6h',
|
|
290
|
+
std_factor: float = 3.0, min_periods: int = 4) -> pd.DataFrame:
|
|
291
|
+
"""
|
|
292
|
+
Time-aware outlier detection using rolling standard deviation
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
df : pd.DataFrame
|
|
297
|
+
Input data
|
|
298
|
+
time_window : str, default='6h'
|
|
299
|
+
Rolling window size
|
|
300
|
+
std_factor : float, default=3.0
|
|
301
|
+
Standard deviation multiplier (e.g., 3 means 3σ)
|
|
302
|
+
min_periods : int, default=4
|
|
303
|
+
Minimum number of observations required in window
|
|
304
|
+
|
|
305
|
+
Returns
|
|
306
|
+
-------
|
|
307
|
+
pd.DataFrame
|
|
308
|
+
Quality controlled DataFrame with outliers marked as NaN
|
|
309
|
+
"""
|
|
310
|
+
df = self._ensure_dataframe(df)
|
|
311
|
+
|
|
312
|
+
# Create result DataFrame
|
|
313
|
+
result = pd.DataFrame(index=df.index)
|
|
314
|
+
|
|
315
|
+
# Apply rolling standard deviation to each column
|
|
316
|
+
for col in df.columns:
|
|
317
|
+
series = df[col]
|
|
318
|
+
|
|
319
|
+
# Calculate global standard deviation for initial values
|
|
320
|
+
global_mean = series.mean()
|
|
321
|
+
global_std = series.std()
|
|
322
|
+
|
|
323
|
+
global_lower = global_mean - std_factor * global_std
|
|
324
|
+
global_upper = global_mean + std_factor * global_std
|
|
325
|
+
|
|
326
|
+
# Calculate rolling mean and standard deviation
|
|
327
|
+
rolling_mean = series.rolling(time_window, min_periods=min_periods).mean()
|
|
328
|
+
rolling_std = series.rolling(time_window, min_periods=min_periods).std()
|
|
329
|
+
|
|
330
|
+
# Calculate dynamic thresholds
|
|
331
|
+
lower_bound = rolling_mean - std_factor * rolling_std
|
|
332
|
+
upper_bound = rolling_mean + std_factor * rolling_std
|
|
333
|
+
|
|
334
|
+
# Use global thresholds for initial NaN values
|
|
335
|
+
lower_bound = lower_bound.fillna(global_lower)
|
|
336
|
+
upper_bound = upper_bound.fillna(global_upper)
|
|
337
|
+
|
|
338
|
+
# Mark data points within thresholds
|
|
339
|
+
mask = (series >= lower_bound) & (series <= upper_bound)
|
|
340
|
+
result[col] = mask
|
|
341
|
+
|
|
342
|
+
# Set values in original data that don't meet conditions to NaN
|
|
343
|
+
return df.where(result, np.nan)
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def bidirectional_trend_std_QC(cls, df: pd.DataFrame, window_size: str = '6h',
|
|
347
|
+
std_factor: float = 3.0, trend_window: str = '30min',
|
|
348
|
+
trend_factor: float = 2, min_periods: int = 4) -> pd.Series:
|
|
349
|
+
"""
|
|
350
|
+
Perform quality control using standard deviation with awareness of both upward and downward trends.
|
|
351
|
+
|
|
352
|
+
This method identifies outliers considering both upward and downward trends in the data,
|
|
353
|
+
applying more lenient criteria when consistent trends are detected.
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
df : pd.DataFrame
|
|
358
|
+
Input data frame with time series (QC_Flag column is now optional)
|
|
359
|
+
window_size : str, default='6h'
|
|
360
|
+
Size of the rolling window for std calculation
|
|
361
|
+
std_factor : float, default=3.0
|
|
362
|
+
Base factor for standard deviation threshold
|
|
363
|
+
trend_window : str, default='30min'
|
|
364
|
+
Window for trend detection
|
|
365
|
+
trend_factor : float, default=2
|
|
366
|
+
Factor to increase std_factor when trends are detected
|
|
367
|
+
min_periods : int, default=4
|
|
368
|
+
Minimum number of observations in window
|
|
369
|
+
|
|
370
|
+
Returns
|
|
371
|
+
-------
|
|
372
|
+
pd.Series
|
|
373
|
+
Boolean mask where True indicates outliers
|
|
374
|
+
"""
|
|
375
|
+
df = cls._ensure_dataframe(df)
|
|
376
|
+
|
|
377
|
+
# 使用預先分配的 NumPy 數組,而不是 pandas Series
|
|
378
|
+
index = df.index
|
|
379
|
+
n_rows = len(index)
|
|
380
|
+
outlier_array = np.zeros(n_rows, dtype=bool) # 更高效的初始化
|
|
381
|
+
|
|
382
|
+
# 只處理數值列,跳過 QC_Flag 等非數值列
|
|
383
|
+
numeric_cols = df.select_dtypes(include=np.number).columns.tolist() # 轉為 list 以提高索引性能
|
|
384
|
+
|
|
385
|
+
# 預先計算滾動窗口大小(以點數而非時間表示)
|
|
386
|
+
# 這僅適用於固定頻率的數據,若數據不規則則保持原始時間窗口
|
|
387
|
+
try:
|
|
388
|
+
if hasattr(df.index, 'freq') and df.index.freq is not None:
|
|
389
|
+
# 將時間窗口轉換為點數
|
|
390
|
+
window_points = int(pd.Timedelta(window_size) / df.index.freq)
|
|
391
|
+
trend_points = int(pd.Timedelta(trend_window) / df.index.freq)
|
|
392
|
+
use_points = True
|
|
393
|
+
else:
|
|
394
|
+
# 嘗試計算平均時間間隔
|
|
395
|
+
if isinstance(df.index, pd.DatetimeIndex) and len(df.index) > 1:
|
|
396
|
+
avg_interval = (df.index[-1] - df.index[0]) / (len(df.index) - 1)
|
|
397
|
+
window_points = int(pd.Timedelta(window_size) / avg_interval)
|
|
398
|
+
trend_points = int(pd.Timedelta(trend_window) / avg_interval)
|
|
399
|
+
use_points = True
|
|
400
|
+
else:
|
|
401
|
+
use_points = False
|
|
402
|
+
window_points = None
|
|
403
|
+
trend_points = None
|
|
404
|
+
except:
|
|
405
|
+
use_points = False
|
|
406
|
+
window_points = None
|
|
407
|
+
trend_points = None
|
|
408
|
+
|
|
409
|
+
# 預編譯趨勢計算函數使用 numba (如果可用)
|
|
410
|
+
try:
|
|
411
|
+
import numba
|
|
412
|
+
|
|
413
|
+
@numba.jit(nopython=True)
|
|
414
|
+
def calc_trend_numba(values):
|
|
415
|
+
n = len(values)
|
|
416
|
+
if n > 3:
|
|
417
|
+
# 使用更高效的線性回歸實現
|
|
418
|
+
x = np.arange(n)
|
|
419
|
+
sum_x = np.sum(x)
|
|
420
|
+
sum_y = np.sum(values)
|
|
421
|
+
sum_xx = np.sum(x * x)
|
|
422
|
+
sum_xy = np.sum(x * values)
|
|
423
|
+
|
|
424
|
+
# 計算斜率
|
|
425
|
+
denom = (n * sum_xx - sum_x * sum_x)
|
|
426
|
+
if denom != 0:
|
|
427
|
+
slope = (n * sum_xy - sum_x * sum_y) / denom
|
|
428
|
+
return slope
|
|
429
|
+
return 0.0
|
|
430
|
+
|
|
431
|
+
use_numba = True
|
|
432
|
+
except ImportError:
|
|
433
|
+
use_numba = False
|
|
434
|
+
|
|
435
|
+
# 回退函數
|
|
436
|
+
def calc_trend_numba(values):
|
|
437
|
+
n = len(values)
|
|
438
|
+
if n > 3:
|
|
439
|
+
try:
|
|
440
|
+
return np.polyfit(range(len(values)), values, 1)[0]
|
|
441
|
+
except:
|
|
442
|
+
return 0
|
|
443
|
+
return 0
|
|
444
|
+
|
|
445
|
+
# 使用並行處理每列
|
|
446
|
+
try:
|
|
447
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
448
|
+
from functools import partial
|
|
449
|
+
|
|
450
|
+
def process_column(col, df, use_points, window_points, trend_points, std_factor,
|
|
451
|
+
min_periods, trend_factor, use_numba):
|
|
452
|
+
# 從 DataFrame 中提取該列
|
|
453
|
+
if isinstance(df, pd.DataFrame):
|
|
454
|
+
series = df[col].values
|
|
455
|
+
else:
|
|
456
|
+
# 如果直接傳入了 Series
|
|
457
|
+
series = df.values
|
|
458
|
+
|
|
459
|
+
# 處理 NaN 值
|
|
460
|
+
valid_mask = ~np.isnan(series)
|
|
461
|
+
valid_indices = np.where(valid_mask)[0]
|
|
462
|
+
|
|
463
|
+
if len(valid_indices) < min_periods:
|
|
464
|
+
return np.zeros(len(series), dtype=bool)
|
|
465
|
+
|
|
466
|
+
# 全局統計量只使用有效值計算
|
|
467
|
+
valid_values = series[valid_mask]
|
|
468
|
+
global_mean = np.mean(valid_values)
|
|
469
|
+
global_std = np.std(valid_values)
|
|
470
|
+
if global_std == 0:
|
|
471
|
+
global_std = 1e-6 # 避免除零
|
|
472
|
+
|
|
473
|
+
# 初始化結果數組
|
|
474
|
+
col_outlier_mask = np.zeros(len(series), dtype=bool)
|
|
475
|
+
|
|
476
|
+
# 滾動統計量計算
|
|
477
|
+
# 對於基於索引的滾動計算
|
|
478
|
+
if use_points and window_points is not None and window_points > 0:
|
|
479
|
+
# 初始化數組
|
|
480
|
+
rolling_mean = np.full_like(series, np.nan, dtype=float)
|
|
481
|
+
rolling_std = np.full_like(series, np.nan, dtype=float)
|
|
482
|
+
trends = np.full_like(series, np.nan, dtype=float)
|
|
483
|
+
trend_significance = np.full_like(series, np.nan, dtype=float)
|
|
484
|
+
|
|
485
|
+
# 手動實現滾動窗口
|
|
486
|
+
for i in valid_indices:
|
|
487
|
+
# 滾動均值和標準差
|
|
488
|
+
start_idx = max(0, i - window_points + 1)
|
|
489
|
+
window_vals = series[start_idx:i + 1]
|
|
490
|
+
valid_window = window_vals[~np.isnan(window_vals)]
|
|
491
|
+
|
|
492
|
+
if len(valid_window) >= min_periods:
|
|
493
|
+
rolling_mean[i] = np.mean(valid_window)
|
|
494
|
+
rolling_std[i] = np.std(valid_window)
|
|
495
|
+
|
|
496
|
+
# 趨勢計算
|
|
497
|
+
if trend_points > 0:
|
|
498
|
+
trend_start = max(0, i - trend_points + 1)
|
|
499
|
+
trend_vals = series[trend_start:i + 1]
|
|
500
|
+
valid_trend = trend_vals[~np.isnan(trend_vals)]
|
|
501
|
+
|
|
502
|
+
if len(valid_trend) >= 3:
|
|
503
|
+
# 使用 numba 加速的趨勢計算
|
|
504
|
+
trends[i] = calc_trend_numba(valid_trend)
|
|
505
|
+
trend_std = np.std(valid_trend)
|
|
506
|
+
if trend_std > 0:
|
|
507
|
+
trend_significance[i] = abs(trends[i]) / trend_std
|
|
508
|
+
|
|
509
|
+
# 計算滾動變化率
|
|
510
|
+
pct_change = np.full_like(series, np.nan, dtype=float)
|
|
511
|
+
for i in range(1, len(series)):
|
|
512
|
+
if not np.isnan(series[i]) and not np.isnan(series[i - 1]) and series[i - 1] != 0:
|
|
513
|
+
pct_change[i] = abs((series[i] - series[i - 1]) / series[i - 1])
|
|
514
|
+
|
|
515
|
+
# 滾動平均變化率
|
|
516
|
+
avg_change_rates = np.full_like(series, np.nan, dtype=float)
|
|
517
|
+
for i in valid_indices:
|
|
518
|
+
if trend_points > 0:
|
|
519
|
+
rate_start = max(0, i - trend_points + 1)
|
|
520
|
+
rate_vals = pct_change[rate_start:i + 1]
|
|
521
|
+
valid_rates = rate_vals[~np.isnan(rate_vals)]
|
|
522
|
+
|
|
523
|
+
if len(valid_rates) >= 3:
|
|
524
|
+
avg_change_rates[i] = np.mean(valid_rates)
|
|
525
|
+
else:
|
|
526
|
+
# 使用 pandas 的滾動窗口(對於時間索引數據)
|
|
527
|
+
# 注意:這裡我們實際上需要創建一個具有時間索引的臨時 Series
|
|
528
|
+
temp_series = pd.Series(series, index=df.index)
|
|
529
|
+
|
|
530
|
+
# 計算滾動統計量
|
|
531
|
+
rolling_mean = temp_series.rolling(window_size, min_periods=min_periods).mean().values
|
|
532
|
+
rolling_std = temp_series.rolling(window_size, min_periods=min_periods).std().values
|
|
533
|
+
|
|
534
|
+
# 趨勢計算
|
|
535
|
+
if use_numba:
|
|
536
|
+
# 使用 apply + numba
|
|
537
|
+
trend_series = temp_series.rolling(trend_window, min_periods=3).apply(
|
|
538
|
+
lambda x: calc_trend_numba(x.values))
|
|
539
|
+
else:
|
|
540
|
+
# 使用內建的 apply + polyfit
|
|
541
|
+
trend_series = temp_series.rolling(trend_window, min_periods=3).apply(
|
|
542
|
+
lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) > 3 else 0)
|
|
543
|
+
|
|
544
|
+
trends = trend_series.values
|
|
545
|
+
|
|
546
|
+
# 計算趨勢顯著性
|
|
547
|
+
series_std = temp_series.rolling(trend_window, min_periods=3).std().values
|
|
548
|
+
trend_significance = np.zeros_like(trends)
|
|
549
|
+
for i in range(len(trends)):
|
|
550
|
+
if not np.isnan(trends[i]) and not np.isnan(series_std[i]) and series_std[i] > 0:
|
|
551
|
+
trend_significance[i] = abs(trends[i]) / series_std[i]
|
|
552
|
+
elif not np.isnan(trends[i]):
|
|
553
|
+
trend_significance[i] = abs(trends[i]) / (global_std * 0.1)
|
|
554
|
+
|
|
555
|
+
# 計算變化率
|
|
556
|
+
pct_change = temp_series.pct_change(fill_method=None).abs().values
|
|
557
|
+
|
|
558
|
+
# 重用 temp_series 計算滾動平均變化率
|
|
559
|
+
temp_change_series = pd.Series(pct_change, index=df.index)
|
|
560
|
+
avg_change_rates = temp_change_series.rolling(trend_window, min_periods=3).mean().values
|
|
561
|
+
|
|
562
|
+
# 動態調整標準差因子
|
|
563
|
+
dynamic_factor = np.full(len(series), std_factor)
|
|
564
|
+
for i in valid_indices:
|
|
565
|
+
if not np.isnan(trend_significance[i]) and trend_significance[i] > 0.1:
|
|
566
|
+
dynamic_factor[i] = std_factor * trend_factor
|
|
567
|
+
|
|
568
|
+
# 調整極低標準差
|
|
569
|
+
min_std = global_std * 0.1
|
|
570
|
+
adjusted_std = np.copy(rolling_std)
|
|
571
|
+
for i in valid_indices:
|
|
572
|
+
if not np.isnan(adjusted_std[i]) and adjusted_std[i] < min_std:
|
|
573
|
+
adjusted_std[i] = min_std
|
|
574
|
+
|
|
575
|
+
# 計算閾值
|
|
576
|
+
lower_bound = np.full_like(series, np.nan, dtype=float)
|
|
577
|
+
upper_bound = np.full_like(series, np.nan, dtype=float)
|
|
578
|
+
|
|
579
|
+
for i in valid_indices:
|
|
580
|
+
if not np.isnan(rolling_mean[i]) and not np.isnan(adjusted_std[i]):
|
|
581
|
+
lower_bound[i] = rolling_mean[i] - dynamic_factor[i] * adjusted_std[i]
|
|
582
|
+
upper_bound[i] = rolling_mean[i] + dynamic_factor[i] * adjusted_std[i]
|
|
583
|
+
else:
|
|
584
|
+
# 使用全局統計量
|
|
585
|
+
lower_bound[i] = global_mean - std_factor * global_std
|
|
586
|
+
upper_bound[i] = global_mean + std_factor * global_std
|
|
587
|
+
|
|
588
|
+
# 標記超出閾值的點
|
|
589
|
+
for i in valid_indices:
|
|
590
|
+
if not (lower_bound[i] <= series[i] <= upper_bound[i]):
|
|
591
|
+
col_outlier_mask[i] = True
|
|
592
|
+
|
|
593
|
+
# 趨勢一致性檢查
|
|
594
|
+
trend_consistent = np.zeros_like(col_outlier_mask, dtype=bool)
|
|
595
|
+
for i in valid_indices:
|
|
596
|
+
if i > 0 and not np.isnan(pct_change[i]) and not np.isnan(avg_change_rates[i]):
|
|
597
|
+
trend_consistent[i] = pct_change[i] <= (avg_change_rates[i] * 3)
|
|
598
|
+
|
|
599
|
+
# 顯著趨勢檢查
|
|
600
|
+
significant_trend_mask = np.zeros_like(col_outlier_mask, dtype=bool)
|
|
601
|
+
for i in valid_indices:
|
|
602
|
+
if not np.isnan(trend_significance[i]) and trend_significance[i] > 0.1:
|
|
603
|
+
significant_trend_mask[i] = True
|
|
604
|
+
|
|
605
|
+
# 最終掩碼:僅當點超出範圍且不符合顯著趨勢時才標記為異常
|
|
606
|
+
col_final_mask = col_outlier_mask.copy()
|
|
607
|
+
for i in valid_indices:
|
|
608
|
+
if col_outlier_mask[i] and trend_consistent[i] and significant_trend_mask[i]:
|
|
609
|
+
col_final_mask[i] = False
|
|
610
|
+
|
|
611
|
+
return col_final_mask
|
|
612
|
+
|
|
613
|
+
# 嘗試使用並行處理
|
|
614
|
+
with ThreadPoolExecutor(max_workers=min(4, len(numeric_cols))) as executor:
|
|
615
|
+
col_results = list(executor.map(
|
|
616
|
+
partial(process_column, df=df, use_points=use_points,
|
|
617
|
+
window_points=window_points, trend_points=trend_points,
|
|
618
|
+
std_factor=std_factor, min_periods=min_periods,
|
|
619
|
+
trend_factor=trend_factor, use_numba=use_numba),
|
|
620
|
+
numeric_cols))
|
|
621
|
+
|
|
622
|
+
# 合併結果
|
|
623
|
+
for col_mask in col_results:
|
|
624
|
+
outlier_array = outlier_array | col_mask
|
|
625
|
+
|
|
626
|
+
except Exception as e:
|
|
627
|
+
# 如果並行處理失敗,回退到原始實現
|
|
628
|
+
print(f"Warning: Parallel processing failed, falling back to original implementation. Error: {e}")
|
|
629
|
+
|
|
630
|
+
# 創建結果掩碼 - 初始全部為 False (不是異常值)
|
|
631
|
+
outlier_mask = pd.Series(False, index=df.index)
|
|
632
|
+
|
|
633
|
+
for col in numeric_cols:
|
|
634
|
+
series = df[col]
|
|
635
|
+
|
|
636
|
+
# 計算全局統計量
|
|
637
|
+
global_mean = series.mean()
|
|
638
|
+
global_std = series.std()
|
|
639
|
+
|
|
640
|
+
# 檢測趨勢方向和強度
|
|
641
|
+
def calc_trend(x):
|
|
642
|
+
if len(x) > 3:
|
|
643
|
+
try:
|
|
644
|
+
return np.polyfit(range(len(x)), x, 1)[0]
|
|
645
|
+
except:
|
|
646
|
+
return 0
|
|
647
|
+
return 0
|
|
648
|
+
|
|
649
|
+
trend = series.rolling(trend_window, min_periods=3).apply(calc_trend)
|
|
650
|
+
|
|
651
|
+
# 計算趨勢顯著性
|
|
652
|
+
series_std = series.rolling(trend_window, min_periods=3).std()
|
|
653
|
+
# 避免除以零
|
|
654
|
+
trend_significance = np.abs(trend) / series_std.replace(0, np.nan).fillna(global_std * 0.1)
|
|
655
|
+
|
|
656
|
+
# 動態因子調整
|
|
657
|
+
dynamic_factor = pd.Series(std_factor, index=df.index)
|
|
658
|
+
significant_trend = trend_significance > 0.1
|
|
659
|
+
dynamic_factor[significant_trend] = std_factor * trend_factor
|
|
660
|
+
|
|
661
|
+
# 計算滾動統計量
|
|
662
|
+
rolling_mean = series.rolling(window_size, min_periods=min_periods).mean()
|
|
663
|
+
rolling_std = series.rolling(window_size, min_periods=min_periods).std()
|
|
664
|
+
|
|
665
|
+
# 調整極低標準差
|
|
666
|
+
min_std_threshold = global_std * 0.1
|
|
667
|
+
adjusted_std = rolling_std.clip(lower=min_std_threshold)
|
|
668
|
+
|
|
669
|
+
# 計算閾值
|
|
670
|
+
lower_bound = rolling_mean - dynamic_factor * adjusted_std
|
|
671
|
+
upper_bound = rolling_mean + dynamic_factor * adjusted_std
|
|
672
|
+
|
|
673
|
+
# 填充初始 NaN 值
|
|
674
|
+
lower_bound = lower_bound.fillna(global_mean - std_factor * global_std)
|
|
675
|
+
upper_bound = upper_bound.fillna(global_mean + std_factor * global_std)
|
|
676
|
+
|
|
677
|
+
# 檢查變化率一致性
|
|
678
|
+
rate_of_change = series.pct_change(fill_method=None).abs()
|
|
679
|
+
avg_change_rate = rate_of_change.rolling(trend_window, min_periods=3).mean()
|
|
680
|
+
|
|
681
|
+
# 標記異常值
|
|
682
|
+
col_outlier_mask = ~((series >= lower_bound) & (series <= upper_bound))
|
|
683
|
+
trend_consistent = rate_of_change <= (avg_change_rate * 3)
|
|
684
|
+
|
|
685
|
+
# 最終掩碼:只有當點超出範圍且不屬於一致趨勢時才標記為異常
|
|
686
|
+
col_final_outlier_mask = col_outlier_mask & ~(col_outlier_mask & trend_consistent & significant_trend)
|
|
687
|
+
|
|
688
|
+
# 更新總掩碼 - 如果任一列有異常,則標記為異常
|
|
689
|
+
outlier_mask = outlier_mask | col_final_outlier_mask
|
|
690
|
+
|
|
691
|
+
return outlier_mask
|
|
692
|
+
|
|
693
|
+
# 轉換回 pandas Series
|
|
694
|
+
return pd.Series(outlier_array, index=index)
|
|
695
|
+
|
|
696
|
+
@staticmethod
|
|
697
|
+
def filter_error_status(_df, error_codes=None, special_codes=None, return_mask=True,
|
|
698
|
+
status_column='Status', status_type='bitwise', ok_value=None):
|
|
699
|
+
"""
|
|
700
|
+
Filter data based on error status codes.
|
|
701
|
+
|
|
702
|
+
Parameters
|
|
703
|
+
----------
|
|
704
|
+
_df : pd.DataFrame
|
|
705
|
+
Input DataFrame
|
|
706
|
+
error_codes : list or array-like, optional
|
|
707
|
+
Codes indicating errors (for 'bitwise' type)
|
|
708
|
+
special_codes : list or array-like, optional
|
|
709
|
+
Special codes to handle differently (exact match)
|
|
710
|
+
return_mask : bool, default=True
|
|
711
|
+
If True, returns a boolean mask where True indicates errors;
|
|
712
|
+
If False, returns filtered DataFrame
|
|
713
|
+
status_column : str, default='Status'
|
|
714
|
+
Name of the status column in DataFrame
|
|
715
|
+
status_type : str, default='bitwise'
|
|
716
|
+
Type of status check:
|
|
717
|
+
- 'bitwise': Use bitwise AND to check error codes (AE33, AE43, BC1054, MA350)
|
|
718
|
+
- 'numeric': Check if status != ok_value (TEOM, Aurora, NEPH)
|
|
719
|
+
- 'text': Check if status != ok_value as string (SMPS)
|
|
720
|
+
- 'binary_string': Parse binary string and check if > 0 (APS)
|
|
721
|
+
ok_value : any, optional
|
|
722
|
+
The value indicating OK status (for 'numeric', 'text' types)
|
|
723
|
+
- For 'numeric': typically 0
|
|
724
|
+
- For 'text': typically 'Normal Scan'
|
|
725
|
+
|
|
726
|
+
Returns
|
|
727
|
+
-------
|
|
728
|
+
Union[pd.DataFrame, pd.Series]
|
|
729
|
+
If return_mask=True: boolean Series with True for error points
|
|
730
|
+
If return_mask=False: Filtered DataFrame with error points masked
|
|
731
|
+
"""
|
|
732
|
+
# Check if status column exists
|
|
733
|
+
if status_column not in _df.columns:
|
|
734
|
+
# No status column, return all False (no errors)
|
|
735
|
+
if return_mask:
|
|
736
|
+
return pd.Series(False, index=_df.index)
|
|
737
|
+
else:
|
|
738
|
+
return _df
|
|
739
|
+
|
|
740
|
+
# Create an empty mask
|
|
741
|
+
error_mask = pd.Series(False, index=_df.index)
|
|
742
|
+
|
|
743
|
+
if status_type == 'bitwise':
|
|
744
|
+
# Original bitwise logic for AE33, AE43, BC1054, MA350
|
|
745
|
+
status_values = pd.to_numeric(_df[status_column], errors='coerce').fillna(0).astype(int)
|
|
746
|
+
|
|
747
|
+
# Bitwise test normal error codes
|
|
748
|
+
if error_codes:
|
|
749
|
+
for code in error_codes:
|
|
750
|
+
error_mask = error_mask | ((status_values & code) != 0)
|
|
751
|
+
|
|
752
|
+
# Exact matching for special codes
|
|
753
|
+
if special_codes:
|
|
754
|
+
error_mask = error_mask | status_values.isin(special_codes)
|
|
755
|
+
|
|
756
|
+
elif status_type == 'numeric':
|
|
757
|
+
# Simple numeric comparison for TEOM, Aurora, NEPH
|
|
758
|
+
status_values = pd.to_numeric(_df[status_column], errors='coerce')
|
|
759
|
+
if ok_value is not None:
|
|
760
|
+
error_mask = (status_values != ok_value) & status_values.notna()
|
|
761
|
+
else:
|
|
762
|
+
# Default: 0 is OK
|
|
763
|
+
error_mask = (status_values != 0) & status_values.notna()
|
|
764
|
+
|
|
765
|
+
elif status_type == 'text':
|
|
766
|
+
# Text comparison for SMPS
|
|
767
|
+
status_values = _df[status_column].astype(str).str.strip()
|
|
768
|
+
if ok_value is not None:
|
|
769
|
+
error_mask = (status_values != ok_value) & (status_values != '') & (status_values != 'nan')
|
|
770
|
+
else:
|
|
771
|
+
# No ok_value specified, can't determine errors
|
|
772
|
+
error_mask = pd.Series(False, index=_df.index)
|
|
773
|
+
|
|
774
|
+
elif status_type == 'binary_string':
|
|
775
|
+
# Binary string parsing for APS ('0000 0000 0000 0000')
|
|
776
|
+
def parse_binary_status(status_str):
|
|
777
|
+
if not isinstance(status_str, str) or status_str in ('nan', ''):
|
|
778
|
+
return 0
|
|
779
|
+
binary_str = status_str.replace(' ', '')
|
|
780
|
+
try:
|
|
781
|
+
return int(binary_str, 2)
|
|
782
|
+
except ValueError:
|
|
783
|
+
return 0
|
|
784
|
+
|
|
785
|
+
status_values = _df[status_column].apply(parse_binary_status)
|
|
786
|
+
error_mask = status_values > 0
|
|
787
|
+
|
|
788
|
+
else:
|
|
789
|
+
raise ValueError(f"Unknown status_type: {status_type}")
|
|
790
|
+
|
|
791
|
+
# Return either the mask or the filtered DataFrame
|
|
792
|
+
if return_mask:
|
|
793
|
+
return error_mask
|
|
794
|
+
else:
|
|
795
|
+
return _df.mask(error_mask)
|
|
796
|
+
|
|
797
|
+
@classmethod
|
|
798
|
+
def spike_detection(cls, df: pd.DataFrame,
|
|
799
|
+
max_change_rate: float = 3.0,
|
|
800
|
+
min_abs_change: float = None) -> pd.Series:
|
|
801
|
+
"""
|
|
802
|
+
Vectorized spike detection using change rate analysis.
|
|
803
|
+
|
|
804
|
+
Detects sudden unreasonable value changes while allowing legitimate
|
|
805
|
+
gradual changes during events (pollution episodes, etc.).
|
|
806
|
+
|
|
807
|
+
This method is much faster than rolling window methods because it uses
|
|
808
|
+
pure numpy vectorized operations.
|
|
809
|
+
|
|
810
|
+
Parameters
|
|
811
|
+
----------
|
|
812
|
+
df : pd.DataFrame
|
|
813
|
+
Input data frame with time series
|
|
814
|
+
max_change_rate : float, default=3.0
|
|
815
|
+
Maximum allowed ratio of current change to median absolute change.
|
|
816
|
+
Higher values = more permissive. A value of 3.0 means a change
|
|
817
|
+
must be 3x larger than the median change to be flagged.
|
|
818
|
+
min_abs_change : float, optional
|
|
819
|
+
Minimum absolute change required to be considered a spike.
|
|
820
|
+
If None, uses 10% of the data's standard deviation.
|
|
821
|
+
|
|
822
|
+
Returns
|
|
823
|
+
-------
|
|
824
|
+
pd.Series
|
|
825
|
+
Boolean mask where True indicates detected spikes
|
|
826
|
+
|
|
827
|
+
Notes
|
|
828
|
+
-----
|
|
829
|
+
The algorithm:
|
|
830
|
+
1. Calculate absolute difference between consecutive points
|
|
831
|
+
2. Calculate the median absolute change (robust baseline)
|
|
832
|
+
3. Flag points where change > max_change_rate * median_change
|
|
833
|
+
4. Also detect "reversals" (spike up then immediately down)
|
|
834
|
+
|
|
835
|
+
This approach allows gradual changes during events while catching
|
|
836
|
+
sudden spikes that are likely instrument errors.
|
|
837
|
+
|
|
838
|
+
Examples
|
|
839
|
+
--------
|
|
840
|
+
>>> qc = QualityControl()
|
|
841
|
+
>>> spike_mask = qc.spike_detection(df, max_change_rate=3.0)
|
|
842
|
+
"""
|
|
843
|
+
df = cls._ensure_dataframe(df)
|
|
844
|
+
|
|
845
|
+
# Initialize result mask
|
|
846
|
+
spike_mask = pd.Series(False, index=df.index)
|
|
847
|
+
|
|
848
|
+
# Process each numeric column
|
|
849
|
+
numeric_cols = df.select_dtypes(include=np.number).columns
|
|
850
|
+
|
|
851
|
+
for col in numeric_cols:
|
|
852
|
+
values = df[col].values
|
|
853
|
+
n = len(values)
|
|
854
|
+
|
|
855
|
+
if n < 3:
|
|
856
|
+
continue
|
|
857
|
+
|
|
858
|
+
# Calculate absolute differences (vectorized)
|
|
859
|
+
diff = np.abs(np.diff(values))
|
|
860
|
+
|
|
861
|
+
# Handle NaN values
|
|
862
|
+
valid_diff = diff[~np.isnan(diff)]
|
|
863
|
+
|
|
864
|
+
if len(valid_diff) < 3:
|
|
865
|
+
continue
|
|
866
|
+
|
|
867
|
+
# Calculate median absolute change (robust measure)
|
|
868
|
+
median_change = np.median(valid_diff)
|
|
869
|
+
|
|
870
|
+
# Set minimum threshold
|
|
871
|
+
if min_abs_change is None:
|
|
872
|
+
# Use 10% of std as minimum meaningful change
|
|
873
|
+
std_val = np.nanstd(values)
|
|
874
|
+
min_threshold = std_val * 0.1
|
|
875
|
+
else:
|
|
876
|
+
min_threshold = min_abs_change
|
|
877
|
+
|
|
878
|
+
# Ensure median_change is not too small
|
|
879
|
+
if median_change < min_threshold:
|
|
880
|
+
median_change = min_threshold
|
|
881
|
+
|
|
882
|
+
# Calculate spike threshold
|
|
883
|
+
spike_threshold = max_change_rate * median_change
|
|
884
|
+
|
|
885
|
+
# Detect spikes: diff[i] is the change from values[i] to values[i+1]
|
|
886
|
+
# So spike at index i+1 if diff[i] > threshold
|
|
887
|
+
large_changes = diff > spike_threshold
|
|
888
|
+
|
|
889
|
+
# Detect reversals: sudden up then immediate down (or vice versa)
|
|
890
|
+
# A reversal at index i means: sign(diff[i-1]) != sign(diff[i])
|
|
891
|
+
# and both changes are large
|
|
892
|
+
signed_diff = np.diff(values) # Keep sign for reversal detection
|
|
893
|
+
|
|
894
|
+
# Reversal detection (vectorized)
|
|
895
|
+
# Check if consecutive changes have opposite signs and both are significant
|
|
896
|
+
if len(signed_diff) >= 2:
|
|
897
|
+
sign_change = signed_diff[:-1] * signed_diff[1:] < 0 # Opposite signs
|
|
898
|
+
both_large = (np.abs(signed_diff[:-1]) > spike_threshold * 0.5) & \
|
|
899
|
+
(np.abs(signed_diff[1:]) > spike_threshold * 0.5)
|
|
900
|
+
reversals = sign_change & both_large
|
|
901
|
+
|
|
902
|
+
# Mark the middle point of a reversal as spike
|
|
903
|
+
# reversals[i] indicates reversal at values[i+1]
|
|
904
|
+
col_spike_mask = np.zeros(n, dtype=bool)
|
|
905
|
+
|
|
906
|
+
# Large changes: mark the point after the change
|
|
907
|
+
col_spike_mask[1:] = large_changes
|
|
908
|
+
|
|
909
|
+
# Reversals: mark the middle point (already aligned to i+1)
|
|
910
|
+
col_spike_mask[1:-1] = col_spike_mask[1:-1] | reversals
|
|
911
|
+
else:
|
|
912
|
+
col_spike_mask = np.zeros(n, dtype=bool)
|
|
913
|
+
col_spike_mask[1:] = large_changes
|
|
914
|
+
|
|
915
|
+
# Update overall mask
|
|
916
|
+
spike_mask = spike_mask | pd.Series(col_spike_mask, index=df.index)
|
|
917
|
+
|
|
918
|
+
return spike_mask
|
|
919
|
+
|
|
920
|
+
@classmethod
|
|
921
|
+
def hourly_completeness_QC(cls, df: pd.DataFrame, freq: str,
|
|
922
|
+
threshold: float = 0.5) -> pd.Series:
|
|
923
|
+
"""
|
|
924
|
+
Check if each hour has sufficient data points.
|
|
925
|
+
|
|
926
|
+
Parameters
|
|
927
|
+
----------
|
|
928
|
+
df : pd.DataFrame
|
|
929
|
+
Input data frame with time series
|
|
930
|
+
freq : str
|
|
931
|
+
Data frequency (e.g., '6min')
|
|
932
|
+
threshold : float, default=0.5
|
|
933
|
+
Minimum required proportion of data points per hour (0-1)
|
|
934
|
+
|
|
935
|
+
Returns
|
|
936
|
+
-------
|
|
937
|
+
pd.Series
|
|
938
|
+
Boolean mask where True indicates insufficient data
|
|
939
|
+
"""
|
|
940
|
+
# Ensure input is DataFrame
|
|
941
|
+
df = cls._ensure_dataframe(df)
|
|
942
|
+
|
|
943
|
+
# Create result mask
|
|
944
|
+
completeness_mask = pd.Series(False, index=df.index)
|
|
945
|
+
|
|
946
|
+
# Calculate expected data points per hour
|
|
947
|
+
points_per_hour = pd.Timedelta('1h') / pd.Timedelta(freq)
|
|
948
|
+
min_points = points_per_hour * threshold
|
|
949
|
+
|
|
950
|
+
# Only process numeric columns
|
|
951
|
+
numeric_cols = df.select_dtypes(include=np.number).columns
|
|
952
|
+
|
|
953
|
+
for col in numeric_cols:
|
|
954
|
+
# Calculate actual data points per hour
|
|
955
|
+
hourly_count = df[col].dropna().resample('1h').size().reindex(df.index).ffill()
|
|
956
|
+
|
|
957
|
+
# Mark points with insufficient data
|
|
958
|
+
insufficient_mask = hourly_count < min_points
|
|
959
|
+
completeness_mask = completeness_mask | insufficient_mask
|
|
960
|
+
|
|
961
|
+
return completeness_mask
|