AeroViz 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. AeroViz/__init__.py +13 -0
  2. AeroViz/__pycache__/__init__.cpython-312.pyc +0 -0
  3. AeroViz/data/DEFAULT_DATA.csv +1417 -0
  4. AeroViz/data/DEFAULT_PNSD_DATA.csv +1417 -0
  5. AeroViz/data/hysplit_example_data.txt +101 -0
  6. AeroViz/dataProcess/Chemistry/__init__.py +149 -0
  7. AeroViz/dataProcess/Chemistry/__pycache__/__init__.cpython-312.pyc +0 -0
  8. AeroViz/dataProcess/Chemistry/_calculate.py +557 -0
  9. AeroViz/dataProcess/Chemistry/_isoropia.py +150 -0
  10. AeroViz/dataProcess/Chemistry/_mass_volume.py +487 -0
  11. AeroViz/dataProcess/Chemistry/_ocec.py +172 -0
  12. AeroViz/dataProcess/Chemistry/isrpia.cnf +21 -0
  13. AeroViz/dataProcess/Chemistry/isrpia2.exe +0 -0
  14. AeroViz/dataProcess/Optical/PyMieScatt_update.py +577 -0
  15. AeroViz/dataProcess/Optical/_IMPROVE.py +452 -0
  16. AeroViz/dataProcess/Optical/__init__.py +281 -0
  17. AeroViz/dataProcess/Optical/__pycache__/PyMieScatt_update.cpython-312.pyc +0 -0
  18. AeroViz/dataProcess/Optical/__pycache__/__init__.cpython-312.pyc +0 -0
  19. AeroViz/dataProcess/Optical/__pycache__/mie_theory.cpython-312.pyc +0 -0
  20. AeroViz/dataProcess/Optical/_derived.py +518 -0
  21. AeroViz/dataProcess/Optical/_extinction.py +123 -0
  22. AeroViz/dataProcess/Optical/_mie_sd.py +912 -0
  23. AeroViz/dataProcess/Optical/_retrieve_RI.py +243 -0
  24. AeroViz/dataProcess/Optical/coefficient.py +72 -0
  25. AeroViz/dataProcess/Optical/fRH.pkl +0 -0
  26. AeroViz/dataProcess/Optical/mie_theory.py +260 -0
  27. AeroViz/dataProcess/README.md +271 -0
  28. AeroViz/dataProcess/SizeDistr/__init__.py +245 -0
  29. AeroViz/dataProcess/SizeDistr/__pycache__/__init__.cpython-312.pyc +0 -0
  30. AeroViz/dataProcess/SizeDistr/__pycache__/_size_dist.cpython-312.pyc +0 -0
  31. AeroViz/dataProcess/SizeDistr/_size_dist.py +810 -0
  32. AeroViz/dataProcess/SizeDistr/merge/README.md +93 -0
  33. AeroViz/dataProcess/SizeDistr/merge/__init__.py +20 -0
  34. AeroViz/dataProcess/SizeDistr/merge/_merge_v0.py +251 -0
  35. AeroViz/dataProcess/SizeDistr/merge/_merge_v0_1.py +246 -0
  36. AeroViz/dataProcess/SizeDistr/merge/_merge_v1.py +255 -0
  37. AeroViz/dataProcess/SizeDistr/merge/_merge_v2.py +244 -0
  38. AeroViz/dataProcess/SizeDistr/merge/_merge_v3.py +518 -0
  39. AeroViz/dataProcess/SizeDistr/merge/_merge_v4.py +422 -0
  40. AeroViz/dataProcess/SizeDistr/prop.py +62 -0
  41. AeroViz/dataProcess/VOC/__init__.py +14 -0
  42. AeroViz/dataProcess/VOC/__pycache__/__init__.cpython-312.pyc +0 -0
  43. AeroViz/dataProcess/VOC/_potential_par.py +108 -0
  44. AeroViz/dataProcess/VOC/support_voc.json +446 -0
  45. AeroViz/dataProcess/__init__.py +66 -0
  46. AeroViz/dataProcess/__pycache__/__init__.cpython-312.pyc +0 -0
  47. AeroViz/dataProcess/core/__init__.py +272 -0
  48. AeroViz/dataProcess/core/__pycache__/__init__.cpython-312.pyc +0 -0
  49. AeroViz/mcp_server.py +352 -0
  50. AeroViz/plot/__init__.py +13 -0
  51. AeroViz/plot/__pycache__/__init__.cpython-312.pyc +0 -0
  52. AeroViz/plot/__pycache__/bar.cpython-312.pyc +0 -0
  53. AeroViz/plot/__pycache__/box.cpython-312.pyc +0 -0
  54. AeroViz/plot/__pycache__/pie.cpython-312.pyc +0 -0
  55. AeroViz/plot/__pycache__/radar.cpython-312.pyc +0 -0
  56. AeroViz/plot/__pycache__/regression.cpython-312.pyc +0 -0
  57. AeroViz/plot/__pycache__/scatter.cpython-312.pyc +0 -0
  58. AeroViz/plot/__pycache__/violin.cpython-312.pyc +0 -0
  59. AeroViz/plot/bar.py +126 -0
  60. AeroViz/plot/box.py +69 -0
  61. AeroViz/plot/distribution/__init__.py +1 -0
  62. AeroViz/plot/distribution/__pycache__/__init__.cpython-312.pyc +0 -0
  63. AeroViz/plot/distribution/__pycache__/distribution.cpython-312.pyc +0 -0
  64. AeroViz/plot/distribution/distribution.py +576 -0
  65. AeroViz/plot/meteorology/CBPF.py +295 -0
  66. AeroViz/plot/meteorology/__init__.py +3 -0
  67. AeroViz/plot/meteorology/__pycache__/CBPF.cpython-312.pyc +0 -0
  68. AeroViz/plot/meteorology/__pycache__/__init__.cpython-312.pyc +0 -0
  69. AeroViz/plot/meteorology/__pycache__/hysplit.cpython-312.pyc +0 -0
  70. AeroViz/plot/meteorology/__pycache__/wind_rose.cpython-312.pyc +0 -0
  71. AeroViz/plot/meteorology/hysplit.py +93 -0
  72. AeroViz/plot/meteorology/wind_rose.py +77 -0
  73. AeroViz/plot/optical/__init__.py +1 -0
  74. AeroViz/plot/optical/__pycache__/__init__.cpython-312.pyc +0 -0
  75. AeroViz/plot/optical/__pycache__/optical.cpython-312.pyc +0 -0
  76. AeroViz/plot/optical/optical.py +388 -0
  77. AeroViz/plot/pie.py +210 -0
  78. AeroViz/plot/radar.py +184 -0
  79. AeroViz/plot/regression.py +200 -0
  80. AeroViz/plot/scatter.py +174 -0
  81. AeroViz/plot/templates/__init__.py +6 -0
  82. AeroViz/plot/templates/__pycache__/__init__.cpython-312.pyc +0 -0
  83. AeroViz/plot/templates/__pycache__/ammonium_rich.cpython-312.pyc +0 -0
  84. AeroViz/plot/templates/__pycache__/contour.cpython-312.pyc +0 -0
  85. AeroViz/plot/templates/__pycache__/corr_matrix.cpython-312.pyc +0 -0
  86. AeroViz/plot/templates/__pycache__/diurnal_pattern.cpython-312.pyc +0 -0
  87. AeroViz/plot/templates/__pycache__/koschmieder.cpython-312.pyc +0 -0
  88. AeroViz/plot/templates/__pycache__/metal_heatmap.cpython-312.pyc +0 -0
  89. AeroViz/plot/templates/ammonium_rich.py +34 -0
  90. AeroViz/plot/templates/contour.py +47 -0
  91. AeroViz/plot/templates/corr_matrix.py +267 -0
  92. AeroViz/plot/templates/diurnal_pattern.py +61 -0
  93. AeroViz/plot/templates/koschmieder.py +95 -0
  94. AeroViz/plot/templates/metal_heatmap.py +164 -0
  95. AeroViz/plot/timeseries/__init__.py +2 -0
  96. AeroViz/plot/timeseries/__pycache__/__init__.cpython-312.pyc +0 -0
  97. AeroViz/plot/timeseries/__pycache__/template.cpython-312.pyc +0 -0
  98. AeroViz/plot/timeseries/__pycache__/timeseries.cpython-312.pyc +0 -0
  99. AeroViz/plot/timeseries/template.py +47 -0
  100. AeroViz/plot/timeseries/timeseries.py +446 -0
  101. AeroViz/plot/utils/__init__.py +4 -0
  102. AeroViz/plot/utils/__pycache__/__init__.cpython-312.pyc +0 -0
  103. AeroViz/plot/utils/__pycache__/_color.cpython-312.pyc +0 -0
  104. AeroViz/plot/utils/__pycache__/_unit.cpython-312.pyc +0 -0
  105. AeroViz/plot/utils/__pycache__/plt_utils.cpython-312.pyc +0 -0
  106. AeroViz/plot/utils/__pycache__/sklearn_utils.cpython-312.pyc +0 -0
  107. AeroViz/plot/utils/_color.py +71 -0
  108. AeroViz/plot/utils/_unit.py +55 -0
  109. AeroViz/plot/utils/fRH.json +390 -0
  110. AeroViz/plot/utils/plt_utils.py +92 -0
  111. AeroViz/plot/utils/sklearn_utils.py +49 -0
  112. AeroViz/plot/utils/units.json +89 -0
  113. AeroViz/plot/violin.py +80 -0
  114. AeroViz/rawDataReader/FLOW.md +138 -0
  115. AeroViz/rawDataReader/__init__.py +220 -0
  116. AeroViz/rawDataReader/__pycache__/__init__.cpython-312.pyc +0 -0
  117. AeroViz/rawDataReader/config/__init__.py +0 -0
  118. AeroViz/rawDataReader/config/__pycache__/__init__.cpython-312.pyc +0 -0
  119. AeroViz/rawDataReader/config/__pycache__/supported_instruments.cpython-312.pyc +0 -0
  120. AeroViz/rawDataReader/config/supported_instruments.py +135 -0
  121. AeroViz/rawDataReader/core/__init__.py +658 -0
  122. AeroViz/rawDataReader/core/__pycache__/__init__.cpython-312.pyc +0 -0
  123. AeroViz/rawDataReader/core/__pycache__/logger.cpython-312.pyc +0 -0
  124. AeroViz/rawDataReader/core/__pycache__/pre_process.cpython-312.pyc +0 -0
  125. AeroViz/rawDataReader/core/__pycache__/qc.cpython-312.pyc +0 -0
  126. AeroViz/rawDataReader/core/__pycache__/report.cpython-312.pyc +0 -0
  127. AeroViz/rawDataReader/core/logger.py +171 -0
  128. AeroViz/rawDataReader/core/pre_process.py +308 -0
  129. AeroViz/rawDataReader/core/qc.py +961 -0
  130. AeroViz/rawDataReader/core/report.py +579 -0
  131. AeroViz/rawDataReader/script/AE33.py +173 -0
  132. AeroViz/rawDataReader/script/AE43.py +151 -0
  133. AeroViz/rawDataReader/script/APS.py +339 -0
  134. AeroViz/rawDataReader/script/Aurora.py +191 -0
  135. AeroViz/rawDataReader/script/BAM1020.py +90 -0
  136. AeroViz/rawDataReader/script/BC1054.py +161 -0
  137. AeroViz/rawDataReader/script/EPA.py +79 -0
  138. AeroViz/rawDataReader/script/GRIMM.py +68 -0
  139. AeroViz/rawDataReader/script/IGAC.py +140 -0
  140. AeroViz/rawDataReader/script/MA350.py +179 -0
  141. AeroViz/rawDataReader/script/Minion.py +218 -0
  142. AeroViz/rawDataReader/script/NEPH.py +199 -0
  143. AeroViz/rawDataReader/script/OCEC.py +173 -0
  144. AeroViz/rawDataReader/script/Q-ACSM.py +12 -0
  145. AeroViz/rawDataReader/script/SMPS.py +389 -0
  146. AeroViz/rawDataReader/script/TEOM.py +181 -0
  147. AeroViz/rawDataReader/script/VOC.py +106 -0
  148. AeroViz/rawDataReader/script/Xact.py +244 -0
  149. AeroViz/rawDataReader/script/__init__.py +28 -0
  150. AeroViz/rawDataReader/script/__pycache__/AE33.cpython-312.pyc +0 -0
  151. AeroViz/rawDataReader/script/__pycache__/AE43.cpython-312.pyc +0 -0
  152. AeroViz/rawDataReader/script/__pycache__/APS.cpython-312.pyc +0 -0
  153. AeroViz/rawDataReader/script/__pycache__/Aurora.cpython-312.pyc +0 -0
  154. AeroViz/rawDataReader/script/__pycache__/BAM1020.cpython-312.pyc +0 -0
  155. AeroViz/rawDataReader/script/__pycache__/BC1054.cpython-312.pyc +0 -0
  156. AeroViz/rawDataReader/script/__pycache__/EPA.cpython-312.pyc +0 -0
  157. AeroViz/rawDataReader/script/__pycache__/GRIMM.cpython-312.pyc +0 -0
  158. AeroViz/rawDataReader/script/__pycache__/IGAC.cpython-312.pyc +0 -0
  159. AeroViz/rawDataReader/script/__pycache__/MA350.cpython-312.pyc +0 -0
  160. AeroViz/rawDataReader/script/__pycache__/Minion.cpython-312.pyc +0 -0
  161. AeroViz/rawDataReader/script/__pycache__/NEPH.cpython-312.pyc +0 -0
  162. AeroViz/rawDataReader/script/__pycache__/OCEC.cpython-312.pyc +0 -0
  163. AeroViz/rawDataReader/script/__pycache__/Q-ACSM.cpython-312.pyc +0 -0
  164. AeroViz/rawDataReader/script/__pycache__/SMPS.cpython-312.pyc +0 -0
  165. AeroViz/rawDataReader/script/__pycache__/TEOM.cpython-312.pyc +0 -0
  166. AeroViz/rawDataReader/script/__pycache__/VOC.cpython-312.pyc +0 -0
  167. AeroViz/rawDataReader/script/__pycache__/Xact.cpython-312.pyc +0 -0
  168. AeroViz/rawDataReader/script/__pycache__/__init__.cpython-312.pyc +0 -0
  169. AeroViz/tools/__init__.py +2 -0
  170. AeroViz/tools/__pycache__/__init__.cpython-312.pyc +0 -0
  171. AeroViz/tools/__pycache__/database.cpython-312.pyc +0 -0
  172. AeroViz/tools/__pycache__/dataclassifier.cpython-312.pyc +0 -0
  173. AeroViz/tools/database.py +95 -0
  174. AeroViz/tools/dataclassifier.py +117 -0
  175. AeroViz/tools/dataprinter.py +58 -0
  176. aeroviz-0.1.21.dist-info/METADATA +294 -0
  177. aeroviz-0.1.21.dist-info/RECORD +180 -0
  178. aeroviz-0.1.21.dist-info/WHEEL +5 -0
  179. aeroviz-0.1.21.dist-info/licenses/LICENSE +21 -0
  180. aeroviz-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,961 @@
1
+ from dataclasses import dataclass
2
+ from typing import Callable
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+ # =============================================================================
9
+ # QC Flag System
10
+ # =============================================================================
11
+
12
+ @dataclass
13
+ class QCRule:
14
+ """
15
+ Declarative QC rule definition.
16
+
17
+ Parameters
18
+ ----------
19
+ name : str
20
+ Short identifier for the flag (e.g., 'Status Error')
21
+ condition : Callable[[pd.DataFrame], pd.Series]
22
+ Function that takes DataFrame and returns boolean Series
23
+ where True = flagged (problematic data)
24
+ description : str, optional
25
+ Detailed explanation of what this rule checks
26
+
27
+ Examples
28
+ --------
29
+ >>> rule = QCRule(
30
+ ... name='Invalid BC',
31
+ ... condition=lambda df: (df['BC6'] <= 0) | (df['BC6'] > 20000),
32
+ ... description='BC concentration outside valid range 0-20000 ng/m³'
33
+ ... )
34
+ """
35
+ name: str
36
+ condition: Callable[[pd.DataFrame], pd.Series]
37
+ description: str = ''
38
+
39
+
40
+ class QCFlagBuilder:
41
+ """
42
+ Centralized QC flag aggregation system.
43
+
44
+ This class collects multiple QC rules and applies them efficiently
45
+ using vectorized operations, producing a single QC_Flag column.
46
+
47
+ Examples
48
+ --------
49
+ >>> builder = QCFlagBuilder()
50
+ >>> builder.add_rule(QCRule('Invalid Value', lambda df: df['value'] < 0))
51
+ >>> builder.add_rule(QCRule('Missing Data', lambda df: df['value'].isna()))
52
+ >>> df_with_flags = builder.apply(df)
53
+ """
54
+
55
+ def __init__(self):
56
+ self.rules: list[QCRule] = []
57
+
58
+ def add_rule(self, rule: QCRule) -> 'QCFlagBuilder':
59
+ """Add a QC rule. Returns self for method chaining."""
60
+ self.rules.append(rule)
61
+ return self
62
+
63
+ def add_rules(self, rules: list[QCRule]) -> 'QCFlagBuilder':
64
+ """Add multiple QC rules. Returns self for method chaining."""
65
+ self.rules.extend(rules)
66
+ return self
67
+
68
+ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
69
+ """
70
+ Apply all registered QC rules and add QC_Flag column.
71
+
72
+ Parameters
73
+ ----------
74
+ df : pd.DataFrame
75
+ Input DataFrame to apply QC rules to
76
+
77
+ Returns
78
+ -------
79
+ pd.DataFrame
80
+ DataFrame with added 'QC_Flag' column containing
81
+ comma-separated flag names or 'Valid'
82
+ """
83
+ if not self.rules:
84
+ df = df.copy()
85
+ df['QC_Flag'] = 'Valid'
86
+ return df
87
+
88
+ # Create a mask DataFrame: each column is a boolean mask for one rule
89
+ # This is much faster than iterating row by row
90
+ flag_masks = {}
91
+ for rule in self.rules:
92
+ try:
93
+ mask = rule.condition(df)
94
+ if isinstance(mask, pd.Series):
95
+ flag_masks[rule.name] = mask
96
+ else:
97
+ # Handle scalar or array results
98
+ flag_masks[rule.name] = pd.Series(mask, index=df.index)
99
+ except Exception as e:
100
+ print(f"Warning: QC rule '{rule.name}' failed: {e}")
101
+ flag_masks[rule.name] = pd.Series(False, index=df.index)
102
+
103
+ # Convert to DataFrame for vectorized string operations
104
+ mask_df = pd.DataFrame(flag_masks)
105
+
106
+ # Build flag strings efficiently using numpy
107
+ def build_flag_string(row):
108
+ flags = [col for col, val in row.items() if val]
109
+ return ', '.join(flags) if flags else 'Valid'
110
+
111
+ # Apply vectorized where possible, fallback to apply for string building
112
+ df = df.copy()
113
+ df['QC_Flag'] = mask_df.apply(build_flag_string, axis=1)
114
+
115
+ return df
116
+
117
+ def get_summary(self, df: pd.DataFrame) -> pd.DataFrame:
118
+ """
119
+ Get summary statistics of QC flags.
120
+
121
+ Returns DataFrame with counts and percentages for each flag.
122
+ """
123
+ results = []
124
+ total = len(df)
125
+ flagged_mask = pd.Series(False, index=df.index)
126
+
127
+ for rule in self.rules:
128
+ try:
129
+ mask = rule.condition(df)
130
+ flagged_mask |= mask
131
+ count = mask.sum()
132
+ results.append({
133
+ 'Rule': rule.name,
134
+ 'Count': count,
135
+ 'Percentage': f'{count / total * 100:.1f}%',
136
+ 'Description': rule.description
137
+ })
138
+ except Exception:
139
+ results.append({
140
+ 'Rule': rule.name,
141
+ 'Count': 'Error',
142
+ 'Percentage': '-',
143
+ 'Description': rule.description
144
+ })
145
+
146
+ # Add Valid count
147
+ valid_count = (~flagged_mask).sum()
148
+ results.append({
149
+ 'Rule': 'Valid',
150
+ 'Count': valid_count,
151
+ 'Percentage': f'{valid_count / total * 100:.1f}%',
152
+ 'Description': 'Passed all QC checks'
153
+ })
154
+
155
+ return pd.DataFrame(results)
156
+
157
+
158
+ class QualityControl:
159
+ """A class providing various methods for data quality control and outlier detection"""
160
+
161
+ @staticmethod
162
+ def _ensure_dataframe(df: pd.DataFrame | pd.Series) -> pd.DataFrame:
163
+ """Ensure input data is in DataFrame format"""
164
+ return df.to_frame() if isinstance(df, pd.Series) else df
165
+
166
+ @staticmethod
167
+ def _transform_if_log(df: pd.DataFrame, log_dist: bool) -> pd.DataFrame:
168
+ """Transform data to log scale if required"""
169
+ return np.log10(df) if log_dist else df
170
+
171
+ @classmethod
172
+ def n_sigma(cls, df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
173
+ """
174
+ Detect outliers using n-sigma method
175
+
176
+ Parameters
177
+ ----------
178
+ df : pd.DataFrame
179
+ Input data
180
+ std_range : int, default=5
181
+ Number of standard deviations to use as threshold
182
+
183
+ Returns
184
+ -------
185
+ pd.DataFrame
186
+ Cleaned DataFrame with outliers masked as NaN
187
+ """
188
+ df = cls._ensure_dataframe(df)
189
+ df_ave = df.mean()
190
+ df_std = df.std()
191
+
192
+ lower_bound = df < (df_ave - df_std * std_range)
193
+ upper_bound = df > (df_ave + df_std * std_range)
194
+
195
+ return df.mask(lower_bound | upper_bound)
196
+
197
+ @classmethod
198
+ def iqr(cls, df: pd.DataFrame, log_dist: bool = False) -> pd.DataFrame:
199
+ """
200
+ Detect outliers using Interquartile Range (IQR) method
201
+
202
+ Parameters
203
+ ----------
204
+ df : pd.DataFrame
205
+ Input data
206
+ log_dist : bool, default=False
207
+ Whether to apply log transformation to data
208
+
209
+ Returns
210
+ -------
211
+ pd.DataFrame
212
+ Cleaned DataFrame with outliers masked as NaN
213
+ """
214
+ df = cls._ensure_dataframe(df)
215
+ df_transformed = cls._transform_if_log(df, log_dist)
216
+
217
+ q1 = df_transformed.quantile(0.25)
218
+ q3 = df_transformed.quantile(0.75)
219
+ iqr = q3 - q1
220
+
221
+ lower_bound = df_transformed < (q1 - 1.5 * iqr)
222
+ upper_bound = df_transformed > (q3 + 1.5 * iqr)
223
+
224
+ return df.mask(lower_bound | upper_bound)
225
+
226
+ @classmethod
227
+ def time_aware_rolling_iqr(cls, df: pd.DataFrame, window_size: str = '24h',
228
+ log_dist: bool = False, iqr_factor: float = 5,
229
+ min_periods: int = 5) -> pd.DataFrame:
230
+ """
231
+ Detect outliers using rolling time-aware IQR method with handling for initial periods
232
+
233
+ Parameters
234
+ ----------
235
+ df : pd.DataFrame
236
+ Input data
237
+ window_size : str, default='24h'
238
+ Size of the rolling window
239
+ log_dist : bool, default=False
240
+ Whether to apply log transformation to data
241
+ iqr_factor : float, default=3
242
+ The factor by which to multiply the IQR
243
+ min_periods : int, default=4
244
+ Minimum number of observations required in window
245
+
246
+ Returns
247
+ -------
248
+ pd.DataFrame
249
+ Cleaned DataFrame with outliers masked as NaN
250
+ """
251
+ df = cls._ensure_dataframe(df)
252
+ df_transformed = cls._transform_if_log(df, log_dist)
253
+
254
+ # Create result DataFrame
255
+ result = pd.DataFrame(index=df.index)
256
+
257
+ # Apply rolling IQR to each column
258
+ for col in df_transformed.columns:
259
+ series = df_transformed[col]
260
+
261
+ # Calculate global IQR for initial values
262
+ global_q1 = series.quantile(0.25)
263
+ global_q3 = series.quantile(0.75)
264
+ global_iqr = global_q3 - global_q1
265
+
266
+ global_lower = global_q1 - iqr_factor * global_iqr
267
+ global_upper = global_q3 + iqr_factor * global_iqr
268
+
269
+ # Calculate rolling IQR
270
+ rolling_q1 = series.rolling(window_size, min_periods=min_periods).quantile(0.25)
271
+ rolling_q3 = series.rolling(window_size, min_periods=min_periods).quantile(0.75)
272
+ rolling_iqr = rolling_q3 - rolling_q1
273
+
274
+ # Calculate dynamic thresholds
275
+ lower_bound = rolling_q1 - iqr_factor * rolling_iqr
276
+ upper_bound = rolling_q3 + iqr_factor * rolling_iqr
277
+
278
+ # Use global thresholds for initial NaN values
279
+ lower_bound = lower_bound.fillna(global_lower)
280
+ upper_bound = upper_bound.fillna(global_upper)
281
+
282
+ # Mark data points within thresholds
283
+ mask = (series >= lower_bound) & (series <= upper_bound)
284
+ result[col] = mask
285
+
286
+ # Set values in original data that don't meet conditions to NaN
287
+ return df.where(result, np.nan)
288
+
289
+ def time_aware_std_QC(self, df: pd.DataFrame, time_window: str = '6h',
290
+ std_factor: float = 3.0, min_periods: int = 4) -> pd.DataFrame:
291
+ """
292
+ Time-aware outlier detection using rolling standard deviation
293
+
294
+ Parameters
295
+ ----------
296
+ df : pd.DataFrame
297
+ Input data
298
+ time_window : str, default='6h'
299
+ Rolling window size
300
+ std_factor : float, default=3.0
301
+ Standard deviation multiplier (e.g., 3 means 3σ)
302
+ min_periods : int, default=4
303
+ Minimum number of observations required in window
304
+
305
+ Returns
306
+ -------
307
+ pd.DataFrame
308
+ Quality controlled DataFrame with outliers marked as NaN
309
+ """
310
+ df = self._ensure_dataframe(df)
311
+
312
+ # Create result DataFrame
313
+ result = pd.DataFrame(index=df.index)
314
+
315
+ # Apply rolling standard deviation to each column
316
+ for col in df.columns:
317
+ series = df[col]
318
+
319
+ # Calculate global standard deviation for initial values
320
+ global_mean = series.mean()
321
+ global_std = series.std()
322
+
323
+ global_lower = global_mean - std_factor * global_std
324
+ global_upper = global_mean + std_factor * global_std
325
+
326
+ # Calculate rolling mean and standard deviation
327
+ rolling_mean = series.rolling(time_window, min_periods=min_periods).mean()
328
+ rolling_std = series.rolling(time_window, min_periods=min_periods).std()
329
+
330
+ # Calculate dynamic thresholds
331
+ lower_bound = rolling_mean - std_factor * rolling_std
332
+ upper_bound = rolling_mean + std_factor * rolling_std
333
+
334
+ # Use global thresholds for initial NaN values
335
+ lower_bound = lower_bound.fillna(global_lower)
336
+ upper_bound = upper_bound.fillna(global_upper)
337
+
338
+ # Mark data points within thresholds
339
+ mask = (series >= lower_bound) & (series <= upper_bound)
340
+ result[col] = mask
341
+
342
+ # Set values in original data that don't meet conditions to NaN
343
+ return df.where(result, np.nan)
344
+
345
+ @classmethod
346
+ def bidirectional_trend_std_QC(cls, df: pd.DataFrame, window_size: str = '6h',
347
+ std_factor: float = 3.0, trend_window: str = '30min',
348
+ trend_factor: float = 2, min_periods: int = 4) -> pd.Series:
349
+ """
350
+ Perform quality control using standard deviation with awareness of both upward and downward trends.
351
+
352
+ This method identifies outliers considering both upward and downward trends in the data,
353
+ applying more lenient criteria when consistent trends are detected.
354
+
355
+ Parameters
356
+ ----------
357
+ df : pd.DataFrame
358
+ Input data frame with time series (QC_Flag column is now optional)
359
+ window_size : str, default='6h'
360
+ Size of the rolling window for std calculation
361
+ std_factor : float, default=3.0
362
+ Base factor for standard deviation threshold
363
+ trend_window : str, default='30min'
364
+ Window for trend detection
365
+ trend_factor : float, default=2
366
+ Factor to increase std_factor when trends are detected
367
+ min_periods : int, default=4
368
+ Minimum number of observations in window
369
+
370
+ Returns
371
+ -------
372
+ pd.Series
373
+ Boolean mask where True indicates outliers
374
+ """
375
+ df = cls._ensure_dataframe(df)
376
+
377
+ # 使用預先分配的 NumPy 數組,而不是 pandas Series
378
+ index = df.index
379
+ n_rows = len(index)
380
+ outlier_array = np.zeros(n_rows, dtype=bool) # 更高效的初始化
381
+
382
+ # 只處理數值列,跳過 QC_Flag 等非數值列
383
+ numeric_cols = df.select_dtypes(include=np.number).columns.tolist() # 轉為 list 以提高索引性能
384
+
385
+ # 預先計算滾動窗口大小(以點數而非時間表示)
386
+ # 這僅適用於固定頻率的數據,若數據不規則則保持原始時間窗口
387
+ try:
388
+ if hasattr(df.index, 'freq') and df.index.freq is not None:
389
+ # 將時間窗口轉換為點數
390
+ window_points = int(pd.Timedelta(window_size) / df.index.freq)
391
+ trend_points = int(pd.Timedelta(trend_window) / df.index.freq)
392
+ use_points = True
393
+ else:
394
+ # 嘗試計算平均時間間隔
395
+ if isinstance(df.index, pd.DatetimeIndex) and len(df.index) > 1:
396
+ avg_interval = (df.index[-1] - df.index[0]) / (len(df.index) - 1)
397
+ window_points = int(pd.Timedelta(window_size) / avg_interval)
398
+ trend_points = int(pd.Timedelta(trend_window) / avg_interval)
399
+ use_points = True
400
+ else:
401
+ use_points = False
402
+ window_points = None
403
+ trend_points = None
404
+ except:
405
+ use_points = False
406
+ window_points = None
407
+ trend_points = None
408
+
409
+ # 預編譯趨勢計算函數使用 numba (如果可用)
410
+ try:
411
+ import numba
412
+
413
+ @numba.jit(nopython=True)
414
+ def calc_trend_numba(values):
415
+ n = len(values)
416
+ if n > 3:
417
+ # 使用更高效的線性回歸實現
418
+ x = np.arange(n)
419
+ sum_x = np.sum(x)
420
+ sum_y = np.sum(values)
421
+ sum_xx = np.sum(x * x)
422
+ sum_xy = np.sum(x * values)
423
+
424
+ # 計算斜率
425
+ denom = (n * sum_xx - sum_x * sum_x)
426
+ if denom != 0:
427
+ slope = (n * sum_xy - sum_x * sum_y) / denom
428
+ return slope
429
+ return 0.0
430
+
431
+ use_numba = True
432
+ except ImportError:
433
+ use_numba = False
434
+
435
+ # 回退函數
436
+ def calc_trend_numba(values):
437
+ n = len(values)
438
+ if n > 3:
439
+ try:
440
+ return np.polyfit(range(len(values)), values, 1)[0]
441
+ except:
442
+ return 0
443
+ return 0
444
+
445
+ # 使用並行處理每列
446
+ try:
447
+ from concurrent.futures import ThreadPoolExecutor
448
+ from functools import partial
449
+
450
+ def process_column(col, df, use_points, window_points, trend_points, std_factor,
451
+ min_periods, trend_factor, use_numba):
452
+ # 從 DataFrame 中提取該列
453
+ if isinstance(df, pd.DataFrame):
454
+ series = df[col].values
455
+ else:
456
+ # 如果直接傳入了 Series
457
+ series = df.values
458
+
459
+ # 處理 NaN 值
460
+ valid_mask = ~np.isnan(series)
461
+ valid_indices = np.where(valid_mask)[0]
462
+
463
+ if len(valid_indices) < min_periods:
464
+ return np.zeros(len(series), dtype=bool)
465
+
466
+ # 全局統計量只使用有效值計算
467
+ valid_values = series[valid_mask]
468
+ global_mean = np.mean(valid_values)
469
+ global_std = np.std(valid_values)
470
+ if global_std == 0:
471
+ global_std = 1e-6 # 避免除零
472
+
473
+ # 初始化結果數組
474
+ col_outlier_mask = np.zeros(len(series), dtype=bool)
475
+
476
+ # 滾動統計量計算
477
+ # 對於基於索引的滾動計算
478
+ if use_points and window_points is not None and window_points > 0:
479
+ # 初始化數組
480
+ rolling_mean = np.full_like(series, np.nan, dtype=float)
481
+ rolling_std = np.full_like(series, np.nan, dtype=float)
482
+ trends = np.full_like(series, np.nan, dtype=float)
483
+ trend_significance = np.full_like(series, np.nan, dtype=float)
484
+
485
+ # 手動實現滾動窗口
486
+ for i in valid_indices:
487
+ # 滾動均值和標準差
488
+ start_idx = max(0, i - window_points + 1)
489
+ window_vals = series[start_idx:i + 1]
490
+ valid_window = window_vals[~np.isnan(window_vals)]
491
+
492
+ if len(valid_window) >= min_periods:
493
+ rolling_mean[i] = np.mean(valid_window)
494
+ rolling_std[i] = np.std(valid_window)
495
+
496
+ # 趨勢計算
497
+ if trend_points > 0:
498
+ trend_start = max(0, i - trend_points + 1)
499
+ trend_vals = series[trend_start:i + 1]
500
+ valid_trend = trend_vals[~np.isnan(trend_vals)]
501
+
502
+ if len(valid_trend) >= 3:
503
+ # 使用 numba 加速的趨勢計算
504
+ trends[i] = calc_trend_numba(valid_trend)
505
+ trend_std = np.std(valid_trend)
506
+ if trend_std > 0:
507
+ trend_significance[i] = abs(trends[i]) / trend_std
508
+
509
+ # 計算滾動變化率
510
+ pct_change = np.full_like(series, np.nan, dtype=float)
511
+ for i in range(1, len(series)):
512
+ if not np.isnan(series[i]) and not np.isnan(series[i - 1]) and series[i - 1] != 0:
513
+ pct_change[i] = abs((series[i] - series[i - 1]) / series[i - 1])
514
+
515
+ # 滾動平均變化率
516
+ avg_change_rates = np.full_like(series, np.nan, dtype=float)
517
+ for i in valid_indices:
518
+ if trend_points > 0:
519
+ rate_start = max(0, i - trend_points + 1)
520
+ rate_vals = pct_change[rate_start:i + 1]
521
+ valid_rates = rate_vals[~np.isnan(rate_vals)]
522
+
523
+ if len(valid_rates) >= 3:
524
+ avg_change_rates[i] = np.mean(valid_rates)
525
+ else:
526
+ # 使用 pandas 的滾動窗口(對於時間索引數據)
527
+ # 注意:這裡我們實際上需要創建一個具有時間索引的臨時 Series
528
+ temp_series = pd.Series(series, index=df.index)
529
+
530
+ # 計算滾動統計量
531
+ rolling_mean = temp_series.rolling(window_size, min_periods=min_periods).mean().values
532
+ rolling_std = temp_series.rolling(window_size, min_periods=min_periods).std().values
533
+
534
+ # 趨勢計算
535
+ if use_numba:
536
+ # 使用 apply + numba
537
+ trend_series = temp_series.rolling(trend_window, min_periods=3).apply(
538
+ lambda x: calc_trend_numba(x.values))
539
+ else:
540
+ # 使用內建的 apply + polyfit
541
+ trend_series = temp_series.rolling(trend_window, min_periods=3).apply(
542
+ lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) > 3 else 0)
543
+
544
+ trends = trend_series.values
545
+
546
+ # 計算趨勢顯著性
547
+ series_std = temp_series.rolling(trend_window, min_periods=3).std().values
548
+ trend_significance = np.zeros_like(trends)
549
+ for i in range(len(trends)):
550
+ if not np.isnan(trends[i]) and not np.isnan(series_std[i]) and series_std[i] > 0:
551
+ trend_significance[i] = abs(trends[i]) / series_std[i]
552
+ elif not np.isnan(trends[i]):
553
+ trend_significance[i] = abs(trends[i]) / (global_std * 0.1)
554
+
555
+ # 計算變化率
556
+ pct_change = temp_series.pct_change(fill_method=None).abs().values
557
+
558
+ # 重用 temp_series 計算滾動平均變化率
559
+ temp_change_series = pd.Series(pct_change, index=df.index)
560
+ avg_change_rates = temp_change_series.rolling(trend_window, min_periods=3).mean().values
561
+
562
+ # 動態調整標準差因子
563
+ dynamic_factor = np.full(len(series), std_factor)
564
+ for i in valid_indices:
565
+ if not np.isnan(trend_significance[i]) and trend_significance[i] > 0.1:
566
+ dynamic_factor[i] = std_factor * trend_factor
567
+
568
+ # 調整極低標準差
569
+ min_std = global_std * 0.1
570
+ adjusted_std = np.copy(rolling_std)
571
+ for i in valid_indices:
572
+ if not np.isnan(adjusted_std[i]) and adjusted_std[i] < min_std:
573
+ adjusted_std[i] = min_std
574
+
575
+ # 計算閾值
576
+ lower_bound = np.full_like(series, np.nan, dtype=float)
577
+ upper_bound = np.full_like(series, np.nan, dtype=float)
578
+
579
+ for i in valid_indices:
580
+ if not np.isnan(rolling_mean[i]) and not np.isnan(adjusted_std[i]):
581
+ lower_bound[i] = rolling_mean[i] - dynamic_factor[i] * adjusted_std[i]
582
+ upper_bound[i] = rolling_mean[i] + dynamic_factor[i] * adjusted_std[i]
583
+ else:
584
+ # 使用全局統計量
585
+ lower_bound[i] = global_mean - std_factor * global_std
586
+ upper_bound[i] = global_mean + std_factor * global_std
587
+
588
+ # 標記超出閾值的點
589
+ for i in valid_indices:
590
+ if not (lower_bound[i] <= series[i] <= upper_bound[i]):
591
+ col_outlier_mask[i] = True
592
+
593
+ # 趨勢一致性檢查
594
+ trend_consistent = np.zeros_like(col_outlier_mask, dtype=bool)
595
+ for i in valid_indices:
596
+ if i > 0 and not np.isnan(pct_change[i]) and not np.isnan(avg_change_rates[i]):
597
+ trend_consistent[i] = pct_change[i] <= (avg_change_rates[i] * 3)
598
+
599
+ # 顯著趨勢檢查
600
+ significant_trend_mask = np.zeros_like(col_outlier_mask, dtype=bool)
601
+ for i in valid_indices:
602
+ if not np.isnan(trend_significance[i]) and trend_significance[i] > 0.1:
603
+ significant_trend_mask[i] = True
604
+
605
+ # 最終掩碼:僅當點超出範圍且不符合顯著趨勢時才標記為異常
606
+ col_final_mask = col_outlier_mask.copy()
607
+ for i in valid_indices:
608
+ if col_outlier_mask[i] and trend_consistent[i] and significant_trend_mask[i]:
609
+ col_final_mask[i] = False
610
+
611
+ return col_final_mask
612
+
613
+ # 嘗試使用並行處理
614
+ with ThreadPoolExecutor(max_workers=min(4, len(numeric_cols))) as executor:
615
+ col_results = list(executor.map(
616
+ partial(process_column, df=df, use_points=use_points,
617
+ window_points=window_points, trend_points=trend_points,
618
+ std_factor=std_factor, min_periods=min_periods,
619
+ trend_factor=trend_factor, use_numba=use_numba),
620
+ numeric_cols))
621
+
622
+ # 合併結果
623
+ for col_mask in col_results:
624
+ outlier_array = outlier_array | col_mask
625
+
626
+ except Exception as e:
627
+ # 如果並行處理失敗,回退到原始實現
628
+ print(f"Warning: Parallel processing failed, falling back to original implementation. Error: {e}")
629
+
630
+ # 創建結果掩碼 - 初始全部為 False (不是異常值)
631
+ outlier_mask = pd.Series(False, index=df.index)
632
+
633
+ for col in numeric_cols:
634
+ series = df[col]
635
+
636
+ # 計算全局統計量
637
+ global_mean = series.mean()
638
+ global_std = series.std()
639
+
640
+ # 檢測趨勢方向和強度
641
+ def calc_trend(x):
642
+ if len(x) > 3:
643
+ try:
644
+ return np.polyfit(range(len(x)), x, 1)[0]
645
+ except:
646
+ return 0
647
+ return 0
648
+
649
+ trend = series.rolling(trend_window, min_periods=3).apply(calc_trend)
650
+
651
+ # 計算趨勢顯著性
652
+ series_std = series.rolling(trend_window, min_periods=3).std()
653
+ # 避免除以零
654
+ trend_significance = np.abs(trend) / series_std.replace(0, np.nan).fillna(global_std * 0.1)
655
+
656
+ # 動態因子調整
657
+ dynamic_factor = pd.Series(std_factor, index=df.index)
658
+ significant_trend = trend_significance > 0.1
659
+ dynamic_factor[significant_trend] = std_factor * trend_factor
660
+
661
+ # 計算滾動統計量
662
+ rolling_mean = series.rolling(window_size, min_periods=min_periods).mean()
663
+ rolling_std = series.rolling(window_size, min_periods=min_periods).std()
664
+
665
+ # 調整極低標準差
666
+ min_std_threshold = global_std * 0.1
667
+ adjusted_std = rolling_std.clip(lower=min_std_threshold)
668
+
669
+ # 計算閾值
670
+ lower_bound = rolling_mean - dynamic_factor * adjusted_std
671
+ upper_bound = rolling_mean + dynamic_factor * adjusted_std
672
+
673
+ # 填充初始 NaN 值
674
+ lower_bound = lower_bound.fillna(global_mean - std_factor * global_std)
675
+ upper_bound = upper_bound.fillna(global_mean + std_factor * global_std)
676
+
677
+ # 檢查變化率一致性
678
+ rate_of_change = series.pct_change(fill_method=None).abs()
679
+ avg_change_rate = rate_of_change.rolling(trend_window, min_periods=3).mean()
680
+
681
+ # 標記異常值
682
+ col_outlier_mask = ~((series >= lower_bound) & (series <= upper_bound))
683
+ trend_consistent = rate_of_change <= (avg_change_rate * 3)
684
+
685
+ # 最終掩碼:只有當點超出範圍且不屬於一致趨勢時才標記為異常
686
+ col_final_outlier_mask = col_outlier_mask & ~(col_outlier_mask & trend_consistent & significant_trend)
687
+
688
+ # 更新總掩碼 - 如果任一列有異常,則標記為異常
689
+ outlier_mask = outlier_mask | col_final_outlier_mask
690
+
691
+ return outlier_mask
692
+
693
+ # 轉換回 pandas Series
694
+ return pd.Series(outlier_array, index=index)
695
+
696
+ @staticmethod
697
+ def filter_error_status(_df, error_codes=None, special_codes=None, return_mask=True,
698
+ status_column='Status', status_type='bitwise', ok_value=None):
699
+ """
700
+ Filter data based on error status codes.
701
+
702
+ Parameters
703
+ ----------
704
+ _df : pd.DataFrame
705
+ Input DataFrame
706
+ error_codes : list or array-like, optional
707
+ Codes indicating errors (for 'bitwise' type)
708
+ special_codes : list or array-like, optional
709
+ Special codes to handle differently (exact match)
710
+ return_mask : bool, default=True
711
+ If True, returns a boolean mask where True indicates errors;
712
+ If False, returns filtered DataFrame
713
+ status_column : str, default='Status'
714
+ Name of the status column in DataFrame
715
+ status_type : str, default='bitwise'
716
+ Type of status check:
717
+ - 'bitwise': Use bitwise AND to check error codes (AE33, AE43, BC1054, MA350)
718
+ - 'numeric': Check if status != ok_value (TEOM, Aurora, NEPH)
719
+ - 'text': Check if status != ok_value as string (SMPS)
720
+ - 'binary_string': Parse binary string and check if > 0 (APS)
721
+ ok_value : any, optional
722
+ The value indicating OK status (for 'numeric', 'text' types)
723
+ - For 'numeric': typically 0
724
+ - For 'text': typically 'Normal Scan'
725
+
726
+ Returns
727
+ -------
728
+ Union[pd.DataFrame, pd.Series]
729
+ If return_mask=True: boolean Series with True for error points
730
+ If return_mask=False: Filtered DataFrame with error points masked
731
+ """
732
+ # Check if status column exists
733
+ if status_column not in _df.columns:
734
+ # No status column, return all False (no errors)
735
+ if return_mask:
736
+ return pd.Series(False, index=_df.index)
737
+ else:
738
+ return _df
739
+
740
+ # Create an empty mask
741
+ error_mask = pd.Series(False, index=_df.index)
742
+
743
+ if status_type == 'bitwise':
744
+ # Original bitwise logic for AE33, AE43, BC1054, MA350
745
+ status_values = pd.to_numeric(_df[status_column], errors='coerce').fillna(0).astype(int)
746
+
747
+ # Bitwise test normal error codes
748
+ if error_codes:
749
+ for code in error_codes:
750
+ error_mask = error_mask | ((status_values & code) != 0)
751
+
752
+ # Exact matching for special codes
753
+ if special_codes:
754
+ error_mask = error_mask | status_values.isin(special_codes)
755
+
756
+ elif status_type == 'numeric':
757
+ # Simple numeric comparison for TEOM, Aurora, NEPH
758
+ status_values = pd.to_numeric(_df[status_column], errors='coerce')
759
+ if ok_value is not None:
760
+ error_mask = (status_values != ok_value) & status_values.notna()
761
+ else:
762
+ # Default: 0 is OK
763
+ error_mask = (status_values != 0) & status_values.notna()
764
+
765
+ elif status_type == 'text':
766
+ # Text comparison for SMPS
767
+ status_values = _df[status_column].astype(str).str.strip()
768
+ if ok_value is not None:
769
+ error_mask = (status_values != ok_value) & (status_values != '') & (status_values != 'nan')
770
+ else:
771
+ # No ok_value specified, can't determine errors
772
+ error_mask = pd.Series(False, index=_df.index)
773
+
774
+ elif status_type == 'binary_string':
775
+ # Binary string parsing for APS ('0000 0000 0000 0000')
776
+ def parse_binary_status(status_str):
777
+ if not isinstance(status_str, str) or status_str in ('nan', ''):
778
+ return 0
779
+ binary_str = status_str.replace(' ', '')
780
+ try:
781
+ return int(binary_str, 2)
782
+ except ValueError:
783
+ return 0
784
+
785
+ status_values = _df[status_column].apply(parse_binary_status)
786
+ error_mask = status_values > 0
787
+
788
+ else:
789
+ raise ValueError(f"Unknown status_type: {status_type}")
790
+
791
+ # Return either the mask or the filtered DataFrame
792
+ if return_mask:
793
+ return error_mask
794
+ else:
795
+ return _df.mask(error_mask)
796
+
797
+ @classmethod
798
+ def spike_detection(cls, df: pd.DataFrame,
799
+ max_change_rate: float = 3.0,
800
+ min_abs_change: float = None) -> pd.Series:
801
+ """
802
+ Vectorized spike detection using change rate analysis.
803
+
804
+ Detects sudden unreasonable value changes while allowing legitimate
805
+ gradual changes during events (pollution episodes, etc.).
806
+
807
+ This method is much faster than rolling window methods because it uses
808
+ pure numpy vectorized operations.
809
+
810
+ Parameters
811
+ ----------
812
+ df : pd.DataFrame
813
+ Input data frame with time series
814
+ max_change_rate : float, default=3.0
815
+ Maximum allowed ratio of current change to median absolute change.
816
+ Higher values = more permissive. A value of 3.0 means a change
817
+ must be 3x larger than the median change to be flagged.
818
+ min_abs_change : float, optional
819
+ Minimum absolute change required to be considered a spike.
820
+ If None, uses 10% of the data's standard deviation.
821
+
822
+ Returns
823
+ -------
824
+ pd.Series
825
+ Boolean mask where True indicates detected spikes
826
+
827
+ Notes
828
+ -----
829
+ The algorithm:
830
+ 1. Calculate absolute difference between consecutive points
831
+ 2. Calculate the median absolute change (robust baseline)
832
+ 3. Flag points where change > max_change_rate * median_change
833
+ 4. Also detect "reversals" (spike up then immediately down)
834
+
835
+ This approach allows gradual changes during events while catching
836
+ sudden spikes that are likely instrument errors.
837
+
838
+ Examples
839
+ --------
840
+ >>> qc = QualityControl()
841
+ >>> spike_mask = qc.spike_detection(df, max_change_rate=3.0)
842
+ """
843
+ df = cls._ensure_dataframe(df)
844
+
845
+ # Initialize result mask
846
+ spike_mask = pd.Series(False, index=df.index)
847
+
848
+ # Process each numeric column
849
+ numeric_cols = df.select_dtypes(include=np.number).columns
850
+
851
+ for col in numeric_cols:
852
+ values = df[col].values
853
+ n = len(values)
854
+
855
+ if n < 3:
856
+ continue
857
+
858
+ # Calculate absolute differences (vectorized)
859
+ diff = np.abs(np.diff(values))
860
+
861
+ # Handle NaN values
862
+ valid_diff = diff[~np.isnan(diff)]
863
+
864
+ if len(valid_diff) < 3:
865
+ continue
866
+
867
+ # Calculate median absolute change (robust measure)
868
+ median_change = np.median(valid_diff)
869
+
870
+ # Set minimum threshold
871
+ if min_abs_change is None:
872
+ # Use 10% of std as minimum meaningful change
873
+ std_val = np.nanstd(values)
874
+ min_threshold = std_val * 0.1
875
+ else:
876
+ min_threshold = min_abs_change
877
+
878
+ # Ensure median_change is not too small
879
+ if median_change < min_threshold:
880
+ median_change = min_threshold
881
+
882
+ # Calculate spike threshold
883
+ spike_threshold = max_change_rate * median_change
884
+
885
+ # Detect spikes: diff[i] is the change from values[i] to values[i+1]
886
+ # So spike at index i+1 if diff[i] > threshold
887
+ large_changes = diff > spike_threshold
888
+
889
+ # Detect reversals: sudden up then immediate down (or vice versa)
890
+ # A reversal at index i means: sign(diff[i-1]) != sign(diff[i])
891
+ # and both changes are large
892
+ signed_diff = np.diff(values) # Keep sign for reversal detection
893
+
894
+ # Reversal detection (vectorized)
895
+ # Check if consecutive changes have opposite signs and both are significant
896
+ if len(signed_diff) >= 2:
897
+ sign_change = signed_diff[:-1] * signed_diff[1:] < 0 # Opposite signs
898
+ both_large = (np.abs(signed_diff[:-1]) > spike_threshold * 0.5) & \
899
+ (np.abs(signed_diff[1:]) > spike_threshold * 0.5)
900
+ reversals = sign_change & both_large
901
+
902
+ # Mark the middle point of a reversal as spike
903
+ # reversals[i] indicates reversal at values[i+1]
904
+ col_spike_mask = np.zeros(n, dtype=bool)
905
+
906
+ # Large changes: mark the point after the change
907
+ col_spike_mask[1:] = large_changes
908
+
909
+ # Reversals: mark the middle point (already aligned to i+1)
910
+ col_spike_mask[1:-1] = col_spike_mask[1:-1] | reversals
911
+ else:
912
+ col_spike_mask = np.zeros(n, dtype=bool)
913
+ col_spike_mask[1:] = large_changes
914
+
915
+ # Update overall mask
916
+ spike_mask = spike_mask | pd.Series(col_spike_mask, index=df.index)
917
+
918
+ return spike_mask
919
+
920
+ @classmethod
921
+ def hourly_completeness_QC(cls, df: pd.DataFrame, freq: str,
922
+ threshold: float = 0.5) -> pd.Series:
923
+ """
924
+ Check if each hour has sufficient data points.
925
+
926
+ Parameters
927
+ ----------
928
+ df : pd.DataFrame
929
+ Input data frame with time series
930
+ freq : str
931
+ Data frequency (e.g., '6min')
932
+ threshold : float, default=0.5
933
+ Minimum required proportion of data points per hour (0-1)
934
+
935
+ Returns
936
+ -------
937
+ pd.Series
938
+ Boolean mask where True indicates insufficient data
939
+ """
940
+ # Ensure input is DataFrame
941
+ df = cls._ensure_dataframe(df)
942
+
943
+ # Create result mask
944
+ completeness_mask = pd.Series(False, index=df.index)
945
+
946
+ # Calculate expected data points per hour
947
+ points_per_hour = pd.Timedelta('1h') / pd.Timedelta(freq)
948
+ min_points = points_per_hour * threshold
949
+
950
+ # Only process numeric columns
951
+ numeric_cols = df.select_dtypes(include=np.number).columns
952
+
953
+ for col in numeric_cols:
954
+ # Calculate actual data points per hour
955
+ hourly_count = df[col].dropna().resample('1h').size().reindex(df.index).ffill()
956
+
957
+ # Mark points with insufficient data
958
+ insufficient_mask = hourly_count < min_points
959
+ completeness_mask = completeness_mask | insufficient_mask
960
+
961
+ return completeness_mask