AeroViz 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AeroViz might be problematic. Click here for more details.
- AeroViz/data/240228_00.txt +101 -0
- AeroViz/dataProcess/Chemistry/_ocec.py +20 -7
- AeroViz/plot/__init__.py +2 -0
- AeroViz/plot/hysplit/__init__.py +1 -0
- AeroViz/plot/hysplit/hysplit.py +79 -0
- AeroViz/plot/meteorology/meteorology.py +2 -0
- AeroViz/plot/optical/optical.py +60 -59
- AeroViz/plot/pie.py +14 -2
- AeroViz/plot/radar.py +184 -0
- AeroViz/plot/scatter.py +16 -7
- AeroViz/plot/templates/diurnal_pattern.py +24 -7
- AeroViz/plot/templates/koschmieder.py +11 -8
- AeroViz/plot/timeseries/template.py +2 -2
- AeroViz/plot/timeseries/timeseries.py +47 -7
- AeroViz/rawDataReader/__init__.py +75 -68
- AeroViz/rawDataReader/config/supported_instruments.py +52 -19
- AeroViz/rawDataReader/core/__init__.py +194 -106
- AeroViz/rawDataReader/script/AE33.py +11 -6
- AeroViz/rawDataReader/script/AE43.py +10 -5
- AeroViz/rawDataReader/script/Aurora.py +14 -10
- AeroViz/rawDataReader/script/BC1054.py +10 -6
- AeroViz/rawDataReader/script/EPA.py +39 -0
- AeroViz/rawDataReader/script/GRIMM.py +1 -2
- AeroViz/rawDataReader/script/IGAC.py +6 -23
- AeroViz/rawDataReader/script/MA350.py +12 -5
- AeroViz/rawDataReader/script/Minion.py +107 -30
- AeroViz/rawDataReader/script/NEPH.py +15 -5
- AeroViz/rawDataReader/script/OCEC.py +39 -15
- AeroViz/rawDataReader/script/SMPS.py +1 -0
- AeroViz/rawDataReader/script/TEOM.py +15 -11
- AeroViz/rawDataReader/script/VOC.py +1 -1
- AeroViz/rawDataReader/script/XRF.py +11 -0
- AeroViz/rawDataReader/script/__init__.py +2 -2
- {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/METADATA +54 -30
- {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/RECORD +40 -51
- AeroViz/process/__init__.py +0 -31
- AeroViz/process/core/DataProc.py +0 -19
- AeroViz/process/core/SizeDist.py +0 -90
- AeroViz/process/core/__init__.py +0 -4
- AeroViz/process/method/__init__.py +0 -2
- AeroViz/process/method/prop.py +0 -62
- AeroViz/process/script/AbstractDistCalc.py +0 -143
- AeroViz/process/script/Chemical.py +0 -177
- AeroViz/process/script/IMPACT.py +0 -49
- AeroViz/process/script/IMPROVE.py +0 -161
- AeroViz/process/script/Others.py +0 -65
- AeroViz/process/script/PSD.py +0 -103
- AeroViz/process/script/PSD_dry.py +0 -93
- AeroViz/process/script/__init__.py +0 -5
- AeroViz/process/script/retrieve_RI.py +0 -69
- AeroViz/rawDataReader/script/EPA_vertical.py +0 -46
- AeroViz/rawDataReader/script/Table.py +0 -27
- /AeroViz/{process/method → plot/optical}/PyMieScatt_update.py +0 -0
- /AeroViz/{process/method → plot/optical}/mie_theory.py +0 -0
- {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/LICENSE +0 -0
- {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/WHEEL +0 -0
- {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,21 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
import pickle as pkl
|
|
4
3
|
from abc import ABC, abstractmethod
|
|
5
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import pandas as pd
|
|
11
|
-
from pandas import DataFrame,
|
|
10
|
+
from pandas import DataFrame, concat, read_pickle
|
|
12
11
|
from rich.console import Console
|
|
13
12
|
from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn, TaskProgressColumn
|
|
14
13
|
|
|
15
|
-
from
|
|
14
|
+
from AeroViz.rawDataReader.config.supported_instruments import meta
|
|
16
15
|
|
|
17
16
|
__all__ = ['AbstractReader']
|
|
18
17
|
|
|
19
18
|
|
|
20
|
-
console = Console(force_terminal=True, color_system="auto")
|
|
21
|
-
|
|
22
|
-
|
|
23
19
|
class AbstractReader(ABC):
|
|
24
20
|
"""
|
|
25
21
|
Abstract class for reading raw data from different instruments. Each instrument should have a separate class that
|
|
@@ -34,9 +30,9 @@ class AbstractReader(ABC):
|
|
|
34
30
|
|
|
35
31
|
def __init__(self,
|
|
36
32
|
path: Path | str,
|
|
37
|
-
qc: bool = True,
|
|
38
|
-
csv_raw: bool = True,
|
|
39
33
|
reset: bool = False,
|
|
34
|
+
qc: bool = True,
|
|
35
|
+
qc_freq: Optional[str] = None,
|
|
40
36
|
rate: bool = True,
|
|
41
37
|
append_data: bool = False):
|
|
42
38
|
|
|
@@ -45,9 +41,9 @@ class AbstractReader(ABC):
|
|
|
45
41
|
self.logger = self._setup_logger()
|
|
46
42
|
|
|
47
43
|
self.reset = reset
|
|
48
|
-
self.rate = rate
|
|
49
44
|
self.qc = qc
|
|
50
|
-
self.
|
|
45
|
+
self.qc_freq = qc_freq
|
|
46
|
+
self.rate = rate
|
|
51
47
|
self.append = append_data and reset
|
|
52
48
|
|
|
53
49
|
self.pkl_nam = self.path / f'_read_{self.nam.lower()}.pkl'
|
|
@@ -57,15 +53,12 @@ class AbstractReader(ABC):
|
|
|
57
53
|
self.csv_out = self.path / f'output_{self.nam.lower()}.csv'
|
|
58
54
|
|
|
59
55
|
def __call__(self,
|
|
60
|
-
start:
|
|
61
|
-
end:
|
|
56
|
+
start: datetime,
|
|
57
|
+
end: datetime,
|
|
62
58
|
mean_freq: str = '1h',
|
|
63
59
|
csv_out: bool = True,
|
|
64
60
|
) -> DataFrame:
|
|
65
61
|
|
|
66
|
-
if start and end and end <= start:
|
|
67
|
-
raise ValueError(f"Invalid time range: start {start} is after end {end}")
|
|
68
|
-
|
|
69
62
|
data = self._run(start, end)
|
|
70
63
|
|
|
71
64
|
if data is not None:
|
|
@@ -81,15 +74,8 @@ class AbstractReader(ABC):
|
|
|
81
74
|
pass
|
|
82
75
|
|
|
83
76
|
@abstractmethod
|
|
84
|
-
def _QC(self, df: DataFrame):
|
|
85
|
-
return df
|
|
86
|
-
|
|
87
|
-
@staticmethod
|
|
88
|
-
def basic_QC(df: DataFrame):
|
|
89
|
-
df_ave, df_std = df.mean(), df.std()
|
|
90
|
-
df_lowb, df_highb = df < (df_ave - df_std * 1.5), df > (df_ave + df_std * 1.5)
|
|
91
|
-
|
|
92
|
-
return df.mask(df_lowb | df_highb).copy()
|
|
77
|
+
def _QC(self, df: DataFrame) -> DataFrame:
|
|
78
|
+
return self.n_sigma_QC(df)
|
|
93
79
|
|
|
94
80
|
def _setup_logger(self) -> logging.Logger:
|
|
95
81
|
logger = logging.getLogger(self.nam)
|
|
@@ -99,72 +85,87 @@ class AbstractReader(ABC):
|
|
|
99
85
|
logger.removeHandler(handler)
|
|
100
86
|
|
|
101
87
|
handler = logging.FileHandler(self.path / f'{self.nam}.log')
|
|
102
|
-
handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
|
|
88
|
+
handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
|
|
103
89
|
logger.addHandler(handler)
|
|
104
90
|
return logger
|
|
105
91
|
|
|
106
|
-
def _rate_calculate(self,
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
_drop_how = 'any'
|
|
111
|
-
_the_size = len(_fout_raw.resample('1h').mean().index)
|
|
92
|
+
def _rate_calculate(self, raw_data, qc_data) -> None:
|
|
93
|
+
def __base_rate(raw_data, qc_data):
|
|
94
|
+
period_size = len(raw_data.resample('1h').mean().index)
|
|
112
95
|
|
|
113
96
|
for _nam, _key in self.meta['deter_key'].items():
|
|
114
|
-
if _key
|
|
115
|
-
_key, _drop_how = _fout_qc.keys(), 'all'
|
|
97
|
+
_key, _drop_how = (qc_data.keys(), 'all') if _key is ['all'] else (_key, 'any')
|
|
116
98
|
|
|
117
|
-
|
|
118
|
-
|
|
99
|
+
sample_size = len(raw_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
|
|
100
|
+
qc_size = len(qc_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
|
|
119
101
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
102
|
+
# validate rate calculation
|
|
103
|
+
if period_size < sample_size or sample_size < qc_size or period_size == 0 or sample_size == 0:
|
|
104
|
+
raise ValueError(f"Invalid sample sizes: period={period_size}, sample={sample_size}, QC={qc_size}")
|
|
105
|
+
|
|
106
|
+
_acq_rate = round((sample_size / period_size) * 100, 1)
|
|
107
|
+
_yid_rate = round((qc_size / sample_size) * 100, 1)
|
|
108
|
+
_OEE_rate = round((qc_size / period_size) * 100, 1)
|
|
125
109
|
|
|
126
110
|
self.logger.info(f'{_nam}:')
|
|
127
111
|
self.logger.info(f"\tAcquisition rate: {_acq_rate}%")
|
|
128
112
|
self.logger.info(f'\tYield rate: {_yid_rate}%')
|
|
113
|
+
self.logger.info(f'\tOEE rate: {_OEE_rate}%')
|
|
129
114
|
self.logger.info(f"{'=' * 60}")
|
|
130
115
|
|
|
131
116
|
print(f'\n\t{_nam} : ')
|
|
132
|
-
print(f'\t\tacquisition rate :
|
|
133
|
-
|
|
117
|
+
print(f'\t\tacquisition rate | yield rate | OEE rate :'
|
|
118
|
+
f' \033[91m{_acq_rate}% | {_yid_rate}% -> {_OEE_rate}%\033[0m')
|
|
134
119
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
freq=self.meta['freq'])
|
|
141
|
-
_tm_index.name = 'time'
|
|
120
|
+
if self.meta['deter_key'] is not None:
|
|
121
|
+
# use qc_freq to calculate each period rate
|
|
122
|
+
if self.qc_freq is not None:
|
|
123
|
+
raw_data_grouped = raw_data.groupby(pd.Grouper(freq=self.qc_freq))
|
|
124
|
+
qc_data_grouped = qc_data.groupby(pd.Grouper(freq=self.qc_freq))
|
|
142
125
|
|
|
143
|
-
|
|
126
|
+
for (month, _sub_raw_data), (_, _sub_qc_data) in zip(raw_data_grouped, qc_data_grouped):
|
|
127
|
+
self.logger.info(
|
|
128
|
+
f"\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
|
|
129
|
+
print(
|
|
130
|
+
f"\n\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
|
|
144
131
|
|
|
145
|
-
|
|
146
|
-
@staticmethod
|
|
147
|
-
def _tmidx_process(_start, _end, _df):
|
|
148
|
-
_st, _ed = _df.index.sort_values()[[0, -1]]
|
|
149
|
-
_start, _end = to_datetime(_start) or _st, to_datetime(_end) or _ed
|
|
150
|
-
_idx = date_range(_start, _end, freq=_df.index.freq.copy())
|
|
151
|
-
_idx.name = 'time'
|
|
132
|
+
__base_rate(_sub_raw_data, _sub_qc_data)
|
|
152
133
|
|
|
153
|
-
|
|
134
|
+
else:
|
|
135
|
+
__base_rate(raw_data, qc_data)
|
|
154
136
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
137
|
+
def _timeIndex_process(self, _df, user_start=None, user_end=None, append_df=None):
|
|
138
|
+
"""
|
|
139
|
+
Process time index, resample data, extract specified time range, and optionally append new data.
|
|
158
140
|
|
|
159
|
-
|
|
160
|
-
|
|
141
|
+
:param _df: Input DataFrame with time index
|
|
142
|
+
:param user_start: Start of user-specified time range (optional)
|
|
143
|
+
:param user_end: End of user-specified time range (optional)
|
|
144
|
+
:param append_df: DataFrame to append (optional)
|
|
145
|
+
:return: Processed DataFrame
|
|
146
|
+
"""
|
|
147
|
+
# Round timestamps and remove duplicates
|
|
148
|
+
_df = _df.groupby(_df.index.round('1min')).first()
|
|
161
149
|
|
|
162
|
-
|
|
163
|
-
|
|
150
|
+
# Determine frequency
|
|
151
|
+
freq = _df.index.inferred_freq or self.meta['freq']
|
|
164
152
|
|
|
165
|
-
|
|
153
|
+
# Append new data if provided
|
|
154
|
+
if append_df is not None:
|
|
155
|
+
append_df.index = append_df.index.round('1min')
|
|
156
|
+
_df = pd.concat([append_df.dropna(how='all'), _df.dropna(how='all')])
|
|
157
|
+
_df = _df.loc[~_df.index.duplicated()]
|
|
166
158
|
|
|
167
|
-
|
|
159
|
+
# Determine time range
|
|
160
|
+
df_start, df_end = _df.index.sort_values()[[0, -1]]
|
|
161
|
+
|
|
162
|
+
# Create new time index
|
|
163
|
+
new_index = pd.date_range(user_start or df_start, user_end or df_end, freq=freq, name='time')
|
|
164
|
+
|
|
165
|
+
# Process data: convert to numeric, resample, and reindex
|
|
166
|
+
return (_df.apply(pd.to_numeric, errors='coerce')
|
|
167
|
+
.resample(freq).mean()
|
|
168
|
+
.reindex(new_index))
|
|
168
169
|
|
|
169
170
|
def _outlier_process(self, _df):
|
|
170
171
|
outlier_file = self.path / 'outlier.json'
|
|
@@ -180,31 +181,17 @@ class AbstractReader(ABC):
|
|
|
180
181
|
|
|
181
182
|
return _df
|
|
182
183
|
|
|
183
|
-
# save pickle file
|
|
184
184
|
def _save_data(self, raw_data: DataFrame, qc_data: DataFrame) -> None:
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
185
|
+
try:
|
|
186
|
+
raw_data.to_pickle(self.pkl_nam_raw)
|
|
187
|
+
raw_data.to_csv(self.csv_nam_raw)
|
|
188
188
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
raw_data.to_csv(self.csv_nam_raw)
|
|
189
|
+
if self.meta['deter_key'] is not None:
|
|
190
|
+
qc_data.to_pickle(self.pkl_nam)
|
|
191
|
+
qc_data.to_csv(self.csv_nam)
|
|
193
192
|
|
|
194
|
-
@staticmethod
|
|
195
|
-
def _safe_pickle_dump(file_path: Path, data: Any) -> None:
|
|
196
|
-
try:
|
|
197
|
-
with file_path.open('wb') as f:
|
|
198
|
-
pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL)
|
|
199
|
-
except PermissionError as e:
|
|
200
|
-
raise IOError(f"Unable to write to {file_path}. The file may be in use or you may not have permission: {e}")
|
|
201
193
|
except Exception as e:
|
|
202
|
-
raise IOError(f"Error
|
|
203
|
-
|
|
204
|
-
# read pickle file
|
|
205
|
-
def _read_pkl(self):
|
|
206
|
-
with self.pkl_nam.open('rb') as qc_data, self.pkl_nam_raw.open('rb') as raw_data:
|
|
207
|
-
return pkl.load(raw_data), pkl.load(qc_data)
|
|
194
|
+
raise IOError(f"Error saving data. {e}")
|
|
208
195
|
|
|
209
196
|
def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]:
|
|
210
197
|
files = [f
|
|
@@ -223,7 +210,7 @@ class AbstractReader(ABC):
|
|
|
223
210
|
TaskProgressColumn(),
|
|
224
211
|
TimeRemainingColumn(),
|
|
225
212
|
TextColumn("{task.fields[filename]}", style="yellow"),
|
|
226
|
-
console=
|
|
213
|
+
console=Console(force_terminal=True, color_system="auto"),
|
|
227
214
|
expand=False
|
|
228
215
|
) as progress:
|
|
229
216
|
task = progress.add_task(f"Reading {self.nam} files", total=len(files), filename="")
|
|
@@ -246,47 +233,148 @@ class AbstractReader(ABC):
|
|
|
246
233
|
if not df_list:
|
|
247
234
|
raise ValueError("All files were either empty or failed to read.")
|
|
248
235
|
|
|
249
|
-
raw_data =
|
|
236
|
+
raw_data = concat(df_list, axis=0).groupby(level=0).first()
|
|
237
|
+
|
|
238
|
+
raw_data = self._timeIndex_process(raw_data)
|
|
250
239
|
qc_data = self._QC(raw_data)
|
|
251
240
|
|
|
252
241
|
return raw_data, qc_data
|
|
253
242
|
|
|
254
|
-
def _run(self,
|
|
243
|
+
def _run(self, user_start, user_end):
|
|
255
244
|
# read pickle if pickle file exists and 'reset=False' or process raw data or append new data
|
|
256
245
|
if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and not self.reset:
|
|
257
|
-
print(f"\n{
|
|
258
|
-
f"from {
|
|
246
|
+
print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mPICKLE\033[0m "
|
|
247
|
+
f"from {user_start} to {user_end}\n")
|
|
259
248
|
|
|
260
|
-
_f_raw_done, _f_qc_done = self.
|
|
249
|
+
_f_raw_done, _f_qc_done = read_pickle(self.pkl_nam_raw), read_pickle(self.pkl_nam)
|
|
261
250
|
|
|
262
251
|
if self.append:
|
|
263
|
-
print(f"Appending new data from {
|
|
252
|
+
print(f"Appending new data from {user_start} to {user_end}")
|
|
264
253
|
_f_raw_new, _f_qc_new = self._read_raw_files()
|
|
265
|
-
_f_raw = self.
|
|
266
|
-
_f_qc = self.
|
|
254
|
+
_f_raw = self._timeIndex_process(_f_raw_done, append_df=_f_raw_new)
|
|
255
|
+
_f_qc = self._timeIndex_process(_f_qc_done, append_df=_f_qc_new)
|
|
267
256
|
else:
|
|
268
257
|
_f_raw, _f_qc = _f_raw_done, _f_qc_done
|
|
258
|
+
return _f_qc if self.qc else _f_raw
|
|
269
259
|
|
|
270
260
|
else:
|
|
271
|
-
print(f"\n{
|
|
272
|
-
f"from {
|
|
261
|
+
print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mRAW DATA\033[0m "
|
|
262
|
+
f"from {user_start} to {user_end}\n")
|
|
263
|
+
|
|
273
264
|
_f_raw, _f_qc = self._read_raw_files()
|
|
274
265
|
|
|
275
266
|
# process time index
|
|
276
|
-
|
|
277
|
-
_f_qc, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_qc)
|
|
267
|
+
data_start, data_end = _f_raw.index.sort_values()[[0, -1]]
|
|
278
268
|
|
|
269
|
+
_f_raw = self._timeIndex_process(_f_raw, user_start, user_end)
|
|
270
|
+
_f_qc = self._timeIndex_process(_f_qc, user_start, user_end)
|
|
279
271
|
_f_qc = self._outlier_process(_f_qc)
|
|
280
272
|
|
|
281
273
|
# save
|
|
282
274
|
self._save_data(_f_raw, _f_qc)
|
|
283
275
|
|
|
284
276
|
self.logger.info(f"{'=' * 60}")
|
|
285
|
-
self.logger.info(f"Raw data time : {
|
|
286
|
-
self.logger.info(f"Output time : {
|
|
277
|
+
self.logger.info(f"Raw data time : {data_start} to {data_end}")
|
|
278
|
+
self.logger.info(f"Output time : {user_start} to {user_end}")
|
|
287
279
|
self.logger.info(f"{'-' * 60}")
|
|
288
280
|
|
|
289
281
|
if self.rate:
|
|
290
|
-
self._rate_calculate(_f_raw, _f_qc
|
|
282
|
+
self._rate_calculate(_f_raw, _f_qc)
|
|
291
283
|
|
|
292
284
|
return _f_qc if self.qc else _f_raw
|
|
285
|
+
|
|
286
|
+
@staticmethod
|
|
287
|
+
def reorder_dataframe_columns(df, order_lists, others_col=False):
|
|
288
|
+
new_order = []
|
|
289
|
+
|
|
290
|
+
for order in order_lists:
|
|
291
|
+
# 只添加存在於DataFrame中的欄位,且不重複添加
|
|
292
|
+
new_order.extend([col for col in order if col in df.columns and col not in new_order])
|
|
293
|
+
|
|
294
|
+
if others_col:
|
|
295
|
+
# 添加所有不在新順序列表中的原始欄位,保持它們的原始順序
|
|
296
|
+
new_order.extend([col for col in df.columns if col not in new_order])
|
|
297
|
+
|
|
298
|
+
return df[new_order]
|
|
299
|
+
|
|
300
|
+
@staticmethod
|
|
301
|
+
def n_sigma_QC(df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
|
|
302
|
+
# 確保輸入是DataFrame
|
|
303
|
+
df = df.to_frame() if isinstance(df, pd.Series) else df
|
|
304
|
+
|
|
305
|
+
df_ave = df.mean()
|
|
306
|
+
df_std = df.std()
|
|
307
|
+
|
|
308
|
+
lower_bound = df < (df_ave - df_std * std_range)
|
|
309
|
+
upper_bound = df > (df_ave + df_std * std_range)
|
|
310
|
+
|
|
311
|
+
return df.mask(lower_bound | upper_bound)
|
|
312
|
+
|
|
313
|
+
@staticmethod
|
|
314
|
+
def IQR_QC(df: pd.DataFrame, log_dist=False) -> pd.DataFrame:
|
|
315
|
+
# 確保輸入是DataFrame
|
|
316
|
+
df = df.to_frame() if isinstance(df, pd.Series) else df
|
|
317
|
+
|
|
318
|
+
df_transformed = np.log10(df) if log_dist else df
|
|
319
|
+
|
|
320
|
+
_df_q1 = df_transformed.quantile(0.25)
|
|
321
|
+
_df_q3 = df_transformed.quantile(0.75)
|
|
322
|
+
|
|
323
|
+
_df_iqr = _df_q3 - _df_q1
|
|
324
|
+
|
|
325
|
+
# Calculate lower and upper bounds
|
|
326
|
+
lower_bound = df_transformed < (_df_q1 - 1.5 * _df_iqr)
|
|
327
|
+
upper_bound = df_transformed > (_df_q3 + 1.5 * _df_iqr)
|
|
328
|
+
|
|
329
|
+
# Apply the filter to the original dataframe
|
|
330
|
+
return df.mask(lower_bound | upper_bound)
|
|
331
|
+
|
|
332
|
+
@staticmethod
|
|
333
|
+
def rolling_IQR_QC(df: pd.DataFrame, window_size=24, log_dist=False) -> pd.DataFrame:
|
|
334
|
+
df = df.to_frame() if isinstance(df, pd.Series) else df
|
|
335
|
+
df_transformed = np.log10(df) if log_dist else df
|
|
336
|
+
|
|
337
|
+
def iqr_filter(x):
|
|
338
|
+
q1, q3 = x.quantile(0.25), x.quantile(0.75)
|
|
339
|
+
iqr = q3 - q1
|
|
340
|
+
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
341
|
+
return (x >= lower) & (x <= upper)
|
|
342
|
+
|
|
343
|
+
mask = df_transformed.rolling(window=window_size, center=True, min_periods=1).apply(iqr_filter)
|
|
344
|
+
return df.where(mask, np.nan)
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def time_aware_IQR_QC(df: pd.DataFrame, time_window='1D', log_dist=False) -> pd.DataFrame:
|
|
348
|
+
df = df.to_frame() if isinstance(df, pd.Series) else df
|
|
349
|
+
df_transformed = np.log10(df) if log_dist else df
|
|
350
|
+
|
|
351
|
+
def iqr_filter(group):
|
|
352
|
+
q1, q3 = group.quantile(0.25), group.quantile(0.75)
|
|
353
|
+
iqr = q3 - q1
|
|
354
|
+
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
355
|
+
return (group >= lower) & (group <= upper)
|
|
356
|
+
|
|
357
|
+
mask = df_transformed.groupby(pd.Grouper(freq=time_window)).transform(iqr_filter)
|
|
358
|
+
return df.where(mask, np.nan)
|
|
359
|
+
|
|
360
|
+
@staticmethod
|
|
361
|
+
def mad_iqr_hybrid_QC(df: pd.DataFrame, mad_threshold=3.5, log_dist=False) -> pd.DataFrame:
|
|
362
|
+
df = df.to_frame() if isinstance(df, pd.Series) else df
|
|
363
|
+
df_transformed = np.log10(df) if log_dist else df
|
|
364
|
+
|
|
365
|
+
# IQR 方法
|
|
366
|
+
q1, q3 = df_transformed.quantile(0.25), df_transformed.quantile(0.75)
|
|
367
|
+
iqr = q3 - q1
|
|
368
|
+
iqr_lower, iqr_upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
369
|
+
|
|
370
|
+
# MAD 方法
|
|
371
|
+
median = df_transformed.median()
|
|
372
|
+
mad = (df_transformed - median).abs().median()
|
|
373
|
+
mad_lower, mad_upper = median - mad_threshold * mad, median + mad_threshold * mad
|
|
374
|
+
|
|
375
|
+
# 结合两种方法
|
|
376
|
+
lower = np.maximum(iqr_lower, mad_lower)
|
|
377
|
+
upper = np.minimum(iqr_upper, mad_upper)
|
|
378
|
+
|
|
379
|
+
mask = (df_transformed >= lower) & (df_transformed <= upper)
|
|
380
|
+
return df.where(mask, np.nan)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pandas import read_table
|
|
1
|
+
from pandas import read_table, to_numeric
|
|
2
2
|
|
|
3
3
|
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
4
|
|
|
@@ -8,10 +8,10 @@ class Reader(AbstractReader):
|
|
|
8
8
|
|
|
9
9
|
def _raw_reader(self, file):
|
|
10
10
|
if file.stat().st_size / 1024 < 550:
|
|
11
|
-
|
|
11
|
+
self.logger.info(f'\t {file} may not be a whole daily data. Make sure the file is correct.')
|
|
12
12
|
|
|
13
13
|
_df = read_table(file, parse_dates={'time': [0, 1]}, index_col='time',
|
|
14
|
-
delimiter=r'\s+', skiprows=5, usecols=range(67))
|
|
14
|
+
delimiter=r'\s+', skiprows=5, usecols=range(67)).apply(to_numeric, errors='coerce')
|
|
15
15
|
_df.columns = _df.columns.str.strip(';')
|
|
16
16
|
|
|
17
17
|
# remove data without Status=0, 128 (Not much filter tape), 256 (Not much filter tape)
|
|
@@ -23,8 +23,13 @@ class Reader(AbstractReader):
|
|
|
23
23
|
return _df.loc[~_df.index.duplicated() & _df.index.notna()]
|
|
24
24
|
|
|
25
25
|
def _QC(self, _df):
|
|
26
|
+
_index = _df.index.copy()
|
|
27
|
+
|
|
26
28
|
# remove negative value
|
|
27
|
-
_df = _df
|
|
29
|
+
_df = _df.mask((_df <= 0) | (_df > 20000))
|
|
30
|
+
|
|
31
|
+
# use IQR_QC
|
|
32
|
+
_df = self.time_aware_IQR_QC(_df, time_window='1h')
|
|
28
33
|
|
|
29
|
-
#
|
|
30
|
-
return _df.
|
|
34
|
+
# make sure all columns have values, otherwise set to nan
|
|
35
|
+
return _df.dropna(how='any').reindex(_index)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pandas import read_csv
|
|
1
|
+
from pandas import read_csv, to_numeric
|
|
2
2
|
|
|
3
3
|
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
4
|
|
|
@@ -7,7 +7,7 @@ class Reader(AbstractReader):
|
|
|
7
7
|
nam = 'AE43'
|
|
8
8
|
|
|
9
9
|
def _raw_reader(self, file):
|
|
10
|
-
_df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time')
|
|
10
|
+
_df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time').apply(to_numeric, errors='coerce')
|
|
11
11
|
_df_id = _df['SetupID'].iloc[-1]
|
|
12
12
|
|
|
13
13
|
# get last SetupID data
|
|
@@ -24,8 +24,13 @@ class Reader(AbstractReader):
|
|
|
24
24
|
|
|
25
25
|
# QC data
|
|
26
26
|
def _QC(self, _df):
|
|
27
|
+
_index = _df.index.copy()
|
|
28
|
+
|
|
27
29
|
# remove negative value
|
|
28
|
-
_df = _df.mask((_df
|
|
30
|
+
_df = _df.mask((_df <= 0) | (_df > 20000))
|
|
31
|
+
|
|
32
|
+
# use IQR_QC
|
|
33
|
+
_df = self.time_aware_IQR_QC(_df, time_window='1h')
|
|
29
34
|
|
|
30
|
-
#
|
|
31
|
-
return _df.
|
|
35
|
+
# make sure all columns have values, otherwise set to nan
|
|
36
|
+
return _df.dropna(how='any').reindex(_index)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pandas import to_datetime, read_csv
|
|
1
|
+
from pandas import to_datetime, read_csv, to_numeric
|
|
2
2
|
|
|
3
3
|
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
4
|
|
|
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
|
|
|
8
8
|
|
|
9
9
|
def _raw_reader(self, file):
|
|
10
10
|
with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
|
|
11
|
-
_df = read_csv(f, low_memory=False, index_col=0)
|
|
11
|
+
_df = read_csv(f, low_memory=False, index_col=0).apply(to_numeric, errors='coerce')
|
|
12
12
|
|
|
13
13
|
_df.index = to_datetime(_df.index, errors='coerce')
|
|
14
14
|
_df.index.name = 'time'
|
|
@@ -24,17 +24,21 @@ class Reader(AbstractReader):
|
|
|
24
24
|
'RH': 'RH'
|
|
25
25
|
})
|
|
26
26
|
|
|
27
|
-
_df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR'
|
|
27
|
+
_df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR']]
|
|
28
28
|
|
|
29
29
|
return _df.loc[~_df.index.duplicated() & _df.index.notna()]
|
|
30
30
|
|
|
31
|
-
# QC data
|
|
32
31
|
def _QC(self, _df):
|
|
33
|
-
|
|
34
|
-
_df = _df.mask((_df <= 0) | (_df > 2000)).copy()
|
|
32
|
+
_index = _df.index.copy()
|
|
35
33
|
|
|
36
|
-
|
|
37
|
-
_df = _df[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
|
|
34
|
+
_df = _df.mask((_df <= 0) | (_df > 2000))
|
|
38
35
|
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
_df = _df.loc[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
|
|
37
|
+
|
|
38
|
+
_df = _df.loc[(_df['B'] > _df['G']) & (_df['G'] > _df['R'])]
|
|
39
|
+
|
|
40
|
+
# use IQR_QC
|
|
41
|
+
_df = self.time_aware_IQR_QC(_df)
|
|
42
|
+
|
|
43
|
+
# make sure all columns have values, otherwise set to nan
|
|
44
|
+
return _df.dropna(how='any').reindex(_index)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pandas import read_csv
|
|
1
|
+
from pandas import read_csv, to_numeric
|
|
2
2
|
|
|
3
3
|
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
4
|
|
|
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
|
|
|
8
8
|
|
|
9
9
|
def _raw_reader(self, file):
|
|
10
10
|
with open(file, 'r', encoding='utf-8', errors='ignore') as f:
|
|
11
|
-
_df = read_csv(f, parse_dates=True, index_col=0)
|
|
11
|
+
_df = read_csv(f, parse_dates=True, index_col=0).apply(to_numeric, errors='coerce')
|
|
12
12
|
|
|
13
13
|
_df.columns = _df.columns.str.replace(' ', '')
|
|
14
14
|
|
|
@@ -33,10 +33,14 @@ class Reader(AbstractReader):
|
|
|
33
33
|
|
|
34
34
|
return _df.loc[~_df.index.duplicated() & _df.index.notna()]
|
|
35
35
|
|
|
36
|
-
# QC data
|
|
37
36
|
def _QC(self, _df):
|
|
37
|
+
_index = _df.index.copy()
|
|
38
|
+
|
|
38
39
|
# remove negative value
|
|
39
|
-
_df = _df
|
|
40
|
+
_df = _df.mask((_df <= 0) | (_df > 20000))
|
|
41
|
+
|
|
42
|
+
# use IQR_QC
|
|
43
|
+
_df = self.time_aware_IQR_QC(_df, time_window='1h')
|
|
40
44
|
|
|
41
|
-
#
|
|
42
|
-
return _df.
|
|
45
|
+
# make sure all columns have values, otherwise set to nan
|
|
46
|
+
return _df.dropna(how='any').reindex(_index)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from pandas import read_csv, to_numeric
|
|
2
|
+
|
|
3
|
+
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
|
+
|
|
5
|
+
desired_order1 = ['SO2', 'NO', 'NOx', 'NO2', 'CO', 'O3', 'THC', 'NMHC',
|
|
6
|
+
'CH4', 'PM10', 'PM2.5', 'PM1', 'WS', 'WD', 'AT', 'RH']
|
|
7
|
+
|
|
8
|
+
desired_order2 = ['Benzene', 'Toluene', 'EthylBenzene', 'm/p-Xylene', 'o-Xylene']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Reader(AbstractReader):
|
|
12
|
+
nam = 'EPA'
|
|
13
|
+
|
|
14
|
+
def _raw_reader(self, file):
|
|
15
|
+
# 查詢小時值(測項).csv & 查詢小時值(直式).csv (有、無輸出有效值都可以)
|
|
16
|
+
df = read_csv(file, encoding='big5', encoding_errors='ignore', index_col=0, parse_dates=True,
|
|
17
|
+
on_bad_lines='skip').apply(to_numeric, errors='coerce')
|
|
18
|
+
|
|
19
|
+
if len(df.groupby('測站')) > 1:
|
|
20
|
+
raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}')
|
|
21
|
+
else:
|
|
22
|
+
if '測站' in df.columns:
|
|
23
|
+
df.drop(columns=['測站'], inplace=True)
|
|
24
|
+
|
|
25
|
+
if '測項' in df.columns:
|
|
26
|
+
df = df.pivot(columns='測項', values='資料')
|
|
27
|
+
|
|
28
|
+
df.rename(columns={'AMB_TEMP': 'AT', 'WIND_SPEED': 'WS', 'WIND_DIREC': 'WD'}, inplace=True)
|
|
29
|
+
df.index.name = 'Time'
|
|
30
|
+
|
|
31
|
+
# 如果沒有將無效值拿掉就輸出 請將包含 #、L 的字串替換成 # 或 _
|
|
32
|
+
df = df.replace(to_replace=r'\d*[#]\b', value='#', regex=True)
|
|
33
|
+
df = df.replace(to_replace=r'\d*[L]\b', value='_', regex=True)
|
|
34
|
+
|
|
35
|
+
# 欄位排序
|
|
36
|
+
return self.reorder_dataframe_columns(df, [desired_order1])
|
|
37
|
+
|
|
38
|
+
def _QC(self, _df):
|
|
39
|
+
return _df
|