AeroViz 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AeroViz might be problematic. Click here for more details.
- AeroViz/dataProcess/Chemistry/_mass_volume.py +4 -3
- AeroViz/dataProcess/Chemistry/_ocec.py +20 -7
- AeroViz/dataProcess/Optical/_IMPROVE.py +2 -3
- AeroViz/dataProcess/SizeDistr/__init__.py +6 -10
- AeroViz/plot/__init__.py +1 -0
- AeroViz/plot/meteorology/meteorology.py +2 -0
- AeroViz/plot/optical/optical.py +1 -1
- AeroViz/plot/pie.py +14 -2
- AeroViz/plot/radar.py +184 -0
- AeroViz/plot/scatter.py +16 -7
- AeroViz/plot/templates/koschmieder.py +11 -8
- AeroViz/plot/timeseries/timeseries.py +0 -1
- AeroViz/rawDataReader/__init__.py +75 -70
- AeroViz/rawDataReader/config/supported_instruments.py +70 -38
- AeroViz/rawDataReader/core/__init__.py +208 -178
- AeroViz/rawDataReader/script/AE33.py +1 -1
- AeroViz/rawDataReader/script/AE43.py +1 -1
- AeroViz/rawDataReader/script/APS_3321.py +2 -2
- AeroViz/rawDataReader/script/Aurora.py +1 -1
- AeroViz/rawDataReader/script/BC1054.py +1 -1
- AeroViz/rawDataReader/script/EPA.py +39 -0
- AeroViz/rawDataReader/script/GRIMM.py +1 -1
- AeroViz/rawDataReader/script/IGAC.py +6 -23
- AeroViz/rawDataReader/script/MA350.py +1 -1
- AeroViz/rawDataReader/script/Minion.py +102 -30
- AeroViz/rawDataReader/script/NEPH.py +1 -1
- AeroViz/rawDataReader/script/{Sunset_OCEC.py → OCEC.py} +2 -2
- AeroViz/rawDataReader/script/SMPS.py +77 -0
- AeroViz/rawDataReader/script/TEOM.py +2 -2
- AeroViz/rawDataReader/script/VOC.py +2 -2
- AeroViz/rawDataReader/script/XRF.py +11 -0
- AeroViz/rawDataReader/script/__init__.py +4 -6
- {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/METADATA +57 -32
- {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/RECORD +37 -55
- AeroViz/process/__init__.py +0 -31
- AeroViz/process/core/DataProc.py +0 -19
- AeroViz/process/core/SizeDist.py +0 -90
- AeroViz/process/core/__init__.py +0 -4
- AeroViz/process/method/PyMieScatt_update.py +0 -567
- AeroViz/process/method/__init__.py +0 -2
- AeroViz/process/method/mie_theory.py +0 -260
- AeroViz/process/method/prop.py +0 -62
- AeroViz/process/script/AbstractDistCalc.py +0 -143
- AeroViz/process/script/Chemical.py +0 -177
- AeroViz/process/script/IMPACT.py +0 -49
- AeroViz/process/script/IMPROVE.py +0 -161
- AeroViz/process/script/Others.py +0 -65
- AeroViz/process/script/PSD.py +0 -103
- AeroViz/process/script/PSD_dry.py +0 -93
- AeroViz/process/script/__init__.py +0 -5
- AeroViz/process/script/retrieve_RI.py +0 -69
- AeroViz/rawDataReader/script/EPA_vertical.py +0 -46
- AeroViz/rawDataReader/script/SMPS_TH.py +0 -41
- AeroViz/rawDataReader/script/SMPS_aim11.py +0 -51
- AeroViz/rawDataReader/script/SMPS_genr.py +0 -51
- AeroViz/rawDataReader/script/Table.py +0 -27
- {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/LICENSE +0 -0
- {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/WHEEL +0 -0
- {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,36 +1,39 @@
|
|
|
1
|
-
import json
|
|
1
|
+
import json
|
|
2
2
|
import logging
|
|
3
|
-
import pickle as pkl
|
|
4
3
|
from abc import ABC, abstractmethod
|
|
5
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import pandas as pd
|
|
11
|
-
from pandas import DataFrame,
|
|
10
|
+
from pandas import DataFrame, concat, read_pickle
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn, TaskProgressColumn
|
|
12
13
|
|
|
13
|
-
from
|
|
14
|
+
from AeroViz.rawDataReader.config.supported_instruments import meta
|
|
14
15
|
|
|
15
16
|
__all__ = ['AbstractReader']
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class AbstractReader(ABC):
|
|
19
|
-
|
|
20
|
+
"""
|
|
21
|
+
Abstract class for reading raw data from different instruments. Each instrument should have a separate class that
|
|
22
|
+
inherits from this class and implements the abstract methods. The abstract methods are `_raw_reader` and `_QC`.
|
|
20
23
|
|
|
21
|
-
|
|
22
|
-
|
|
24
|
+
List the file in the path and read pickle file if it exists, else read raw data and dump the pickle file the
|
|
25
|
+
pickle file will be generated after read raw data first time, if you want to re-read the rawdata, please set
|
|
26
|
+
'reset=True'
|
|
27
|
+
"""
|
|
23
28
|
|
|
24
|
-
|
|
25
|
-
# pickle file will be generated after read raw data first time, if you want to re-read the rawdata, please set
|
|
26
|
-
# 'reset=True'
|
|
29
|
+
nam = 'AbstractReader'
|
|
27
30
|
|
|
28
31
|
def __init__(self,
|
|
29
32
|
path: Path | str,
|
|
30
|
-
qc: bool = True,
|
|
31
|
-
csv_raw: bool = True,
|
|
32
33
|
reset: bool = False,
|
|
33
|
-
|
|
34
|
+
qc: bool = True,
|
|
35
|
+
qc_freq: Optional[str] = None,
|
|
36
|
+
rate: bool = True,
|
|
34
37
|
append_data: bool = False):
|
|
35
38
|
|
|
36
39
|
self.path = Path(path)
|
|
@@ -38,10 +41,10 @@ class AbstractReader(ABC):
|
|
|
38
41
|
self.logger = self._setup_logger()
|
|
39
42
|
|
|
40
43
|
self.reset = reset
|
|
41
|
-
self.rate = rate
|
|
42
44
|
self.qc = qc
|
|
43
|
-
self.
|
|
44
|
-
self.
|
|
45
|
+
self.qc_freq = qc_freq
|
|
46
|
+
self.rate = rate
|
|
47
|
+
self.append = append_data and reset
|
|
45
48
|
|
|
46
49
|
self.pkl_nam = self.path / f'_read_{self.nam.lower()}.pkl'
|
|
47
50
|
self.csv_nam = self.path / f'_read_{self.nam.lower()}.csv'
|
|
@@ -49,24 +52,12 @@ class AbstractReader(ABC):
|
|
|
49
52
|
self.csv_nam_raw = self.path / f'_read_{self.nam.lower()}_raw.csv'
|
|
50
53
|
self.csv_out = self.path / f'output_{self.nam.lower()}.csv'
|
|
51
54
|
|
|
52
|
-
# dependency injection function, customize each instrument
|
|
53
|
-
@abstractmethod
|
|
54
|
-
def _raw_reader(self, file):
|
|
55
|
-
pass
|
|
56
|
-
|
|
57
|
-
@abstractmethod
|
|
58
|
-
def _QC(self, df: DataFrame):
|
|
59
|
-
return df
|
|
60
|
-
|
|
61
55
|
def __call__(self,
|
|
62
|
-
start:
|
|
63
|
-
end:
|
|
56
|
+
start: datetime,
|
|
57
|
+
end: datetime,
|
|
64
58
|
mean_freq: str = '1h',
|
|
65
59
|
csv_out: bool = True,
|
|
66
|
-
) -> DataFrame
|
|
67
|
-
|
|
68
|
-
if start and end and end <= start:
|
|
69
|
-
raise ValueError(f"Invalid time range: start {start} is after end {end}")
|
|
60
|
+
) -> DataFrame:
|
|
70
61
|
|
|
71
62
|
data = self._run(start, end)
|
|
72
63
|
|
|
@@ -78,210 +69,249 @@ class AbstractReader(ABC):
|
|
|
78
69
|
|
|
79
70
|
return data
|
|
80
71
|
|
|
81
|
-
@
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
df_lowb, df_highb = df < (df_ave - df_std * 1.5), df > (df_ave + df_std * 1.5)
|
|
85
|
-
|
|
86
|
-
return df.mask(df_lowb | df_highb).copy()
|
|
87
|
-
|
|
88
|
-
# set each to true datetime(18:30:01 -> 18:30:00) and rindex data
|
|
89
|
-
def _raw_process(self, _df):
|
|
90
|
-
# get time from df and set time to whole time to create time index
|
|
91
|
-
_st, _ed = _df.index.sort_values()[[0, -1]]
|
|
92
|
-
_tm_index = date_range(_st.strftime('%Y%m%d %H00'), _ed.floor('h').strftime('%Y%m%d %H00'),
|
|
93
|
-
freq=self.meta['freq'])
|
|
94
|
-
_tm_index.name = 'time'
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def _raw_reader(self, file):
|
|
74
|
+
pass
|
|
95
75
|
|
|
96
|
-
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def _QC(self, df: DataFrame) -> DataFrame:
|
|
78
|
+
return self.n_sigma_QC(df)
|
|
97
79
|
|
|
98
80
|
def _setup_logger(self) -> logging.Logger:
|
|
99
81
|
logger = logging.getLogger(self.nam)
|
|
100
82
|
logger.setLevel(logging.INFO)
|
|
83
|
+
|
|
84
|
+
for handler in logger.handlers[:]:
|
|
85
|
+
logger.removeHandler(handler)
|
|
86
|
+
|
|
101
87
|
handler = logging.FileHandler(self.path / f'{self.nam}.log')
|
|
102
|
-
handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
|
|
88
|
+
handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
|
|
103
89
|
logger.addHandler(handler)
|
|
104
90
|
return logger
|
|
105
91
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
_start, _end = _fout_qc.index[[0, -1]]
|
|
110
|
-
|
|
111
|
-
_drop_how = 'any'
|
|
112
|
-
_the_size = len(_fout_raw.resample('1h').mean().index)
|
|
113
|
-
|
|
114
|
-
self.logger.info(f"{'=' * 60}")
|
|
115
|
-
self.logger.info(
|
|
116
|
-
f"Raw data time : {_st_raw.strftime('%Y-%m-%d %H:%M:%S')} to {_ed_raw.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
117
|
-
self.logger.info(
|
|
118
|
-
f"Output time : {_start.strftime('%Y-%m-%d %H:%M:%S')} to {_end.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
119
|
-
self.logger.info(f"{'-' * 60}")
|
|
120
|
-
print(f"\n\n\t\tfrom {_start.strftime('%Y-%m-%d %H:%M:%S')} to {_end.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
92
|
+
def _rate_calculate(self, raw_data, qc_data) -> None:
|
|
93
|
+
def __base_rate(raw_data, qc_data):
|
|
94
|
+
period_size = len(raw_data.resample('1h').mean().index)
|
|
121
95
|
|
|
122
96
|
for _nam, _key in self.meta['deter_key'].items():
|
|
123
|
-
if _key
|
|
124
|
-
_key, _drop_how = _fout_qc.keys(), 'all'
|
|
97
|
+
_key, _drop_how = (qc_data.keys(), 'all') if _key is ['all'] else (_key, 'any')
|
|
125
98
|
|
|
126
|
-
|
|
127
|
-
|
|
99
|
+
sample_size = len(raw_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
|
|
100
|
+
qc_size = len(qc_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
|
|
128
101
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
102
|
+
# validate rate calculation
|
|
103
|
+
if period_size < sample_size or sample_size < qc_size or period_size == 0 or sample_size == 0:
|
|
104
|
+
raise ValueError(f"Invalid sample sizes: period={period_size}, sample={sample_size}, QC={qc_size}")
|
|
105
|
+
|
|
106
|
+
_acq_rate = round((sample_size / period_size) * 100, 1)
|
|
107
|
+
_yid_rate = round((qc_size / sample_size) * 100, 1)
|
|
134
108
|
|
|
135
109
|
self.logger.info(f'{_nam}:')
|
|
136
110
|
self.logger.info(f"\tAcquisition rate: {_acq_rate}%")
|
|
137
111
|
self.logger.info(f'\tYield rate: {_yid_rate}%')
|
|
138
112
|
self.logger.info(f"{'=' * 60}")
|
|
139
113
|
|
|
140
|
-
print(f'\
|
|
141
|
-
print(f'\t\
|
|
142
|
-
print(f'\t\
|
|
143
|
-
|
|
144
|
-
# process time index
|
|
145
|
-
@staticmethod
|
|
146
|
-
def _tmidx_process(_start, _end, _df):
|
|
147
|
-
_st, _ed = _df.index.sort_values()[[0, -1]]
|
|
148
|
-
_start, _end = to_datetime(_start) or _st, to_datetime(_end) or _ed
|
|
149
|
-
_idx = date_range(_start, _end, freq=_df.index.freq.copy())
|
|
150
|
-
_idx.name = 'time'
|
|
151
|
-
|
|
152
|
-
return _df.reindex(_idx), _st, _ed
|
|
153
|
-
|
|
154
|
-
# append new data to exist pkl
|
|
155
|
-
@staticmethod
|
|
156
|
-
def _apnd_prcs(_df_done, _df_apnd):
|
|
157
|
-
|
|
158
|
-
if _df_apnd is not None:
|
|
159
|
-
_df = concat([_df_apnd.dropna(how='all').copy(), _df_done.dropna(how='all').copy()])
|
|
160
|
-
|
|
161
|
-
_idx = date_range(*_df.index.sort_values()[[0, -1]], freq=_df_done.index.freq.copy())
|
|
162
|
-
_idx.name = 'time'
|
|
163
|
-
|
|
164
|
-
return _df.loc[~_df.index.duplicated()].copy().reindex(_idx)
|
|
165
|
-
|
|
166
|
-
return _df_done
|
|
114
|
+
print(f'\n\t{_nam} : ')
|
|
115
|
+
print(f'\t\tacquisition rate : \033[91m{_acq_rate}%\033[0m')
|
|
116
|
+
print(f'\t\tyield rate : \033[91m{_yid_rate}%\033[0m')
|
|
167
117
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
118
|
+
if self.meta['deter_key'] is not None:
|
|
119
|
+
# use qc_freq to calculate each period rate
|
|
120
|
+
if self.qc_freq is not None:
|
|
121
|
+
raw_data_grouped = raw_data.groupby(pd.Grouper(freq=self.qc_freq))
|
|
122
|
+
qc_data_grouped = qc_data.groupby(pd.Grouper(freq=self.qc_freq))
|
|
123
|
+
|
|
124
|
+
for (month, _sub_raw_data), (_, _sub_qc_data) in zip(raw_data_grouped, qc_data_grouped):
|
|
125
|
+
self.logger.info(
|
|
126
|
+
f"\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
|
|
127
|
+
print(
|
|
128
|
+
f"\n\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
|
|
129
|
+
|
|
130
|
+
__base_rate(_sub_raw_data, _sub_qc_data)
|
|
131
|
+
|
|
132
|
+
else:
|
|
133
|
+
__base_rate(raw_data, qc_data)
|
|
134
|
+
|
|
135
|
+
def _timeIndex_process(self, _df, user_start=None, user_end=None, append_df=None):
|
|
136
|
+
"""
|
|
137
|
+
Process time index, resample data, extract specified time range, and optionally append new data.
|
|
138
|
+
|
|
139
|
+
:param _df: Input DataFrame with time index
|
|
140
|
+
:param user_start: Start of user-specified time range (optional)
|
|
141
|
+
:param user_end: End of user-specified time range (optional)
|
|
142
|
+
:param append_df: DataFrame to append (optional)
|
|
143
|
+
:return: Processed DataFrame
|
|
144
|
+
"""
|
|
145
|
+
# Round timestamps and remove duplicates
|
|
146
|
+
_df = _df.groupby(_df.index.round('1min')).first()
|
|
147
|
+
|
|
148
|
+
# Determine frequency
|
|
149
|
+
freq = _df.index.inferred_freq or self.meta['freq']
|
|
150
|
+
|
|
151
|
+
# Append new data if provided
|
|
152
|
+
if append_df is not None:
|
|
153
|
+
append_df.index = append_df.index.round('1min')
|
|
154
|
+
_df = pd.concat([append_df.dropna(how='all'), _df.dropna(how='all')])
|
|
155
|
+
_df = _df.loc[~_df.index.duplicated()]
|
|
156
|
+
|
|
157
|
+
# Determine time range
|
|
158
|
+
df_start, df_end = _df.index.sort_values()[[0, -1]]
|
|
159
|
+
|
|
160
|
+
# Create new time index
|
|
161
|
+
new_index = pd.date_range(user_start or df_start, user_end or df_end, freq=freq, name='time')
|
|
162
|
+
|
|
163
|
+
# Process data: convert to numeric, resample, and reindex
|
|
164
|
+
return (_df.apply(pd.to_numeric, errors='coerce')
|
|
165
|
+
.resample(freq).mean()
|
|
166
|
+
.reindex(new_index))
|
|
167
|
+
|
|
168
|
+
def _outlier_process(self, _df):
|
|
169
|
+
outlier_file = self.path / 'outlier.json'
|
|
170
|
+
|
|
171
|
+
if not outlier_file.exists():
|
|
172
172
|
return _df
|
|
173
173
|
|
|
174
|
-
with
|
|
175
|
-
|
|
174
|
+
with outlier_file.open('r', encoding='utf-8', errors='ignore') as f:
|
|
175
|
+
outliers = json.load(f)
|
|
176
176
|
|
|
177
|
-
for _st, _ed in
|
|
177
|
+
for _st, _ed in outliers.values():
|
|
178
178
|
_df.loc[_st:_ed] = np.nan
|
|
179
179
|
|
|
180
180
|
return _df
|
|
181
181
|
|
|
182
|
-
# save pickle file
|
|
183
182
|
def _save_data(self, raw_data: DataFrame, qc_data: DataFrame) -> None:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
183
|
+
try:
|
|
184
|
+
raw_data.to_pickle(self.pkl_nam_raw)
|
|
185
|
+
raw_data.to_csv(self.csv_nam_raw)
|
|
187
186
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
raw_data.to_csv(self.csv_nam_raw)
|
|
187
|
+
if self.meta['deter_key'] is not None:
|
|
188
|
+
qc_data.to_pickle(self.pkl_nam)
|
|
189
|
+
qc_data.to_csv(self.csv_nam)
|
|
192
190
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
while True:
|
|
196
|
-
try:
|
|
197
|
-
with file_path.open('wb') as f:
|
|
198
|
-
pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL)
|
|
199
|
-
break
|
|
200
|
-
except PermissionError as err:
|
|
201
|
-
print('\n', err)
|
|
202
|
-
input('\t\t\33[41m Please close the file and press "Enter" \33[0m\n')
|
|
203
|
-
|
|
204
|
-
# read pickle file
|
|
205
|
-
def _read_pkl(self):
|
|
206
|
-
with self.pkl_nam.open('rb') as qc_data, self.pkl_nam_raw.open('rb') as raw_data:
|
|
207
|
-
return pkl.load(raw_data), pkl.load(qc_data)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
raise IOError(f"Error saving data. {e}")
|
|
208
193
|
|
|
209
194
|
def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]:
|
|
210
|
-
|
|
211
|
-
|
|
195
|
+
files = [f
|
|
196
|
+
for file_pattern in self.meta['pattern']
|
|
197
|
+
for pattern in {file_pattern.lower(), file_pattern.upper(), file_pattern}
|
|
198
|
+
for f in self.path.glob(pattern)
|
|
212
199
|
if f.name not in [self.csv_out.name, self.csv_nam.name, self.csv_nam_raw.name, f'{self.nam}.log']]
|
|
213
200
|
|
|
214
201
|
if not files:
|
|
215
|
-
raise FileNotFoundError(f"
|
|
216
|
-
f"Please check the current path.\033[0m")
|
|
202
|
+
raise FileNotFoundError(f"No files in '{self.path}' could be read. Please check the current path.")
|
|
217
203
|
|
|
218
204
|
df_list = []
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
205
|
+
with Progress(
|
|
206
|
+
TextColumn("[bold blue]{task.description}", style="bold blue"),
|
|
207
|
+
BarColumn(bar_width=18, complete_style="green", finished_style="bright_green"),
|
|
208
|
+
TaskProgressColumn(),
|
|
209
|
+
TimeRemainingColumn(),
|
|
210
|
+
TextColumn("{task.fields[filename]}", style="yellow"),
|
|
211
|
+
console=Console(force_terminal=True, color_system="auto"),
|
|
212
|
+
expand=False
|
|
213
|
+
) as progress:
|
|
214
|
+
task = progress.add_task(f"Reading {self.nam} files", total=len(files), filename="")
|
|
215
|
+
for file in files:
|
|
216
|
+
progress.update(task, advance=1, filename=file.name)
|
|
217
|
+
try:
|
|
218
|
+
df = self._raw_reader(file)
|
|
224
219
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
220
|
+
if df is not None and not df.empty:
|
|
221
|
+
df_list.append(df)
|
|
222
|
+
else:
|
|
223
|
+
self.logger.warning(f"File {file.name} produced an empty DataFrame or None.")
|
|
229
224
|
|
|
230
|
-
|
|
231
|
-
|
|
225
|
+
except pd.errors.ParserError as e:
|
|
226
|
+
self.logger.error(f"Error tokenizing data: {e}")
|
|
232
227
|
|
|
233
|
-
|
|
234
|
-
|
|
228
|
+
except Exception as e:
|
|
229
|
+
self.logger.error(f"Error reading {file.name}: {e}")
|
|
235
230
|
|
|
236
231
|
if not df_list:
|
|
237
232
|
raise ValueError("All files were either empty or failed to read.")
|
|
238
233
|
|
|
239
|
-
raw_data =
|
|
234
|
+
raw_data = concat(df_list, axis=0).groupby(level=0).first()
|
|
235
|
+
|
|
236
|
+
raw_data = self._timeIndex_process(raw_data)
|
|
240
237
|
qc_data = self._QC(raw_data)
|
|
241
238
|
|
|
242
239
|
return raw_data, qc_data
|
|
243
240
|
|
|
244
|
-
|
|
245
|
-
def _run(self, _start, _end):
|
|
246
|
-
_f_raw_done, _f_qc_done = None, None
|
|
247
|
-
|
|
241
|
+
def _run(self, user_start, user_end):
|
|
248
242
|
# read pickle if pickle file exists and 'reset=False' or process raw data or append new data
|
|
249
|
-
if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and
|
|
250
|
-
print(f"\n
|
|
243
|
+
if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and not self.reset:
|
|
244
|
+
print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mPICKLE\033[0m "
|
|
245
|
+
f"from {user_start} to {user_end}\n")
|
|
251
246
|
|
|
252
|
-
_f_raw_done, _f_qc_done = self.
|
|
247
|
+
_f_raw_done, _f_qc_done = read_pickle(self.pkl_nam_raw), read_pickle(self.pkl_nam)
|
|
253
248
|
|
|
254
|
-
if
|
|
255
|
-
|
|
256
|
-
|
|
249
|
+
if self.append:
|
|
250
|
+
print(f"Appending new data from {user_start} to {user_end}")
|
|
251
|
+
_f_raw_new, _f_qc_new = self._read_raw_files()
|
|
252
|
+
_f_raw = self._timeIndex_process(_f_raw_done, append_df=_f_raw_new)
|
|
253
|
+
_f_qc = self._timeIndex_process(_f_qc_done, append_df=_f_qc_new)
|
|
254
|
+
else:
|
|
255
|
+
_f_raw, _f_qc = _f_raw_done, _f_qc_done
|
|
256
|
+
return _f_qc if self.qc else _f_raw
|
|
257
257
|
|
|
258
|
-
|
|
258
|
+
else:
|
|
259
|
+
print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mRAW DATA\033[0m "
|
|
260
|
+
f"from {user_start} to {user_end}\n")
|
|
259
261
|
|
|
260
|
-
|
|
261
|
-
self._rate_calculate(_f_raw_done, _f_qc_done, _start_raw, _end_raw)
|
|
262
|
+
_f_raw, _f_qc = self._read_raw_files()
|
|
262
263
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
# read raw data
|
|
266
|
-
print(f"\n\t{dtm.now().strftime('%m/%d %X')} : Reading \033[96mRAW DATA\033[0m of {self.nam} and process it")
|
|
267
|
-
|
|
268
|
-
_f_raw, _f_qc = self._read_raw_files()
|
|
269
|
-
|
|
270
|
-
# append new data and pickle data
|
|
271
|
-
if self.append and self.pkl_nam.exists():
|
|
272
|
-
_f_raw = self._apnd_prcs(_f_raw_done, _f_raw)
|
|
273
|
-
_f_qc = self._apnd_prcs(_f_qc_done, _f_qc)
|
|
264
|
+
# process time index
|
|
265
|
+
data_start, data_end = _f_raw.index.sort_values()[[0, -1]]
|
|
274
266
|
|
|
275
|
-
|
|
267
|
+
_f_raw = self._timeIndex_process(_f_raw, user_start, user_end)
|
|
268
|
+
_f_qc = self._timeIndex_process(_f_qc, user_start, user_end)
|
|
269
|
+
_f_qc = self._outlier_process(_f_qc)
|
|
276
270
|
|
|
277
271
|
# save
|
|
278
272
|
self._save_data(_f_raw, _f_qc)
|
|
279
273
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
274
|
+
self.logger.info(f"{'=' * 60}")
|
|
275
|
+
self.logger.info(f"Raw data time : {data_start} to {data_end}")
|
|
276
|
+
self.logger.info(f"Output time : {user_start} to {user_end}")
|
|
277
|
+
self.logger.info(f"{'-' * 60}")
|
|
284
278
|
|
|
285
|
-
self.
|
|
279
|
+
if self.rate:
|
|
280
|
+
self._rate_calculate(_f_raw, _f_qc)
|
|
286
281
|
|
|
287
282
|
return _f_qc if self.qc else _f_raw
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def reorder_dataframe_columns(df, order_lists, others_col=False):
|
|
286
|
+
new_order = []
|
|
287
|
+
|
|
288
|
+
for order in order_lists:
|
|
289
|
+
# 只添加存在於DataFrame中的欄位,且不重複添加
|
|
290
|
+
new_order.extend([col for col in order if col in df.columns and col not in new_order])
|
|
291
|
+
|
|
292
|
+
if others_col:
|
|
293
|
+
# 添加所有不在新順序列表中的原始欄位,保持它們的原始順序
|
|
294
|
+
new_order.extend([col for col in df.columns if col not in new_order])
|
|
295
|
+
|
|
296
|
+
return df[new_order]
|
|
297
|
+
|
|
298
|
+
@staticmethod
|
|
299
|
+
def n_sigma_QC(df: DataFrame, std_range: int = 5) -> DataFrame:
|
|
300
|
+
df_ave, df_std = df.mean(), df.std()
|
|
301
|
+
df_lowb, df_highb = df < (df_ave - df_std * std_range), df > (df_ave + df_std * std_range)
|
|
302
|
+
|
|
303
|
+
return df.mask(df_lowb | df_highb).copy()
|
|
304
|
+
|
|
305
|
+
# "四分位數範圍法"(Inter-quartile Range Method)
|
|
306
|
+
@staticmethod
|
|
307
|
+
def IQR_QC(df: DataFrame, log_dist=False) -> tuple[DataFrame, DataFrame]:
|
|
308
|
+
df = np.log10(df) if log_dist else df
|
|
309
|
+
|
|
310
|
+
_df_qua = df.quantile([.25, .75])
|
|
311
|
+
_df_q1, _df_q3 = _df_qua.loc[.25].copy(), _df_qua.loc[.75].copy()
|
|
312
|
+
_df_iqr = _df_q3 - _df_q1
|
|
313
|
+
|
|
314
|
+
_se = concat([_df_q1 - 1.5 * _df_iqr] * len(df), axis=1).T.set_index(df.index)
|
|
315
|
+
_le = concat([_df_q3 + 1.5 * _df_iqr] * len(df), axis=1).T.set_index(df.index)
|
|
316
|
+
|
|
317
|
+
return (10 ** _se, 10 ** _le) if log_dist else (_se, _le)
|
|
@@ -27,4 +27,4 @@ class Reader(AbstractReader):
|
|
|
27
27
|
_df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']].mask((_df < 0).copy())
|
|
28
28
|
|
|
29
29
|
# QC data in 1h
|
|
30
|
-
return _df.resample('1h').apply(self.
|
|
30
|
+
return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import numpy as
|
|
1
|
+
import numpy as np
|
|
2
2
|
from pandas import to_datetime, read_table
|
|
3
3
|
|
|
4
4
|
from AeroViz.rawDataReader.core import AbstractReader
|
|
@@ -29,7 +29,7 @@ class Reader(AbstractReader):
|
|
|
29
29
|
# QC data
|
|
30
30
|
def _QC(self, _df):
|
|
31
31
|
# mask out the data size lower than 7
|
|
32
|
-
_df['total'] = _df.sum(axis=1, min_count=1) * (
|
|
32
|
+
_df['total'] = _df.sum(axis=1, min_count=1) * (np.diff(np.log(_df.keys().to_numpy(float)))).mean()
|
|
33
33
|
_df_size = _df['total'].dropna().resample('1h').size().resample(_df.index.freq).ffill()
|
|
34
34
|
_df = _df.mask(_df_size < 7)
|
|
35
35
|
|
|
@@ -37,4 +37,4 @@ class Reader(AbstractReader):
|
|
|
37
37
|
_df = _df[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
|
|
38
38
|
|
|
39
39
|
# QC data in 1h
|
|
40
|
-
return _df.resample('1h').apply(self.
|
|
40
|
+
return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
|
|
@@ -39,4 +39,4 @@ class Reader(AbstractReader):
|
|
|
39
39
|
_df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7', 'BC8', 'BC9', 'BC10']].mask((_df < 0).copy())
|
|
40
40
|
|
|
41
41
|
# QC data in 1h
|
|
42
|
-
return _df.resample('1h').apply(self.
|
|
42
|
+
return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from pandas import read_csv
|
|
2
|
+
|
|
3
|
+
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
|
+
|
|
5
|
+
desired_order1 = ['SO2', 'NO', 'NOx', 'NO2', 'CO', 'O3', 'THC', 'NMHC',
|
|
6
|
+
'CH4', 'PM10', 'PM2.5', 'PM1', 'WS', 'WD', 'AT', 'RH']
|
|
7
|
+
|
|
8
|
+
desired_order2 = ['Benzene', 'Toluene', 'EthylBenzene', 'm/p-Xylene', 'o-Xylene']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Reader(AbstractReader):
|
|
12
|
+
nam = 'EPA'
|
|
13
|
+
|
|
14
|
+
def _raw_reader(self, file):
|
|
15
|
+
# 查詢小時值(測項).csv & 查詢小時值(直式).csv (有、無輸出有效值都可以)
|
|
16
|
+
df = read_csv(file, encoding='big5', encoding_errors='ignore', index_col=0, parse_dates=True,
|
|
17
|
+
on_bad_lines='skip')
|
|
18
|
+
|
|
19
|
+
if len(df.groupby('測站')) > 1:
|
|
20
|
+
raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}')
|
|
21
|
+
else:
|
|
22
|
+
if '測站' in df.columns:
|
|
23
|
+
df.drop(columns=['測站'], inplace=True)
|
|
24
|
+
|
|
25
|
+
if '測項' in df.columns:
|
|
26
|
+
df = df.pivot(columns='測項', values='資料')
|
|
27
|
+
|
|
28
|
+
df.rename(columns={'AMB_TEMP': 'AT', 'WIND_SPEED': 'WS', 'WIND_DIREC': 'WD'}, inplace=True)
|
|
29
|
+
df.index.name = 'Time'
|
|
30
|
+
|
|
31
|
+
# 如果沒有將無效值拿掉就輸出 請將包含 #、L 的字串替換成 # 或 _
|
|
32
|
+
df = df.replace(to_replace=r'\d*[#]\b', value='#', regex=True)
|
|
33
|
+
df = df.replace(to_replace=r'\d*[L]\b', value='_', regex=True)
|
|
34
|
+
|
|
35
|
+
# 欄位排序
|
|
36
|
+
return self.reorder_dataframe_columns(df, [desired_order1])
|
|
37
|
+
|
|
38
|
+
def _QC(self, _df):
|
|
39
|
+
return _df.resample('6h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
# read meteorological data from google sheet
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
import
|
|
5
|
-
from pandas import read_csv, concat, to_numeric
|
|
4
|
+
from pandas import read_csv, to_numeric
|
|
6
5
|
|
|
7
6
|
from AeroViz.rawDataReader.core import AbstractReader
|
|
8
7
|
|
|
@@ -35,24 +34,8 @@ class Reader(AbstractReader):
|
|
|
35
34
|
'SO42-': 0.08,
|
|
36
35
|
}
|
|
37
36
|
|
|
38
|
-
# _mdl.update(self._oth_set.get('mdl', {}))
|
|
39
|
-
|
|
40
|
-
def _se_le(_df_, _log=False):
|
|
41
|
-
_df_ = np.log10(_df_) if _log else _df_
|
|
42
|
-
|
|
43
|
-
_df_qua = _df_.quantile([.25, .75])
|
|
44
|
-
_df_q1, _df_q3 = _df_qua.loc[.25].copy(), _df_qua.loc[.75].copy()
|
|
45
|
-
_df_iqr = _df_q3 - _df_q1
|
|
46
|
-
|
|
47
|
-
_se = concat([_df_q1 - 1.5 * _df_iqr] * len(_df_), axis=1).T.set_index(_df_.index)
|
|
48
|
-
_le = concat([_df_q3 + 1.5 * _df_iqr] * len(_df_), axis=1).T.set_index(_df_.index)
|
|
49
|
-
|
|
50
|
-
if _log:
|
|
51
|
-
return 10 ** _se, 10 ** _le
|
|
52
|
-
return _se, _le
|
|
53
|
-
|
|
54
37
|
_cation, _anion, _main = (['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+'],
|
|
55
|
-
['Cl-', 'NO2-', 'NO3-', 'SO42-', ],
|
|
38
|
+
['Cl-', 'NO2-', 'NO3-', 'PO43-', 'SO42-', ],
|
|
56
39
|
['SO42-', 'NO3-', 'NH4+'])
|
|
57
40
|
|
|
58
41
|
_df_salt = _df[_mdl.keys()].copy()
|
|
@@ -68,23 +51,23 @@ class Reader(AbstractReader):
|
|
|
68
51
|
|
|
69
52
|
# calculate SE LE
|
|
70
53
|
# salt < LE
|
|
71
|
-
_se, _le =
|
|
54
|
+
_se, _le = self.IQR_QC(_df_salt, log_dist=True)
|
|
72
55
|
_df_salt = _df_salt.mask(_df_salt > _le).copy()
|
|
73
56
|
|
|
74
57
|
# C/A, A/C
|
|
75
58
|
_rat_CA = (_df_salt[_cation].sum(axis=1) / _df_salt[_anion].sum(axis=1)).to_frame()
|
|
76
59
|
_rat_AC = (1 / _rat_CA).copy()
|
|
77
60
|
|
|
78
|
-
_se, _le =
|
|
61
|
+
_se, _le = self.IQR_QC(_rat_CA, )
|
|
79
62
|
_cond_CA = (_rat_CA < _le) & (_rat_CA > 0)
|
|
80
63
|
|
|
81
|
-
_se, _le =
|
|
64
|
+
_se, _le = self.IQR_QC(_rat_AC, )
|
|
82
65
|
_cond_AC = (_rat_AC < _le) & (_rat_AC > 0)
|
|
83
66
|
|
|
84
67
|
_df_salt = _df_salt.where((_cond_CA * _cond_AC)[0]).copy()
|
|
85
68
|
|
|
86
69
|
# conc. of main salt > SE
|
|
87
|
-
_se, _le =
|
|
70
|
+
_se, _le = self.IQR_QC(_df_salt[_main], log_dist=True)
|
|
88
71
|
_df_salt[_main] = _df_salt[_main].mask(_df_salt[_main] < _se).copy()
|
|
89
72
|
|
|
90
73
|
return _df_salt.reindex(_df.index)
|
|
@@ -35,4 +35,4 @@ class Reader(AbstractReader):
|
|
|
35
35
|
_df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BB mass', 'FF mass', 'AAE', 'BB']].mask((_df < 0).copy())
|
|
36
36
|
|
|
37
37
|
# QC data in 1h
|
|
38
|
-
return _df.resample('1h').apply(self.
|
|
38
|
+
return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
|