AeroViz 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AeroViz might be problematic. Click here for more details.

Files changed (57) hide show
  1. AeroViz/data/240228_00.txt +101 -0
  2. AeroViz/dataProcess/Chemistry/_ocec.py +20 -7
  3. AeroViz/plot/__init__.py +2 -0
  4. AeroViz/plot/hysplit/__init__.py +1 -0
  5. AeroViz/plot/hysplit/hysplit.py +79 -0
  6. AeroViz/plot/meteorology/meteorology.py +2 -0
  7. AeroViz/plot/optical/optical.py +60 -59
  8. AeroViz/plot/pie.py +14 -2
  9. AeroViz/plot/radar.py +184 -0
  10. AeroViz/plot/scatter.py +16 -7
  11. AeroViz/plot/templates/diurnal_pattern.py +24 -7
  12. AeroViz/plot/templates/koschmieder.py +11 -8
  13. AeroViz/plot/timeseries/template.py +2 -2
  14. AeroViz/plot/timeseries/timeseries.py +47 -7
  15. AeroViz/rawDataReader/__init__.py +75 -68
  16. AeroViz/rawDataReader/config/supported_instruments.py +52 -19
  17. AeroViz/rawDataReader/core/__init__.py +194 -106
  18. AeroViz/rawDataReader/script/AE33.py +11 -6
  19. AeroViz/rawDataReader/script/AE43.py +10 -5
  20. AeroViz/rawDataReader/script/Aurora.py +14 -10
  21. AeroViz/rawDataReader/script/BC1054.py +10 -6
  22. AeroViz/rawDataReader/script/EPA.py +39 -0
  23. AeroViz/rawDataReader/script/GRIMM.py +1 -2
  24. AeroViz/rawDataReader/script/IGAC.py +6 -23
  25. AeroViz/rawDataReader/script/MA350.py +12 -5
  26. AeroViz/rawDataReader/script/Minion.py +107 -30
  27. AeroViz/rawDataReader/script/NEPH.py +15 -5
  28. AeroViz/rawDataReader/script/OCEC.py +39 -15
  29. AeroViz/rawDataReader/script/SMPS.py +1 -0
  30. AeroViz/rawDataReader/script/TEOM.py +15 -11
  31. AeroViz/rawDataReader/script/VOC.py +1 -1
  32. AeroViz/rawDataReader/script/XRF.py +11 -0
  33. AeroViz/rawDataReader/script/__init__.py +2 -2
  34. {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/METADATA +54 -30
  35. {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/RECORD +40 -51
  36. AeroViz/process/__init__.py +0 -31
  37. AeroViz/process/core/DataProc.py +0 -19
  38. AeroViz/process/core/SizeDist.py +0 -90
  39. AeroViz/process/core/__init__.py +0 -4
  40. AeroViz/process/method/__init__.py +0 -2
  41. AeroViz/process/method/prop.py +0 -62
  42. AeroViz/process/script/AbstractDistCalc.py +0 -143
  43. AeroViz/process/script/Chemical.py +0 -177
  44. AeroViz/process/script/IMPACT.py +0 -49
  45. AeroViz/process/script/IMPROVE.py +0 -161
  46. AeroViz/process/script/Others.py +0 -65
  47. AeroViz/process/script/PSD.py +0 -103
  48. AeroViz/process/script/PSD_dry.py +0 -93
  49. AeroViz/process/script/__init__.py +0 -5
  50. AeroViz/process/script/retrieve_RI.py +0 -69
  51. AeroViz/rawDataReader/script/EPA_vertical.py +0 -46
  52. AeroViz/rawDataReader/script/Table.py +0 -27
  53. /AeroViz/{process/method → plot/optical}/PyMieScatt_update.py +0 -0
  54. /AeroViz/{process/method → plot/optical}/mie_theory.py +0 -0
  55. {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/LICENSE +0 -0
  56. {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/WHEEL +0 -0
  57. {AeroViz-0.1.6.dist-info → AeroViz-0.1.8.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,21 @@
1
1
  import json
2
2
  import logging
3
- import pickle as pkl
4
3
  from abc import ABC, abstractmethod
5
- from datetime import datetime as dtm
4
+ from datetime import datetime
6
5
  from pathlib import Path
7
- from typing import Any
6
+ from typing import Optional
8
7
 
9
8
  import numpy as np
10
9
  import pandas as pd
11
- from pandas import DataFrame, date_range, concat, to_numeric, to_datetime
10
+ from pandas import DataFrame, concat, read_pickle
12
11
  from rich.console import Console
13
12
  from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn, TaskProgressColumn
14
13
 
15
- from ..config.supported_instruments import meta
14
+ from AeroViz.rawDataReader.config.supported_instruments import meta
16
15
 
17
16
  __all__ = ['AbstractReader']
18
17
 
19
18
 
20
- console = Console(force_terminal=True, color_system="auto")
21
-
22
-
23
19
  class AbstractReader(ABC):
24
20
  """
25
21
  Abstract class for reading raw data from different instruments. Each instrument should have a separate class that
@@ -34,9 +30,9 @@ class AbstractReader(ABC):
34
30
 
35
31
  def __init__(self,
36
32
  path: Path | str,
37
- qc: bool = True,
38
- csv_raw: bool = True,
39
33
  reset: bool = False,
34
+ qc: bool = True,
35
+ qc_freq: Optional[str] = None,
40
36
  rate: bool = True,
41
37
  append_data: bool = False):
42
38
 
@@ -45,9 +41,9 @@ class AbstractReader(ABC):
45
41
  self.logger = self._setup_logger()
46
42
 
47
43
  self.reset = reset
48
- self.rate = rate
49
44
  self.qc = qc
50
- self.csv = csv_raw
45
+ self.qc_freq = qc_freq
46
+ self.rate = rate
51
47
  self.append = append_data and reset
52
48
 
53
49
  self.pkl_nam = self.path / f'_read_{self.nam.lower()}.pkl'
@@ -57,15 +53,12 @@ class AbstractReader(ABC):
57
53
  self.csv_out = self.path / f'output_{self.nam.lower()}.csv'
58
54
 
59
55
  def __call__(self,
60
- start: dtm | None = None,
61
- end: dtm | None = None,
56
+ start: datetime,
57
+ end: datetime,
62
58
  mean_freq: str = '1h',
63
59
  csv_out: bool = True,
64
60
  ) -> DataFrame:
65
61
 
66
- if start and end and end <= start:
67
- raise ValueError(f"Invalid time range: start {start} is after end {end}")
68
-
69
62
  data = self._run(start, end)
70
63
 
71
64
  if data is not None:
@@ -81,15 +74,8 @@ class AbstractReader(ABC):
81
74
  pass
82
75
 
83
76
  @abstractmethod
84
- def _QC(self, df: DataFrame):
85
- return df
86
-
87
- @staticmethod
88
- def basic_QC(df: DataFrame):
89
- df_ave, df_std = df.mean(), df.std()
90
- df_lowb, df_highb = df < (df_ave - df_std * 1.5), df > (df_ave + df_std * 1.5)
91
-
92
- return df.mask(df_lowb | df_highb).copy()
77
+ def _QC(self, df: DataFrame) -> DataFrame:
78
+ return self.n_sigma_QC(df)
93
79
 
94
80
  def _setup_logger(self) -> logging.Logger:
95
81
  logger = logging.getLogger(self.nam)
@@ -99,72 +85,87 @@ class AbstractReader(ABC):
99
85
  logger.removeHandler(handler)
100
86
 
101
87
  handler = logging.FileHandler(self.path / f'{self.nam}.log')
102
- handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
88
+ handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
103
89
  logger.addHandler(handler)
104
90
  return logger
105
91
 
106
- def _rate_calculate(self, _fout_raw, _fout_qc, _st_raw, _ed_raw) -> None:
107
- if self.meta['deter_key'] is not None:
108
- _start, _end = _fout_qc.index[[0, -1]]
109
-
110
- _drop_how = 'any'
111
- _the_size = len(_fout_raw.resample('1h').mean().index)
92
+ def _rate_calculate(self, raw_data, qc_data) -> None:
93
+ def __base_rate(raw_data, qc_data):
94
+ period_size = len(raw_data.resample('1h').mean().index)
112
95
 
113
96
  for _nam, _key in self.meta['deter_key'].items():
114
- if _key == ['all']:
115
- _key, _drop_how = _fout_qc.keys(), 'all'
97
+ _key, _drop_how = (qc_data.keys(), 'all') if _key is ['all'] else (_key, 'any')
116
98
 
117
- _real_size = len(_fout_raw[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
118
- _QC_size = len(_fout_qc[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
99
+ sample_size = len(raw_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
100
+ qc_size = len(qc_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
119
101
 
120
- try:
121
- _acq_rate = round((_real_size / _the_size) * 100, 1)
122
- _yid_rate = round((_QC_size / _real_size) * 100, 1)
123
- except ZeroDivisionError:
124
- _acq_rate, _yid_rate = 0, 0
102
+ # validate rate calculation
103
+ if period_size < sample_size or sample_size < qc_size or period_size == 0 or sample_size == 0:
104
+ raise ValueError(f"Invalid sample sizes: period={period_size}, sample={sample_size}, QC={qc_size}")
105
+
106
+ _acq_rate = round((sample_size / period_size) * 100, 1)
107
+ _yid_rate = round((qc_size / sample_size) * 100, 1)
108
+ _OEE_rate = round((qc_size / period_size) * 100, 1)
125
109
 
126
110
  self.logger.info(f'{_nam}:')
127
111
  self.logger.info(f"\tAcquisition rate: {_acq_rate}%")
128
112
  self.logger.info(f'\tYield rate: {_yid_rate}%')
113
+ self.logger.info(f'\tOEE rate: {_OEE_rate}%')
129
114
  self.logger.info(f"{'=' * 60}")
130
115
 
131
116
  print(f'\n\t{_nam} : ')
132
- print(f'\t\tacquisition rate : \033[91m{_acq_rate}%\033[0m')
133
- print(f'\t\tyield rate : \033[91m{_yid_rate}%\033[0m')
117
+ print(f'\t\tacquisition rate | yield rate | OEE rate :'
118
+ f' \033[91m{_acq_rate}% | {_yid_rate}% -> {_OEE_rate}%\033[0m')
134
119
 
135
- # set each to true datetime(18:30:01 -> 18:30:00) and rindex data
136
- def _raw_process(self, _df):
137
- # get time from df and set time to whole time to create time index
138
- _st, _ed = _df.index.sort_values()[[0, -1]]
139
- _tm_index = date_range(_st.strftime('%Y%m%d %H00'), _ed.floor('h').strftime('%Y%m%d %H00'),
140
- freq=self.meta['freq'])
141
- _tm_index.name = 'time'
120
+ if self.meta['deter_key'] is not None:
121
+ # use qc_freq to calculate each period rate
122
+ if self.qc_freq is not None:
123
+ raw_data_grouped = raw_data.groupby(pd.Grouper(freq=self.qc_freq))
124
+ qc_data_grouped = qc_data.groupby(pd.Grouper(freq=self.qc_freq))
142
125
 
143
- return _df.apply(to_numeric, errors='coerce').resample(self.meta['freq']).mean().reindex(_tm_index)
126
+ for (month, _sub_raw_data), (_, _sub_qc_data) in zip(raw_data_grouped, qc_data_grouped):
127
+ self.logger.info(
128
+ f"\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
129
+ print(
130
+ f"\n\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
144
131
 
145
- # process time index
146
- @staticmethod
147
- def _tmidx_process(_start, _end, _df):
148
- _st, _ed = _df.index.sort_values()[[0, -1]]
149
- _start, _end = to_datetime(_start) or _st, to_datetime(_end) or _ed
150
- _idx = date_range(_start, _end, freq=_df.index.freq.copy())
151
- _idx.name = 'time'
132
+ __base_rate(_sub_raw_data, _sub_qc_data)
152
133
 
153
- return _df.reindex(_idx), _st, _ed
134
+ else:
135
+ __base_rate(raw_data, qc_data)
154
136
 
155
- # append new data to exist pkl
156
- @staticmethod
157
- def _append_process(_df_done, _df_apnd):
137
+ def _timeIndex_process(self, _df, user_start=None, user_end=None, append_df=None):
138
+ """
139
+ Process time index, resample data, extract specified time range, and optionally append new data.
158
140
 
159
- if _df_apnd is not None:
160
- _df = concat([_df_apnd.dropna(how='all').copy(), _df_done.dropna(how='all').copy()])
141
+ :param _df: Input DataFrame with time index
142
+ :param user_start: Start of user-specified time range (optional)
143
+ :param user_end: End of user-specified time range (optional)
144
+ :param append_df: DataFrame to append (optional)
145
+ :return: Processed DataFrame
146
+ """
147
+ # Round timestamps and remove duplicates
148
+ _df = _df.groupby(_df.index.round('1min')).first()
161
149
 
162
- _idx = date_range(*_df.index.sort_values()[[0, -1]], freq=_df_done.index.freq.copy())
163
- _idx.name = 'time'
150
+ # Determine frequency
151
+ freq = _df.index.inferred_freq or self.meta['freq']
164
152
 
165
- return _df.loc[~_df.index.duplicated()].copy().reindex(_idx)
153
+ # Append new data if provided
154
+ if append_df is not None:
155
+ append_df.index = append_df.index.round('1min')
156
+ _df = pd.concat([append_df.dropna(how='all'), _df.dropna(how='all')])
157
+ _df = _df.loc[~_df.index.duplicated()]
166
158
 
167
- return _df_done
159
+ # Determine time range
160
+ df_start, df_end = _df.index.sort_values()[[0, -1]]
161
+
162
+ # Create new time index
163
+ new_index = pd.date_range(user_start or df_start, user_end or df_end, freq=freq, name='time')
164
+
165
+ # Process data: convert to numeric, resample, and reindex
166
+ return (_df.apply(pd.to_numeric, errors='coerce')
167
+ .resample(freq).mean()
168
+ .reindex(new_index))
168
169
 
169
170
  def _outlier_process(self, _df):
170
171
  outlier_file = self.path / 'outlier.json'
@@ -180,31 +181,17 @@ class AbstractReader(ABC):
180
181
 
181
182
  return _df
182
183
 
183
- # save pickle file
184
184
  def _save_data(self, raw_data: DataFrame, qc_data: DataFrame) -> None:
185
- self._safe_pickle_dump(self.pkl_nam, qc_data)
186
- if self.csv:
187
- qc_data.to_csv(self.csv_nam)
185
+ try:
186
+ raw_data.to_pickle(self.pkl_nam_raw)
187
+ raw_data.to_csv(self.csv_nam_raw)
188
188
 
189
- if self.meta['deter_key'] is not None:
190
- self._safe_pickle_dump(self.pkl_nam_raw, raw_data)
191
- if self.csv:
192
- raw_data.to_csv(self.csv_nam_raw)
189
+ if self.meta['deter_key'] is not None:
190
+ qc_data.to_pickle(self.pkl_nam)
191
+ qc_data.to_csv(self.csv_nam)
193
192
 
194
- @staticmethod
195
- def _safe_pickle_dump(file_path: Path, data: Any) -> None:
196
- try:
197
- with file_path.open('wb') as f:
198
- pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL)
199
- except PermissionError as e:
200
- raise IOError(f"Unable to write to {file_path}. The file may be in use or you may not have permission: {e}")
201
193
  except Exception as e:
202
- raise IOError(f"Error writing to {file_path}: {e}")
203
-
204
- # read pickle file
205
- def _read_pkl(self):
206
- with self.pkl_nam.open('rb') as qc_data, self.pkl_nam_raw.open('rb') as raw_data:
207
- return pkl.load(raw_data), pkl.load(qc_data)
194
+ raise IOError(f"Error saving data. {e}")
208
195
 
209
196
  def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]:
210
197
  files = [f
@@ -223,7 +210,7 @@ class AbstractReader(ABC):
223
210
  TaskProgressColumn(),
224
211
  TimeRemainingColumn(),
225
212
  TextColumn("{task.fields[filename]}", style="yellow"),
226
- console=console,
213
+ console=Console(force_terminal=True, color_system="auto"),
227
214
  expand=False
228
215
  ) as progress:
229
216
  task = progress.add_task(f"Reading {self.nam} files", total=len(files), filename="")
@@ -246,47 +233,148 @@ class AbstractReader(ABC):
246
233
  if not df_list:
247
234
  raise ValueError("All files were either empty or failed to read.")
248
235
 
249
- raw_data = self._raw_process(concat(df_list))
236
+ raw_data = concat(df_list, axis=0).groupby(level=0).first()
237
+
238
+ raw_data = self._timeIndex_process(raw_data)
250
239
  qc_data = self._QC(raw_data)
251
240
 
252
241
  return raw_data, qc_data
253
242
 
254
- def _run(self, _start, _end):
243
+ def _run(self, user_start, user_end):
255
244
  # read pickle if pickle file exists and 'reset=False' or process raw data or append new data
256
245
  if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and not self.reset:
257
- print(f"\n{dtm.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mPICKLE\033[0m "
258
- f"from {_start} to {_end}\n")
246
+ print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mPICKLE\033[0m "
247
+ f"from {user_start} to {user_end}\n")
259
248
 
260
- _f_raw_done, _f_qc_done = self._read_pkl()
249
+ _f_raw_done, _f_qc_done = read_pickle(self.pkl_nam_raw), read_pickle(self.pkl_nam)
261
250
 
262
251
  if self.append:
263
- print(f"Appending new data from {_start} to {_end}")
252
+ print(f"Appending new data from {user_start} to {user_end}")
264
253
  _f_raw_new, _f_qc_new = self._read_raw_files()
265
- _f_raw = self._append_process(_f_raw_done, _f_raw_new)
266
- _f_qc = self._append_process(_f_qc_done, _f_qc_new)
254
+ _f_raw = self._timeIndex_process(_f_raw_done, append_df=_f_raw_new)
255
+ _f_qc = self._timeIndex_process(_f_qc_done, append_df=_f_qc_new)
267
256
  else:
268
257
  _f_raw, _f_qc = _f_raw_done, _f_qc_done
258
+ return _f_qc if self.qc else _f_raw
269
259
 
270
260
  else:
271
- print(f"\n{dtm.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mRAW DATA\033[0m "
272
- f"from {_start} to {_end}\n")
261
+ print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mRAW DATA\033[0m "
262
+ f"from {user_start} to {user_end}\n")
263
+
273
264
  _f_raw, _f_qc = self._read_raw_files()
274
265
 
275
266
  # process time index
276
- _f_raw, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_raw)
277
- _f_qc, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_qc)
267
+ data_start, data_end = _f_raw.index.sort_values()[[0, -1]]
278
268
 
269
+ _f_raw = self._timeIndex_process(_f_raw, user_start, user_end)
270
+ _f_qc = self._timeIndex_process(_f_qc, user_start, user_end)
279
271
  _f_qc = self._outlier_process(_f_qc)
280
272
 
281
273
  # save
282
274
  self._save_data(_f_raw, _f_qc)
283
275
 
284
276
  self.logger.info(f"{'=' * 60}")
285
- self.logger.info(f"Raw data time : {_start_raw} to {_end_raw}")
286
- self.logger.info(f"Output time : {_start} to {_end}")
277
+ self.logger.info(f"Raw data time : {data_start} to {data_end}")
278
+ self.logger.info(f"Output time : {user_start} to {user_end}")
287
279
  self.logger.info(f"{'-' * 60}")
288
280
 
289
281
  if self.rate:
290
- self._rate_calculate(_f_raw, _f_qc, _start_raw, _end_raw)
282
+ self._rate_calculate(_f_raw, _f_qc)
291
283
 
292
284
  return _f_qc if self.qc else _f_raw
285
+
286
+ @staticmethod
287
+ def reorder_dataframe_columns(df, order_lists, others_col=False):
288
+ new_order = []
289
+
290
+ for order in order_lists:
291
+ # 只添加存在於DataFrame中的欄位,且不重複添加
292
+ new_order.extend([col for col in order if col in df.columns and col not in new_order])
293
+
294
+ if others_col:
295
+ # 添加所有不在新順序列表中的原始欄位,保持它們的原始順序
296
+ new_order.extend([col for col in df.columns if col not in new_order])
297
+
298
+ return df[new_order]
299
+
300
+ @staticmethod
301
+ def n_sigma_QC(df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
302
+ # 確保輸入是DataFrame
303
+ df = df.to_frame() if isinstance(df, pd.Series) else df
304
+
305
+ df_ave = df.mean()
306
+ df_std = df.std()
307
+
308
+ lower_bound = df < (df_ave - df_std * std_range)
309
+ upper_bound = df > (df_ave + df_std * std_range)
310
+
311
+ return df.mask(lower_bound | upper_bound)
312
+
313
+ @staticmethod
314
+ def IQR_QC(df: pd.DataFrame, log_dist=False) -> pd.DataFrame:
315
+ # 確保輸入是DataFrame
316
+ df = df.to_frame() if isinstance(df, pd.Series) else df
317
+
318
+ df_transformed = np.log10(df) if log_dist else df
319
+
320
+ _df_q1 = df_transformed.quantile(0.25)
321
+ _df_q3 = df_transformed.quantile(0.75)
322
+
323
+ _df_iqr = _df_q3 - _df_q1
324
+
325
+ # Calculate lower and upper bounds
326
+ lower_bound = df_transformed < (_df_q1 - 1.5 * _df_iqr)
327
+ upper_bound = df_transformed > (_df_q3 + 1.5 * _df_iqr)
328
+
329
+ # Apply the filter to the original dataframe
330
+ return df.mask(lower_bound | upper_bound)
331
+
332
+ @staticmethod
333
+ def rolling_IQR_QC(df: pd.DataFrame, window_size=24, log_dist=False) -> pd.DataFrame:
334
+ df = df.to_frame() if isinstance(df, pd.Series) else df
335
+ df_transformed = np.log10(df) if log_dist else df
336
+
337
+ def iqr_filter(x):
338
+ q1, q3 = x.quantile(0.25), x.quantile(0.75)
339
+ iqr = q3 - q1
340
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
341
+ return (x >= lower) & (x <= upper)
342
+
343
+ mask = df_transformed.rolling(window=window_size, center=True, min_periods=1).apply(iqr_filter)
344
+ return df.where(mask, np.nan)
345
+
346
+ @staticmethod
347
+ def time_aware_IQR_QC(df: pd.DataFrame, time_window='1D', log_dist=False) -> pd.DataFrame:
348
+ df = df.to_frame() if isinstance(df, pd.Series) else df
349
+ df_transformed = np.log10(df) if log_dist else df
350
+
351
+ def iqr_filter(group):
352
+ q1, q3 = group.quantile(0.25), group.quantile(0.75)
353
+ iqr = q3 - q1
354
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
355
+ return (group >= lower) & (group <= upper)
356
+
357
+ mask = df_transformed.groupby(pd.Grouper(freq=time_window)).transform(iqr_filter)
358
+ return df.where(mask, np.nan)
359
+
360
+ @staticmethod
361
+ def mad_iqr_hybrid_QC(df: pd.DataFrame, mad_threshold=3.5, log_dist=False) -> pd.DataFrame:
362
+ df = df.to_frame() if isinstance(df, pd.Series) else df
363
+ df_transformed = np.log10(df) if log_dist else df
364
+
365
+ # IQR 方法
366
+ q1, q3 = df_transformed.quantile(0.25), df_transformed.quantile(0.75)
367
+ iqr = q3 - q1
368
+ iqr_lower, iqr_upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
369
+
370
+ # MAD 方法
371
+ median = df_transformed.median()
372
+ mad = (df_transformed - median).abs().median()
373
+ mad_lower, mad_upper = median - mad_threshold * mad, median + mad_threshold * mad
374
+
375
+ # 结合两种方法
376
+ lower = np.maximum(iqr_lower, mad_lower)
377
+ upper = np.minimum(iqr_upper, mad_upper)
378
+
379
+ mask = (df_transformed >= lower) & (df_transformed <= upper)
380
+ return df.where(mask, np.nan)
@@ -1,4 +1,4 @@
1
- from pandas import read_table
1
+ from pandas import read_table, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,10 +8,10 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  if file.stat().st_size / 1024 < 550:
11
- print('\t It may not be a whole daily data.')
11
+ self.logger.info(f'\t {file} may not be a whole daily data. Make sure the file is correct.')
12
12
 
13
13
  _df = read_table(file, parse_dates={'time': [0, 1]}, index_col='time',
14
- delimiter=r'\s+', skiprows=5, usecols=range(67))
14
+ delimiter=r'\s+', skiprows=5, usecols=range(67)).apply(to_numeric, errors='coerce')
15
15
  _df.columns = _df.columns.str.strip(';')
16
16
 
17
17
  # remove data without Status=0, 128 (Not much filter tape), 256 (Not much filter tape)
@@ -23,8 +23,13 @@ class Reader(AbstractReader):
23
23
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
24
24
 
25
25
  def _QC(self, _df):
26
+ _index = _df.index.copy()
27
+
26
28
  # remove negative value
27
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']].mask((_df < 0).copy())
29
+ _df = _df.mask((_df <= 0) | (_df > 20000))
30
+
31
+ # use IQR_QC
32
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
28
33
 
29
- # QC data in 1h
30
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
34
+ # make sure all columns have values, otherwise set to nan
35
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import read_csv
1
+ from pandas import read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -7,7 +7,7 @@ class Reader(AbstractReader):
7
7
  nam = 'AE43'
8
8
 
9
9
  def _raw_reader(self, file):
10
- _df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time')
10
+ _df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time').apply(to_numeric, errors='coerce')
11
11
  _df_id = _df['SetupID'].iloc[-1]
12
12
 
13
13
  # get last SetupID data
@@ -24,8 +24,13 @@ class Reader(AbstractReader):
24
24
 
25
25
  # QC data
26
26
  def _QC(self, _df):
27
+ _index = _df.index.copy()
28
+
27
29
  # remove negative value
28
- _df = _df.mask((_df < 0).copy())
30
+ _df = _df.mask((_df <= 0) | (_df > 20000))
31
+
32
+ # use IQR_QC
33
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
29
34
 
30
- # QC data in 1h
31
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
35
+ # make sure all columns have values, otherwise set to nan
36
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import to_datetime, read_csv
1
+ from pandas import to_datetime, read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
11
- _df = read_csv(f, low_memory=False, index_col=0)
11
+ _df = read_csv(f, low_memory=False, index_col=0).apply(to_numeric, errors='coerce')
12
12
 
13
13
  _df.index = to_datetime(_df.index, errors='coerce')
14
14
  _df.index.name = 'time'
@@ -24,17 +24,21 @@ class Reader(AbstractReader):
24
24
  'RH': 'RH'
25
25
  })
26
26
 
27
- _df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR', 'RH']]
27
+ _df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR']]
28
28
 
29
29
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
30
30
 
31
- # QC data
32
31
  def _QC(self, _df):
33
- # remove negative value
34
- _df = _df.mask((_df <= 0) | (_df > 2000)).copy()
32
+ _index = _df.index.copy()
35
33
 
36
- # total scattering is larger than back scattering
37
- _df = _df[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
34
+ _df = _df.mask((_df <= 0) | (_df > 2000))
38
35
 
39
- # QC data in 1h
40
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
36
+ _df = _df.loc[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
37
+
38
+ _df = _df.loc[(_df['B'] > _df['G']) & (_df['G'] > _df['R'])]
39
+
40
+ # use IQR_QC
41
+ _df = self.time_aware_IQR_QC(_df)
42
+
43
+ # make sure all columns have values, otherwise set to nan
44
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import read_csv
1
+ from pandas import read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with open(file, 'r', encoding='utf-8', errors='ignore') as f:
11
- _df = read_csv(f, parse_dates=True, index_col=0)
11
+ _df = read_csv(f, parse_dates=True, index_col=0).apply(to_numeric, errors='coerce')
12
12
 
13
13
  _df.columns = _df.columns.str.replace(' ', '')
14
14
 
@@ -33,10 +33,14 @@ class Reader(AbstractReader):
33
33
 
34
34
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
35
35
 
36
- # QC data
37
36
  def _QC(self, _df):
37
+ _index = _df.index.copy()
38
+
38
39
  # remove negative value
39
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7', 'BC8', 'BC9', 'BC10']].mask((_df < 0).copy())
40
+ _df = _df.mask((_df <= 0) | (_df > 20000))
41
+
42
+ # use IQR_QC
43
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
40
44
 
41
- # QC data in 1h
42
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
45
+ # make sure all columns have values, otherwise set to nan
46
+ return _df.dropna(how='any').reindex(_index)
@@ -0,0 +1,39 @@
1
+ from pandas import read_csv, to_numeric
2
+
3
+ from AeroViz.rawDataReader.core import AbstractReader
4
+
5
+ desired_order1 = ['SO2', 'NO', 'NOx', 'NO2', 'CO', 'O3', 'THC', 'NMHC',
6
+ 'CH4', 'PM10', 'PM2.5', 'PM1', 'WS', 'WD', 'AT', 'RH']
7
+
8
+ desired_order2 = ['Benzene', 'Toluene', 'EthylBenzene', 'm/p-Xylene', 'o-Xylene']
9
+
10
+
11
+ class Reader(AbstractReader):
12
+ nam = 'EPA'
13
+
14
+ def _raw_reader(self, file):
15
+ # 查詢小時值(測項).csv & 查詢小時值(直式).csv (有、無輸出有效值都可以)
16
+ df = read_csv(file, encoding='big5', encoding_errors='ignore', index_col=0, parse_dates=True,
17
+ on_bad_lines='skip').apply(to_numeric, errors='coerce')
18
+
19
+ if len(df.groupby('測站')) > 1:
20
+ raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}')
21
+ else:
22
+ if '測站' in df.columns:
23
+ df.drop(columns=['測站'], inplace=True)
24
+
25
+ if '測項' in df.columns:
26
+ df = df.pivot(columns='測項', values='資料')
27
+
28
+ df.rename(columns={'AMB_TEMP': 'AT', 'WIND_SPEED': 'WS', 'WIND_DIREC': 'WD'}, inplace=True)
29
+ df.index.name = 'Time'
30
+
31
+ # 如果沒有將無效值拿掉就輸出 請將包含 #、L 的字串替換成 # 或 _
32
+ df = df.replace(to_replace=r'\d*[#]\b', value='#', regex=True)
33
+ df = df.replace(to_replace=r'\d*[L]\b', value='_', regex=True)
34
+
35
+ # 欄位排序
36
+ return self.reorder_dataframe_columns(df, [desired_order1])
37
+
38
+ def _QC(self, _df):
39
+ return _df
@@ -24,5 +24,4 @@ class Reader(AbstractReader):
24
24
  return _df / 0.035
25
25
 
26
26
  def _QC(self, _df):
27
- # QC data in 1h
28
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
27
+ return _df