AeroViz 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AeroViz might be problematic. Click here for more details.

Files changed (59) hide show
  1. AeroViz/dataProcess/Chemistry/_mass_volume.py +4 -3
  2. AeroViz/dataProcess/Chemistry/_ocec.py +20 -7
  3. AeroViz/dataProcess/Optical/_IMPROVE.py +2 -3
  4. AeroViz/dataProcess/SizeDistr/__init__.py +6 -10
  5. AeroViz/plot/__init__.py +1 -0
  6. AeroViz/plot/meteorology/meteorology.py +2 -0
  7. AeroViz/plot/optical/optical.py +1 -1
  8. AeroViz/plot/pie.py +14 -2
  9. AeroViz/plot/radar.py +184 -0
  10. AeroViz/plot/scatter.py +16 -7
  11. AeroViz/plot/templates/koschmieder.py +11 -8
  12. AeroViz/plot/timeseries/timeseries.py +0 -1
  13. AeroViz/rawDataReader/__init__.py +75 -70
  14. AeroViz/rawDataReader/config/supported_instruments.py +70 -38
  15. AeroViz/rawDataReader/core/__init__.py +208 -178
  16. AeroViz/rawDataReader/script/AE33.py +1 -1
  17. AeroViz/rawDataReader/script/AE43.py +1 -1
  18. AeroViz/rawDataReader/script/APS_3321.py +2 -2
  19. AeroViz/rawDataReader/script/Aurora.py +1 -1
  20. AeroViz/rawDataReader/script/BC1054.py +1 -1
  21. AeroViz/rawDataReader/script/EPA.py +39 -0
  22. AeroViz/rawDataReader/script/GRIMM.py +1 -1
  23. AeroViz/rawDataReader/script/IGAC.py +6 -23
  24. AeroViz/rawDataReader/script/MA350.py +1 -1
  25. AeroViz/rawDataReader/script/Minion.py +102 -30
  26. AeroViz/rawDataReader/script/NEPH.py +1 -1
  27. AeroViz/rawDataReader/script/{Sunset_OCEC.py → OCEC.py} +2 -2
  28. AeroViz/rawDataReader/script/SMPS.py +77 -0
  29. AeroViz/rawDataReader/script/TEOM.py +2 -2
  30. AeroViz/rawDataReader/script/VOC.py +2 -2
  31. AeroViz/rawDataReader/script/XRF.py +11 -0
  32. AeroViz/rawDataReader/script/__init__.py +4 -6
  33. {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/METADATA +57 -32
  34. {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/RECORD +37 -55
  35. AeroViz/process/__init__.py +0 -31
  36. AeroViz/process/core/DataProc.py +0 -19
  37. AeroViz/process/core/SizeDist.py +0 -90
  38. AeroViz/process/core/__init__.py +0 -4
  39. AeroViz/process/method/PyMieScatt_update.py +0 -567
  40. AeroViz/process/method/__init__.py +0 -2
  41. AeroViz/process/method/mie_theory.py +0 -260
  42. AeroViz/process/method/prop.py +0 -62
  43. AeroViz/process/script/AbstractDistCalc.py +0 -143
  44. AeroViz/process/script/Chemical.py +0 -177
  45. AeroViz/process/script/IMPACT.py +0 -49
  46. AeroViz/process/script/IMPROVE.py +0 -161
  47. AeroViz/process/script/Others.py +0 -65
  48. AeroViz/process/script/PSD.py +0 -103
  49. AeroViz/process/script/PSD_dry.py +0 -93
  50. AeroViz/process/script/__init__.py +0 -5
  51. AeroViz/process/script/retrieve_RI.py +0 -69
  52. AeroViz/rawDataReader/script/EPA_vertical.py +0 -46
  53. AeroViz/rawDataReader/script/SMPS_TH.py +0 -41
  54. AeroViz/rawDataReader/script/SMPS_aim11.py +0 -51
  55. AeroViz/rawDataReader/script/SMPS_genr.py +0 -51
  56. AeroViz/rawDataReader/script/Table.py +0 -27
  57. {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/LICENSE +0 -0
  58. {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/WHEEL +0 -0
  59. {AeroViz-0.1.5.dist-info → AeroViz-0.1.7.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,39 @@
1
- import json as jsn
1
+ import json
2
2
  import logging
3
- import pickle as pkl
4
3
  from abc import ABC, abstractmethod
5
- from datetime import datetime as dtm
4
+ from datetime import datetime
6
5
  from pathlib import Path
7
- from typing import Any
6
+ from typing import Optional
8
7
 
9
8
  import numpy as np
10
9
  import pandas as pd
11
- from pandas import DataFrame, date_range, concat, to_numeric, to_datetime
10
+ from pandas import DataFrame, concat, read_pickle
11
+ from rich.console import Console
12
+ from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn, TaskProgressColumn
12
13
 
13
- from ..config.supported_instruments import meta
14
+ from AeroViz.rawDataReader.config.supported_instruments import meta
14
15
 
15
16
  __all__ = ['AbstractReader']
16
17
 
17
18
 
18
19
  class AbstractReader(ABC):
19
- nam = 'AbstractReader'
20
+ """
21
+ Abstract class for reading raw data from different instruments. Each instrument should have a separate class that
22
+ inherits from this class and implements the abstract methods. The abstract methods are `_raw_reader` and `_QC`.
20
23
 
21
- # initial data
22
- # input : file path, reset switch
24
+ List the file in the path and read pickle file if it exists, else read raw data and dump the pickle file the
25
+ pickle file will be generated after read raw data first time, if you want to re-read the rawdata, please set
26
+ 'reset=True'
27
+ """
23
28
 
24
- # list the file in the path and read pickle file if it exists, else read raw data and dump the pickle file the
25
- # pickle file will be generated after read raw data first time, if you want to re-read the rawdata, please set
26
- # 'reset=True'
29
+ nam = 'AbstractReader'
27
30
 
28
31
  def __init__(self,
29
32
  path: Path | str,
30
- qc: bool = True,
31
- csv_raw: bool = True,
32
33
  reset: bool = False,
33
- rate: bool = False,
34
+ qc: bool = True,
35
+ qc_freq: Optional[str] = None,
36
+ rate: bool = True,
34
37
  append_data: bool = False):
35
38
 
36
39
  self.path = Path(path)
@@ -38,10 +41,10 @@ class AbstractReader(ABC):
38
41
  self.logger = self._setup_logger()
39
42
 
40
43
  self.reset = reset
41
- self.rate = rate
42
44
  self.qc = qc
43
- self.csv = csv_raw
44
- self.append = append_data & reset
45
+ self.qc_freq = qc_freq
46
+ self.rate = rate
47
+ self.append = append_data and reset
45
48
 
46
49
  self.pkl_nam = self.path / f'_read_{self.nam.lower()}.pkl'
47
50
  self.csv_nam = self.path / f'_read_{self.nam.lower()}.csv'
@@ -49,24 +52,12 @@ class AbstractReader(ABC):
49
52
  self.csv_nam_raw = self.path / f'_read_{self.nam.lower()}_raw.csv'
50
53
  self.csv_out = self.path / f'output_{self.nam.lower()}.csv'
51
54
 
52
- # dependency injection function, customize each instrument
53
- @abstractmethod
54
- def _raw_reader(self, file):
55
- pass
56
-
57
- @abstractmethod
58
- def _QC(self, df: DataFrame):
59
- return df
60
-
61
55
  def __call__(self,
62
- start: dtm | None = None,
63
- end: dtm | None = None,
56
+ start: datetime,
57
+ end: datetime,
64
58
  mean_freq: str = '1h',
65
59
  csv_out: bool = True,
66
- ) -> DataFrame | None:
67
-
68
- if start and end and end <= start:
69
- raise ValueError(f"Invalid time range: start {start} is after end {end}")
60
+ ) -> DataFrame:
70
61
 
71
62
  data = self._run(start, end)
72
63
 
@@ -78,210 +69,249 @@ class AbstractReader(ABC):
78
69
 
79
70
  return data
80
71
 
81
- @staticmethod
82
- def basic_QC(df: DataFrame):
83
- df_ave, df_std = df.mean(), df.std()
84
- df_lowb, df_highb = df < (df_ave - df_std * 1.5), df > (df_ave + df_std * 1.5)
85
-
86
- return df.mask(df_lowb | df_highb).copy()
87
-
88
- # set each to true datetime(18:30:01 -> 18:30:00) and rindex data
89
- def _raw_process(self, _df):
90
- # get time from df and set time to whole time to create time index
91
- _st, _ed = _df.index.sort_values()[[0, -1]]
92
- _tm_index = date_range(_st.strftime('%Y%m%d %H00'), _ed.floor('h').strftime('%Y%m%d %H00'),
93
- freq=self.meta['freq'])
94
- _tm_index.name = 'time'
72
+ @abstractmethod
73
+ def _raw_reader(self, file):
74
+ pass
95
75
 
96
- return _df.apply(to_numeric, errors='coerce').resample(self.meta['freq']).mean().reindex(_tm_index)
76
+ @abstractmethod
77
+ def _QC(self, df: DataFrame) -> DataFrame:
78
+ return self.n_sigma_QC(df)
97
79
 
98
80
  def _setup_logger(self) -> logging.Logger:
99
81
  logger = logging.getLogger(self.nam)
100
82
  logger.setLevel(logging.INFO)
83
+
84
+ for handler in logger.handlers[:]:
85
+ logger.removeHandler(handler)
86
+
101
87
  handler = logging.FileHandler(self.path / f'{self.nam}.log')
102
- handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
88
+ handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
103
89
  logger.addHandler(handler)
104
90
  return logger
105
91
 
106
- # acquisition rate and yield rate
107
- def _rate_calculate(self, _fout_raw, _fout_qc, _st_raw, _ed_raw):
108
- if self.meta['deter_key'] is not None:
109
- _start, _end = _fout_qc.index[[0, -1]]
110
-
111
- _drop_how = 'any'
112
- _the_size = len(_fout_raw.resample('1h').mean().index)
113
-
114
- self.logger.info(f"{'=' * 60}")
115
- self.logger.info(
116
- f"Raw data time : {_st_raw.strftime('%Y-%m-%d %H:%M:%S')} to {_ed_raw.strftime('%Y-%m-%d %H:%M:%S')}")
117
- self.logger.info(
118
- f"Output time : {_start.strftime('%Y-%m-%d %H:%M:%S')} to {_end.strftime('%Y-%m-%d %H:%M:%S')}")
119
- self.logger.info(f"{'-' * 60}")
120
- print(f"\n\n\t\tfrom {_start.strftime('%Y-%m-%d %H:%M:%S')} to {_end.strftime('%Y-%m-%d %H:%M:%S')}\n")
92
+ def _rate_calculate(self, raw_data, qc_data) -> None:
93
+ def __base_rate(raw_data, qc_data):
94
+ period_size = len(raw_data.resample('1h').mean().index)
121
95
 
122
96
  for _nam, _key in self.meta['deter_key'].items():
123
- if _key == ['all']:
124
- _key, _drop_how = _fout_qc.keys(), 'all'
97
+ _key, _drop_how = (qc_data.keys(), 'all') if _key is ['all'] else (_key, 'any')
125
98
 
126
- _real_size = len(_fout_raw[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
127
- _QC_size = len(_fout_qc[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
99
+ sample_size = len(raw_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
100
+ qc_size = len(qc_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
128
101
 
129
- try:
130
- _acq_rate = round((_real_size / _the_size) * 100, 1)
131
- _yid_rate = round((_QC_size / _real_size) * 100, 1)
132
- except ZeroDivisionError:
133
- _acq_rate, _yid_rate = 0, 0
102
+ # validate rate calculation
103
+ if period_size < sample_size or sample_size < qc_size or period_size == 0 or sample_size == 0:
104
+ raise ValueError(f"Invalid sample sizes: period={period_size}, sample={sample_size}, QC={qc_size}")
105
+
106
+ _acq_rate = round((sample_size / period_size) * 100, 1)
107
+ _yid_rate = round((qc_size / sample_size) * 100, 1)
134
108
 
135
109
  self.logger.info(f'{_nam}:')
136
110
  self.logger.info(f"\tAcquisition rate: {_acq_rate}%")
137
111
  self.logger.info(f'\tYield rate: {_yid_rate}%')
138
112
  self.logger.info(f"{'=' * 60}")
139
113
 
140
- print(f'\t\t{_nam} : ')
141
- print(f'\t\t\tacquisition rate : \033[91m{_acq_rate}%\033[0m')
142
- print(f'\t\t\tyield rate : \033[91m{_yid_rate}%\033[0m')
143
-
144
- # process time index
145
- @staticmethod
146
- def _tmidx_process(_start, _end, _df):
147
- _st, _ed = _df.index.sort_values()[[0, -1]]
148
- _start, _end = to_datetime(_start) or _st, to_datetime(_end) or _ed
149
- _idx = date_range(_start, _end, freq=_df.index.freq.copy())
150
- _idx.name = 'time'
151
-
152
- return _df.reindex(_idx), _st, _ed
153
-
154
- # append new data to exist pkl
155
- @staticmethod
156
- def _apnd_prcs(_df_done, _df_apnd):
157
-
158
- if _df_apnd is not None:
159
- _df = concat([_df_apnd.dropna(how='all').copy(), _df_done.dropna(how='all').copy()])
160
-
161
- _idx = date_range(*_df.index.sort_values()[[0, -1]], freq=_df_done.index.freq.copy())
162
- _idx.name = 'time'
163
-
164
- return _df.loc[~_df.index.duplicated()].copy().reindex(_idx)
165
-
166
- return _df_done
114
+ print(f'\n\t{_nam} : ')
115
+ print(f'\t\tacquisition rate : \033[91m{_acq_rate}%\033[0m')
116
+ print(f'\t\tyield rate : \033[91m{_yid_rate}%\033[0m')
167
117
 
168
- # remove outlier
169
- def _outlier_prcs(self, _df):
170
-
171
- if (self.path / 'outlier.json') not in self.path.glob('*.json'):
118
+ if self.meta['deter_key'] is not None:
119
+ # use qc_freq to calculate each period rate
120
+ if self.qc_freq is not None:
121
+ raw_data_grouped = raw_data.groupby(pd.Grouper(freq=self.qc_freq))
122
+ qc_data_grouped = qc_data.groupby(pd.Grouper(freq=self.qc_freq))
123
+
124
+ for (month, _sub_raw_data), (_, _sub_qc_data) in zip(raw_data_grouped, qc_data_grouped):
125
+ self.logger.info(
126
+ f"\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
127
+ print(
128
+ f"\n\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}")
129
+
130
+ __base_rate(_sub_raw_data, _sub_qc_data)
131
+
132
+ else:
133
+ __base_rate(raw_data, qc_data)
134
+
135
+ def _timeIndex_process(self, _df, user_start=None, user_end=None, append_df=None):
136
+ """
137
+ Process time index, resample data, extract specified time range, and optionally append new data.
138
+
139
+ :param _df: Input DataFrame with time index
140
+ :param user_start: Start of user-specified time range (optional)
141
+ :param user_end: End of user-specified time range (optional)
142
+ :param append_df: DataFrame to append (optional)
143
+ :return: Processed DataFrame
144
+ """
145
+ # Round timestamps and remove duplicates
146
+ _df = _df.groupby(_df.index.round('1min')).first()
147
+
148
+ # Determine frequency
149
+ freq = _df.index.inferred_freq or self.meta['freq']
150
+
151
+ # Append new data if provided
152
+ if append_df is not None:
153
+ append_df.index = append_df.index.round('1min')
154
+ _df = pd.concat([append_df.dropna(how='all'), _df.dropna(how='all')])
155
+ _df = _df.loc[~_df.index.duplicated()]
156
+
157
+ # Determine time range
158
+ df_start, df_end = _df.index.sort_values()[[0, -1]]
159
+
160
+ # Create new time index
161
+ new_index = pd.date_range(user_start or df_start, user_end or df_end, freq=freq, name='time')
162
+
163
+ # Process data: convert to numeric, resample, and reindex
164
+ return (_df.apply(pd.to_numeric, errors='coerce')
165
+ .resample(freq).mean()
166
+ .reindex(new_index))
167
+
168
+ def _outlier_process(self, _df):
169
+ outlier_file = self.path / 'outlier.json'
170
+
171
+ if not outlier_file.exists():
172
172
  return _df
173
173
 
174
- with (self.path / 'outlier.json').open('r', encoding='utf-8', errors='ignore') as f:
175
- self.outlier = jsn.load(f)
174
+ with outlier_file.open('r', encoding='utf-8', errors='ignore') as f:
175
+ outliers = json.load(f)
176
176
 
177
- for _st, _ed in self.outlier.values():
177
+ for _st, _ed in outliers.values():
178
178
  _df.loc[_st:_ed] = np.nan
179
179
 
180
180
  return _df
181
181
 
182
- # save pickle file
183
182
  def _save_data(self, raw_data: DataFrame, qc_data: DataFrame) -> None:
184
- self._safe_pickle_dump(self.pkl_nam, qc_data)
185
- if self.csv:
186
- qc_data.to_csv(self.csv_nam)
183
+ try:
184
+ raw_data.to_pickle(self.pkl_nam_raw)
185
+ raw_data.to_csv(self.csv_nam_raw)
187
186
 
188
- if self.meta['deter_key'] is not None:
189
- self._safe_pickle_dump(self.pkl_nam_raw, raw_data)
190
- if self.csv:
191
- raw_data.to_csv(self.csv_nam_raw)
187
+ if self.meta['deter_key'] is not None:
188
+ qc_data.to_pickle(self.pkl_nam)
189
+ qc_data.to_csv(self.csv_nam)
192
190
 
193
- @staticmethod
194
- def _safe_pickle_dump(file_path: Path, data: Any) -> None:
195
- while True:
196
- try:
197
- with file_path.open('wb') as f:
198
- pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL)
199
- break
200
- except PermissionError as err:
201
- print('\n', err)
202
- input('\t\t\33[41m Please close the file and press "Enter" \33[0m\n')
203
-
204
- # read pickle file
205
- def _read_pkl(self):
206
- with self.pkl_nam.open('rb') as qc_data, self.pkl_nam_raw.open('rb') as raw_data:
207
- return pkl.load(raw_data), pkl.load(qc_data)
191
+ except Exception as e:
192
+ raise IOError(f"Error saving data. {e}")
208
193
 
209
194
  def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]:
210
- patterns = {self.meta['pattern'].lower(), self.meta['pattern'].upper(), self.meta['pattern']}
211
- files = [f for pattern in patterns for f in self.path.glob(pattern)
195
+ files = [f
196
+ for file_pattern in self.meta['pattern']
197
+ for pattern in {file_pattern.lower(), file_pattern.upper(), file_pattern}
198
+ for f in self.path.glob(pattern)
212
199
  if f.name not in [self.csv_out.name, self.csv_nam.name, self.csv_nam_raw.name, f'{self.nam}.log']]
213
200
 
214
201
  if not files:
215
- raise FileNotFoundError(f"\t\t\033[31mNo files in '{self.path}' could be read."
216
- f"Please check the current path.\033[0m")
202
+ raise FileNotFoundError(f"No files in '{self.path}' could be read. Please check the current path.")
217
203
 
218
204
  df_list = []
219
- for file in files:
220
- print(f"\r\t\treading {file.name}", end='')
221
-
222
- try:
223
- df = self._raw_reader(file)
205
+ with Progress(
206
+ TextColumn("[bold blue]{task.description}", style="bold blue"),
207
+ BarColumn(bar_width=18, complete_style="green", finished_style="bright_green"),
208
+ TaskProgressColumn(),
209
+ TimeRemainingColumn(),
210
+ TextColumn("{task.fields[filename]}", style="yellow"),
211
+ console=Console(force_terminal=True, color_system="auto"),
212
+ expand=False
213
+ ) as progress:
214
+ task = progress.add_task(f"Reading {self.nam} files", total=len(files), filename="")
215
+ for file in files:
216
+ progress.update(task, advance=1, filename=file.name)
217
+ try:
218
+ df = self._raw_reader(file)
224
219
 
225
- if df is not None and not df.empty:
226
- df_list.append(df)
227
- else:
228
- self.logger.warning(f"File {file.name} produced an empty DataFrame or None.")
220
+ if df is not None and not df.empty:
221
+ df_list.append(df)
222
+ else:
223
+ self.logger.warning(f"File {file.name} produced an empty DataFrame or None.")
229
224
 
230
- except pd.errors.ParserError as e:
231
- self.logger.error(f"Error tokenizing data: {e}")
225
+ except pd.errors.ParserError as e:
226
+ self.logger.error(f"Error tokenizing data: {e}")
232
227
 
233
- except Exception as e:
234
- self.logger.error(f"Error reading {file.name}: {e}")
228
+ except Exception as e:
229
+ self.logger.error(f"Error reading {file.name}: {e}")
235
230
 
236
231
  if not df_list:
237
232
  raise ValueError("All files were either empty or failed to read.")
238
233
 
239
- raw_data = self._raw_process(concat(df_list))
234
+ raw_data = concat(df_list, axis=0).groupby(level=0).first()
235
+
236
+ raw_data = self._timeIndex_process(raw_data)
240
237
  qc_data = self._QC(raw_data)
241
238
 
242
239
  return raw_data, qc_data
243
240
 
244
- # main flow
245
- def _run(self, _start, _end):
246
- _f_raw_done, _f_qc_done = None, None
247
-
241
+ def _run(self, user_start, user_end):
248
242
  # read pickle if pickle file exists and 'reset=False' or process raw data or append new data
249
- if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and (not self.reset or self.append):
250
- print(f"\n\t{dtm.now().strftime('%m/%d %X')} : Reading \033[96mPICKLE\033[0m file of {self.nam}")
243
+ if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and not self.reset:
244
+ print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mPICKLE\033[0m "
245
+ f"from {user_start} to {user_end}\n")
251
246
 
252
- _f_raw_done, _f_qc_done = self._read_pkl()
247
+ _f_raw_done, _f_qc_done = read_pickle(self.pkl_nam_raw), read_pickle(self.pkl_nam)
253
248
 
254
- if not self.append:
255
- _f_raw_done, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_raw_done)
256
- _f_qc_done, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_qc_done)
249
+ if self.append:
250
+ print(f"Appending new data from {user_start} to {user_end}")
251
+ _f_raw_new, _f_qc_new = self._read_raw_files()
252
+ _f_raw = self._timeIndex_process(_f_raw_done, append_df=_f_raw_new)
253
+ _f_qc = self._timeIndex_process(_f_qc_done, append_df=_f_qc_new)
254
+ else:
255
+ _f_raw, _f_qc = _f_raw_done, _f_qc_done
256
+ return _f_qc if self.qc else _f_raw
257
257
 
258
- _f_qc_done = self._outlier_prcs(_f_qc_done)
258
+ else:
259
+ print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mRAW DATA\033[0m "
260
+ f"from {user_start} to {user_end}\n")
259
261
 
260
- if self.rate:
261
- self._rate_calculate(_f_raw_done, _f_qc_done, _start_raw, _end_raw)
262
+ _f_raw, _f_qc = self._read_raw_files()
262
263
 
263
- return _f_qc_done if self.qc else _f_raw_done
264
-
265
- # read raw data
266
- print(f"\n\t{dtm.now().strftime('%m/%d %X')} : Reading \033[96mRAW DATA\033[0m of {self.nam} and process it")
267
-
268
- _f_raw, _f_qc = self._read_raw_files()
269
-
270
- # append new data and pickle data
271
- if self.append and self.pkl_nam.exists():
272
- _f_raw = self._apnd_prcs(_f_raw_done, _f_raw)
273
- _f_qc = self._apnd_prcs(_f_qc_done, _f_qc)
264
+ # process time index
265
+ data_start, data_end = _f_raw.index.sort_values()[[0, -1]]
274
266
 
275
- _f_qc = self._outlier_prcs(_f_qc)
267
+ _f_raw = self._timeIndex_process(_f_raw, user_start, user_end)
268
+ _f_qc = self._timeIndex_process(_f_qc, user_start, user_end)
269
+ _f_qc = self._outlier_process(_f_qc)
276
270
 
277
271
  # save
278
272
  self._save_data(_f_raw, _f_qc)
279
273
 
280
- # process time index
281
- # if (_start is not None)|(_end is not None):
282
- _f_raw, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_raw)
283
- _f_qc, _start_raw, _end_raw = self._tmidx_process(_start, _end, _f_qc)
274
+ self.logger.info(f"{'=' * 60}")
275
+ self.logger.info(f"Raw data time : {data_start} to {data_end}")
276
+ self.logger.info(f"Output time : {user_start} to {user_end}")
277
+ self.logger.info(f"{'-' * 60}")
284
278
 
285
- self._rate_calculate(_f_raw, _f_qc, _start_raw, _end_raw)
279
+ if self.rate:
280
+ self._rate_calculate(_f_raw, _f_qc)
286
281
 
287
282
  return _f_qc if self.qc else _f_raw
283
+
284
+ @staticmethod
285
+ def reorder_dataframe_columns(df, order_lists, others_col=False):
286
+ new_order = []
287
+
288
+ for order in order_lists:
289
+ # 只添加存在於DataFrame中的欄位,且不重複添加
290
+ new_order.extend([col for col in order if col in df.columns and col not in new_order])
291
+
292
+ if others_col:
293
+ # 添加所有不在新順序列表中的原始欄位,保持它們的原始順序
294
+ new_order.extend([col for col in df.columns if col not in new_order])
295
+
296
+ return df[new_order]
297
+
298
+ @staticmethod
299
+ def n_sigma_QC(df: DataFrame, std_range: int = 5) -> DataFrame:
300
+ df_ave, df_std = df.mean(), df.std()
301
+ df_lowb, df_highb = df < (df_ave - df_std * std_range), df > (df_ave + df_std * std_range)
302
+
303
+ return df.mask(df_lowb | df_highb).copy()
304
+
305
+ # "四分位數範圍法"(Inter-quartile Range Method)
306
+ @staticmethod
307
+ def IQR_QC(df: DataFrame, log_dist=False) -> tuple[DataFrame, DataFrame]:
308
+ df = np.log10(df) if log_dist else df
309
+
310
+ _df_qua = df.quantile([.25, .75])
311
+ _df_q1, _df_q3 = _df_qua.loc[.25].copy(), _df_qua.loc[.75].copy()
312
+ _df_iqr = _df_q3 - _df_q1
313
+
314
+ _se = concat([_df_q1 - 1.5 * _df_iqr] * len(df), axis=1).T.set_index(df.index)
315
+ _le = concat([_df_q3 + 1.5 * _df_iqr] * len(df), axis=1).T.set_index(df.index)
316
+
317
+ return (10 ** _se, 10 ** _le) if log_dist else (_se, _le)
@@ -27,4 +27,4 @@ class Reader(AbstractReader):
27
27
  _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']].mask((_df < 0).copy())
28
28
 
29
29
  # QC data in 1h
30
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
30
+ return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
@@ -28,4 +28,4 @@ class Reader(AbstractReader):
28
28
  _df = _df.mask((_df < 0).copy())
29
29
 
30
30
  # QC data in 1h
31
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
31
+ return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
@@ -1,4 +1,4 @@
1
- import numpy as n
1
+ import numpy as np
2
2
  from pandas import to_datetime, read_table
3
3
 
4
4
  from AeroViz.rawDataReader.core import AbstractReader
@@ -29,7 +29,7 @@ class Reader(AbstractReader):
29
29
  # QC data
30
30
  def _QC(self, _df):
31
31
  # mask out the data size lower than 7
32
- _df['total'] = _df.sum(axis=1, min_count=1) * (n.diff(n.log(_df.keys().to_numpy(float)))).mean()
32
+ _df['total'] = _df.sum(axis=1, min_count=1) * (np.diff(np.log(_df.keys().to_numpy(float)))).mean()
33
33
  _df_size = _df['total'].dropna().resample('1h').size().resample(_df.index.freq).ffill()
34
34
  _df = _df.mask(_df_size < 7)
35
35
 
@@ -37,4 +37,4 @@ class Reader(AbstractReader):
37
37
  _df = _df[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
38
38
 
39
39
  # QC data in 1h
40
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
40
+ return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
@@ -39,4 +39,4 @@ class Reader(AbstractReader):
39
39
  _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7', 'BC8', 'BC9', 'BC10']].mask((_df < 0).copy())
40
40
 
41
41
  # QC data in 1h
42
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
42
+ return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
@@ -0,0 +1,39 @@
1
+ from pandas import read_csv
2
+
3
+ from AeroViz.rawDataReader.core import AbstractReader
4
+
5
+ desired_order1 = ['SO2', 'NO', 'NOx', 'NO2', 'CO', 'O3', 'THC', 'NMHC',
6
+ 'CH4', 'PM10', 'PM2.5', 'PM1', 'WS', 'WD', 'AT', 'RH']
7
+
8
+ desired_order2 = ['Benzene', 'Toluene', 'EthylBenzene', 'm/p-Xylene', 'o-Xylene']
9
+
10
+
11
+ class Reader(AbstractReader):
12
+ nam = 'EPA'
13
+
14
+ def _raw_reader(self, file):
15
+ # 查詢小時值(測項).csv & 查詢小時值(直式).csv (有、無輸出有效值都可以)
16
+ df = read_csv(file, encoding='big5', encoding_errors='ignore', index_col=0, parse_dates=True,
17
+ on_bad_lines='skip')
18
+
19
+ if len(df.groupby('測站')) > 1:
20
+ raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}')
21
+ else:
22
+ if '測站' in df.columns:
23
+ df.drop(columns=['測站'], inplace=True)
24
+
25
+ if '測項' in df.columns:
26
+ df = df.pivot(columns='測項', values='資料')
27
+
28
+ df.rename(columns={'AMB_TEMP': 'AT', 'WIND_SPEED': 'WS', 'WIND_DIREC': 'WD'}, inplace=True)
29
+ df.index.name = 'Time'
30
+
31
+ # 如果沒有將無效值拿掉就輸出 請將包含 #、L 的字串替換成 # 或 _
32
+ df = df.replace(to_replace=r'\d*[#]\b', value='#', regex=True)
33
+ df = df.replace(to_replace=r'\d*[L]\b', value='_', regex=True)
34
+
35
+ # 欄位排序
36
+ return self.reorder_dataframe_columns(df, [desired_order1])
37
+
38
+ def _QC(self, _df):
39
+ return _df.resample('6h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
@@ -25,4 +25,4 @@ class Reader(AbstractReader):
25
25
 
26
26
  def _QC(self, _df):
27
27
  # QC data in 1h
28
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
28
+ return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
@@ -1,8 +1,7 @@
1
1
  # read meteorological data from google sheet
2
2
 
3
3
 
4
- import numpy as np
5
- from pandas import read_csv, concat, to_numeric
4
+ from pandas import read_csv, to_numeric
6
5
 
7
6
  from AeroViz.rawDataReader.core import AbstractReader
8
7
 
@@ -35,24 +34,8 @@ class Reader(AbstractReader):
35
34
  'SO42-': 0.08,
36
35
  }
37
36
 
38
- # _mdl.update(self._oth_set.get('mdl', {}))
39
-
40
- def _se_le(_df_, _log=False):
41
- _df_ = np.log10(_df_) if _log else _df_
42
-
43
- _df_qua = _df_.quantile([.25, .75])
44
- _df_q1, _df_q3 = _df_qua.loc[.25].copy(), _df_qua.loc[.75].copy()
45
- _df_iqr = _df_q3 - _df_q1
46
-
47
- _se = concat([_df_q1 - 1.5 * _df_iqr] * len(_df_), axis=1).T.set_index(_df_.index)
48
- _le = concat([_df_q3 + 1.5 * _df_iqr] * len(_df_), axis=1).T.set_index(_df_.index)
49
-
50
- if _log:
51
- return 10 ** _se, 10 ** _le
52
- return _se, _le
53
-
54
37
  _cation, _anion, _main = (['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+'],
55
- ['Cl-', 'NO2-', 'NO3-', 'SO42-', ],
38
+ ['Cl-', 'NO2-', 'NO3-', 'PO43-', 'SO42-', ],
56
39
  ['SO42-', 'NO3-', 'NH4+'])
57
40
 
58
41
  _df_salt = _df[_mdl.keys()].copy()
@@ -68,23 +51,23 @@ class Reader(AbstractReader):
68
51
 
69
52
  # calculate SE LE
70
53
  # salt < LE
71
- _se, _le = _se_le(_df_salt, _log=True)
54
+ _se, _le = self.IQR_QC(_df_salt, log_dist=True)
72
55
  _df_salt = _df_salt.mask(_df_salt > _le).copy()
73
56
 
74
57
  # C/A, A/C
75
58
  _rat_CA = (_df_salt[_cation].sum(axis=1) / _df_salt[_anion].sum(axis=1)).to_frame()
76
59
  _rat_AC = (1 / _rat_CA).copy()
77
60
 
78
- _se, _le = _se_le(_rat_CA, )
61
+ _se, _le = self.IQR_QC(_rat_CA, )
79
62
  _cond_CA = (_rat_CA < _le) & (_rat_CA > 0)
80
63
 
81
- _se, _le = _se_le(_rat_AC, )
64
+ _se, _le = self.IQR_QC(_rat_AC, )
82
65
  _cond_AC = (_rat_AC < _le) & (_rat_AC > 0)
83
66
 
84
67
  _df_salt = _df_salt.where((_cond_CA * _cond_AC)[0]).copy()
85
68
 
86
69
  # conc. of main salt > SE
87
- _se, _le = _se_le(_df_salt[_main], _log=True)
70
+ _se, _le = self.IQR_QC(_df_salt[_main], log_dist=True)
88
71
  _df_salt[_main] = _df_salt[_main].mask(_df_salt[_main] < _se).copy()
89
72
 
90
73
  return _df_salt.reindex(_df.index)
@@ -35,4 +35,4 @@ class Reader(AbstractReader):
35
35
  _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BB mass', 'FF mass', 'AAE', 'BB']].mask((_df < 0).copy())
36
36
 
37
37
  # QC data in 1h
38
- return _df.resample('1h').apply(self.basic_QC).resample(self.meta.get("freq")).mean()
38
+ return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()