AeroViz 0.1.9.1__py3-none-any.whl → 0.1.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AeroViz might be problematic. Click here for more details.

@@ -69,7 +69,11 @@ def _basic(df, hybrid, unit, bin_rg, input_type):
69
69
 
70
70
  df_oth[f'total_{_tp_nam}_{_md_nam}'], df_oth[f'GMD_{_tp_nam}_{_md_nam}'], df_oth[
71
71
  f'GSD_{_tp_nam}_{_md_nam}'] = _geometric_prop(_dia, _dt)
72
- df_oth[f'mode_{_tp_nam}_{_md_nam}'] = _dt.idxmax(axis=1)
72
+
73
+ mask = _dt.notna().any(axis=1)
74
+
75
+ df_oth.loc[mask, f'mode_{_tp_nam}_{_md_nam}'] = _dt.loc[mask].idxmax(axis=1)
76
+ df_oth.loc[~mask, f'mode_{_tp_nam}_{_md_nam}'] = n.nan
73
77
 
74
78
  ## out
75
79
  out_dic['other'] = df_oth
@@ -7,21 +7,29 @@ import pandas as pd
7
7
 
8
8
  from AeroViz.plot.utils import set_figure
9
9
 
10
- # TODO: Hybrid Single-Particle Lagrangian Integrated Trajectory (HYSPLIT) model
10
+ # Hybrid Single-Particle Lagrangian Integrated Trajectory (HYSPLIT) model
11
11
 
12
12
 
13
13
  __all__ = ['hysplit']
14
14
 
15
15
  # 設置默認文件路徑
16
- DEFAULT_FILE = Path(__file__).parent.parent.parent / 'data' / '240228_00.txt'
16
+ DEFAULT_FILE = Path(__file__).parent.parent.parent / 'data' / 'hysplit_example_data.txt'
17
17
 
18
18
 
19
19
  def read_hysplit_data(file: Path):
20
20
  data = pd.read_csv(file, skiprows=8, sep=r'\s+', names=range(0, 12), engine='python')
21
21
  data = data.reset_index(drop=False)
22
- data.columns = ['category', 'name', 'abc', 'year', 'month', 'hour', 'min', 'cont', 'backward', 'lat', 'lon',
22
+ data.columns = ['category', 'name', 'year', 'month', 'day', 'hour', 'minute', 'count', 'backward', 'lat', 'lon',
23
23
  'height', 'pressure']
24
24
 
25
+ time_cols = ['year', 'month', 'day', 'hour', 'minute']
26
+
27
+ data['time'] = pd.to_datetime(data[time_cols].astype(str).agg(''.join, axis=1), format='%y%m%d%H%M')
28
+
29
+ data = data.drop(columns=time_cols)
30
+
31
+ data = data[['time'] + [col for col in data.columns if col != 'time']]
32
+
25
33
  return data
26
34
 
27
35
 
@@ -74,7 +74,7 @@ def RawDataReader(instrument_name: str,
74
74
  if not isinstance(path, Path):
75
75
  path = Path(path)
76
76
  if not path.exists() or not path.is_dir():
77
- raise ValueError(f"The specified path '{path}' does not exist or is not a directory.")
77
+ raise FileNotFoundError(f"The specified path '{path}' does not exist or is not a directory.")
78
78
 
79
79
  # Validate the QC frequency
80
80
  if qc_freq is not None:
@@ -75,6 +75,8 @@ meta = {
75
75
  "Thermal EC": ["Thermal_EC"],
76
76
  "Optical OC": ["Optical_OC"],
77
77
  "Optical EC": ["Optical_EC"],
78
+ "Thermal OC & EC": ["Thermal_OC", "Thermal_EC"],
79
+ "Optical OC & EC": ["Optical_OC", "Optical_EC"],
78
80
  },
79
81
  },
80
82
 
@@ -93,53 +95,41 @@ meta = {
93
95
  "SO42-": ["SO42-"],
94
96
  "Main Salt (NH4+, NO3-, SO42-)": ["NO3-", "SO42-", "NH4+"],
95
97
  },
98
+ # https://www.yangyao-env.com/web/product/product_in2.jsp?pd_id=PD1640151884502
99
+
100
+ # HF: 0.08, F-: 0.08, PO43-: None is not measured
101
+ "MDL": {
102
+ 'HF': None, 'HCl': 0.05, 'HNO2': 0.01, 'HNO3': 0.05, 'G-SO2': 0.05, 'NH3': 0.1,
103
+ 'Na+': 0.05, 'NH4+': 0.08, 'K+': 0.08, 'Mg2+': 0.05, 'Ca2+': 0.05,
104
+ 'F-': None, 'Cl-': 0.05, 'NO2-': 0.05, 'NO3-': 0.01, 'PO43-': None, 'SO42-': 0.05,
105
+ },
106
+
107
+ "MR": {
108
+ 'HF': 200, 'HCl': 200, 'HNO2': 200, 'HNO3': 200, 'G-SO2': 200, 'NH3': 300,
109
+ 'Na+': 300, 'NH4+': 300, 'K+': 300, 'Mg2+': 300, 'Ca2+': 300,
110
+ 'F-': 300, 'Cl-': 300, 'NO2-': 300, 'NO3-': 300, 'PO43-': None, 'SO42-': 300,
111
+ }
96
112
  },
97
113
 
98
114
  "XRF": {
99
115
  "pattern": ["*.csv"],
100
116
  "freq": "1h",
101
117
  "deter_key": {
102
- "Al": ["Al"],
103
- "Si": ["Si"],
104
- "P": ["P"],
105
- "S": ["S"],
106
- "Cl": ["Cl"],
107
- "K": ["K"],
108
- "Ca": ["Ca"],
109
- "Ti": ["Ti"],
110
- "V": ["V"],
111
- "Cr": ["Cr"],
112
- "Mn": ["Mn"],
113
- "Fe": ["Fe"],
114
- "Ni": ["Ni"],
115
- "Cu": ["Cu"],
116
- "Zn": ["Zn"],
117
- "As": ["As"],
118
- "Se": ["Se"],
119
- "Br": ["Br"],
120
- "Rb": ["Rb"],
121
- "Sr": ["Sr"],
122
- "Y": ["Y"],
123
- "Zr": ["Zr"],
124
- "Mo": ["Mo"],
125
- "Ag": ["Ag"],
126
- "Cd": ["Cd"],
127
- "In": ["In"],
128
- "Sn": ["Sn"],
129
- "Sb": ["Sb"],
130
- "Te": ["Te"],
131
- "Cs": ["Cs"],
132
- "Ba": ["Ba"],
133
- "La": ["La"],
134
- "Ce": ["Ce"],
135
- "W": ["W"],
136
- "Pt": ["Pt"],
137
- "Au": ["Au"],
138
- "Hg": ["Hg"],
139
- "Tl": ["Tl"],
140
- "Pb": ["Pb"],
141
- "Bi": ["Bi"],
118
+ "Several trace element (Al, Si, Ti, V, Cr, Mn, Fe)": ["Al", "Si", "Ti", "V", "Cr", "Mn", "Fe"],
119
+
142
120
  },
121
+ # base on Xact 625i Minimum Decision Limit (MDL) for XRF in ng/m3, 60 min sample time
122
+ "MDL": {
123
+ 'Al': 100, 'Si': 18, 'P': 5.2, 'S': 3.2, 'Cl': 1.7,
124
+ 'K': 1.2, 'Ca': 0.3, 'Ti': 1.6, 'V': 0.12, 'Cr': 0.12,
125
+ 'Mn': 0.14, 'Fe': 0.17, 'Co': 0.14, 'Ni': 0.096, 'Cu': 0.079,
126
+ 'Zn': 0.067, 'Ga': 0.059, 'Ge': 0.056, 'As': 0.063, 'Se': 0.081,
127
+ 'Br': 0.1, 'Rb': 0.19, 'Sr': 0.22, 'Y': 0.28, 'Zr': 0.33,
128
+ 'Nb': 0.41, 'Mo': 0.48, 'Pd': 2.2, 'Ag': 1.9, 'Cd': 2.5,
129
+ 'In': 3.1, 'Sn': 4.1, 'Sb': 5.2, 'Te': 0.6, 'Cs': 0.37,
130
+ 'Ba': 0.39, 'La': 0.36, 'Ce': 0.3, 'W': 0.0001, 'Pt': 0.12,
131
+ 'Au': 0.1, 'Hg': 0.12, 'Tl': 0.12, 'Pb': 0.13, 'Bi': 0.13
132
+ }
143
133
  },
144
134
 
145
135
  "VOC": {
@@ -174,7 +164,7 @@ meta = {
174
164
  "freq": "1h",
175
165
  "deter_key": {
176
166
  "Main Salt (Na+, NH4+, Cl-, NO3-, SO42-)": ["Na+", "NH4+", "Cl-", "NO3-", "SO42-"],
177
- "XRF (Al, Ti, V, Cr, Mn, Fe)": ["Al", "Ti", "V", "Cr", "Mn", "Fe"],
167
+ "Several trace element (Al, Ti, V, Cr, Mn, Fe)": ["Al", "Ti", "V", "Cr", "Mn", "Fe"],
178
168
  },
179
169
  },
180
170
  }
@@ -7,11 +7,12 @@ from typing import Optional
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
- from pandas import DataFrame, concat, read_pickle
10
+ from pandas import DataFrame, concat, read_pickle, to_numeric
11
11
  from rich.console import Console
12
12
  from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn, TaskProgressColumn
13
13
 
14
14
  from AeroViz.rawDataReader.config.supported_instruments import meta
15
+ from AeroViz.rawDataReader.core.qc import DataQualityControl
15
16
 
16
17
  __all__ = ['AbstractReader']
17
18
 
@@ -75,18 +76,20 @@ class AbstractReader(ABC):
75
76
 
76
77
  @abstractmethod
77
78
  def _QC(self, df: DataFrame) -> DataFrame:
78
- return self.n_sigma_QC(df)
79
+ return df
79
80
 
80
81
  def _setup_logger(self) -> logging.Logger:
81
82
  logger = logging.getLogger(self.nam)
82
83
  logger.setLevel(logging.INFO)
83
84
 
84
85
  for handler in logger.handlers[:]:
86
+ handler.close()
85
87
  logger.removeHandler(handler)
86
88
 
87
89
  handler = logging.FileHandler(self.path / f'{self.nam}.log')
88
90
  handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
89
91
  logger.addHandler(handler)
92
+
90
93
  return logger
91
94
 
92
95
  def _rate_calculate(self, raw_data, qc_data) -> None:
@@ -94,15 +97,20 @@ class AbstractReader(ABC):
94
97
  period_size = len(raw_data.resample('1h').mean().index)
95
98
 
96
99
  for _nam, _key in self.meta['deter_key'].items():
97
- _key, _drop_how = (qc_data.keys(), 'all') if _key is ['all'] else (_key, 'any')
100
+ _columns_key, _drop_how = (qc_data.keys(), 'all') if _key == ['all'] else (_key, 'any')
98
101
 
99
- sample_size = len(raw_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
100
- qc_size = len(qc_data[_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
102
+ sample_size = len(raw_data[_columns_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
103
+ qc_size = len(qc_data[_columns_key].resample('1h').mean().copy().dropna(how=_drop_how).index)
101
104
 
102
105
  # validate rate calculation
103
- if period_size < sample_size or sample_size < qc_size or period_size == 0 or sample_size == 0:
104
- _acq_rate, _yid_rate, _OEE_rate = 0, 0, 0
105
- # raise ValueError(f"Invalid sample sizes: period={period_size}, sample={sample_size}, QC={qc_size}")
106
+ if period_size == 0 or sample_size == 0 or qc_size == 0:
107
+ print(f'\t\t\033[91m No data for this period... skipping\033[0m')
108
+ continue
109
+
110
+ if period_size < sample_size or sample_size < qc_size:
111
+ print(
112
+ f'\t\tInvalid size relationship: period={period_size}, sample={sample_size}, QC={qc_size}... skipping')
113
+ continue
106
114
 
107
115
  else:
108
116
  _acq_rate = round((sample_size / period_size) * 100, 1)
@@ -116,8 +124,8 @@ class AbstractReader(ABC):
116
124
  self.logger.info(f"{'=' * 60}")
117
125
 
118
126
  print(f'\n\t{_nam} : ')
119
- print(f'\t\tacquisition rate | yield rate -> OEE rate :'
120
- f' \033[91m{_acq_rate}% | {_yid_rate}% -> {_OEE_rate}%\033[0m')
127
+ print(f'\t\tacquisition rate | yield rate -> OEE rate : '
128
+ f'\033[91m{_acq_rate}% | {_yid_rate}% -> {_OEE_rate}%\033[0m')
121
129
 
122
130
  if self.meta['deter_key'] is not None:
123
131
  # use qc_freq to calculate each period rate
@@ -165,9 +173,7 @@ class AbstractReader(ABC):
165
173
  new_index = pd.date_range(user_start or df_start, user_end or df_end, freq=freq, name='time')
166
174
 
167
175
  # Process data: convert to numeric, resample, and reindex
168
- return (_df.apply(pd.to_numeric, errors='coerce')
169
- .resample(freq).mean()
170
- .reindex(new_index))
176
+ return _df.reindex(new_index)
171
177
 
172
178
  def _outlier_process(self, _df):
173
179
  outlier_file = self.path / 'outlier.json'
@@ -237,8 +243,8 @@ class AbstractReader(ABC):
237
243
 
238
244
  raw_data = concat(df_list, axis=0).groupby(level=0).first()
239
245
 
240
- raw_data = self._timeIndex_process(raw_data)
241
- qc_data = self._QC(raw_data)
246
+ raw_data = self._timeIndex_process(raw_data).apply(to_numeric, errors='coerce').copy(deep=True)
247
+ qc_data = self._QC(raw_data).apply(to_numeric, errors='coerce').copy(deep=True)
242
248
 
243
249
  return raw_data, qc_data
244
250
 
@@ -281,6 +287,8 @@ class AbstractReader(ABC):
281
287
  self.logger.info(f"{'-' * 60}")
282
288
 
283
289
  if self.rate:
290
+ _f_raw = _f_raw.apply(to_numeric, errors='coerce')
291
+ _f_qc = _f_qc.apply(to_numeric, errors='coerce')
284
292
  self._rate_calculate(_f_raw, _f_qc)
285
293
 
286
294
  return _f_qc if self.qc else _f_raw
@@ -299,84 +307,6 @@ class AbstractReader(ABC):
299
307
 
300
308
  return df[new_order]
301
309
 
302
- @staticmethod
303
- def n_sigma_QC(df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
304
- # 確保輸入是DataFrame
305
- df = df.to_frame() if isinstance(df, pd.Series) else df
306
-
307
- df_ave = df.mean()
308
- df_std = df.std()
309
-
310
- lower_bound = df < (df_ave - df_std * std_range)
311
- upper_bound = df > (df_ave + df_std * std_range)
312
-
313
- return df.mask(lower_bound | upper_bound)
314
-
315
- @staticmethod
316
- def IQR_QC(df: pd.DataFrame, log_dist=False) -> pd.DataFrame:
317
- # 確保輸入是DataFrame
318
- df = df.to_frame() if isinstance(df, pd.Series) else df
319
-
320
- df_transformed = np.log10(df) if log_dist else df
321
-
322
- _df_q1 = df_transformed.quantile(0.25)
323
- _df_q3 = df_transformed.quantile(0.75)
324
-
325
- _df_iqr = _df_q3 - _df_q1
326
-
327
- # Calculate lower and upper bounds
328
- lower_bound = df_transformed < (_df_q1 - 1.5 * _df_iqr)
329
- upper_bound = df_transformed > (_df_q3 + 1.5 * _df_iqr)
330
-
331
- # Apply the filter to the original dataframe
332
- return df.mask(lower_bound | upper_bound)
333
-
334
- @staticmethod
335
- def rolling_IQR_QC(df: pd.DataFrame, window_size=24, log_dist=False) -> pd.DataFrame:
336
- df = df.to_frame() if isinstance(df, pd.Series) else df
337
- df_transformed = np.log10(df) if log_dist else df
338
-
339
- def iqr_filter(x):
340
- q1, q3 = x.quantile(0.25), x.quantile(0.75)
341
- iqr = q3 - q1
342
- lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
343
- return (x >= lower) & (x <= upper)
344
-
345
- mask = df_transformed.rolling(window=window_size, center=True, min_periods=1).apply(iqr_filter)
346
- return df.where(mask, np.nan)
347
-
348
310
  @staticmethod
349
311
  def time_aware_IQR_QC(df: pd.DataFrame, time_window='1D', log_dist=False) -> pd.DataFrame:
350
- df = df.to_frame() if isinstance(df, pd.Series) else df
351
- df_transformed = np.log10(df) if log_dist else df
352
-
353
- def iqr_filter(group):
354
- q1, q3 = group.quantile(0.25), group.quantile(0.75)
355
- iqr = q3 - q1
356
- lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
357
- return (group >= lower) & (group <= upper)
358
-
359
- mask = df_transformed.groupby(pd.Grouper(freq=time_window)).transform(iqr_filter)
360
- return df.where(mask, np.nan)
361
-
362
- @staticmethod
363
- def mad_iqr_hybrid_QC(df: pd.DataFrame, mad_threshold=3.5, log_dist=False) -> pd.DataFrame:
364
- df = df.to_frame() if isinstance(df, pd.Series) else df
365
- df_transformed = np.log10(df) if log_dist else df
366
-
367
- # IQR 方法
368
- q1, q3 = df_transformed.quantile(0.25), df_transformed.quantile(0.75)
369
- iqr = q3 - q1
370
- iqr_lower, iqr_upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
371
-
372
- # MAD 方法
373
- median = df_transformed.median()
374
- mad = (df_transformed - median).abs().median()
375
- mad_lower, mad_upper = median - mad_threshold * mad, median + mad_threshold * mad
376
-
377
- # 结合两种方法
378
- lower = np.maximum(iqr_lower, mad_lower)
379
- upper = np.minimum(iqr_upper, mad_upper)
380
-
381
- mask = (df_transformed >= lower) & (df_transformed <= upper)
382
- return df.where(mask, np.nan)
312
+ return DataQualityControl().time_aware_iqr(df, time_window=time_window, log_dist=log_dist)
@@ -0,0 +1,184 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+
5
+ class DataQualityControl:
6
+ """A class providing various methods for data quality control and outlier detection"""
7
+
8
+ @staticmethod
9
+ def _ensure_dataframe(df: pd.DataFrame | pd.Series) -> pd.DataFrame:
10
+ """Ensure input data is in DataFrame format"""
11
+ return df.to_frame() if isinstance(df, pd.Series) else df
12
+
13
+ @staticmethod
14
+ def _transform_if_log(df: pd.DataFrame, log_dist: bool) -> pd.DataFrame:
15
+ """Transform data to log scale if required"""
16
+ return np.log10(df) if log_dist else df
17
+
18
+ @classmethod
19
+ def n_sigma(cls, df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
20
+ """
21
+ Detect outliers using n-sigma method
22
+
23
+ Parameters
24
+ ----------
25
+ df : pd.DataFrame
26
+ Input data
27
+ std_range : int, default=5
28
+ Number of standard deviations to use as threshold
29
+
30
+ Returns
31
+ -------
32
+ pd.DataFrame
33
+ Cleaned DataFrame with outliers masked as NaN
34
+ """
35
+ df = cls._ensure_dataframe(df)
36
+ df_ave = df.mean()
37
+ df_std = df.std()
38
+
39
+ lower_bound = df < (df_ave - df_std * std_range)
40
+ upper_bound = df > (df_ave + df_std * std_range)
41
+
42
+ return df.mask(lower_bound | upper_bound)
43
+
44
+ @classmethod
45
+ def iqr(cls, df: pd.DataFrame, log_dist: bool = False) -> pd.DataFrame:
46
+ """
47
+ Detect outliers using Interquartile Range (IQR) method
48
+
49
+ Parameters
50
+ ----------
51
+ df : pd.DataFrame
52
+ Input data
53
+ log_dist : bool, default=False
54
+ Whether to apply log transformation to data
55
+
56
+ Returns
57
+ -------
58
+ pd.DataFrame
59
+ Cleaned DataFrame with outliers masked as NaN
60
+ """
61
+ df = cls._ensure_dataframe(df)
62
+ df_transformed = cls._transform_if_log(df, log_dist)
63
+
64
+ q1 = df_transformed.quantile(0.25)
65
+ q3 = df_transformed.quantile(0.75)
66
+ iqr = q3 - q1
67
+
68
+ lower_bound = df_transformed < (q1 - 1.5 * iqr)
69
+ upper_bound = df_transformed > (q3 + 1.5 * iqr)
70
+
71
+ return df.mask(lower_bound | upper_bound)
72
+
73
+ @classmethod
74
+ def rolling_iqr(cls, df: pd.DataFrame, window_size: int = 24,
75
+ log_dist: bool = False) -> pd.DataFrame:
76
+ """
77
+ Detect outliers using rolling window IQR method
78
+
79
+ Parameters
80
+ ----------
81
+ df : pd.DataFrame
82
+ Input data
83
+ window_size : int, default=24
84
+ Size of the rolling window
85
+ log_dist : bool, default=False
86
+ Whether to apply log transformation to data
87
+
88
+ Returns
89
+ -------
90
+ pd.DataFrame
91
+ Cleaned DataFrame with outliers masked as NaN
92
+ """
93
+ df = cls._ensure_dataframe(df)
94
+ df_transformed = cls._transform_if_log(df, log_dist)
95
+
96
+ def iqr_filter(x):
97
+ q1, q3 = x.quantile(0.25), x.quantile(0.75)
98
+ iqr = q3 - q1
99
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
100
+ return (x >= lower) & (x <= upper)
101
+
102
+ mask = df_transformed.rolling(
103
+ window=window_size,
104
+ center=True,
105
+ min_periods=1
106
+ ).apply(iqr_filter)
107
+
108
+ return df.where(mask, np.nan)
109
+
110
+ @classmethod
111
+ def time_aware_iqr(cls, df: pd.DataFrame, time_window: str = '1D',
112
+ log_dist: bool = False) -> pd.DataFrame:
113
+ """
114
+ Detect outliers using time-aware IQR method
115
+
116
+ Parameters
117
+ ----------
118
+ df : pd.DataFrame
119
+ Input data
120
+ time_window : str, default='1D'
121
+ Time window size (e.g., '1D' for one day)
122
+ log_dist : bool, default=False
123
+ Whether to apply log transformation to data
124
+
125
+ Returns
126
+ -------
127
+ pd.DataFrame
128
+ Cleaned DataFrame with outliers masked as NaN
129
+ """
130
+ df = cls._ensure_dataframe(df)
131
+ df_transformed = cls._transform_if_log(df, log_dist)
132
+
133
+ def iqr_filter(group):
134
+ q1, q3 = group.quantile(0.25), group.quantile(0.75)
135
+ iqr = q3 - q1
136
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
137
+ return (group >= lower) & (group <= upper)
138
+
139
+ mask = df_transformed.groupby(
140
+ pd.Grouper(freq=time_window)
141
+ ).transform(iqr_filter)
142
+
143
+ return df.where(mask, np.nan)
144
+
145
+ @classmethod
146
+ def mad_iqr_hybrid(cls, df: pd.DataFrame, mad_threshold: float = 3.5,
147
+ log_dist: bool = False) -> pd.DataFrame:
148
+ """
149
+ Detect outliers using a hybrid of MAD and IQR methods
150
+
151
+ Parameters
152
+ ----------
153
+ df : pd.DataFrame
154
+ Input data
155
+ mad_threshold : float, default=3.5
156
+ Threshold for MAD method
157
+ log_dist : bool, default=False
158
+ Whether to apply log transformation to data
159
+
160
+ Returns
161
+ -------
162
+ pd.DataFrame
163
+ Cleaned DataFrame with outliers masked as NaN
164
+ """
165
+ df = cls._ensure_dataframe(df)
166
+ df_transformed = cls._transform_if_log(df, log_dist)
167
+
168
+ # IQR method
169
+ q1, q3 = df_transformed.quantile(0.25), df_transformed.quantile(0.75)
170
+ iqr = q3 - q1
171
+ iqr_lower, iqr_upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
172
+
173
+ # MAD method
174
+ median = df_transformed.median()
175
+ mad = (df_transformed - median).abs().median()
176
+ mad_lower = median - mad_threshold * mad
177
+ mad_upper = median + mad_threshold * mad
178
+
179
+ # Combine both methods
180
+ lower = np.maximum(iqr_lower, mad_lower)
181
+ upper = np.minimum(iqr_upper, mad_upper)
182
+
183
+ mask = (df_transformed >= lower) & (df_transformed <= upper)
184
+ return df.where(mask, np.nan)
@@ -11,14 +11,14 @@ class Reader(AbstractReader):
11
11
  self.logger.info(f'\t {file} may not be a whole daily data. Make sure the file is correct.')
12
12
 
13
13
  _df = read_table(file, parse_dates={'time': [0, 1]}, index_col='time',
14
- delimiter=r'\s+', skiprows=5, usecols=range(67)).apply(to_numeric, errors='coerce')
14
+ delimiter=r'\s+', skiprows=5, usecols=range(67))
15
15
  _df.columns = _df.columns.str.strip(';')
16
16
 
17
17
  # remove data without Status=0, 128 (Not much filter tape), 256 (Not much filter tape)
18
18
  if self.meta.get('error_state', False):
19
19
  _df = _df.where(~_df['Status'].isin(self.meta['error_state'])).copy()
20
20
 
21
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']]
21
+ _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']].apply(to_numeric, errors='coerce')
22
22
 
23
23
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
24
24
 
@@ -7,7 +7,7 @@ class Reader(AbstractReader):
7
7
  nam = 'AE43'
8
8
 
9
9
  def _raw_reader(self, file):
10
- _df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time').apply(to_numeric, errors='coerce')
10
+ _df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time')
11
11
  _df_id = _df['SetupID'].iloc[-1]
12
12
 
13
13
  # get last SetupID data
@@ -18,7 +18,7 @@ class Reader(AbstractReader):
18
18
  if self.meta.get('error_state', False):
19
19
  _df = _df.where(~_df['Status'].isin(self.meta['error_state'])).copy()
20
20
 
21
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']]
21
+ _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']].apply(to_numeric, errors='coerce')
22
22
 
23
23
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
24
24
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
11
- _df = read_csv(f, low_memory=False, index_col=0).apply(to_numeric, errors='coerce')
11
+ _df = read_csv(f, low_memory=False, index_col=0)
12
12
 
13
13
  _df.index = to_datetime(_df.index, errors='coerce')
14
14
  _df.index.name = 'time'
@@ -24,7 +24,7 @@ class Reader(AbstractReader):
24
24
  'RH': 'RH'
25
25
  })
26
26
 
27
- _df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR']]
27
+ _df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR']].apply(to_numeric, errors='coerce')
28
28
 
29
29
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
30
30
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with open(file, 'r', encoding='utf-8', errors='ignore') as f:
11
- _df = read_csv(f, parse_dates=True, index_col=0).apply(to_numeric, errors='coerce')
11
+ _df = read_csv(f, parse_dates=True, index_col=0)
12
12
 
13
13
  _df.columns = _df.columns.str.replace(' ', '')
14
14
 
@@ -29,7 +29,8 @@ class Reader(AbstractReader):
29
29
  if self.meta.get('error_state', False):
30
30
  _df = _df[~_df['Status'].isin(self.meta.get('error_state'))]
31
31
 
32
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7', 'BC8', 'BC9', 'BC10']]
32
+ _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7', 'BC8', 'BC9', 'BC10']].apply(to_numeric,
33
+ errors='coerce')
33
34
 
34
35
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
35
36
 
@@ -1,3 +1,4 @@
1
+ import numpy as np
1
2
  from pandas import read_csv, to_numeric
2
3
 
3
4
  from AeroViz.rawDataReader.core import AbstractReader
@@ -14,7 +15,7 @@ class Reader(AbstractReader):
14
15
  def _raw_reader(self, file):
15
16
  # 查詢小時值(測項).csv & 查詢小時值(直式).csv (有、無輸出有效值都可以)
16
17
  df = read_csv(file, encoding='big5', encoding_errors='ignore', index_col=0, parse_dates=True,
17
- on_bad_lines='skip').apply(to_numeric, errors='coerce')
18
+ on_bad_lines='skip')
18
19
 
19
20
  if len(df.groupby('測站')) > 1:
20
21
  raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}')
@@ -29,11 +30,12 @@ class Reader(AbstractReader):
29
30
  df.index.name = 'Time'
30
31
 
31
32
  # 如果沒有將無效值拿掉就輸出 請將包含 #、L 的字串替換成 # 或 _
32
- df = df.replace(to_replace=r'\d*[#]\b', value='#', regex=True)
33
- df = df.replace(to_replace=r'\d*[L]\b', value='_', regex=True)
33
+ df = df.replace(to_replace=r'\d*\.?\d*[#]\b', value='#', regex=True)
34
+ df = df.replace(to_replace=r'\d*\.?\d*[L]\b', value='_', regex=True)
34
35
 
35
36
  # 欄位排序
36
- return self.reorder_dataframe_columns(df, [desired_order1])
37
+ return self.reorder_dataframe_columns(df, [desired_order1]).apply(to_numeric, errors='coerce')
37
38
 
38
39
  def _QC(self, _df):
40
+ _df = _df.mask(_df < 0, np.nan)
39
41
  return _df
@@ -7,7 +7,6 @@ class Reader(AbstractReader):
7
7
  nam = 'GRIMM'
8
8
 
9
9
  def _raw_reader(self, file):
10
-
11
10
  _df = read_csv(file, header=233, delimiter='\t', index_col=0, parse_dates=[0], encoding='ISO-8859-1',
12
11
  dayfirst=True).rename_axis("Time")
13
12
  _df.index = to_datetime(_df.index, format="%d/%m/%Y %H:%M:%S", dayfirst=True)
@@ -12,11 +12,13 @@ class Reader(AbstractReader):
12
12
  def _raw_reader(self, file):
13
13
 
14
14
  with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
15
- _df = read_csv(f, parse_dates=True, index_col=0, na_values='-').apply(to_numeric, errors='coerce')
15
+ _df = read_csv(f, parse_dates=True, index_col=0, na_values='-')
16
16
 
17
17
  _df.columns = _df.keys().str.strip(' ')
18
18
  _df.index.name = 'time'
19
19
 
20
+ _df = _df.apply(to_numeric, errors='coerce')
21
+
20
22
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
21
23
 
22
24
  def _QC(self, _df):
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  _df = read_csv(file, parse_dates=['Date / time local'], index_col='Date / time local').rename_axis(
11
- "Time").apply(to_numeric, errors='coerce')
11
+ "Time")
12
12
 
13
13
  _df = _df.rename(columns={
14
14
  'UV BCc': 'BC1',
@@ -26,7 +26,8 @@ class Reader(AbstractReader):
26
26
  # if self.meta.get('error_state', False):
27
27
  # _df = _df.where(~_df['Status'].isin(self.meta['error_state'])).copy()
28
28
 
29
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BB mass', 'FF mass', 'Delta-C', 'AAE', 'BB']]
29
+ _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BB mass', 'FF mass', 'Delta-C', 'AAE', 'BB']].apply(to_numeric,
30
+ errors='coerce')
30
31
 
31
32
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
32
33
 
@@ -2,8 +2,9 @@ from typing import Literal
2
2
 
3
3
  import numpy as np
4
4
  import pandas
5
- from pandas import read_excel, to_numeric
5
+ from pandas import DataFrame, read_excel
6
6
 
7
+ from AeroViz.rawDataReader.config.supported_instruments import meta
7
8
  from AeroViz.rawDataReader.core import AbstractReader
8
9
 
9
10
  pandas.set_option("future.no_silent_downcasting", True)
@@ -13,143 +14,181 @@ desired_order1 = ['SO2', 'NO', 'NOx', 'NO2', 'CO', 'O3', 'THC', 'NMHC',
13
14
 
14
15
  desired_order2 = ['Benzene', 'Toluene', 'EthylBenzene', 'm/p-Xylene', 'o-Xylene']
15
16
 
16
- desired_order3 = ['Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
17
- 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Rb', 'Sr',
18
- 'Y', 'Zr', 'Nb', 'Mo', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te',
19
- 'Cs', 'Ba', 'La', 'Ce', 'W', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi']
20
-
21
- desired_order4 = ['NH3', 'HF', 'HCl', 'HNO2', 'HNO3', 'G-SO2',
22
- 'Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+',
23
- 'F-', 'Cl-', 'NO2-', 'NO3-', 'PO43-', 'SO42-']
17
+ MDL_NUMBER = -999
24
18
 
25
19
 
26
20
  class Reader(AbstractReader):
27
21
  nam = 'Minion'
28
22
 
23
+ # 楠梓8月數據(環境部)(空品、重金屬和氣膠可用率) -> 楠梓8月數據_level1 -> NZ_minion_XXXX
29
24
  def _raw_reader(self, file):
30
- # 讀取 Excel 文件
31
25
  df = read_excel(file, index_col=0, parse_dates=True)
26
+ df.index.name = 'Time'
32
27
 
33
28
  # 重命名列,去除空白
34
29
  df = df.rename(columns=lambda x: x.strip())
35
30
 
36
- # 保存單位行並給它一個名稱
37
- units = df.iloc[0].copy()
31
+ # 保存單位
32
+ self.units = df.iloc[0].copy()
38
33
 
39
34
  # 刪除原始數據中的單位行
40
35
  df = df.iloc[1:]
41
36
 
42
37
  # 替換特定值
43
- df = df.replace({'維護校正': '*', np.nan: '-', '0L': '_', 'Nodata': '-'}, inplace=False)
44
- df = df.replace(to_replace=r'\d*[#]\b', value='#', regex=True)
45
- df = df.replace(to_replace=r'\d*[L]\b', value='_', regex=True)
38
+ df = df.replace({'維護校正': '*', np.nan: '-', 'Nodata': '-', '0L': MDL_NUMBER})
39
+ # df = df.replace(to_replace=r'\d*\.?\d*[#]\b', value='_', regex=True)
40
+ df = df.replace(to_replace=r'\d*\.?\d*[L]\b', value=MDL_NUMBER, regex=True)
41
+
42
+ # 處理除了'WD'列的 0 值 替換為 '_'
43
+ for col in [col for col in df.columns if col != 'WD']:
44
+ df[col] = df[col].replace({0: MDL_NUMBER})
45
+
46
+ # replace to numeric for estimating qc rate
47
+ df = df.replace({'_': MDL_NUMBER})
46
48
 
47
- # 處理除了'WD'列的 0 值
48
- non_wd_columns = [col for col in df.columns if col != 'WD']
49
- df.loc[:, non_wd_columns] = df.loc[:, non_wd_columns].replace({0: '_'})
49
+ XRF_col = list(meta.get('XRF').get('MDL').keys())
50
+ IGAC_col = list(meta.get('IGAC').get('MDL').keys())
50
51
 
51
52
  # 重新排序列
52
- df = self.reorder_dataframe_columns(df, [desired_order1, desired_order2, desired_order3, desired_order4])
53
+ df = self.reorder_dataframe_columns(df, [desired_order1, desired_order2, XRF_col, IGAC_col])
53
54
 
54
55
  # 將單位行添加回 DataFrame
55
56
  # df = concat([units.to_frame().T, df])
56
57
 
57
- df.index.name = 'Time'
58
+ # save Level1 data
59
+ output_folder = file.parent / 'Level1'
60
+ output_folder.mkdir(parents=True, exist_ok=True)
61
+ df.to_csv(output_folder / f'{file.stem}_Level1.csv')
58
62
 
59
63
  return df.loc[~df.index.duplicated() & df.index.notna()]
60
64
 
61
65
  def _QC(self, _df):
66
+ IGAC_col = list(meta.get('IGAC').get('MDL'))
67
+ XRF_col = list(meta.get('XRF').get('MDL'))
68
+
69
+ # IGAC MDL QC
70
+ _df[IGAC_col] = self.IGAC_QAQC(_df[IGAC_col])
71
+
72
+ # XRF MDL QC
73
+ _df[XRF_col] = self.XRF_QAQC(_df[XRF_col])
74
+
62
75
  # remove negative value
63
- _df = _df.mask((_df < 0))
76
+ # _df = _df.mask((_df < 0))
77
+ _df = _df.mask(_df == MDL_NUMBER, np.nan)
64
78
 
65
- # XRF QAQC
66
- _df = self.XRF_QAQC(_df)
79
+ col = [col for col in desired_order1 if col != 'WD']
80
+ _df[col] = self.time_aware_IQR_QC(_df[col])
67
81
 
68
- # ions balance
69
- _df = self.IGAC_QAQC(_df)
82
+ # Calculate the mass and ion balance
83
+ # mass tolerance = ± 1, ions balance tolerance = ± 1
70
84
 
71
- # QC data in 6h
72
- _df = self.time_aware_IQR_QC(_df)
85
+ # # conc. of main salt should be present at the same time (NH4+, SO42-, NO3-)
86
+ # _df_salt = df.mask(df.sum(axis=1, min_count=1) > df.PM25).dropna(subset=_main).copy()
87
+
88
+ ions_mass = _df[['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+', 'Cl-', 'NO3-', 'SO42-']].sum(axis=1)
89
+ element_mass = _df[XRF_col].sum(axis=1)
90
+
91
+ estimated_mass = ions_mass + element_mass
92
+
93
+ valid_mask = 2 * _df['PM2.5'] > estimated_mass
94
+
95
+ _df.loc[~valid_mask, IGAC_col + XRF_col] = np.nan
73
96
 
74
97
  return _df
75
98
 
76
- # base on Xact 625i Minimum Decision Limit (MDL) for XRF in ng/m3, 60 min sample time
77
- def XRF_QAQC(self, df, MDL_replace: Literal['nan', '0.5 * MDL'] = 'nan'):
78
- MDL = {
79
- 'Al': 100, 'Si': 18, 'P': 5.2, 'S': 3.2,
80
- 'Cl': 1.7, 'K': 1.2, 'Ca': 0.3, 'Ti': 1.6,
81
- 'V': 0.12, 'Cr': 0.12, 'Mn': 0.14, 'Fe': 0.17,
82
- 'Co': 0.14, 'Ni': 0.096, 'Cu': 0.079, 'Zn': 0.067,
83
- 'Ga': 0.059, 'Ge': 0.056, 'As': 0.063, 'Se': 0.081,
84
- 'Br': 0.1, 'Rb': 0.19, 'Sr': 0.22, 'Y': 0.28,
85
- 'Zr': 0.33, 'Nb': 0.41, 'Mo': 0.48, 'Pd': 2.2,
86
- 'Ag': 1.9, 'Cd': 2.5, 'In': 3.1, 'Sn': 4.1,
87
- 'Sb': 5.2, 'Te': 0.6, 'Cs': 0.37, 'Ba': 0.39,
88
- 'La': 0.36, 'Ce': 0.3, 'W': 0.0001, 'Pt': 0.12,
89
- 'Au': 0.1, 'Hg': 0.12, 'Tl': 0.12, 'Pb': 0.13,
90
- 'Bi': 0.13
91
- }
92
-
93
- # Br Li internal standard
94
-
95
- # 將小於 MDL 值的數據替換為 nan or 1/2 MDL
96
- for element, threshold in MDL.items():
97
- if element in df.columns:
98
- rep = np.nan if MDL_replace == 'nan' else 0.5 * threshold
99
- df[element] = df[element].where(df[element] >= threshold, rep)
100
-
101
- self.logger.info(f"{'=' * 60}")
102
- self.logger.info(f"XRF QAQC summary:")
103
- self.logger.info("\t\ttransform values below MDL to NaN")
104
- self.logger.info(f"{'=' * 60}")
99
+ def mdlReplace_timeAware_qc(self, df: DataFrame, MDL: dict, MDL_replace) -> DataFrame:
100
+ # Step 1: Track MDL positions and values below threshold
101
+ mdl_mask = (df.eq(MDL_NUMBER) |
102
+ df.apply(lambda x: x < MDL.get(x.name, float('-inf'))))
103
+
104
+ # Step 2: Convert all values below MDL to MDL_NUMBER (-999)
105
+ df_mdl = df.mask(mdl_mask, MDL_NUMBER)
106
+
107
+ # Step 3: Apply time_aware_IQR_QC (excluding MDL_NUMBER values)
108
+ df_qc = self.time_aware_IQR_QC(df_mdl.mask(df_mdl == MDL_NUMBER))
109
+
110
+ # Step 4: Handle values below MDL according to specified method
111
+ if MDL_replace == '0.5 * MDL':
112
+ for column, threshold in MDL.items():
113
+ if column in df.columns and threshold is not None:
114
+ df_qc.loc[df_mdl[column] == MDL_NUMBER, column] = 0.5 * threshold
115
+ else:
116
+ df_qc.loc[df_mdl[column] == MDL_NUMBER, column] = np.nan
117
+ else: # 'nan'
118
+ df_qc = df_qc.mask(df_mdl == MDL_NUMBER, np.nan)
119
+
120
+ return df_qc
121
+
122
+ def XRF_QAQC(self,
123
+ df: DataFrame,
124
+ MDL_replace: Literal['nan', '0.5 * MDL'] = '0.5 * MDL'
125
+ ) -> DataFrame:
126
+ """
127
+ Perform Quality Assurance and Quality Control for XRF data
128
+
129
+ Parameters
130
+ ----------
131
+ df : pd.DataFrame
132
+ Input dataframe with XRF data
133
+ MDL_replace : {'nan', '0.5 * MDL'}, default='nan'
134
+ Method to handle values below MDL:
135
+ - 'nan': Replace with NaN
136
+ - '0.5 * MDL': Replace with half of MDL value
137
+
138
+ Returns
139
+ -------
140
+ pd.DataFrame
141
+ Processed dataframe with QC applied and MDL values handled
142
+ """
143
+ MDL = meta.get('XRF').get('MDL')
144
+
145
+ df = self.mdlReplace_timeAware_qc(df, MDL, MDL_replace)
105
146
 
106
147
  # 轉換單位 ng/m3 -> ug/m3
107
148
  if df.Al.max() > 10 and df.Fe.max() > 10:
108
- # 確保 MDL.keys() 中的所有列都存在於 _df 中
109
149
  columns_to_convert = [col for col in MDL.keys() if col in df.columns]
110
-
111
150
  df[columns_to_convert] = df[columns_to_convert].div(1000)
112
151
 
152
+ self.logger.info(f"XRF QAQC summary: transform values below MDL to {MDL_replace}")
153
+
113
154
  return df
114
155
 
115
- def IGAC_QAQC(self, df, tolerance=1):
156
+ def IGAC_QAQC(self,
157
+ df: DataFrame,
158
+ MDL_replace: Literal['nan', '0.5 * MDL'] = '0.5 * MDL',
159
+ tolerance: float = 1
160
+ ) -> DataFrame:
116
161
  """
117
- Calculate the balance of ions in the system
162
+ Perform Quality Assurance and Quality Control for IGAC data
163
+
164
+ Parameters
165
+ ----------
166
+ df : pd.DataFrame
167
+ Input dataframe with IGAC data
168
+ MDL_replace : {'nan', '0.5 * MDL'}, default='nan'
169
+ Method to handle values below MDL:
170
+ - 'nan': Replace with NaN
171
+ - '0.5 * MDL': Replace with half of MDL value
172
+ tolerance : float, default=1
173
+ Tolerance value for QC checks
174
+
175
+ Returns
176
+ -------
177
+ pd.DataFrame
178
+ Processed dataframe with QC applied and MDL values handled
118
179
  """
119
- # https://www.yangyao-env.com/web/product/product_in2.jsp?pd_id=PD1640151884502
120
- MDL = {
121
- 'HF': 0.08, 'HCl': 0.05, 'HNO2': 0.01, 'HNO3': 0.05, 'G-SO2': 0.05, 'NH3': 0.1,
122
- 'Na+': 0.05, 'NH4+': 0.08, 'K+': 0.08, 'Mg2+': 0.05, 'Ca2+': 0.05,
123
- 'F-': 0.08, 'Cl-': 0.05, 'NO2-': 0.05, 'NO3-': 0.01, 'PO43-': None, 'SO42-': 0.05,
124
- }
125
-
126
- MR = {
127
- 'HF': 200, 'HCl': 200, 'HNO2': 200, 'HNO3': 200, 'G-SO2': 200, 'NH3': 300,
128
- 'Na+': 300, 'NH4+': 300, 'K+': 300, 'Mg2+': 300, 'Ca2+': 300,
129
- 'F-': 300, 'Cl-': 300, 'NO2-': 300, 'NO3-': 300, 'PO43-': None, 'SO42-': 300,
130
- }
180
+ MDL = meta.get('IGAC').get('MDL')
131
181
 
182
+ df = self.mdlReplace_timeAware_qc(df, MDL, MDL_replace)
183
+
184
+ # Define the ions
185
+ _df = df.copy()
132
186
  _cation, _anion, _main = (['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+'],
133
187
  ['Cl-', 'NO2-', 'NO3-', 'SO42-'],
134
188
  ['SO42-', 'NO3-', 'NH4+'])
135
- # QC: replace values below MDL with 0.5 * MDL -> ions balance -> PM2.5 > main salt
136
- # mass tolerance = 0.3, ions balance tolerance = 0.3
137
-
138
- # # conc. of main salt should be present at the same time (NH4+, SO42-, NO3-)
139
- # _df_salt = df.mask(df.sum(axis=1, min_count=1) > df.PM25).dropna(subset=_main).copy()
140
-
141
- # Define the ions
142
- item = ['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+', 'Cl-', 'NO2-', 'NO3-', 'SO42-']
143
189
 
144
- # Calculate the balance
145
- _df = df[item].apply(to_numeric, errors='coerce')
146
-
147
- # for (_key, _df_col) in _df.items():
148
- # _df[_key] = _df_col.mask(_df_col < MDL[_key], MDL[_key] / 2)
149
-
150
- _df['+_mole'] = _df[['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+']].div([23, 18, 39, (24 / 2), (40 / 2)]).sum(axis=1,
151
- skipna=True)
152
- _df['-_mole'] = _df[['Cl-', 'NO2-', 'NO3-', 'SO42-']].div([35.5, 46, 62, (96 / 2)]).sum(axis=1, skipna=True)
190
+ _df['+_mole'] = _df[_cation].div([23, 18, 39, (24 / 2), (40 / 2)]).sum(axis=1, skipna=True)
191
+ _df['-_mole'] = _df[_anion].div([35.5, 46, 62, (96 / 2)]).sum(axis=1, skipna=True)
153
192
 
154
193
  # Avoid division by zero
155
194
  _df['ratio'] = np.where(_df['-_mole'] != 0, _df['+_mole'] / _df['-_mole'], np.nan)
@@ -157,24 +196,19 @@ class Reader(AbstractReader):
157
196
  # Calculate bounds
158
197
  lower_bound, upper_bound = 1 - tolerance, 1 + tolerance
159
198
 
160
- # 根据ratio决定是否保留原始数据
199
+ # 根據ratio决定是否保留原始数据
161
200
  valid_mask = ((_df['ratio'] <= upper_bound) & (_df['ratio'] >= lower_bound) &
162
201
  ~np.isnan(_df['+_mole']) & ~np.isnan(_df['-_mole']))
163
202
 
164
- # 保留数据或将不符合条件的行设为NaN
165
- df.loc[~valid_mask, item] = np.nan
203
+ # 保留数據或將不符合的條件設為NaN
204
+ df.loc[~valid_mask] = np.nan
166
205
 
167
- # 计算保留的数据的百分比
206
+ # 計算保留的数據的百分比
168
207
  retained_percentage = (valid_mask.sum() / len(df)) * 100
169
208
 
170
- self.logger.info(f"{'=' * 60}")
171
- self.logger.info(f"Ions balance summary:")
172
- self.logger.info(f"\t\tretain {retained_percentage.__round__(0)}% data within tolerance {tolerance}")
173
- self.logger.info(f"{'=' * 60}")
209
+ self.logger.info(f"Ions balance summary: {retained_percentage.__round__(0)}% within tolerance ± {tolerance}")
174
210
 
175
211
  if retained_percentage < 70:
176
212
  self.logger.warning("Warning: The percentage of retained data is less than 70%")
177
213
 
178
- # print(f"\tretain {retained_percentage.__round__(0)}% data within tolerance {tolerance}")
179
-
180
214
  return df
@@ -1,4 +1,4 @@
1
- from pandas import to_datetime, read_csv, DataFrame
1
+ from pandas import to_datetime, read_csv, DataFrame, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -42,7 +42,7 @@ class Reader(AbstractReader):
42
42
 
43
43
  _df_out.mask(_df_out['status'] != 0) # 0000 -> numeric to 0
44
44
 
45
- _df = _df_out[['B', 'G', 'R', 'BB', 'BG', 'BR', 'RH']]
45
+ _df = _df_out[['B', 'G', 'R', 'BB', 'BG', 'BR', 'RH']].apply(to_numeric, errors='coerce')
46
46
 
47
47
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
48
48
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with open(file, 'r', encoding='utf-8', errors='ignore') as f:
11
- _df = read_csv(f, skiprows=3, index_col=False).apply(to_numeric, errors='coerce')
11
+ _df = read_csv(f, skiprows=3, index_col=False)
12
12
 
13
13
  _df = _df.rename(columns={'Time Stamp': 'time',
14
14
  'System status': 'status',
@@ -27,7 +27,7 @@ class Reader(AbstractReader):
27
27
 
28
28
  _df = _df.where(_df['status'] < 1)
29
29
 
30
- _df = _df[['PM_NV', 'PM_Total', 'noise']]
30
+ _df = _df[['PM_NV', 'PM_Total', 'noise']].apply(to_numeric, errors='coerce')
31
31
 
32
32
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
33
33
 
@@ -1,11 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: AeroViz
3
- Version: 0.1.9.1
3
+ Version: 0.1.9.2
4
4
  Summary: Aerosol science
5
5
  Home-page: https://github.com/Alex870521/AeroViz
6
6
  Author: alex
7
7
  Author-email: alex870521@gmail.com
8
8
  Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
9
10
  Classifier: License :: OSI Approved :: MIT License
10
11
  Classifier: Operating System :: OS Independent
11
12
  Requires-Python: >=3.12
@@ -21,6 +22,8 @@ Requires-Dist: windrose ==1.9.2
21
22
  Requires-Dist: cartopy ==0.24.1
22
23
  Requires-Dist: tabulate ==0.9.0
23
24
  Requires-Dist: rich ~=13.7.1
25
+ Provides-Extra: test
26
+ Requires-Dist: pytest >=7.0.0 ; extra == 'test'
24
27
 
25
28
  ## <div align="center">AeroViz for Aerosol Science Visualization</div>
26
29
 
@@ -1,7 +1,7 @@
1
1
  AeroViz/__init__.py,sha256=A5W6SR71uY_eW44Sh-Yk6blJQ_G1aHrkSzeP2YTPQc4,371
2
- AeroViz/data/240228_00.txt,sha256=DWfY83EW3fOcv9dW-Y4pudq8-M7BJlXD-tlMSYrAk2w,8946
3
2
  AeroViz/data/DEFAULT_DATA.csv,sha256=eeeyeh8vSLKkE5tAF0TYnUNOyQIH98VA41bJaAP204Y,2248526
4
3
  AeroViz/data/DEFAULT_PNSD_DATA.csv,sha256=imLvLA80oYwo_jzXZtlQn5hZ76d47HUIlK2jp0tZPrg,2636511
4
+ AeroViz/data/hysplit_example_data.txt,sha256=DWfY83EW3fOcv9dW-Y4pudq8-M7BJlXD-tlMSYrAk2w,8946
5
5
  AeroViz/dataProcess/__init__.py,sha256=D3rTVUiGfs_daGuaotVtbijOgLAp6HaRWchj-zoEnHw,828
6
6
  AeroViz/dataProcess/Chemistry/__init__.py,sha256=fyyomjxkQcUNWDx4R5jPrHafAftN-v2liUZii9OlaiU,2058
7
7
  AeroViz/dataProcess/Chemistry/_calculate.py,sha256=q7ojTFPok0vg8k_1PMECNdP5CPanR9NWQ4Rx5iTcHew,599
@@ -28,7 +28,7 @@ AeroViz/dataProcess/SizeDistr/_merge_v1.py,sha256=6Anb8DszoatK66tc9ccA6ZApbqtL7p
28
28
  AeroViz/dataProcess/SizeDistr/_merge_v2.py,sha256=8OzUKw7hTg-yuQBipuFKgBS_7c7zbApN_BNr00G8q9c,9046
29
29
  AeroViz/dataProcess/SizeDistr/_merge_v3.py,sha256=HN2ARFmeWOawOWRPPv_pHEGBBZNgXVbH4dDTxcN7rdY,18749
30
30
  AeroViz/dataProcess/SizeDistr/_merge_v4.py,sha256=b8RVAievGIOLrmJHJXRsKXQ1tkMkm6rx43S7XAfeXE4,16228
31
- AeroViz/dataProcess/SizeDistr/_size_distr.py,sha256=o5fTwLH7j9j5129e3uciSJrOR8AGCN3tkAyGffoPENg,3127
31
+ AeroViz/dataProcess/SizeDistr/_size_distr.py,sha256=ULhGKlxE9QmbDO_PS3HOSKzepeMfJZWabJvGXqsDEvE,3259
32
32
  AeroViz/dataProcess/VOC/__init__.py,sha256=8GNP0RMymTkJXK18pSgfLHqrKPWboN-3x1_Ke4UrI44,259
33
33
  AeroViz/dataProcess/VOC/_potential_par.py,sha256=h3rVdvtBvC6xHa_ZG4Oq5eXezeSZtHNy6T6I40maIcM,3863
34
34
  AeroViz/dataProcess/VOC/support_voc.json,sha256=tMYp_NERqhSriVRE2NavXh33CQ5CnsbJHtmMFlE5q_E,6804
@@ -44,7 +44,7 @@ AeroViz/plot/violin.py,sha256=pU2Z2yTWocEtImmCAmbtn0WvXtUOrnCGOdDOrLxjooU,2689
44
44
  AeroViz/plot/distribution/__init__.py,sha256=nhbIegWczkuEfWsE7-2jfF0dnpmPDzJJzjq8Fuh6q5k,28
45
45
  AeroViz/plot/distribution/distribution.py,sha256=sAjqtqKavFwQqI8PGPFnpvZFSU-w2UKjcTTC5L91f4E,20595
46
46
  AeroViz/plot/hysplit/__init__.py,sha256=VrEkha2OEFp_00Xj9R98C96niZ7fYqJzGPeYsbojtzA,23
47
- AeroViz/plot/hysplit/hysplit.py,sha256=yDIQuhlP3IPXRy0BCRkzqyJ_PfYPqIF-S1lpksW1dFk,2504
47
+ AeroViz/plot/hysplit/hysplit.py,sha256=gSCkemFLRvsk4m8zYbxbsjrdU14NkN9ZNfVRvdq69aM,2796
48
48
  AeroViz/plot/meteorology/__init__.py,sha256=hhGfQE3IUzS3Eaju_nO7LomPPHJnd-zAAZZweXOXs2M,27
49
49
  AeroViz/plot/meteorology/meteorology.py,sha256=6hk-5olgQTw2SB-GhEizLN19vRVBztgiXoruh8Q6Zns,11282
50
50
  AeroViz/plot/optical/PyMieScatt_update.py,sha256=g3vlzATjzYSYZd3LwmwxEmdkpo4cHJ3KY4rePY4jwZk,21065
@@ -68,24 +68,25 @@ AeroViz/plot/utils/fRH.json,sha256=t-2ux4TLOYAB-4jJ72LSM4jv1jk9XkaxKYNMDepMHIg,6
68
68
  AeroViz/plot/utils/plt_utils.py,sha256=7Au3r2-7AZQmzrO2OfcyTFomJRLHgu1Npb7wxQqUvzY,3438
69
69
  AeroViz/plot/utils/sklearn_utils.py,sha256=hKFfkVkYLRxkIDKvO9COHXwhjD_UWqQigdT3mDToni4,2098
70
70
  AeroViz/plot/utils/units.json,sha256=JKcqvLA6dkc8REV_NhX85Jl9LA4iAZxzw9RJp6JTla0,2965
71
- AeroViz/rawDataReader/__init__.py,sha256=0JUjzD54KRWQQ5C07zpVDwHlg02QoSaQ89rRFB4RwNM,4852
71
+ AeroViz/rawDataReader/__init__.py,sha256=FDUsJ_v6wG9DUcNOjmqaWJvVdMgatJrqGYPZdbaW7Wo,4859
72
72
  AeroViz/rawDataReader/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- AeroViz/rawDataReader/config/supported_instruments.py,sha256=CGE34wsXyq-Za8IIYu2rt3JzoANrFBgrphqvl2FYTC0,5296
74
- AeroViz/rawDataReader/core/__init__.py,sha256=QeUDtDAz49a85jYdY838nKVCI7pqSwRDP1o1IMcHjmU,15205
75
- AeroViz/rawDataReader/script/AE33.py,sha256=FGET-JGW2H4cTRoduuIS8VGFYK7jEnPIhqlBJuNAGFQ,1278
76
- AeroViz/rawDataReader/script/AE43.py,sha256=H8lrI6jqLTRwbslVmbkB_YxEcPl42uEVIfKk6WGTTBI,1231
73
+ AeroViz/rawDataReader/config/supported_instruments.py,sha256=RR2TPome27O3ERNxQ043boDMfvWG9BHvhpGJ6VQC5gw,5988
74
+ AeroViz/rawDataReader/core/__init__.py,sha256=i5JTd8IhhTYI2bpEMQ27XmxPZojACUzWnKcw4gGofnE,12691
75
+ AeroViz/rawDataReader/core/qc.py,sha256=tFIVsfph8yZIK6NRKQxaZYHcruJclriKSvR0oC12T0Q,5698
76
+ AeroViz/rawDataReader/script/AE33.py,sha256=FbbFJ93aLVjA8k2QZ_fKcI9uXoux2k0AL3O73iY879I,1278
77
+ AeroViz/rawDataReader/script/AE43.py,sha256=GjcICBJ3nIANyMd4kovteBUtkyCGLTos07BczgSCuVE,1231
77
78
  AeroViz/rawDataReader/script/APS_3321.py,sha256=x75G72Xl0vElr6Njbv8SlOcosAHNozseaJzAxVmfXyI,1697
78
- AeroViz/rawDataReader/script/Aurora.py,sha256=HDLyHOw62tgfLjjwYCWVAJKsc4SB7aLnIjI6HI_WTRM,1491
79
- AeroViz/rawDataReader/script/BC1054.py,sha256=pvHnUA_gJIRf9jEUi8vzNyErXMHRQWLnOsExIZR0_OA,1574
80
- AeroViz/rawDataReader/script/EPA.py,sha256=lIdWx9roM1unyqSjTtd5aAOGoITcU5e-P0XYt0k8Mjg,1578
81
- AeroViz/rawDataReader/script/GRIMM.py,sha256=UyWeqZfOcbIVCmLk_0P8xSh6eQiq_U2Gse84O2mTnlQ,847
82
- AeroViz/rawDataReader/script/IGAC.py,sha256=ZdskNc65wVx2znmbjJp2J_rxVg5vuqxB1HWRoqxb7Ho,2364
83
- AeroViz/rawDataReader/script/MA350.py,sha256=w0QCoJxMIMwaLOLWLE65FM7MY9kcvpkRMAowRA5TaYk,1490
84
- AeroViz/rawDataReader/script/Minion.py,sha256=BjuJe2KWLJKgvVnV0WfilGh2DvaWqJDDNj8i0z3oeuU,7306
85
- AeroViz/rawDataReader/script/NEPH.py,sha256=6qs2oiS6zDOFkTNEu9T-8hrCuoZHPfjd5UoMacobAno,3168
79
+ AeroViz/rawDataReader/script/Aurora.py,sha256=2duNsK2WCWk21Rd2d4EugAA_yN27p2AjRFd9ClJ2aUA,1491
80
+ AeroViz/rawDataReader/script/BC1054.py,sha256=tuDyq8M5BPbmu1yJr9zXYS2piMGz08yTQXGT6tK9jxA,1675
81
+ AeroViz/rawDataReader/script/EPA.py,sha256=1ZXEcCnIMOhEXu0JwzeCgmhRtPzBNo2CfLhfhstOT4k,1649
82
+ AeroViz/rawDataReader/script/GRIMM.py,sha256=-D4U83ihjAqcvOAnk7NET59IZfV1JzPYKRQjrIQyBDM,846
83
+ AeroViz/rawDataReader/script/IGAC.py,sha256=i6WT3rX0n0e4hq7NfWN6tVwCuKAeV9ARxPkXZSbQj74,2387
84
+ AeroViz/rawDataReader/script/MA350.py,sha256=EfPTFhgDAjI7r0G6kW7pjog-4MBOnvW0cyFqIkCxEP8,1597
85
+ AeroViz/rawDataReader/script/Minion.py,sha256=9G_q-EhE3nfJoxWFwAnMYdY0teSYqcYxTkk0JW5lmY0,7793
86
+ AeroViz/rawDataReader/script/NEPH.py,sha256=x6HgnvpmmhOOvB4-nL-jTfoSo0x8FUxVBXPqAyfhZVk,3215
86
87
  AeroViz/rawDataReader/script/OCEC.py,sha256=jWWaNbCjP5MJDYrdWUhjrQLClaWqC8SGDVPIFJ9xljU,3413
87
88
  AeroViz/rawDataReader/script/SMPS.py,sha256=EtXmeukOIwqfMwMJqv99_STfVg0uPdVr96r-tfD95gk,2774
88
- AeroViz/rawDataReader/script/TEOM.py,sha256=Ew4JqDf_qpGFvvBLwX824kKIRXMM6QZLwEB4t4xkTSk,2103
89
+ AeroViz/rawDataReader/script/TEOM.py,sha256=jsxU4W46FmLjiIthmPOHo6CAYFZiPENhW80WjMCiIPA,2103
89
90
  AeroViz/rawDataReader/script/VOC.py,sha256=GUme72ZyjSzREsFNUgOV_OCESIVJBXY9KrKP1c9Av7I,1248
90
91
  AeroViz/rawDataReader/script/XRF.py,sha256=SU1-D94GkwdkjlNXcyXbwQG1tOYCpeL6GTVkaLBHc-s,187
91
92
  AeroViz/rawDataReader/script/__init__.py,sha256=s3c797Q8EAGcJCxVRTA-KdHie-vHLNYbMxwa5c0qz-I,214
@@ -94,8 +95,8 @@ AeroViz/tools/database.py,sha256=05VzjJyhlRrhsZdhfFQ__7CxGm4MdFekLjz3_Is5h9U,343
94
95
  AeroViz/tools/dataclassifier.py,sha256=_wpv0PlZ5EGkcNqHxfFtdEsYvHP5FVE8sMZXikhm_YE,4492
95
96
  AeroViz/tools/dataprinter.py,sha256=Jq2Yztpa9YCOeLDVTrRs7PhSdNIPhEAexVj1YSuJ7hY,2249
96
97
  AeroViz/tools/datareader.py,sha256=iTQ0U8hdNMjCdbiH7EiKW10UEoxzxXRHc4s5_1IikJo,1933
97
- AeroViz-0.1.9.1.dist-info/LICENSE,sha256=E-679GpGGkp3irmtuJXiT7R4cNUA4cmsH6Q7QUgPf5U,1069
98
- AeroViz-0.1.9.1.dist-info/METADATA,sha256=GVdWT5eQU-KOldHK8yzNq9rRN0DYBuVHJxqGsrvJefo,6253
99
- AeroViz-0.1.9.1.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
100
- AeroViz-0.1.9.1.dist-info/top_level.txt,sha256=BYsmTst_o4FZOKRP1XIvIMlN6mMTTXNfnSToL2_nVbQ,8
101
- AeroViz-0.1.9.1.dist-info/RECORD,,
98
+ AeroViz-0.1.9.2.dist-info/LICENSE,sha256=E-679GpGGkp3irmtuJXiT7R4cNUA4cmsH6Q7QUgPf5U,1069
99
+ AeroViz-0.1.9.2.dist-info/METADATA,sha256=qTnQ4ONlpadkTfYa5lhvSQ3DxuI4p_geNcv16f_bOjI,6373
100
+ AeroViz-0.1.9.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
101
+ AeroViz-0.1.9.2.dist-info/top_level.txt,sha256=BYsmTst_o4FZOKRP1XIvIMlN6mMTTXNfnSToL2_nVbQ,8
102
+ AeroViz-0.1.9.2.dist-info/RECORD,,