AeroViz 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AeroViz might be problematic. Click here for more details.

@@ -105,15 +105,17 @@ class AbstractReader(ABC):
105
105
 
106
106
  _acq_rate = round((sample_size / period_size) * 100, 1)
107
107
  _yid_rate = round((qc_size / sample_size) * 100, 1)
108
+ _OEE_rate = round((qc_size / period_size) * 100, 1)
108
109
 
109
110
  self.logger.info(f'{_nam}:')
110
111
  self.logger.info(f"\tAcquisition rate: {_acq_rate}%")
111
112
  self.logger.info(f'\tYield rate: {_yid_rate}%')
113
+ self.logger.info(f'\tOEE rate: {_OEE_rate}%')
112
114
  self.logger.info(f"{'=' * 60}")
113
115
 
114
116
  print(f'\n\t{_nam} : ')
115
- print(f'\t\tacquisition rate : \033[91m{_acq_rate}%\033[0m')
116
- print(f'\t\tyield rate : \033[91m{_yid_rate}%\033[0m')
117
+ print(f'\t\tacquisition rate | yield rate | OEE rate :'
118
+ f' \033[91m{_acq_rate}% | {_yid_rate}% -> {_OEE_rate}%\033[0m')
117
119
 
118
120
  if self.meta['deter_key'] is not None:
119
121
  # use qc_freq to calculate each period rate
@@ -296,22 +298,83 @@ class AbstractReader(ABC):
296
298
  return df[new_order]
297
299
 
298
300
  @staticmethod
299
- def n_sigma_QC(df: DataFrame, std_range: int = 5) -> DataFrame:
300
- df_ave, df_std = df.mean(), df.std()
301
- df_lowb, df_highb = df < (df_ave - df_std * std_range), df > (df_ave + df_std * std_range)
301
+ def n_sigma_QC(df: pd.DataFrame, std_range: int = 5) -> pd.DataFrame:
302
+ # 確保輸入是DataFrame
303
+ df = df.to_frame() if isinstance(df, pd.Series) else df
302
304
 
303
- return df.mask(df_lowb | df_highb).copy()
305
+ df_ave = df.mean()
306
+ df_std = df.std()
307
+
308
+ lower_bound = df < (df_ave - df_std * std_range)
309
+ upper_bound = df > (df_ave + df_std * std_range)
310
+
311
+ return df.mask(lower_bound | upper_bound)
304
312
 
305
- # "四分位數範圍法"(Inter-quartile Range Method)
306
313
  @staticmethod
307
- def IQR_QC(df: DataFrame, log_dist=False) -> tuple[DataFrame, DataFrame]:
308
- df = np.log10(df) if log_dist else df
314
+ def IQR_QC(df: pd.DataFrame, log_dist=False) -> pd.DataFrame:
315
+ # 確保輸入是DataFrame
316
+ df = df.to_frame() if isinstance(df, pd.Series) else df
317
+
318
+ df_transformed = np.log10(df) if log_dist else df
319
+
320
+ _df_q1 = df_transformed.quantile(0.25)
321
+ _df_q3 = df_transformed.quantile(0.75)
309
322
 
310
- _df_qua = df.quantile([.25, .75])
311
- _df_q1, _df_q3 = _df_qua.loc[.25].copy(), _df_qua.loc[.75].copy()
312
323
  _df_iqr = _df_q3 - _df_q1
313
324
 
314
- _se = concat([_df_q1 - 1.5 * _df_iqr] * len(df), axis=1).T.set_index(df.index)
315
- _le = concat([_df_q3 + 1.5 * _df_iqr] * len(df), axis=1).T.set_index(df.index)
325
+ # Calculate lower and upper bounds
326
+ lower_bound = df_transformed < (_df_q1 - 1.5 * _df_iqr)
327
+ upper_bound = df_transformed > (_df_q3 + 1.5 * _df_iqr)
328
+
329
+ # Apply the filter to the original dataframe
330
+ return df.mask(lower_bound | upper_bound)
331
+
332
+ @staticmethod
333
+ def rolling_IQR_QC(df: pd.DataFrame, window_size=24, log_dist=False) -> pd.DataFrame:
334
+ df = df.to_frame() if isinstance(df, pd.Series) else df
335
+ df_transformed = np.log10(df) if log_dist else df
336
+
337
+ def iqr_filter(x):
338
+ q1, q3 = x.quantile(0.25), x.quantile(0.75)
339
+ iqr = q3 - q1
340
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
341
+ return (x >= lower) & (x <= upper)
342
+
343
+ mask = df_transformed.rolling(window=window_size, center=True, min_periods=1).apply(iqr_filter)
344
+ return df.where(mask, np.nan)
345
+
346
+ @staticmethod
347
+ def time_aware_IQR_QC(df: pd.DataFrame, time_window='1D', log_dist=False) -> pd.DataFrame:
348
+ df = df.to_frame() if isinstance(df, pd.Series) else df
349
+ df_transformed = np.log10(df) if log_dist else df
316
350
 
317
- return (10 ** _se, 10 ** _le) if log_dist else (_se, _le)
351
+ def iqr_filter(group):
352
+ q1, q3 = group.quantile(0.25), group.quantile(0.75)
353
+ iqr = q3 - q1
354
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
355
+ return (group >= lower) & (group <= upper)
356
+
357
+ mask = df_transformed.groupby(pd.Grouper(freq=time_window)).transform(iqr_filter)
358
+ return df.where(mask, np.nan)
359
+
360
+ @staticmethod
361
+ def mad_iqr_hybrid_QC(df: pd.DataFrame, mad_threshold=3.5, log_dist=False) -> pd.DataFrame:
362
+ df = df.to_frame() if isinstance(df, pd.Series) else df
363
+ df_transformed = np.log10(df) if log_dist else df
364
+
365
+ # IQR 方法
366
+ q1, q3 = df_transformed.quantile(0.25), df_transformed.quantile(0.75)
367
+ iqr = q3 - q1
368
+ iqr_lower, iqr_upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
369
+
370
+ # MAD 方法
371
+ median = df_transformed.median()
372
+ mad = (df_transformed - median).abs().median()
373
+ mad_lower, mad_upper = median - mad_threshold * mad, median + mad_threshold * mad
374
+
375
+ # 结合两种方法
376
+ lower = np.maximum(iqr_lower, mad_lower)
377
+ upper = np.minimum(iqr_upper, mad_upper)
378
+
379
+ mask = (df_transformed >= lower) & (df_transformed <= upper)
380
+ return df.where(mask, np.nan)
@@ -1,4 +1,4 @@
1
- from pandas import read_table
1
+ from pandas import read_table, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,10 +8,10 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  if file.stat().st_size / 1024 < 550:
11
- print('\t It may not be a whole daily data.')
11
+ self.logger.info(f'\t {file} may not be a whole daily data. Make sure the file is correct.')
12
12
 
13
13
  _df = read_table(file, parse_dates={'time': [0, 1]}, index_col='time',
14
- delimiter=r'\s+', skiprows=5, usecols=range(67))
14
+ delimiter=r'\s+', skiprows=5, usecols=range(67)).apply(to_numeric, errors='coerce')
15
15
  _df.columns = _df.columns.str.strip(';')
16
16
 
17
17
  # remove data without Status=0, 128 (Not much filter tape), 256 (Not much filter tape)
@@ -23,8 +23,13 @@ class Reader(AbstractReader):
23
23
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
24
24
 
25
25
  def _QC(self, _df):
26
+ _index = _df.index.copy()
27
+
26
28
  # remove negative value
27
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7']].mask((_df < 0).copy())
29
+ _df = _df.mask((_df <= 0) | (_df > 20000))
30
+
31
+ # use IQR_QC
32
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
28
33
 
29
- # QC data in 1h
30
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
34
+ # make sure all columns have values, otherwise set to nan
35
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import read_csv
1
+ from pandas import read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -7,7 +7,7 @@ class Reader(AbstractReader):
7
7
  nam = 'AE43'
8
8
 
9
9
  def _raw_reader(self, file):
10
- _df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time')
10
+ _df = read_csv(file, parse_dates={'time': ['StartTime']}, index_col='time').apply(to_numeric, errors='coerce')
11
11
  _df_id = _df['SetupID'].iloc[-1]
12
12
 
13
13
  # get last SetupID data
@@ -24,8 +24,13 @@ class Reader(AbstractReader):
24
24
 
25
25
  # QC data
26
26
  def _QC(self, _df):
27
+ _index = _df.index.copy()
28
+
27
29
  # remove negative value
28
- _df = _df.mask((_df < 0).copy())
30
+ _df = _df.mask((_df <= 0) | (_df > 20000))
31
+
32
+ # use IQR_QC
33
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
29
34
 
30
- # QC data in 1h
31
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
35
+ # make sure all columns have values, otherwise set to nan
36
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import to_datetime, read_csv
1
+ from pandas import to_datetime, read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
11
- _df = read_csv(f, low_memory=False, index_col=0)
11
+ _df = read_csv(f, low_memory=False, index_col=0).apply(to_numeric, errors='coerce')
12
12
 
13
13
  _df.index = to_datetime(_df.index, errors='coerce')
14
14
  _df.index.name = 'time'
@@ -24,17 +24,21 @@ class Reader(AbstractReader):
24
24
  'RH': 'RH'
25
25
  })
26
26
 
27
- _df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR', 'RH']]
27
+ _df = _df[['B', 'G', 'R', 'BB', 'BG', 'BR']]
28
28
 
29
29
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
30
30
 
31
- # QC data
32
31
  def _QC(self, _df):
33
- # remove negative value
34
- _df = _df.mask((_df <= 0) | (_df > 2000)).copy()
32
+ _index = _df.index.copy()
35
33
 
36
- # total scattering is larger than back scattering
37
- _df = _df[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
34
+ _df = _df.mask((_df <= 0) | (_df > 2000))
38
35
 
39
- # QC data in 1h
40
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
36
+ _df = _df.loc[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
37
+
38
+ _df = _df.loc[(_df['B'] > _df['G']) & (_df['G'] > _df['R'])]
39
+
40
+ # use IQR_QC
41
+ _df = self.time_aware_IQR_QC(_df)
42
+
43
+ # make sure all columns have values, otherwise set to nan
44
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import read_csv
1
+ from pandas import read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with open(file, 'r', encoding='utf-8', errors='ignore') as f:
11
- _df = read_csv(f, parse_dates=True, index_col=0)
11
+ _df = read_csv(f, parse_dates=True, index_col=0).apply(to_numeric, errors='coerce')
12
12
 
13
13
  _df.columns = _df.columns.str.replace(' ', '')
14
14
 
@@ -33,10 +33,14 @@ class Reader(AbstractReader):
33
33
 
34
34
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
35
35
 
36
- # QC data
37
36
  def _QC(self, _df):
37
+ _index = _df.index.copy()
38
+
38
39
  # remove negative value
39
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'BC7', 'BC8', 'BC9', 'BC10']].mask((_df < 0).copy())
40
+ _df = _df.mask((_df <= 0) | (_df > 20000))
41
+
42
+ # use IQR_QC
43
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
40
44
 
41
- # QC data in 1h
42
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
45
+ # make sure all columns have values, otherwise set to nan
46
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import read_csv
1
+ from pandas import read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -14,7 +14,7 @@ class Reader(AbstractReader):
14
14
  def _raw_reader(self, file):
15
15
  # 查詢小時值(測項).csv & 查詢小時值(直式).csv (有、無輸出有效值都可以)
16
16
  df = read_csv(file, encoding='big5', encoding_errors='ignore', index_col=0, parse_dates=True,
17
- on_bad_lines='skip')
17
+ on_bad_lines='skip').apply(to_numeric, errors='coerce')
18
18
 
19
19
  if len(df.groupby('測站')) > 1:
20
20
  raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}')
@@ -36,4 +36,4 @@ class Reader(AbstractReader):
36
36
  return self.reorder_dataframe_columns(df, [desired_order1])
37
37
 
38
38
  def _QC(self, _df):
39
- return _df.resample('6h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
39
+ return _df
@@ -24,5 +24,4 @@ class Reader(AbstractReader):
24
24
  return _df / 0.035
25
25
 
26
26
  def _QC(self, _df):
27
- # QC data in 1h
28
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
27
+ return _df
@@ -1,4 +1,4 @@
1
- from pandas import read_csv
1
+ from pandas import read_csv, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -7,7 +7,8 @@ class Reader(AbstractReader):
7
7
  nam = 'MA350'
8
8
 
9
9
  def _raw_reader(self, file):
10
- _df = read_csv(file, parse_dates=['Date / time local'], index_col='Date / time local').rename_axis("Time")
10
+ _df = read_csv(file, parse_dates=['Date / time local'], index_col='Date / time local').rename_axis(
11
+ "Time").apply(to_numeric, errors='coerce')
11
12
 
12
13
  _df = _df.rename(columns={
13
14
  'UV BCc': 'BC1',
@@ -31,8 +32,14 @@ class Reader(AbstractReader):
31
32
 
32
33
  # QC data
33
34
  def _QC(self, _df):
35
+ _index = _df.index.copy()
36
+
34
37
  # remove negative value
35
- _df = _df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BB mass', 'FF mass', 'AAE', 'BB']].mask((_df < 0).copy())
38
+ _df = _df.mask(
39
+ (_df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5']] <= 0) | (_df[['BC1', 'BC2', 'BC3', 'BC4', 'BC5']] > 20000))
40
+
41
+ # use IQR_QC
42
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
36
43
 
37
- # QC data in 1h
38
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
44
+ # make sure all columns have values, otherwise set to nan
45
+ return _df.dropna(how='any').reindex(_index)
@@ -60,7 +60,7 @@ class Reader(AbstractReader):
60
60
 
61
61
  def _QC(self, _df):
62
62
  # remove negative value
63
- _df = _df.mask((_df < 0).copy())
63
+ _df = _df.mask((_df < 0))
64
64
 
65
65
  # XRF QAQC
66
66
  _df = self.XRF_QAQC(_df)
@@ -69,7 +69,9 @@ class Reader(AbstractReader):
69
69
  _df = self.IGAC_QAQC(_df)
70
70
 
71
71
  # QC data in 6h
72
- return _df.resample('6h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
72
+ _df = self.time_aware_IQR_QC(_df)
73
+
74
+ return _df
73
75
 
74
76
  # base on Xact 625i Minimum Decision Limit (MDL) for XRF in ng/m3, 60 min sample time
75
77
  def XRF_QAQC(self, df, MDL_replace: Literal['nan', '0.5 * MDL'] = 'nan'):
@@ -87,7 +89,10 @@ class Reader(AbstractReader):
87
89
  'Au': 0.1, 'Hg': 0.12, 'Tl': 0.12, 'Pb': 0.13,
88
90
  'Bi': 0.13
89
91
  }
90
- # 將小於 MDL 值的數據替換為 nan or 5/6 MDL
92
+
93
+ # Br Li internal standard
94
+
95
+ # 將小於 MDL 值的數據替換為 nan or 1/2 MDL
91
96
  for element, threshold in MDL.items():
92
97
  if element in df.columns:
93
98
  rep = np.nan if MDL_replace == 'nan' else 0.5 * threshold
@@ -137,7 +142,7 @@ class Reader(AbstractReader):
137
142
  item = ['Na+', 'NH4+', 'K+', 'Mg2+', 'Ca2+', 'Cl-', 'NO2-', 'NO3-', 'SO42-']
138
143
 
139
144
  # Calculate the balance
140
- _df = df[item].apply(lambda x: to_numeric(x, errors='coerce'))
145
+ _df = df[item].apply(to_numeric, errors='coerce')
141
146
 
142
147
  # for (_key, _df_col) in _df.items():
143
148
  # _df[_key] = _df_col.mask(_df_col < MDL[_key], MDL[_key] / 2)
@@ -58,13 +58,23 @@ class Reader(AbstractReader):
58
58
  print(f'\n\t\t\t Length mismatch in {file} data. Returning an empty DataFrame.')
59
59
  return _df_out
60
60
 
61
- # QC data
62
61
  def _QC(self, _df):
62
+ MDL_sensitivity = {'B': .1, 'G': .1, 'R': .3}
63
+
64
+ _index = _df.index.copy()
65
+
63
66
  # remove negative value
64
- _df = _df.mask((_df <= 5).copy())
67
+ _df = _df.mask((_df <= 0) | (_df > 2000))
65
68
 
66
69
  # total scattering is larger than back scattering
67
- _df = _df[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
70
+ _df = _df.loc[(_df['BB'] < _df['B']) & (_df['BG'] < _df['G']) & (_df['BR'] < _df['R'])]
71
+
72
+ # blue scattering is larger than green scattering, green scattering is larger than red scattering
73
+ # due to the nephelometer's Green PMT in FS is already aged, this QC may delete too many data
74
+ # _df = _df.loc[(_df['B'] > _df['G']) & (_df['G'] > _df['R'])]
75
+
76
+ # use IQR_QC
77
+ _df = self.time_aware_IQR_QC(_df)
68
78
 
69
- # QC data in 1h
70
- return _df.resample('1h').apply(self.n_sigma_QC).resample(self.meta.get("freq")).mean()
79
+ # make sure all columns have values, otherwise set to nan
80
+ return _df.dropna(how='any').reindex(_index)
@@ -1,4 +1,5 @@
1
- from pandas import to_datetime, read_csv
1
+ import numpy as np
2
+ from pandas import to_datetime, read_csv, to_numeric
2
3
 
3
4
  from AeroViz.rawDataReader.core import AbstractReader
4
5
 
@@ -8,11 +9,18 @@ class Reader(AbstractReader):
8
9
 
9
10
  def _raw_reader(self, file):
10
11
  with open(file, 'r', encoding='utf-8', errors='ignore') as f:
11
- _df = read_csv(f, skiprows=3, nrows=25)
12
+ _df = read_csv(f, skiprows=3).apply(to_numeric, errors='coerce')
12
13
 
13
14
  _df['Start Date/Time'] = _df['Start Date/Time'].str.strip()
14
15
  _df['time'] = to_datetime(_df['Start Date/Time'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
16
+
17
+ if _df['time'].isna().any():
18
+ _df['time'] = to_datetime(_df['Start Date/Time'], format='%m/%d/%Y %H:%M:%S', errors='coerce')
19
+
15
20
  _df = _df.set_index('time')
21
+
22
+ _df = _df.loc[~_df.index.duplicated() & _df.index.notna()]
23
+
16
24
  _df.index = _df.index.round('1h')
17
25
 
18
26
  _df = _df.rename(columns={
@@ -34,6 +42,8 @@ class Reader(AbstractReader):
34
42
  'OCPk2-ug C': 'OC2_raw',
35
43
  'OCPk3-ug C': 'OC3_raw',
36
44
  'OCPk4-ug C': 'OC4_raw',
45
+ 'Pyrolized C ug': 'PC_raw',
46
+
37
47
  'ECPk1-ug C': 'EC1_raw',
38
48
  'ECPk2-ug C': 'EC2_raw',
39
49
  'ECPk3-ug C': 'EC3_raw',
@@ -41,26 +51,40 @@ class Reader(AbstractReader):
41
51
  'ECPk5-ug C': 'EC5_raw',
42
52
  })
43
53
 
54
+ _df['OC1'] = _df['OC1_raw'] / _df['Sample_Volume']
55
+ _df['OC2'] = _df['OC2_raw'] / _df['Sample_Volume']
56
+ _df['OC3'] = _df['OC3_raw'] / _df['Sample_Volume']
57
+ _df['OC4'] = _df['OC4_raw'] / _df['Sample_Volume']
58
+
59
+ _df['PC'] = _df['Thermal_OC'] - _df['OC1'] - _df['OC2'] - _df['OC3'] - _df['OC4']
60
+
61
+ # _df['EC1'] = _df['EC1_raw'] / _df['Sample_Volume']
62
+ # _df['EC2'] = _df['EC2_raw'] / _df['Sample_Volume']
63
+ # _df['EC3'] = _df['EC3_raw'] / _df['Sample_Volume']
64
+ # _df['EC4'] = _df['EC4_raw'] / _df['Sample_Volume']
65
+ # _df['EC5'] = _df['EC5_raw'] / _df['Sample_Volume']
66
+
44
67
  _df = _df[['Thermal_OC', 'Optical_OC', 'Thermal_EC', 'Optical_EC', 'TC', 'Sample_Volume',
45
- 'OC1_raw', 'OC2_raw', 'OC3_raw', 'OC4_raw', 'EC1_raw', 'EC2_raw', 'EC3_raw', 'EC4_raw',
46
- 'EC5_raw']]
68
+ 'OC1', 'OC2', 'OC3', 'OC4', 'PC']]
47
69
 
48
70
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
49
71
 
50
72
  # QC data
51
73
  def _QC(self, _df):
52
- import numpy as np
74
+ MDL = {'Thermal_OC': 0.3,
75
+ 'Optical_OC': 0.3,
76
+ 'Thermal_EC': 0.015,
77
+ 'Optical_EC': 0.015
78
+ }
79
+
80
+ _index = _df.index.copy()
53
81
 
54
- _df = _df.mask((_df <= 0) | (_df > 100)).copy()
82
+ _df = _df.mask((_df <= -5) | (_df > 100))
55
83
 
56
- thresholds = {
57
- 'Thermal_OC': 0.3,
58
- 'Optical_OC': 0.3,
59
- 'Thermal_EC': 0.015,
60
- 'Optical_EC': 0.015
61
- }
84
+ for col, threshold in MDL.items():
85
+ _df.loc[_df[col] <= threshold, col] = np.nan
62
86
 
63
- for col, thresh in thresholds.items():
64
- _df.loc[_df[col] <= thresh, col] = np.nan
87
+ # use IQR_QC
88
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
65
89
 
66
- return _df
90
+ return _df.dropna(subset=['Thermal_OC', 'Optical_OC']).reindex(_index)
@@ -1,4 +1,4 @@
1
- from pandas import to_datetime, read_csv
1
+ from pandas import to_datetime, read_csv, Timedelta, to_numeric
2
2
 
3
3
  from AeroViz.rawDataReader.core import AbstractReader
4
4
 
@@ -8,7 +8,7 @@ class Reader(AbstractReader):
8
8
 
9
9
  def _raw_reader(self, file):
10
10
  with open(file, 'r', encoding='utf-8', errors='ignore') as f:
11
- _df = read_csv(f, skiprows=3, index_col=False)
11
+ _df = read_csv(f, skiprows=3, index_col=False).apply(to_numeric, errors='coerce')
12
12
 
13
13
  _df = _df.rename(columns={'Time Stamp': 'time',
14
14
  'System status': 'status',
@@ -33,16 +33,20 @@ class Reader(AbstractReader):
33
33
 
34
34
  # QC data
35
35
  def _QC(self, _df):
36
-
37
- _df_idx = _df.index.copy()
36
+ _index = _df.index.copy()
38
37
 
39
38
  # remove negative value
40
- _df = _df.where(_df.noise < 0.01)[['PM_NV', 'PM_Total']].mask((_df <= 0).copy())
39
+ _df = _df.where(_df.noise < 0.01)[['PM_NV', 'PM_Total']].mask((_df <= 0))
41
40
 
42
41
  # QC data in 1 hr
43
- # remove data where size < 8 in 1-hr
42
+ # use time_aware_IQR_QC
43
+ _df = self.time_aware_IQR_QC(_df, time_window='1h')
44
+
45
+ # remove data where size < 50% in 1-hr
46
+ points_per_hour = Timedelta('1h') / Timedelta(self.meta['freq'])
44
47
  for _key in ['PM_Total', 'PM_NV']:
45
- _size = _df[_key].dropna().resample('1h').size().reindex(_df_idx).ffill().copy()
46
- _df[_key] = _df[_key].mask(_size < 8)
48
+ _size = _df[_key].dropna().resample('1h').size().reindex(_index).ffill()
49
+ _df[_key] = _df[_key].mask(_size < points_per_hour * 0.5)
47
50
 
48
- return _df.reindex(_df_idx)
51
+ # make sure all columns have values, otherwise set to nan
52
+ return _df.dropna(how='any').reindex(_index)
@@ -26,7 +26,7 @@ class Reader(AbstractReader):
26
26
  if valid_keys:
27
27
  return _df[valid_keys].loc[~_df.index.duplicated() & _df.index.notna()]
28
28
  else:
29
- self.logger.warning("沒有找到匹配的鍵。返回原始DataFrame並移除含NaN的行。")
29
+ self.logger.warning("沒有找到匹配的鍵。返回原始DataFrame")
30
30
  return _df.loc[~_df.index.duplicated() & _df.index.notna()]
31
31
 
32
32
  def _QC(self, _df):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: AeroViz
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: Aerosol science
5
5
  Home-page: https://github.com/Alex870521/AeroViz
6
6
  Author: alex
@@ -56,14 +56,19 @@ Requires-Dist: rich ~=13.7.1
56
56
  pip install AeroViz
57
57
  ```
58
58
 
59
+ For Windows users: Run `install_windows.bat`
60
+
61
+ For Linux and Mac users: Run `install_unix.bat`
62
+
59
63
  ## <div align="center">Quick Start</div>
60
64
 
61
65
  ```python
62
- import AeroViz
66
+ from datetime import datetime
67
+ from pathlib import Path
63
68
  from AeroViz import RawDataReader, DataProcess, plot
64
69
 
65
70
  # Read data from a supported instrument
66
- data = RawDataReader('NEPH', '/path/to/data', start='2024-01-01', end='2024-01-31')
71
+ data = RawDataReader('NEPH', Path('/path/to/data'), start=datetime(2024, 2, 1), end=datetime(2024, 4, 30))
67
72
 
68
73
  # Create a visualization
69
74
  plot.timeseries(data, y='scattering_coefficient')
@@ -71,13 +76,13 @@ plot.timeseries(data, y='scattering_coefficient')
71
76
 
72
77
  For more detailed usage instructions, please refer to our [User Guide]().
73
78
 
74
- ## RawDataReader
79
+ ## <div align="center"> RawDataReader
75
80
 
76
81
  RawDataReader supports a wide range of aerosol instruments, including NEPH, SMPS, AE33, and many more. It handles
77
82
  various file types and time resolutions, making data processing efficient and standardized.
78
83
 
79
84
  For a detailed list of supported instruments, file types, and data columns, please refer to
80
- our [RawDataReader Usage Guide](docs/RawDataReader_Usage_Guide.md) in the `docs` folder.
85
+ our [RawDataReader Usage Guide](docs/user_guide/RawDataReader) in the `docs` folder.
81
86
 
82
87
  ### Key Features:
83
88
 
@@ -108,7 +113,7 @@ The AeroViz project currently supports data from the following instruments:
108
113
  > **Note:** We are continuously working to support more instruments. Please check back for updates or contribute to our
109
114
  > project on GitHub.
110
115
 
111
- ## <div align="center">DataProcess Supported Method</div>
116
+ ## <div align="center">DataProcess</div>
112
117
 
113
118
  The AeroViz project currently supports the following processing methods:
114
119
 
@@ -126,9 +131,6 @@ For detailed documentation, please refer to the `docs` folder, which includes:
126
131
  | Documentation | Description |
127
132
  |--------------------------------------------|----------------------------|
128
133
  | [User Guide](docs/user_guide) | Basic usage instructions |
129
- | [Developer Guide](docs/developer_guide.md) | Developer guidelines |
130
- | [API Reference](docs/api_reference.md) | API documentation |
131
- | [FAQ](docs/faq.md) | Frequently Asked Questions |
132
134
  | [Changelog](docs/changelog.md) | List of changes |
133
135
 
134
136
  </div>