AeroViz 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. AeroViz/__init__.py +13 -0
  2. AeroViz/__pycache__/__init__.cpython-312.pyc +0 -0
  3. AeroViz/data/DEFAULT_DATA.csv +1417 -0
  4. AeroViz/data/DEFAULT_PNSD_DATA.csv +1417 -0
  5. AeroViz/data/hysplit_example_data.txt +101 -0
  6. AeroViz/dataProcess/Chemistry/__init__.py +149 -0
  7. AeroViz/dataProcess/Chemistry/__pycache__/__init__.cpython-312.pyc +0 -0
  8. AeroViz/dataProcess/Chemistry/_calculate.py +557 -0
  9. AeroViz/dataProcess/Chemistry/_isoropia.py +150 -0
  10. AeroViz/dataProcess/Chemistry/_mass_volume.py +487 -0
  11. AeroViz/dataProcess/Chemistry/_ocec.py +172 -0
  12. AeroViz/dataProcess/Chemistry/isrpia.cnf +21 -0
  13. AeroViz/dataProcess/Chemistry/isrpia2.exe +0 -0
  14. AeroViz/dataProcess/Optical/PyMieScatt_update.py +577 -0
  15. AeroViz/dataProcess/Optical/_IMPROVE.py +452 -0
  16. AeroViz/dataProcess/Optical/__init__.py +281 -0
  17. AeroViz/dataProcess/Optical/__pycache__/PyMieScatt_update.cpython-312.pyc +0 -0
  18. AeroViz/dataProcess/Optical/__pycache__/__init__.cpython-312.pyc +0 -0
  19. AeroViz/dataProcess/Optical/__pycache__/mie_theory.cpython-312.pyc +0 -0
  20. AeroViz/dataProcess/Optical/_derived.py +518 -0
  21. AeroViz/dataProcess/Optical/_extinction.py +123 -0
  22. AeroViz/dataProcess/Optical/_mie_sd.py +912 -0
  23. AeroViz/dataProcess/Optical/_retrieve_RI.py +243 -0
  24. AeroViz/dataProcess/Optical/coefficient.py +72 -0
  25. AeroViz/dataProcess/Optical/fRH.pkl +0 -0
  26. AeroViz/dataProcess/Optical/mie_theory.py +260 -0
  27. AeroViz/dataProcess/README.md +271 -0
  28. AeroViz/dataProcess/SizeDistr/__init__.py +245 -0
  29. AeroViz/dataProcess/SizeDistr/__pycache__/__init__.cpython-312.pyc +0 -0
  30. AeroViz/dataProcess/SizeDistr/__pycache__/_size_dist.cpython-312.pyc +0 -0
  31. AeroViz/dataProcess/SizeDistr/_size_dist.py +810 -0
  32. AeroViz/dataProcess/SizeDistr/merge/README.md +93 -0
  33. AeroViz/dataProcess/SizeDistr/merge/__init__.py +20 -0
  34. AeroViz/dataProcess/SizeDistr/merge/_merge_v0.py +251 -0
  35. AeroViz/dataProcess/SizeDistr/merge/_merge_v0_1.py +246 -0
  36. AeroViz/dataProcess/SizeDistr/merge/_merge_v1.py +255 -0
  37. AeroViz/dataProcess/SizeDistr/merge/_merge_v2.py +244 -0
  38. AeroViz/dataProcess/SizeDistr/merge/_merge_v3.py +518 -0
  39. AeroViz/dataProcess/SizeDistr/merge/_merge_v4.py +422 -0
  40. AeroViz/dataProcess/SizeDistr/prop.py +62 -0
  41. AeroViz/dataProcess/VOC/__init__.py +14 -0
  42. AeroViz/dataProcess/VOC/__pycache__/__init__.cpython-312.pyc +0 -0
  43. AeroViz/dataProcess/VOC/_potential_par.py +108 -0
  44. AeroViz/dataProcess/VOC/support_voc.json +446 -0
  45. AeroViz/dataProcess/__init__.py +66 -0
  46. AeroViz/dataProcess/__pycache__/__init__.cpython-312.pyc +0 -0
  47. AeroViz/dataProcess/core/__init__.py +272 -0
  48. AeroViz/dataProcess/core/__pycache__/__init__.cpython-312.pyc +0 -0
  49. AeroViz/mcp_server.py +352 -0
  50. AeroViz/plot/__init__.py +13 -0
  51. AeroViz/plot/__pycache__/__init__.cpython-312.pyc +0 -0
  52. AeroViz/plot/__pycache__/bar.cpython-312.pyc +0 -0
  53. AeroViz/plot/__pycache__/box.cpython-312.pyc +0 -0
  54. AeroViz/plot/__pycache__/pie.cpython-312.pyc +0 -0
  55. AeroViz/plot/__pycache__/radar.cpython-312.pyc +0 -0
  56. AeroViz/plot/__pycache__/regression.cpython-312.pyc +0 -0
  57. AeroViz/plot/__pycache__/scatter.cpython-312.pyc +0 -0
  58. AeroViz/plot/__pycache__/violin.cpython-312.pyc +0 -0
  59. AeroViz/plot/bar.py +126 -0
  60. AeroViz/plot/box.py +69 -0
  61. AeroViz/plot/distribution/__init__.py +1 -0
  62. AeroViz/plot/distribution/__pycache__/__init__.cpython-312.pyc +0 -0
  63. AeroViz/plot/distribution/__pycache__/distribution.cpython-312.pyc +0 -0
  64. AeroViz/plot/distribution/distribution.py +576 -0
  65. AeroViz/plot/meteorology/CBPF.py +295 -0
  66. AeroViz/plot/meteorology/__init__.py +3 -0
  67. AeroViz/plot/meteorology/__pycache__/CBPF.cpython-312.pyc +0 -0
  68. AeroViz/plot/meteorology/__pycache__/__init__.cpython-312.pyc +0 -0
  69. AeroViz/plot/meteorology/__pycache__/hysplit.cpython-312.pyc +0 -0
  70. AeroViz/plot/meteorology/__pycache__/wind_rose.cpython-312.pyc +0 -0
  71. AeroViz/plot/meteorology/hysplit.py +93 -0
  72. AeroViz/plot/meteorology/wind_rose.py +77 -0
  73. AeroViz/plot/optical/__init__.py +1 -0
  74. AeroViz/plot/optical/__pycache__/__init__.cpython-312.pyc +0 -0
  75. AeroViz/plot/optical/__pycache__/optical.cpython-312.pyc +0 -0
  76. AeroViz/plot/optical/optical.py +388 -0
  77. AeroViz/plot/pie.py +210 -0
  78. AeroViz/plot/radar.py +184 -0
  79. AeroViz/plot/regression.py +200 -0
  80. AeroViz/plot/scatter.py +174 -0
  81. AeroViz/plot/templates/__init__.py +6 -0
  82. AeroViz/plot/templates/__pycache__/__init__.cpython-312.pyc +0 -0
  83. AeroViz/plot/templates/__pycache__/ammonium_rich.cpython-312.pyc +0 -0
  84. AeroViz/plot/templates/__pycache__/contour.cpython-312.pyc +0 -0
  85. AeroViz/plot/templates/__pycache__/corr_matrix.cpython-312.pyc +0 -0
  86. AeroViz/plot/templates/__pycache__/diurnal_pattern.cpython-312.pyc +0 -0
  87. AeroViz/plot/templates/__pycache__/koschmieder.cpython-312.pyc +0 -0
  88. AeroViz/plot/templates/__pycache__/metal_heatmap.cpython-312.pyc +0 -0
  89. AeroViz/plot/templates/ammonium_rich.py +34 -0
  90. AeroViz/plot/templates/contour.py +47 -0
  91. AeroViz/plot/templates/corr_matrix.py +267 -0
  92. AeroViz/plot/templates/diurnal_pattern.py +61 -0
  93. AeroViz/plot/templates/koschmieder.py +95 -0
  94. AeroViz/plot/templates/metal_heatmap.py +164 -0
  95. AeroViz/plot/timeseries/__init__.py +2 -0
  96. AeroViz/plot/timeseries/__pycache__/__init__.cpython-312.pyc +0 -0
  97. AeroViz/plot/timeseries/__pycache__/template.cpython-312.pyc +0 -0
  98. AeroViz/plot/timeseries/__pycache__/timeseries.cpython-312.pyc +0 -0
  99. AeroViz/plot/timeseries/template.py +47 -0
  100. AeroViz/plot/timeseries/timeseries.py +446 -0
  101. AeroViz/plot/utils/__init__.py +4 -0
  102. AeroViz/plot/utils/__pycache__/__init__.cpython-312.pyc +0 -0
  103. AeroViz/plot/utils/__pycache__/_color.cpython-312.pyc +0 -0
  104. AeroViz/plot/utils/__pycache__/_unit.cpython-312.pyc +0 -0
  105. AeroViz/plot/utils/__pycache__/plt_utils.cpython-312.pyc +0 -0
  106. AeroViz/plot/utils/__pycache__/sklearn_utils.cpython-312.pyc +0 -0
  107. AeroViz/plot/utils/_color.py +71 -0
  108. AeroViz/plot/utils/_unit.py +55 -0
  109. AeroViz/plot/utils/fRH.json +390 -0
  110. AeroViz/plot/utils/plt_utils.py +92 -0
  111. AeroViz/plot/utils/sklearn_utils.py +49 -0
  112. AeroViz/plot/utils/units.json +89 -0
  113. AeroViz/plot/violin.py +80 -0
  114. AeroViz/rawDataReader/FLOW.md +138 -0
  115. AeroViz/rawDataReader/__init__.py +220 -0
  116. AeroViz/rawDataReader/__pycache__/__init__.cpython-312.pyc +0 -0
  117. AeroViz/rawDataReader/config/__init__.py +0 -0
  118. AeroViz/rawDataReader/config/__pycache__/__init__.cpython-312.pyc +0 -0
  119. AeroViz/rawDataReader/config/__pycache__/supported_instruments.cpython-312.pyc +0 -0
  120. AeroViz/rawDataReader/config/supported_instruments.py +135 -0
  121. AeroViz/rawDataReader/core/__init__.py +658 -0
  122. AeroViz/rawDataReader/core/__pycache__/__init__.cpython-312.pyc +0 -0
  123. AeroViz/rawDataReader/core/__pycache__/logger.cpython-312.pyc +0 -0
  124. AeroViz/rawDataReader/core/__pycache__/pre_process.cpython-312.pyc +0 -0
  125. AeroViz/rawDataReader/core/__pycache__/qc.cpython-312.pyc +0 -0
  126. AeroViz/rawDataReader/core/__pycache__/report.cpython-312.pyc +0 -0
  127. AeroViz/rawDataReader/core/logger.py +171 -0
  128. AeroViz/rawDataReader/core/pre_process.py +308 -0
  129. AeroViz/rawDataReader/core/qc.py +961 -0
  130. AeroViz/rawDataReader/core/report.py +579 -0
  131. AeroViz/rawDataReader/script/AE33.py +173 -0
  132. AeroViz/rawDataReader/script/AE43.py +151 -0
  133. AeroViz/rawDataReader/script/APS.py +339 -0
  134. AeroViz/rawDataReader/script/Aurora.py +191 -0
  135. AeroViz/rawDataReader/script/BAM1020.py +90 -0
  136. AeroViz/rawDataReader/script/BC1054.py +161 -0
  137. AeroViz/rawDataReader/script/EPA.py +79 -0
  138. AeroViz/rawDataReader/script/GRIMM.py +68 -0
  139. AeroViz/rawDataReader/script/IGAC.py +140 -0
  140. AeroViz/rawDataReader/script/MA350.py +179 -0
  141. AeroViz/rawDataReader/script/Minion.py +218 -0
  142. AeroViz/rawDataReader/script/NEPH.py +199 -0
  143. AeroViz/rawDataReader/script/OCEC.py +173 -0
  144. AeroViz/rawDataReader/script/Q-ACSM.py +12 -0
  145. AeroViz/rawDataReader/script/SMPS.py +389 -0
  146. AeroViz/rawDataReader/script/TEOM.py +181 -0
  147. AeroViz/rawDataReader/script/VOC.py +106 -0
  148. AeroViz/rawDataReader/script/Xact.py +244 -0
  149. AeroViz/rawDataReader/script/__init__.py +28 -0
  150. AeroViz/rawDataReader/script/__pycache__/AE33.cpython-312.pyc +0 -0
  151. AeroViz/rawDataReader/script/__pycache__/AE43.cpython-312.pyc +0 -0
  152. AeroViz/rawDataReader/script/__pycache__/APS.cpython-312.pyc +0 -0
  153. AeroViz/rawDataReader/script/__pycache__/Aurora.cpython-312.pyc +0 -0
  154. AeroViz/rawDataReader/script/__pycache__/BAM1020.cpython-312.pyc +0 -0
  155. AeroViz/rawDataReader/script/__pycache__/BC1054.cpython-312.pyc +0 -0
  156. AeroViz/rawDataReader/script/__pycache__/EPA.cpython-312.pyc +0 -0
  157. AeroViz/rawDataReader/script/__pycache__/GRIMM.cpython-312.pyc +0 -0
  158. AeroViz/rawDataReader/script/__pycache__/IGAC.cpython-312.pyc +0 -0
  159. AeroViz/rawDataReader/script/__pycache__/MA350.cpython-312.pyc +0 -0
  160. AeroViz/rawDataReader/script/__pycache__/Minion.cpython-312.pyc +0 -0
  161. AeroViz/rawDataReader/script/__pycache__/NEPH.cpython-312.pyc +0 -0
  162. AeroViz/rawDataReader/script/__pycache__/OCEC.cpython-312.pyc +0 -0
  163. AeroViz/rawDataReader/script/__pycache__/Q-ACSM.cpython-312.pyc +0 -0
  164. AeroViz/rawDataReader/script/__pycache__/SMPS.cpython-312.pyc +0 -0
  165. AeroViz/rawDataReader/script/__pycache__/TEOM.cpython-312.pyc +0 -0
  166. AeroViz/rawDataReader/script/__pycache__/VOC.cpython-312.pyc +0 -0
  167. AeroViz/rawDataReader/script/__pycache__/Xact.cpython-312.pyc +0 -0
  168. AeroViz/rawDataReader/script/__pycache__/__init__.cpython-312.pyc +0 -0
  169. AeroViz/tools/__init__.py +2 -0
  170. AeroViz/tools/__pycache__/__init__.cpython-312.pyc +0 -0
  171. AeroViz/tools/__pycache__/database.cpython-312.pyc +0 -0
  172. AeroViz/tools/__pycache__/dataclassifier.cpython-312.pyc +0 -0
  173. AeroViz/tools/database.py +95 -0
  174. AeroViz/tools/dataclassifier.py +117 -0
  175. AeroViz/tools/dataprinter.py +58 -0
  176. aeroviz-0.1.21.dist-info/METADATA +294 -0
  177. aeroviz-0.1.21.dist-info/RECORD +180 -0
  178. aeroviz-0.1.21.dist-info/WHEEL +5 -0
  179. aeroviz-0.1.21.dist-info/licenses/LICENSE +21 -0
  180. aeroviz-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,389 @@
1
+ import csv
2
+
3
+ import numpy as np
4
+ from pandas import to_datetime, to_numeric, read_csv, Series, concat, DataFrame
5
+
6
+ from AeroViz.rawDataReader.core import AbstractReader, QCRule, QCFlagBuilder
7
+
8
+
9
+ class Reader(AbstractReader):
10
+ """SMPS (Scanning Mobility Particle Sizer) Data Reader
11
+
12
+ A specialized reader for SMPS data files, which measure particle size distributions
13
+ in the range of 11.8-593.5 nm.
14
+
15
+ See full documentation at docs/source/instruments/SMPS.md for detailed information
16
+ on supported formats and QC procedures.
17
+ """
18
+ nam = 'SMPS'
19
+
20
+ # =========================================================================
21
+ # QC Thresholds
22
+ # =========================================================================
23
+ MIN_HOURLY_COUNT = 5 # Minimum measurements per hour
24
+ MIN_TOTAL_CONC = 2000 # Minimum total concentration (#/cm³)
25
+ MAX_TOTAL_CONC = 1e7 # Maximum total concentration (#/cm³)
26
+ MAX_LARGE_BIN_CONC = 4000 # Maximum concentration for >400nm bins (DMA water ingress indicator)
27
+ LARGE_BIN_THRESHOLD = 400 # Size threshold for large bin filter (nm)
28
+
29
+ # Status Flag column name
30
+ STATUS_COLUMN = 'Status Flag'
31
+ STATUS_OK = 'Normal Scan' # Normal status text
32
+
33
+ def __init__(self, *args, **kwargs):
34
+ super().__init__(*args, **kwargs)
35
+ self._distributions = None # Store distributions for separate file output
36
+
37
+ def __call__(self, start, end, mean_freq='1h'):
38
+ """
39
+ Process SMPS data and save size distributions to separate files.
40
+
41
+ Overrides AbstractReader.__call__ to add distribution file saving
42
+ and filter out size bins from main output.
43
+
44
+ Parameters
45
+ ----------
46
+ start : datetime
47
+ Start time for data processing
48
+ end : datetime
49
+ End time for data processing
50
+ mean_freq : str, default='1h'
51
+ Frequency for resampling the data
52
+
53
+ Returns
54
+ -------
55
+ pd.DataFrame
56
+ Processed and resampled data (statistics only, no size bins)
57
+ """
58
+ # Call parent __call__ for standard processing
59
+ result = super().__call__(start, end, mean_freq)
60
+
61
+ # Save distributions to separate files
62
+ self._save_distributions(mean_freq)
63
+
64
+ # Filter out size bins from main output, keep only statistics
65
+ stat_cols = [col for col in result.columns if not isinstance(col, (int, float))]
66
+ result_stats = result[stat_cols]
67
+
68
+ # Re-save filtered output to CSV
69
+ result_stats.to_csv(self.csv_out)
70
+
71
+ return result_stats
72
+
73
+ def _raw_reader(self, file):
74
+ """Read and parse raw SMPS data files."""
75
+
76
+ def find_header_row(file_obj, delimiter):
77
+ csv_reader = csv.reader(file_obj, delimiter=delimiter)
78
+ for skip, row in enumerate(csv_reader):
79
+ if row and (row[0] in ['Sample #', 'Scan Number']):
80
+ return skip
81
+ raise ValueError("Header row not found")
82
+
83
+ def parse_date(df, date_format):
84
+ if 'Date' in df.columns and 'Start Time' in df.columns:
85
+ return to_datetime(df['Date'] + ' ' + df['Start Time'], format=date_format, errors='coerce')
86
+ elif 'DateTime Sample Start' in df.columns:
87
+ return to_datetime(df['DateTime Sample Start'], format=date_format, errors='coerce')
88
+ else:
89
+ raise ValueError("Expected date columns not found")
90
+
91
+ with open(file, 'r', encoding='utf-8', errors='ignore') as f:
92
+ if file.suffix.lower() == '.txt':
93
+ delimiter, date_formats = '\t', ['%m/%d/%y %X', '%m/%d/%Y %X']
94
+ else: # csv
95
+ delimiter, date_formats = ',', ['%d/%m/%Y %X']
96
+
97
+ skip = find_header_row(f, delimiter)
98
+ f.seek(0)
99
+
100
+ _df = read_csv(f, sep=delimiter, skiprows=skip, low_memory=False)
101
+ if 'Date' not in _df.columns and 'DateTime Sample Start' not in _df.columns:
102
+ try:
103
+ _df = _df.T
104
+ _df.columns = _df.iloc[0]
105
+ _df = _df.iloc[1:]
106
+ _df = _df.reset_index(drop=True)
107
+ except:
108
+ raise NotImplementedError('Not supported date format')
109
+
110
+ for date_format in date_formats:
111
+ _time_index = parse_date(_df, date_format)
112
+ if not _time_index.isna().all():
113
+ break
114
+ else:
115
+ raise ValueError("Unable to parse dates with given formats")
116
+
117
+ # Check for comma decimal separator
118
+ comma_decimal_cols = [col for col in _df.columns if ',' in col.strip()]
119
+ if comma_decimal_cols:
120
+ self.logger.warning(f"Detected {len(comma_decimal_cols)} columns using comma as decimal separator")
121
+ _df.columns = _df.columns.str.replace(',', '.')
122
+
123
+ # Filter numeric columns
124
+ numeric_cols = [col for col in _df.columns if col.strip().replace('.', '').isdigit()]
125
+ numeric_cols.sort(key=lambda x: float(x.strip()))
126
+
127
+ _df.index = _time_index
128
+ _df.index.name = 'time'
129
+
130
+ _df_smps = _df[numeric_cols]
131
+ _df_smps = _df_smps.loc[_df_smps.index.dropna().copy()]
132
+
133
+ # Rename columns to float values (strip spaces)
134
+ _df_smps.columns = [float(col.strip()) for col in _df_smps.columns]
135
+
136
+ size_range = self.kwargs.get('size_range') or (11.8, 593.5)
137
+
138
+ if _df_smps.columns[0] != size_range[0] or _df_smps.columns[-1] != size_range[1]:
139
+ self.logger.warning(f'SMPS file: {file.name} size range mismatch. '
140
+ f'Expected {size_range}, got ({_df_smps.columns[0]}, {_df_smps.columns[-1]})')
141
+ return None
142
+
143
+ _df_smps = _df_smps.apply(to_numeric, errors='coerce')
144
+
145
+ # Include Status Flag column in _df (will be processed by core together)
146
+ if self.STATUS_COLUMN in _df.columns:
147
+ _df_smps[self.STATUS_COLUMN] = _df.loc[_df_smps.index, self.STATUS_COLUMN].astype(str).str.strip()
148
+
149
+ return _df_smps
150
+
151
+ def _QC(self, _df):
152
+ """
153
+ Perform quality control on SMPS particle size distribution data.
154
+
155
+ QC Rules Applied
156
+ ----------------
157
+ 1. Status Error : Non-empty status flag indicates instrument error
158
+ 2. Insufficient : Less than 5 measurements per hour
159
+ 3. Invalid Number Conc : Total number concentration outside valid range (2000-1e7 #/cm³)
160
+ 4. DMA Water Ingress : Bins >400nm with concentration > 4000 dN/dlogDp (indicates water in DMA)
161
+ """
162
+ _df = _df.copy()
163
+ _index = _df.index.copy()
164
+
165
+ # Apply size range filter
166
+ size_range = self.kwargs.get('size_range') or (11.8, 593.5)
167
+ numeric_cols = [col for col in _df.columns if isinstance(col, (int, float))]
168
+ df_numeric = _df[numeric_cols]
169
+ size_mask = (df_numeric.columns.astype(float) >= size_range[0]) & (df_numeric.columns.astype(float) <= size_range[1])
170
+ df_numeric = df_numeric.loc[:, size_mask]
171
+
172
+ # Calculate total concentration for QC checks
173
+ dlogDp = np.diff(np.log(df_numeric.columns[:-1].to_numpy(float))).mean()
174
+ total_conc = df_numeric.sum(axis=1, min_count=1) * dlogDp
175
+
176
+ # Calculate hourly data counts
177
+ hourly_counts = (total_conc
178
+ .dropna()
179
+ .resample('h')
180
+ .size()
181
+ .resample('6min')
182
+ .ffill()
183
+ .reindex(df_numeric.index, method='ffill', tolerance='6min'))
184
+
185
+ # Get large bins (>400nm)
186
+ large_bins = df_numeric.columns[df_numeric.columns.astype(float) >= self.LARGE_BIN_THRESHOLD]
187
+
188
+ # Build QC rules declaratively
189
+ qc = QCFlagBuilder()
190
+
191
+ qc.add_rules([
192
+ QCRule(
193
+ name='Status Error',
194
+ condition=lambda df: self.QC_control().filter_error_status(
195
+ _df, status_column=self.STATUS_COLUMN, status_type='text', ok_value=self.STATUS_OK
196
+ ),
197
+ description=f'Status flag is not "{self.STATUS_OK}"'
198
+ ),
199
+ QCRule(
200
+ name='Insufficient',
201
+ condition=lambda df: Series(hourly_counts < self.MIN_HOURLY_COUNT, index=df.index).fillna(True),
202
+ description=f'Less than {self.MIN_HOURLY_COUNT} measurements per hour'
203
+ ),
204
+ QCRule(
205
+ name='Invalid Number Conc',
206
+ condition=lambda df, tc=total_conc: Series(
207
+ (tc < self.MIN_TOTAL_CONC) | (tc > self.MAX_TOTAL_CONC),
208
+ index=df.index
209
+ ).fillna(True),
210
+ description=f'Total number concentration outside valid range ({self.MIN_TOTAL_CONC}-{self.MAX_TOTAL_CONC:.0e} #/cm³)'
211
+ ),
212
+ QCRule(
213
+ name='DMA Water Ingress',
214
+ condition=lambda df: (df[large_bins] > self.MAX_LARGE_BIN_CONC).any(axis=1) if len(large_bins) > 0 else Series(False, index=df.index),
215
+ description=f'Bins >{self.LARGE_BIN_THRESHOLD}nm with concentration > {self.MAX_LARGE_BIN_CONC} dN/dlogDp (water in DMA)'
216
+ ),
217
+ ])
218
+
219
+ # Apply all QC rules
220
+ df_qc = qc.apply(_df)
221
+
222
+ # Store QC summary for combined output in _process()
223
+ self._qc_summary = qc.get_summary(df_qc)
224
+
225
+ return df_qc.reindex(_index)
226
+
227
+ def _process(self, _df):
228
+ """
229
+ Calculate size distribution statistics from QC'd SMPS data.
230
+
231
+ Processing Steps
232
+ ----------------
233
+ 1. Calculate dlogDp from bin diameters
234
+ 2. Calculate number, surface, volume distributions (all in dX/dlogDp)
235
+ 3. Calculate total, GMD, GSD, mode for each weighting
236
+ 4. Calculate mode contributions (ultra, accum, coarse fractions)
237
+ 5. Store distributions for separate file output
238
+
239
+ Parameters
240
+ ----------
241
+ _df : pd.DataFrame
242
+ Quality-controlled DataFrame with size bin columns and QC_Flag
243
+
244
+ Returns
245
+ -------
246
+ pd.DataFrame
247
+ Original size bins (dN/dlogDp) + calculated statistics + QC_Flag
248
+ """
249
+ _index = _df.index.copy()
250
+
251
+ # Separate QC_Flag from size bins
252
+ qc_flag = _df['QC_Flag'].copy() if 'QC_Flag' in _df.columns else Series('Valid', index=_df.index)
253
+
254
+ # Get numeric columns (size bins)
255
+ bin_cols = [col for col in _df.columns if isinstance(col, (int, float))]
256
+ df_bins = _df[bin_cols].copy() # This is dN/dlogDp
257
+ dp = np.array(bin_cols, dtype=float)
258
+
259
+ # Input is already dN/dlogDp, calculate dS/dlogDp and dV/dlogDp
260
+ dN_dlogDp = df_bins.copy()
261
+ dS_dlogDp = dN_dlogDp * np.pi * dp ** 2 # Surface area distribution (nm²·cm⁻³)
262
+ dV_dlogDp = dN_dlogDp * np.pi * (dp ** 3) / 6 # Volume distribution (nm³·cm⁻³)
263
+
264
+ # Store distributions for separate file output (with QC_Flag)
265
+ self._distributions = {
266
+ 'dNdlogDp': concat([dN_dlogDp, qc_flag], axis=1),
267
+ 'dSdlogDp': concat([dS_dlogDp, qc_flag], axis=1),
268
+ 'dVdlogDp': concat([dV_dlogDp, qc_flag], axis=1),
269
+ }
270
+
271
+ # For statistics calculation, convert to absolute values (dX = dX/dlogDp * dlogDp)
272
+ dlogDp = np.diff(np.log10(dp))
273
+ dlogDp = np.append(dlogDp, dlogDp[-1])
274
+ dN = dN_dlogDp * dlogDp
275
+ dS = dS_dlogDp * dlogDp
276
+ dV = dV_dlogDp * dlogDp
277
+
278
+ # Calculate statistics for all particles
279
+ stats = DataFrame(index=_df.index)
280
+
281
+ # Calculate for each weighting type
282
+ for weight_name, dist in [('num', dN), ('surf', dS), ('vol', dV)]:
283
+ total, gmd, gsd = self._geometric_prop(dp, dist)
284
+ stats[f'total_{weight_name}'] = total
285
+ stats[f'GMD_{weight_name}'] = gmd
286
+ stats[f'GSD_{weight_name}'] = gsd
287
+
288
+ # Calculate mode (diameter with maximum concentration)
289
+ mask = dist.notna().any(axis=1)
290
+ stats.loc[mask, f'mode_{weight_name}'] = dist.loc[mask].idxmax(axis=1)
291
+
292
+ # Calculate mode contributions
293
+ if weight_name == 'num':
294
+ total_sum = dist.sum(axis=1)
295
+ total_sum = total_sum.where(total_sum > 0)
296
+
297
+ # Ultrafine: < 100 nm
298
+ ultra_bins = [c for c in dist.columns if c < 100]
299
+ if ultra_bins:
300
+ stats[f'ultra_{weight_name}'] = dist[ultra_bins].sum(axis=1) / total_sum
301
+
302
+ # Accumulation: 100-1000 nm
303
+ accum_bins = [c for c in dist.columns if 100 <= c < 1000]
304
+ if accum_bins:
305
+ stats[f'accum_{weight_name}'] = dist[accum_bins].sum(axis=1) / total_sum
306
+
307
+ # Coarse: >= 1000 nm (if available in SMPS range)
308
+ coarse_bins = [c for c in dist.columns if c >= 1000]
309
+ if coarse_bins:
310
+ stats[f'coarse_{weight_name}'] = dist[coarse_bins].sum(axis=1) / total_sum
311
+
312
+ # Combine: size bins + statistics + QC_Flag
313
+ # (bins are kept for rate calculation, filtered out when saving to CSV)
314
+ df_out = concat([df_bins, stats, qc_flag], axis=1)
315
+
316
+ # Log QC summary
317
+ if hasattr(self, '_qc_summary') and self._qc_summary is not None:
318
+ self.logger.info(f"{self.nam} QC Summary:")
319
+ for _, row in self._qc_summary.iterrows():
320
+ self.logger.info(f" {row['Rule']}: {row['Count']} ({row['Percentage']})")
321
+
322
+ return df_out.reindex(_index)
323
+
324
+ def _save_distributions(self, mean_freq: str = '1h') -> None:
325
+ """
326
+ Save size distributions to separate CSV files.
327
+
328
+ Output Files
329
+ ------------
330
+ - output_smps_dNdlogDp.csv : Number distribution (dN/dlogDp)
331
+ - output_smps_dSdlogDp.csv : Surface distribution (dS/dlogDp)
332
+ - output_smps_dVdlogDp.csv : Volume distribution (dV/dlogDp)
333
+
334
+ Parameters
335
+ ----------
336
+ mean_freq : str, default='1h'
337
+ Frequency for resampling the data
338
+ """
339
+ if not hasattr(self, '_distributions') or self._distributions is None:
340
+ self.logger.warning("No distributions to save. Run _process() first.")
341
+ return
342
+
343
+ output_folder = self.csv_out.parent
344
+ self.logger.info("")
345
+
346
+ for dist_name, dist_df in self._distributions.items():
347
+ # Process QC_Flag: set invalid rows to NaN
348
+ if 'QC_Flag' in dist_df.columns:
349
+ invalid_mask = dist_df['QC_Flag'] != 'Valid'
350
+ numeric_cols = [c for c in dist_df.columns if c != 'QC_Flag']
351
+ dist_df.loc[invalid_mask, numeric_cols] = np.nan
352
+ dist_df = dist_df.drop(columns=['QC_Flag'])
353
+
354
+ # Resample and save
355
+ dist_resampled = dist_df.resample(mean_freq).mean().round(4)
356
+ output_path = output_folder / f'output_{self.nam.lower()}_{dist_name}.csv'
357
+ dist_resampled.to_csv(output_path)
358
+ self.logger.info(f"Saved: {output_path.name}")
359
+
360
+ @staticmethod
361
+ def _geometric_prop(dp, dist):
362
+ """
363
+ Calculate geometric mean diameter and geometric standard deviation.
364
+
365
+ Parameters
366
+ ----------
367
+ dp : np.ndarray
368
+ Particle diameters (nm)
369
+ dist : pd.DataFrame
370
+ Distribution data (dN, dS, or dV)
371
+
372
+ Returns
373
+ -------
374
+ tuple
375
+ (total, GMD, GSD) as pandas Series
376
+ """
377
+ # Total concentration
378
+ total = dist.sum(axis=1, min_count=1)
379
+ total_valid = total.where(total > 0)
380
+
381
+ # GMD calculation (in log space)
382
+ log_dp = np.log(dp)
383
+ gmd_log = (dist * log_dp).sum(axis=1) / total_valid
384
+
385
+ # GSD calculation
386
+ dp_mesh, gmd_mesh = np.meshgrid(log_dp, gmd_log)
387
+ gsd_log = np.sqrt(((dp_mesh - gmd_mesh) ** 2 * dist.values).sum(axis=1) / total_valid)
388
+
389
+ return total, np.exp(gmd_log), np.exp(gsd_log)
@@ -0,0 +1,181 @@
1
+ from pandas import to_datetime, read_csv, to_numeric, Series, concat
2
+
3
+ from AeroViz.rawDataReader.core import AbstractReader, QCRule, QCFlagBuilder
4
+
5
+
6
+ class Reader(AbstractReader):
7
+ """TEOM Output Data Formats Reader
8
+
9
+ A specialized reader for TEOM (Tapered Element Oscillating Microbalance)
10
+ particulate matter data files with support for multiple file formats and
11
+ comprehensive quality control.
12
+
13
+ See full documentation at docs/source/instruments/TEOM.md for detailed information
14
+ on supported formats and QC procedures.
15
+ """
16
+ nam = 'TEOM'
17
+
18
+ # =========================================================================
19
+ # Column Definitions
20
+ # =========================================================================
21
+ PM_COLUMNS = ['PM_NV', 'PM_Total']
22
+ OUTPUT_COLUMNS = ['PM_NV', 'PM_Total', 'Volatile_Fraction']
23
+
24
+ # =========================================================================
25
+ # QC Thresholds
26
+ # =========================================================================
27
+ MAX_NOISE = 0.01 # Maximum acceptable noise level
28
+
29
+ # Status Flag
30
+ STATUS_COLUMN = 'status'
31
+ STATUS_OK = 0 # Status code 0 means normal operation
32
+
33
+ def __init__(self, *args, **kwargs):
34
+ super().__init__(*args, **kwargs)
35
+
36
+ def _raw_reader(self, file):
37
+ """
38
+ Read and parse raw TEOM data files in various formats.
39
+
40
+ Handles multiple TEOM data formats and standardizes them to a consistent
41
+ structure with uniform column names and datetime index.
42
+
43
+ Parameters
44
+ ----------
45
+ file : Path or str
46
+ Path to the TEOM data file.
47
+
48
+ Returns
49
+ -------
50
+ pandas.DataFrame
51
+ Processed raw TEOM data with datetime index and standardized columns.
52
+
53
+ Raises
54
+ ------
55
+ NotImplementedError
56
+ If the file format is not recognized as a supported TEOM data format.
57
+ """
58
+ _df = read_csv(file, skiprows=3, index_col=False)
59
+
60
+ # Chinese month name conversion dictionary
61
+ _time_replace = {'十一月': '11', '十二月': '12', '一月': '01', '二月': '02', '三月': '03', '四月': '04',
62
+ '五月': '05', '六月': '06', '七月': '07', '八月': '08', '九月': '09', '十月': '10'}
63
+
64
+ # Try both naming conventions (will ignore columns that don't exist)
65
+ _df = _df.rename(columns={
66
+ # Remote download format
67
+ 'Time Stamp': 'time',
68
+ 'System status': 'status',
69
+ 'PM-2.5 base MC': 'PM_NV',
70
+ 'PM-2.5 MC': 'PM_Total',
71
+ 'PM-2.5 TEOM noise': 'noise',
72
+ # USB/auto export format
73
+ 'time_stamp': 'time',
74
+ 'tmoStatusCondition_0': 'status',
75
+ 'tmoTEOMABaseMC_0': 'PM_NV',
76
+ 'tmoTEOMAMC_0': 'PM_Total',
77
+ 'tmoTEOMANoise_0': 'noise'
78
+ })
79
+
80
+ # Handle different time formats
81
+ if 'time' in _df.columns: # Remote download or auto export with time column
82
+ _tm_idx = _df.time
83
+ # Convert Chinese month names if present
84
+ for _ori, _rpl in _time_replace.items():
85
+ _tm_idx = _tm_idx.str.replace(_ori, _rpl)
86
+
87
+ _df = _df.set_index(to_datetime(_tm_idx, errors='coerce', format='%d - %m - %Y %X'))
88
+
89
+ elif 'Date' in _df.columns and 'Time' in _df.columns: # USB download format
90
+ _df['time'] = to_datetime(_df['Date'] + ' ' + _df['Time'],
91
+ errors='coerce', format='%Y-%m-%d %H:%M:%S')
92
+ _df.drop(columns=['Date', 'Time'], inplace=True)
93
+ _df.set_index('time', inplace=True)
94
+
95
+ else:
96
+ raise NotImplementedError("Unsupported TEOM data format")
97
+
98
+ _df = _df[['PM_NV', 'PM_Total', 'noise', 'status']].apply(to_numeric, errors='coerce')
99
+
100
+ # Remove duplicates and NaN indices
101
+ _df = _df.loc[~_df.index.duplicated() & _df.index.notna()]
102
+
103
+ return _df
104
+
105
+ def _QC(self, _df):
106
+ """
107
+ Perform quality control on TEOM particulate matter data.
108
+
109
+ QC Rules Applied
110
+ ----------------
111
+ 1. Status Error : Non-zero status code indicates instrument error
112
+ 2. High Noise : noise >= 0.01
113
+ 3. Non-positive : PM_NV <= 0 OR PM_Total <= 0
114
+ 4. NV > Total : PM_NV > PM_Total (physically impossible)
115
+ 5. Invalid Vol Frac : Volatile_Fraction outside valid range (0-1)
116
+ 6. Spike : Sudden value change (vectorized spike detection)
117
+ 7. Insufficient : Less than 50% hourly data completeness
118
+ """
119
+ _index = _df.index.copy()
120
+
121
+ # Pre-process: calculate Volatile_Fraction
122
+ _df['Volatile_Fraction'] = ((_df['PM_Total'] - _df['PM_NV']) / _df['PM_Total']).__round__(4)
123
+ df_qc = _df.copy()
124
+
125
+ # Build QC rules declaratively
126
+ qc = QCFlagBuilder()
127
+
128
+ qc.add_rules([
129
+ QCRule(
130
+ name='Status Error',
131
+ condition=lambda df: self.QC_control().filter_error_status(
132
+ _df, status_column=self.STATUS_COLUMN, status_type='numeric', ok_value=self.STATUS_OK
133
+ ),
134
+ description=f'Status code is not {self.STATUS_OK} (non-zero indicates error)'
135
+ ),
136
+ QCRule(
137
+ name='High Noise',
138
+ condition=lambda df: df['noise'] >= self.MAX_NOISE,
139
+ description=f'Noise level >= {self.MAX_NOISE}'
140
+ ),
141
+ QCRule(
142
+ name='Non-positive',
143
+ condition=lambda df: (df[self.PM_COLUMNS] <= 0).any(axis=1),
144
+ description='PM_NV or PM_Total <= 0 (non-positive value)'
145
+ ),
146
+ QCRule(
147
+ name='NV > Total',
148
+ condition=lambda df: df['PM_NV'] > df['PM_Total'],
149
+ description='PM_NV exceeds PM_Total (physically impossible)'
150
+ ),
151
+ QCRule(
152
+ name='Invalid Vol Frac',
153
+ condition=lambda df: (df['Volatile_Fraction'] < 0) | (df['Volatile_Fraction'] > 1),
154
+ description='Volatile_Fraction outside 0-1 range'
155
+ ),
156
+ QCRule(
157
+ name='Spike',
158
+ condition=lambda df: self.QC_control().spike_detection(
159
+ df[self.PM_COLUMNS], max_change_rate=3.0
160
+ ),
161
+ description='Sudden unreasonable value change detected'
162
+ ),
163
+ QCRule(
164
+ name='Insufficient',
165
+ condition=lambda df: self.QC_control().hourly_completeness_QC(
166
+ df[self.PM_COLUMNS], freq=self.meta['freq']
167
+ ),
168
+ description='Less than 50% hourly data completeness'
169
+ ),
170
+ ])
171
+
172
+ # Apply all QC rules and get flagged DataFrame
173
+ df_qc = qc.apply(df_qc)
174
+
175
+ # Log QC summary
176
+ summary = qc.get_summary(df_qc)
177
+ self.logger.info(f"{self.nam} QC Summary:")
178
+ for _, row in summary.iterrows():
179
+ self.logger.info(f" {row['Rule']}: {row['Count']} ({row['Percentage']})")
180
+
181
+ return df_qc[self.OUTPUT_COLUMNS + ['QC_Flag']].reindex(_index)
@@ -0,0 +1,106 @@
1
+ from pandas import read_csv
2
+
3
+ from AeroViz.rawDataReader.core import AbstractReader
4
+
5
+
6
+ class Reader(AbstractReader):
7
+ """ Volatile Organic Compounds (VOC) Data Reader
8
+
9
+ This class handles the reading and parsing of VOC measurement data files,
10
+ which provide concentrations of various volatile organic compounds in air.
11
+
12
+ File structure handling:
13
+ - Processes CSV formatted data files with datetime index
14
+ - Handles special values like '-' and 'N.D.' (Not Detected) as NA
15
+ - Standardizes column names by stripping whitespace
16
+
17
+ Data processing:
18
+ - Filters VOC species based on a predefined list of supported compounds
19
+ - Warns about unsupported VOC species in the data file
20
+ - Handles duplicate timestamps and invalid indices
21
+
22
+ Quality Control procedures:
23
+ - Basic file validation
24
+ - No additional QC is applied in the current implementation
25
+
26
+ Returns:
27
+ -------
28
+ DataFrame
29
+ Processed VOC data with datetime index and supported VOC species as columns.
30
+ If no supported species are found, returns the original dataframe.
31
+
32
+ Notes:
33
+ -----
34
+ VOC measurements are important for understanding air quality, photochemical
35
+ reactions, and sources of secondary organic aerosols. This reader requires
36
+ a predefined list of supported VOC species to be provided in the meta attribute.
37
+ """
38
+ nam = 'VOC'
39
+
40
+ def _raw_reader(self, file):
41
+ """
42
+ Read and parse raw VOC measurement data files.
43
+
44
+ Parameters
45
+ ----------
46
+ file : Path or str
47
+ Path to the VOC data file.
48
+
49
+ Returns
50
+ -------
51
+ pandas.DataFrame
52
+ Processed VOC data with datetime index and supported VOC species as columns.
53
+
54
+ Notes
55
+ -----
56
+ Requires self.meta["key"] to contain a list of supported VOC species names.
57
+ If no supported species are found, returns the original dataframe with a warning.
58
+ """
59
+ with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
60
+ _df = read_csv(f, parse_dates=True, index_col=0, na_values=('-', 'N.D.'))
61
+
62
+ _df.columns = _df.keys().str.strip(' ')
63
+ _df.index.name = 'time'
64
+
65
+ support_voc = set(self.meta["key"])
66
+
67
+ valid_keys = [key for key in _df.keys() if key in support_voc]
68
+ invalid_keys = [key for key in _df.keys() if key not in support_voc]
69
+
70
+ if invalid_keys:
71
+ self.logger.warning(f'{invalid_keys} are not supported keys.')
72
+ print(f'\n\t{invalid_keys} are not supported keys.'
73
+ f'\n\tPlease check the\033[91m support_voc.md\033[0m file to use the correct name.')
74
+
75
+ if valid_keys:
76
+ return _df[valid_keys].loc[~_df.index.duplicated() & _df.index.notna()]
77
+ else:
78
+ self.logger.warning("沒有找到匹配的鍵。返回原始DataFrame。")
79
+ return _df.loc[~_df.index.duplicated() & _df.index.notna()]
80
+
81
+ def _QC(self, _df):
82
+ """
83
+ Perform quality control on VOC measurement data.
84
+
85
+ This method is a placeholder for future QC implementation. Currently,
86
+ it returns the data unchanged.
87
+
88
+ Parameters
89
+ ----------
90
+ _df : pandas.DataFrame
91
+ Raw VOC data with datetime index and concentration columns.
92
+
93
+ Returns
94
+ -------
95
+ pandas.DataFrame
96
+ The input data unchanged.
97
+
98
+ Notes
99
+ -----
100
+ No QC filters are currently applied. Future implementations could include:
101
+ 1. Minimum detection limit filtering
102
+ 2. Value range checks for each VOC species
103
+ 3. Time-based outlier detection
104
+ 4. Correlation checks between related VOC species
105
+ """
106
+ return _df