AeroViz 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AeroViz/__init__.py +13 -0
- AeroViz/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/data/DEFAULT_DATA.csv +1417 -0
- AeroViz/data/DEFAULT_PNSD_DATA.csv +1417 -0
- AeroViz/data/hysplit_example_data.txt +101 -0
- AeroViz/dataProcess/Chemistry/__init__.py +149 -0
- AeroViz/dataProcess/Chemistry/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Chemistry/_calculate.py +557 -0
- AeroViz/dataProcess/Chemistry/_isoropia.py +150 -0
- AeroViz/dataProcess/Chemistry/_mass_volume.py +487 -0
- AeroViz/dataProcess/Chemistry/_ocec.py +172 -0
- AeroViz/dataProcess/Chemistry/isrpia.cnf +21 -0
- AeroViz/dataProcess/Chemistry/isrpia2.exe +0 -0
- AeroViz/dataProcess/Optical/PyMieScatt_update.py +577 -0
- AeroViz/dataProcess/Optical/_IMPROVE.py +452 -0
- AeroViz/dataProcess/Optical/__init__.py +281 -0
- AeroViz/dataProcess/Optical/__pycache__/PyMieScatt_update.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Optical/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Optical/__pycache__/mie_theory.cpython-312.pyc +0 -0
- AeroViz/dataProcess/Optical/_derived.py +518 -0
- AeroViz/dataProcess/Optical/_extinction.py +123 -0
- AeroViz/dataProcess/Optical/_mie_sd.py +912 -0
- AeroViz/dataProcess/Optical/_retrieve_RI.py +243 -0
- AeroViz/dataProcess/Optical/coefficient.py +72 -0
- AeroViz/dataProcess/Optical/fRH.pkl +0 -0
- AeroViz/dataProcess/Optical/mie_theory.py +260 -0
- AeroViz/dataProcess/README.md +271 -0
- AeroViz/dataProcess/SizeDistr/__init__.py +245 -0
- AeroViz/dataProcess/SizeDistr/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/SizeDistr/__pycache__/_size_dist.cpython-312.pyc +0 -0
- AeroViz/dataProcess/SizeDistr/_size_dist.py +810 -0
- AeroViz/dataProcess/SizeDistr/merge/README.md +93 -0
- AeroViz/dataProcess/SizeDistr/merge/__init__.py +20 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v0.py +251 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v0_1.py +246 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v1.py +255 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v2.py +244 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v3.py +518 -0
- AeroViz/dataProcess/SizeDistr/merge/_merge_v4.py +422 -0
- AeroViz/dataProcess/SizeDistr/prop.py +62 -0
- AeroViz/dataProcess/VOC/__init__.py +14 -0
- AeroViz/dataProcess/VOC/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/VOC/_potential_par.py +108 -0
- AeroViz/dataProcess/VOC/support_voc.json +446 -0
- AeroViz/dataProcess/__init__.py +66 -0
- AeroViz/dataProcess/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/dataProcess/core/__init__.py +272 -0
- AeroViz/dataProcess/core/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/mcp_server.py +352 -0
- AeroViz/plot/__init__.py +13 -0
- AeroViz/plot/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/bar.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/box.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/pie.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/radar.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/regression.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/scatter.cpython-312.pyc +0 -0
- AeroViz/plot/__pycache__/violin.cpython-312.pyc +0 -0
- AeroViz/plot/bar.py +126 -0
- AeroViz/plot/box.py +69 -0
- AeroViz/plot/distribution/__init__.py +1 -0
- AeroViz/plot/distribution/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/distribution/__pycache__/distribution.cpython-312.pyc +0 -0
- AeroViz/plot/distribution/distribution.py +576 -0
- AeroViz/plot/meteorology/CBPF.py +295 -0
- AeroViz/plot/meteorology/__init__.py +3 -0
- AeroViz/plot/meteorology/__pycache__/CBPF.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/__pycache__/hysplit.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/__pycache__/wind_rose.cpython-312.pyc +0 -0
- AeroViz/plot/meteorology/hysplit.py +93 -0
- AeroViz/plot/meteorology/wind_rose.py +77 -0
- AeroViz/plot/optical/__init__.py +1 -0
- AeroViz/plot/optical/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/optical/__pycache__/optical.cpython-312.pyc +0 -0
- AeroViz/plot/optical/optical.py +388 -0
- AeroViz/plot/pie.py +210 -0
- AeroViz/plot/radar.py +184 -0
- AeroViz/plot/regression.py +200 -0
- AeroViz/plot/scatter.py +174 -0
- AeroViz/plot/templates/__init__.py +6 -0
- AeroViz/plot/templates/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/ammonium_rich.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/contour.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/corr_matrix.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/diurnal_pattern.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/koschmieder.cpython-312.pyc +0 -0
- AeroViz/plot/templates/__pycache__/metal_heatmap.cpython-312.pyc +0 -0
- AeroViz/plot/templates/ammonium_rich.py +34 -0
- AeroViz/plot/templates/contour.py +47 -0
- AeroViz/plot/templates/corr_matrix.py +267 -0
- AeroViz/plot/templates/diurnal_pattern.py +61 -0
- AeroViz/plot/templates/koschmieder.py +95 -0
- AeroViz/plot/templates/metal_heatmap.py +164 -0
- AeroViz/plot/timeseries/__init__.py +2 -0
- AeroViz/plot/timeseries/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/timeseries/__pycache__/template.cpython-312.pyc +0 -0
- AeroViz/plot/timeseries/__pycache__/timeseries.cpython-312.pyc +0 -0
- AeroViz/plot/timeseries/template.py +47 -0
- AeroViz/plot/timeseries/timeseries.py +446 -0
- AeroViz/plot/utils/__init__.py +4 -0
- AeroViz/plot/utils/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/_color.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/_unit.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/plt_utils.cpython-312.pyc +0 -0
- AeroViz/plot/utils/__pycache__/sklearn_utils.cpython-312.pyc +0 -0
- AeroViz/plot/utils/_color.py +71 -0
- AeroViz/plot/utils/_unit.py +55 -0
- AeroViz/plot/utils/fRH.json +390 -0
- AeroViz/plot/utils/plt_utils.py +92 -0
- AeroViz/plot/utils/sklearn_utils.py +49 -0
- AeroViz/plot/utils/units.json +89 -0
- AeroViz/plot/violin.py +80 -0
- AeroViz/rawDataReader/FLOW.md +138 -0
- AeroViz/rawDataReader/__init__.py +220 -0
- AeroViz/rawDataReader/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/config/__init__.py +0 -0
- AeroViz/rawDataReader/config/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/config/__pycache__/supported_instruments.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/config/supported_instruments.py +135 -0
- AeroViz/rawDataReader/core/__init__.py +658 -0
- AeroViz/rawDataReader/core/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/logger.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/pre_process.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/qc.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/__pycache__/report.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/core/logger.py +171 -0
- AeroViz/rawDataReader/core/pre_process.py +308 -0
- AeroViz/rawDataReader/core/qc.py +961 -0
- AeroViz/rawDataReader/core/report.py +579 -0
- AeroViz/rawDataReader/script/AE33.py +173 -0
- AeroViz/rawDataReader/script/AE43.py +151 -0
- AeroViz/rawDataReader/script/APS.py +339 -0
- AeroViz/rawDataReader/script/Aurora.py +191 -0
- AeroViz/rawDataReader/script/BAM1020.py +90 -0
- AeroViz/rawDataReader/script/BC1054.py +161 -0
- AeroViz/rawDataReader/script/EPA.py +79 -0
- AeroViz/rawDataReader/script/GRIMM.py +68 -0
- AeroViz/rawDataReader/script/IGAC.py +140 -0
- AeroViz/rawDataReader/script/MA350.py +179 -0
- AeroViz/rawDataReader/script/Minion.py +218 -0
- AeroViz/rawDataReader/script/NEPH.py +199 -0
- AeroViz/rawDataReader/script/OCEC.py +173 -0
- AeroViz/rawDataReader/script/Q-ACSM.py +12 -0
- AeroViz/rawDataReader/script/SMPS.py +389 -0
- AeroViz/rawDataReader/script/TEOM.py +181 -0
- AeroViz/rawDataReader/script/VOC.py +106 -0
- AeroViz/rawDataReader/script/Xact.py +244 -0
- AeroViz/rawDataReader/script/__init__.py +28 -0
- AeroViz/rawDataReader/script/__pycache__/AE33.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/AE43.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/APS.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Aurora.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/BAM1020.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/BC1054.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/EPA.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/GRIMM.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/IGAC.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/MA350.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Minion.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/NEPH.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/OCEC.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Q-ACSM.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/SMPS.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/TEOM.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/VOC.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/Xact.cpython-312.pyc +0 -0
- AeroViz/rawDataReader/script/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/tools/__init__.py +2 -0
- AeroViz/tools/__pycache__/__init__.cpython-312.pyc +0 -0
- AeroViz/tools/__pycache__/database.cpython-312.pyc +0 -0
- AeroViz/tools/__pycache__/dataclassifier.cpython-312.pyc +0 -0
- AeroViz/tools/database.py +95 -0
- AeroViz/tools/dataclassifier.py +117 -0
- AeroViz/tools/dataprinter.py +58 -0
- aeroviz-0.1.21.dist-info/METADATA +294 -0
- aeroviz-0.1.21.dist-info/RECORD +180 -0
- aeroviz-0.1.21.dist-info/WHEEL +5 -0
- aeroviz-0.1.21.dist-info/licenses/LICENSE +21 -0
- aeroviz-0.1.21.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pandas import to_datetime, to_numeric, read_csv, Series, concat, DataFrame
|
|
5
|
+
|
|
6
|
+
from AeroViz.rawDataReader.core import AbstractReader, QCRule, QCFlagBuilder
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Reader(AbstractReader):
|
|
10
|
+
"""SMPS (Scanning Mobility Particle Sizer) Data Reader
|
|
11
|
+
|
|
12
|
+
A specialized reader for SMPS data files, which measure particle size distributions
|
|
13
|
+
in the range of 11.8-593.5 nm.
|
|
14
|
+
|
|
15
|
+
See full documentation at docs/source/instruments/SMPS.md for detailed information
|
|
16
|
+
on supported formats and QC procedures.
|
|
17
|
+
"""
|
|
18
|
+
nam = 'SMPS'
|
|
19
|
+
|
|
20
|
+
# =========================================================================
|
|
21
|
+
# QC Thresholds
|
|
22
|
+
# =========================================================================
|
|
23
|
+
MIN_HOURLY_COUNT = 5 # Minimum measurements per hour
|
|
24
|
+
MIN_TOTAL_CONC = 2000 # Minimum total concentration (#/cm³)
|
|
25
|
+
MAX_TOTAL_CONC = 1e7 # Maximum total concentration (#/cm³)
|
|
26
|
+
MAX_LARGE_BIN_CONC = 4000 # Maximum concentration for >400nm bins (DMA water ingress indicator)
|
|
27
|
+
LARGE_BIN_THRESHOLD = 400 # Size threshold for large bin filter (nm)
|
|
28
|
+
|
|
29
|
+
# Status Flag column name
|
|
30
|
+
STATUS_COLUMN = 'Status Flag'
|
|
31
|
+
STATUS_OK = 'Normal Scan' # Normal status text
|
|
32
|
+
|
|
33
|
+
def __init__(self, *args, **kwargs):
|
|
34
|
+
super().__init__(*args, **kwargs)
|
|
35
|
+
self._distributions = None # Store distributions for separate file output
|
|
36
|
+
|
|
37
|
+
def __call__(self, start, end, mean_freq='1h'):
|
|
38
|
+
"""
|
|
39
|
+
Process SMPS data and save size distributions to separate files.
|
|
40
|
+
|
|
41
|
+
Overrides AbstractReader.__call__ to add distribution file saving
|
|
42
|
+
and filter out size bins from main output.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
start : datetime
|
|
47
|
+
Start time for data processing
|
|
48
|
+
end : datetime
|
|
49
|
+
End time for data processing
|
|
50
|
+
mean_freq : str, default='1h'
|
|
51
|
+
Frequency for resampling the data
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
pd.DataFrame
|
|
56
|
+
Processed and resampled data (statistics only, no size bins)
|
|
57
|
+
"""
|
|
58
|
+
# Call parent __call__ for standard processing
|
|
59
|
+
result = super().__call__(start, end, mean_freq)
|
|
60
|
+
|
|
61
|
+
# Save distributions to separate files
|
|
62
|
+
self._save_distributions(mean_freq)
|
|
63
|
+
|
|
64
|
+
# Filter out size bins from main output, keep only statistics
|
|
65
|
+
stat_cols = [col for col in result.columns if not isinstance(col, (int, float))]
|
|
66
|
+
result_stats = result[stat_cols]
|
|
67
|
+
|
|
68
|
+
# Re-save filtered output to CSV
|
|
69
|
+
result_stats.to_csv(self.csv_out)
|
|
70
|
+
|
|
71
|
+
return result_stats
|
|
72
|
+
|
|
73
|
+
def _raw_reader(self, file):
|
|
74
|
+
"""Read and parse raw SMPS data files."""
|
|
75
|
+
|
|
76
|
+
def find_header_row(file_obj, delimiter):
|
|
77
|
+
csv_reader = csv.reader(file_obj, delimiter=delimiter)
|
|
78
|
+
for skip, row in enumerate(csv_reader):
|
|
79
|
+
if row and (row[0] in ['Sample #', 'Scan Number']):
|
|
80
|
+
return skip
|
|
81
|
+
raise ValueError("Header row not found")
|
|
82
|
+
|
|
83
|
+
def parse_date(df, date_format):
|
|
84
|
+
if 'Date' in df.columns and 'Start Time' in df.columns:
|
|
85
|
+
return to_datetime(df['Date'] + ' ' + df['Start Time'], format=date_format, errors='coerce')
|
|
86
|
+
elif 'DateTime Sample Start' in df.columns:
|
|
87
|
+
return to_datetime(df['DateTime Sample Start'], format=date_format, errors='coerce')
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError("Expected date columns not found")
|
|
90
|
+
|
|
91
|
+
with open(file, 'r', encoding='utf-8', errors='ignore') as f:
|
|
92
|
+
if file.suffix.lower() == '.txt':
|
|
93
|
+
delimiter, date_formats = '\t', ['%m/%d/%y %X', '%m/%d/%Y %X']
|
|
94
|
+
else: # csv
|
|
95
|
+
delimiter, date_formats = ',', ['%d/%m/%Y %X']
|
|
96
|
+
|
|
97
|
+
skip = find_header_row(f, delimiter)
|
|
98
|
+
f.seek(0)
|
|
99
|
+
|
|
100
|
+
_df = read_csv(f, sep=delimiter, skiprows=skip, low_memory=False)
|
|
101
|
+
if 'Date' not in _df.columns and 'DateTime Sample Start' not in _df.columns:
|
|
102
|
+
try:
|
|
103
|
+
_df = _df.T
|
|
104
|
+
_df.columns = _df.iloc[0]
|
|
105
|
+
_df = _df.iloc[1:]
|
|
106
|
+
_df = _df.reset_index(drop=True)
|
|
107
|
+
except:
|
|
108
|
+
raise NotImplementedError('Not supported date format')
|
|
109
|
+
|
|
110
|
+
for date_format in date_formats:
|
|
111
|
+
_time_index = parse_date(_df, date_format)
|
|
112
|
+
if not _time_index.isna().all():
|
|
113
|
+
break
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError("Unable to parse dates with given formats")
|
|
116
|
+
|
|
117
|
+
# Check for comma decimal separator
|
|
118
|
+
comma_decimal_cols = [col for col in _df.columns if ',' in col.strip()]
|
|
119
|
+
if comma_decimal_cols:
|
|
120
|
+
self.logger.warning(f"Detected {len(comma_decimal_cols)} columns using comma as decimal separator")
|
|
121
|
+
_df.columns = _df.columns.str.replace(',', '.')
|
|
122
|
+
|
|
123
|
+
# Filter numeric columns
|
|
124
|
+
numeric_cols = [col for col in _df.columns if col.strip().replace('.', '').isdigit()]
|
|
125
|
+
numeric_cols.sort(key=lambda x: float(x.strip()))
|
|
126
|
+
|
|
127
|
+
_df.index = _time_index
|
|
128
|
+
_df.index.name = 'time'
|
|
129
|
+
|
|
130
|
+
_df_smps = _df[numeric_cols]
|
|
131
|
+
_df_smps = _df_smps.loc[_df_smps.index.dropna().copy()]
|
|
132
|
+
|
|
133
|
+
# Rename columns to float values (strip spaces)
|
|
134
|
+
_df_smps.columns = [float(col.strip()) for col in _df_smps.columns]
|
|
135
|
+
|
|
136
|
+
size_range = self.kwargs.get('size_range') or (11.8, 593.5)
|
|
137
|
+
|
|
138
|
+
if _df_smps.columns[0] != size_range[0] or _df_smps.columns[-1] != size_range[1]:
|
|
139
|
+
self.logger.warning(f'SMPS file: {file.name} size range mismatch. '
|
|
140
|
+
f'Expected {size_range}, got ({_df_smps.columns[0]}, {_df_smps.columns[-1]})')
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
_df_smps = _df_smps.apply(to_numeric, errors='coerce')
|
|
144
|
+
|
|
145
|
+
# Include Status Flag column in _df (will be processed by core together)
|
|
146
|
+
if self.STATUS_COLUMN in _df.columns:
|
|
147
|
+
_df_smps[self.STATUS_COLUMN] = _df.loc[_df_smps.index, self.STATUS_COLUMN].astype(str).str.strip()
|
|
148
|
+
|
|
149
|
+
return _df_smps
|
|
150
|
+
|
|
151
|
+
def _QC(self, _df):
|
|
152
|
+
"""
|
|
153
|
+
Perform quality control on SMPS particle size distribution data.
|
|
154
|
+
|
|
155
|
+
QC Rules Applied
|
|
156
|
+
----------------
|
|
157
|
+
1. Status Error : Non-empty status flag indicates instrument error
|
|
158
|
+
2. Insufficient : Less than 5 measurements per hour
|
|
159
|
+
3. Invalid Number Conc : Total number concentration outside valid range (2000-1e7 #/cm³)
|
|
160
|
+
4. DMA Water Ingress : Bins >400nm with concentration > 4000 dN/dlogDp (indicates water in DMA)
|
|
161
|
+
"""
|
|
162
|
+
_df = _df.copy()
|
|
163
|
+
_index = _df.index.copy()
|
|
164
|
+
|
|
165
|
+
# Apply size range filter
|
|
166
|
+
size_range = self.kwargs.get('size_range') or (11.8, 593.5)
|
|
167
|
+
numeric_cols = [col for col in _df.columns if isinstance(col, (int, float))]
|
|
168
|
+
df_numeric = _df[numeric_cols]
|
|
169
|
+
size_mask = (df_numeric.columns.astype(float) >= size_range[0]) & (df_numeric.columns.astype(float) <= size_range[1])
|
|
170
|
+
df_numeric = df_numeric.loc[:, size_mask]
|
|
171
|
+
|
|
172
|
+
# Calculate total concentration for QC checks
|
|
173
|
+
dlogDp = np.diff(np.log(df_numeric.columns[:-1].to_numpy(float))).mean()
|
|
174
|
+
total_conc = df_numeric.sum(axis=1, min_count=1) * dlogDp
|
|
175
|
+
|
|
176
|
+
# Calculate hourly data counts
|
|
177
|
+
hourly_counts = (total_conc
|
|
178
|
+
.dropna()
|
|
179
|
+
.resample('h')
|
|
180
|
+
.size()
|
|
181
|
+
.resample('6min')
|
|
182
|
+
.ffill()
|
|
183
|
+
.reindex(df_numeric.index, method='ffill', tolerance='6min'))
|
|
184
|
+
|
|
185
|
+
# Get large bins (>400nm)
|
|
186
|
+
large_bins = df_numeric.columns[df_numeric.columns.astype(float) >= self.LARGE_BIN_THRESHOLD]
|
|
187
|
+
|
|
188
|
+
# Build QC rules declaratively
|
|
189
|
+
qc = QCFlagBuilder()
|
|
190
|
+
|
|
191
|
+
qc.add_rules([
|
|
192
|
+
QCRule(
|
|
193
|
+
name='Status Error',
|
|
194
|
+
condition=lambda df: self.QC_control().filter_error_status(
|
|
195
|
+
_df, status_column=self.STATUS_COLUMN, status_type='text', ok_value=self.STATUS_OK
|
|
196
|
+
),
|
|
197
|
+
description=f'Status flag is not "{self.STATUS_OK}"'
|
|
198
|
+
),
|
|
199
|
+
QCRule(
|
|
200
|
+
name='Insufficient',
|
|
201
|
+
condition=lambda df: Series(hourly_counts < self.MIN_HOURLY_COUNT, index=df.index).fillna(True),
|
|
202
|
+
description=f'Less than {self.MIN_HOURLY_COUNT} measurements per hour'
|
|
203
|
+
),
|
|
204
|
+
QCRule(
|
|
205
|
+
name='Invalid Number Conc',
|
|
206
|
+
condition=lambda df, tc=total_conc: Series(
|
|
207
|
+
(tc < self.MIN_TOTAL_CONC) | (tc > self.MAX_TOTAL_CONC),
|
|
208
|
+
index=df.index
|
|
209
|
+
).fillna(True),
|
|
210
|
+
description=f'Total number concentration outside valid range ({self.MIN_TOTAL_CONC}-{self.MAX_TOTAL_CONC:.0e} #/cm³)'
|
|
211
|
+
),
|
|
212
|
+
QCRule(
|
|
213
|
+
name='DMA Water Ingress',
|
|
214
|
+
condition=lambda df: (df[large_bins] > self.MAX_LARGE_BIN_CONC).any(axis=1) if len(large_bins) > 0 else Series(False, index=df.index),
|
|
215
|
+
description=f'Bins >{self.LARGE_BIN_THRESHOLD}nm with concentration > {self.MAX_LARGE_BIN_CONC} dN/dlogDp (water in DMA)'
|
|
216
|
+
),
|
|
217
|
+
])
|
|
218
|
+
|
|
219
|
+
# Apply all QC rules
|
|
220
|
+
df_qc = qc.apply(_df)
|
|
221
|
+
|
|
222
|
+
# Store QC summary for combined output in _process()
|
|
223
|
+
self._qc_summary = qc.get_summary(df_qc)
|
|
224
|
+
|
|
225
|
+
return df_qc.reindex(_index)
|
|
226
|
+
|
|
227
|
+
def _process(self, _df):
|
|
228
|
+
"""
|
|
229
|
+
Calculate size distribution statistics from QC'd SMPS data.
|
|
230
|
+
|
|
231
|
+
Processing Steps
|
|
232
|
+
----------------
|
|
233
|
+
1. Calculate dlogDp from bin diameters
|
|
234
|
+
2. Calculate number, surface, volume distributions (all in dX/dlogDp)
|
|
235
|
+
3. Calculate total, GMD, GSD, mode for each weighting
|
|
236
|
+
4. Calculate mode contributions (ultra, accum, coarse fractions)
|
|
237
|
+
5. Store distributions for separate file output
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
_df : pd.DataFrame
|
|
242
|
+
Quality-controlled DataFrame with size bin columns and QC_Flag
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
pd.DataFrame
|
|
247
|
+
Original size bins (dN/dlogDp) + calculated statistics + QC_Flag
|
|
248
|
+
"""
|
|
249
|
+
_index = _df.index.copy()
|
|
250
|
+
|
|
251
|
+
# Separate QC_Flag from size bins
|
|
252
|
+
qc_flag = _df['QC_Flag'].copy() if 'QC_Flag' in _df.columns else Series('Valid', index=_df.index)
|
|
253
|
+
|
|
254
|
+
# Get numeric columns (size bins)
|
|
255
|
+
bin_cols = [col for col in _df.columns if isinstance(col, (int, float))]
|
|
256
|
+
df_bins = _df[bin_cols].copy() # This is dN/dlogDp
|
|
257
|
+
dp = np.array(bin_cols, dtype=float)
|
|
258
|
+
|
|
259
|
+
# Input is already dN/dlogDp, calculate dS/dlogDp and dV/dlogDp
|
|
260
|
+
dN_dlogDp = df_bins.copy()
|
|
261
|
+
dS_dlogDp = dN_dlogDp * np.pi * dp ** 2 # Surface area distribution (nm²·cm⁻³)
|
|
262
|
+
dV_dlogDp = dN_dlogDp * np.pi * (dp ** 3) / 6 # Volume distribution (nm³·cm⁻³)
|
|
263
|
+
|
|
264
|
+
# Store distributions for separate file output (with QC_Flag)
|
|
265
|
+
self._distributions = {
|
|
266
|
+
'dNdlogDp': concat([dN_dlogDp, qc_flag], axis=1),
|
|
267
|
+
'dSdlogDp': concat([dS_dlogDp, qc_flag], axis=1),
|
|
268
|
+
'dVdlogDp': concat([dV_dlogDp, qc_flag], axis=1),
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
# For statistics calculation, convert to absolute values (dX = dX/dlogDp * dlogDp)
|
|
272
|
+
dlogDp = np.diff(np.log10(dp))
|
|
273
|
+
dlogDp = np.append(dlogDp, dlogDp[-1])
|
|
274
|
+
dN = dN_dlogDp * dlogDp
|
|
275
|
+
dS = dS_dlogDp * dlogDp
|
|
276
|
+
dV = dV_dlogDp * dlogDp
|
|
277
|
+
|
|
278
|
+
# Calculate statistics for all particles
|
|
279
|
+
stats = DataFrame(index=_df.index)
|
|
280
|
+
|
|
281
|
+
# Calculate for each weighting type
|
|
282
|
+
for weight_name, dist in [('num', dN), ('surf', dS), ('vol', dV)]:
|
|
283
|
+
total, gmd, gsd = self._geometric_prop(dp, dist)
|
|
284
|
+
stats[f'total_{weight_name}'] = total
|
|
285
|
+
stats[f'GMD_{weight_name}'] = gmd
|
|
286
|
+
stats[f'GSD_{weight_name}'] = gsd
|
|
287
|
+
|
|
288
|
+
# Calculate mode (diameter with maximum concentration)
|
|
289
|
+
mask = dist.notna().any(axis=1)
|
|
290
|
+
stats.loc[mask, f'mode_{weight_name}'] = dist.loc[mask].idxmax(axis=1)
|
|
291
|
+
|
|
292
|
+
# Calculate mode contributions
|
|
293
|
+
if weight_name == 'num':
|
|
294
|
+
total_sum = dist.sum(axis=1)
|
|
295
|
+
total_sum = total_sum.where(total_sum > 0)
|
|
296
|
+
|
|
297
|
+
# Ultrafine: < 100 nm
|
|
298
|
+
ultra_bins = [c for c in dist.columns if c < 100]
|
|
299
|
+
if ultra_bins:
|
|
300
|
+
stats[f'ultra_{weight_name}'] = dist[ultra_bins].sum(axis=1) / total_sum
|
|
301
|
+
|
|
302
|
+
# Accumulation: 100-1000 nm
|
|
303
|
+
accum_bins = [c for c in dist.columns if 100 <= c < 1000]
|
|
304
|
+
if accum_bins:
|
|
305
|
+
stats[f'accum_{weight_name}'] = dist[accum_bins].sum(axis=1) / total_sum
|
|
306
|
+
|
|
307
|
+
# Coarse: >= 1000 nm (if available in SMPS range)
|
|
308
|
+
coarse_bins = [c for c in dist.columns if c >= 1000]
|
|
309
|
+
if coarse_bins:
|
|
310
|
+
stats[f'coarse_{weight_name}'] = dist[coarse_bins].sum(axis=1) / total_sum
|
|
311
|
+
|
|
312
|
+
# Combine: size bins + statistics + QC_Flag
|
|
313
|
+
# (bins are kept for rate calculation, filtered out when saving to CSV)
|
|
314
|
+
df_out = concat([df_bins, stats, qc_flag], axis=1)
|
|
315
|
+
|
|
316
|
+
# Log QC summary
|
|
317
|
+
if hasattr(self, '_qc_summary') and self._qc_summary is not None:
|
|
318
|
+
self.logger.info(f"{self.nam} QC Summary:")
|
|
319
|
+
for _, row in self._qc_summary.iterrows():
|
|
320
|
+
self.logger.info(f" {row['Rule']}: {row['Count']} ({row['Percentage']})")
|
|
321
|
+
|
|
322
|
+
return df_out.reindex(_index)
|
|
323
|
+
|
|
324
|
+
def _save_distributions(self, mean_freq: str = '1h') -> None:
|
|
325
|
+
"""
|
|
326
|
+
Save size distributions to separate CSV files.
|
|
327
|
+
|
|
328
|
+
Output Files
|
|
329
|
+
------------
|
|
330
|
+
- output_smps_dNdlogDp.csv : Number distribution (dN/dlogDp)
|
|
331
|
+
- output_smps_dSdlogDp.csv : Surface distribution (dS/dlogDp)
|
|
332
|
+
- output_smps_dVdlogDp.csv : Volume distribution (dV/dlogDp)
|
|
333
|
+
|
|
334
|
+
Parameters
|
|
335
|
+
----------
|
|
336
|
+
mean_freq : str, default='1h'
|
|
337
|
+
Frequency for resampling the data
|
|
338
|
+
"""
|
|
339
|
+
if not hasattr(self, '_distributions') or self._distributions is None:
|
|
340
|
+
self.logger.warning("No distributions to save. Run _process() first.")
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
output_folder = self.csv_out.parent
|
|
344
|
+
self.logger.info("")
|
|
345
|
+
|
|
346
|
+
for dist_name, dist_df in self._distributions.items():
|
|
347
|
+
# Process QC_Flag: set invalid rows to NaN
|
|
348
|
+
if 'QC_Flag' in dist_df.columns:
|
|
349
|
+
invalid_mask = dist_df['QC_Flag'] != 'Valid'
|
|
350
|
+
numeric_cols = [c for c in dist_df.columns if c != 'QC_Flag']
|
|
351
|
+
dist_df.loc[invalid_mask, numeric_cols] = np.nan
|
|
352
|
+
dist_df = dist_df.drop(columns=['QC_Flag'])
|
|
353
|
+
|
|
354
|
+
# Resample and save
|
|
355
|
+
dist_resampled = dist_df.resample(mean_freq).mean().round(4)
|
|
356
|
+
output_path = output_folder / f'output_{self.nam.lower()}_{dist_name}.csv'
|
|
357
|
+
dist_resampled.to_csv(output_path)
|
|
358
|
+
self.logger.info(f"Saved: {output_path.name}")
|
|
359
|
+
|
|
360
|
+
@staticmethod
|
|
361
|
+
def _geometric_prop(dp, dist):
|
|
362
|
+
"""
|
|
363
|
+
Calculate geometric mean diameter and geometric standard deviation.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
dp : np.ndarray
|
|
368
|
+
Particle diameters (nm)
|
|
369
|
+
dist : pd.DataFrame
|
|
370
|
+
Distribution data (dN, dS, or dV)
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
tuple
|
|
375
|
+
(total, GMD, GSD) as pandas Series
|
|
376
|
+
"""
|
|
377
|
+
# Total concentration
|
|
378
|
+
total = dist.sum(axis=1, min_count=1)
|
|
379
|
+
total_valid = total.where(total > 0)
|
|
380
|
+
|
|
381
|
+
# GMD calculation (in log space)
|
|
382
|
+
log_dp = np.log(dp)
|
|
383
|
+
gmd_log = (dist * log_dp).sum(axis=1) / total_valid
|
|
384
|
+
|
|
385
|
+
# GSD calculation
|
|
386
|
+
dp_mesh, gmd_mesh = np.meshgrid(log_dp, gmd_log)
|
|
387
|
+
gsd_log = np.sqrt(((dp_mesh - gmd_mesh) ** 2 * dist.values).sum(axis=1) / total_valid)
|
|
388
|
+
|
|
389
|
+
return total, np.exp(gmd_log), np.exp(gsd_log)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from pandas import to_datetime, read_csv, to_numeric, Series, concat
|
|
2
|
+
|
|
3
|
+
from AeroViz.rawDataReader.core import AbstractReader, QCRule, QCFlagBuilder
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Reader(AbstractReader):
|
|
7
|
+
"""TEOM Output Data Formats Reader
|
|
8
|
+
|
|
9
|
+
A specialized reader for TEOM (Tapered Element Oscillating Microbalance)
|
|
10
|
+
particulate matter data files with support for multiple file formats and
|
|
11
|
+
comprehensive quality control.
|
|
12
|
+
|
|
13
|
+
See full documentation at docs/source/instruments/TEOM.md for detailed information
|
|
14
|
+
on supported formats and QC procedures.
|
|
15
|
+
"""
|
|
16
|
+
nam = 'TEOM'
|
|
17
|
+
|
|
18
|
+
# =========================================================================
|
|
19
|
+
# Column Definitions
|
|
20
|
+
# =========================================================================
|
|
21
|
+
PM_COLUMNS = ['PM_NV', 'PM_Total']
|
|
22
|
+
OUTPUT_COLUMNS = ['PM_NV', 'PM_Total', 'Volatile_Fraction']
|
|
23
|
+
|
|
24
|
+
# =========================================================================
|
|
25
|
+
# QC Thresholds
|
|
26
|
+
# =========================================================================
|
|
27
|
+
MAX_NOISE = 0.01 # Maximum acceptable noise level
|
|
28
|
+
|
|
29
|
+
# Status Flag
|
|
30
|
+
STATUS_COLUMN = 'status'
|
|
31
|
+
STATUS_OK = 0 # Status code 0 means normal operation
|
|
32
|
+
|
|
33
|
+
def __init__(self, *args, **kwargs):
|
|
34
|
+
super().__init__(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
def _raw_reader(self, file):
|
|
37
|
+
"""
|
|
38
|
+
Read and parse raw TEOM data files in various formats.
|
|
39
|
+
|
|
40
|
+
Handles multiple TEOM data formats and standardizes them to a consistent
|
|
41
|
+
structure with uniform column names and datetime index.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
file : Path or str
|
|
46
|
+
Path to the TEOM data file.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
pandas.DataFrame
|
|
51
|
+
Processed raw TEOM data with datetime index and standardized columns.
|
|
52
|
+
|
|
53
|
+
Raises
|
|
54
|
+
------
|
|
55
|
+
NotImplementedError
|
|
56
|
+
If the file format is not recognized as a supported TEOM data format.
|
|
57
|
+
"""
|
|
58
|
+
_df = read_csv(file, skiprows=3, index_col=False)
|
|
59
|
+
|
|
60
|
+
# Chinese month name conversion dictionary
|
|
61
|
+
_time_replace = {'十一月': '11', '十二月': '12', '一月': '01', '二月': '02', '三月': '03', '四月': '04',
|
|
62
|
+
'五月': '05', '六月': '06', '七月': '07', '八月': '08', '九月': '09', '十月': '10'}
|
|
63
|
+
|
|
64
|
+
# Try both naming conventions (will ignore columns that don't exist)
|
|
65
|
+
_df = _df.rename(columns={
|
|
66
|
+
# Remote download format
|
|
67
|
+
'Time Stamp': 'time',
|
|
68
|
+
'System status': 'status',
|
|
69
|
+
'PM-2.5 base MC': 'PM_NV',
|
|
70
|
+
'PM-2.5 MC': 'PM_Total',
|
|
71
|
+
'PM-2.5 TEOM noise': 'noise',
|
|
72
|
+
# USB/auto export format
|
|
73
|
+
'time_stamp': 'time',
|
|
74
|
+
'tmoStatusCondition_0': 'status',
|
|
75
|
+
'tmoTEOMABaseMC_0': 'PM_NV',
|
|
76
|
+
'tmoTEOMAMC_0': 'PM_Total',
|
|
77
|
+
'tmoTEOMANoise_0': 'noise'
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
# Handle different time formats
|
|
81
|
+
if 'time' in _df.columns: # Remote download or auto export with time column
|
|
82
|
+
_tm_idx = _df.time
|
|
83
|
+
# Convert Chinese month names if present
|
|
84
|
+
for _ori, _rpl in _time_replace.items():
|
|
85
|
+
_tm_idx = _tm_idx.str.replace(_ori, _rpl)
|
|
86
|
+
|
|
87
|
+
_df = _df.set_index(to_datetime(_tm_idx, errors='coerce', format='%d - %m - %Y %X'))
|
|
88
|
+
|
|
89
|
+
elif 'Date' in _df.columns and 'Time' in _df.columns: # USB download format
|
|
90
|
+
_df['time'] = to_datetime(_df['Date'] + ' ' + _df['Time'],
|
|
91
|
+
errors='coerce', format='%Y-%m-%d %H:%M:%S')
|
|
92
|
+
_df.drop(columns=['Date', 'Time'], inplace=True)
|
|
93
|
+
_df.set_index('time', inplace=True)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
raise NotImplementedError("Unsupported TEOM data format")
|
|
97
|
+
|
|
98
|
+
_df = _df[['PM_NV', 'PM_Total', 'noise', 'status']].apply(to_numeric, errors='coerce')
|
|
99
|
+
|
|
100
|
+
# Remove duplicates and NaN indices
|
|
101
|
+
_df = _df.loc[~_df.index.duplicated() & _df.index.notna()]
|
|
102
|
+
|
|
103
|
+
return _df
|
|
104
|
+
|
|
105
|
+
def _QC(self, _df):
|
|
106
|
+
"""
|
|
107
|
+
Perform quality control on TEOM particulate matter data.
|
|
108
|
+
|
|
109
|
+
QC Rules Applied
|
|
110
|
+
----------------
|
|
111
|
+
1. Status Error : Non-zero status code indicates instrument error
|
|
112
|
+
2. High Noise : noise >= 0.01
|
|
113
|
+
3. Non-positive : PM_NV <= 0 OR PM_Total <= 0
|
|
114
|
+
4. NV > Total : PM_NV > PM_Total (physically impossible)
|
|
115
|
+
5. Invalid Vol Frac : Volatile_Fraction outside valid range (0-1)
|
|
116
|
+
6. Spike : Sudden value change (vectorized spike detection)
|
|
117
|
+
7. Insufficient : Less than 50% hourly data completeness
|
|
118
|
+
"""
|
|
119
|
+
_index = _df.index.copy()
|
|
120
|
+
|
|
121
|
+
# Pre-process: calculate Volatile_Fraction
|
|
122
|
+
_df['Volatile_Fraction'] = ((_df['PM_Total'] - _df['PM_NV']) / _df['PM_Total']).__round__(4)
|
|
123
|
+
df_qc = _df.copy()
|
|
124
|
+
|
|
125
|
+
# Build QC rules declaratively
|
|
126
|
+
qc = QCFlagBuilder()
|
|
127
|
+
|
|
128
|
+
qc.add_rules([
|
|
129
|
+
QCRule(
|
|
130
|
+
name='Status Error',
|
|
131
|
+
condition=lambda df: self.QC_control().filter_error_status(
|
|
132
|
+
_df, status_column=self.STATUS_COLUMN, status_type='numeric', ok_value=self.STATUS_OK
|
|
133
|
+
),
|
|
134
|
+
description=f'Status code is not {self.STATUS_OK} (non-zero indicates error)'
|
|
135
|
+
),
|
|
136
|
+
QCRule(
|
|
137
|
+
name='High Noise',
|
|
138
|
+
condition=lambda df: df['noise'] >= self.MAX_NOISE,
|
|
139
|
+
description=f'Noise level >= {self.MAX_NOISE}'
|
|
140
|
+
),
|
|
141
|
+
QCRule(
|
|
142
|
+
name='Non-positive',
|
|
143
|
+
condition=lambda df: (df[self.PM_COLUMNS] <= 0).any(axis=1),
|
|
144
|
+
description='PM_NV or PM_Total <= 0 (non-positive value)'
|
|
145
|
+
),
|
|
146
|
+
QCRule(
|
|
147
|
+
name='NV > Total',
|
|
148
|
+
condition=lambda df: df['PM_NV'] > df['PM_Total'],
|
|
149
|
+
description='PM_NV exceeds PM_Total (physically impossible)'
|
|
150
|
+
),
|
|
151
|
+
QCRule(
|
|
152
|
+
name='Invalid Vol Frac',
|
|
153
|
+
condition=lambda df: (df['Volatile_Fraction'] < 0) | (df['Volatile_Fraction'] > 1),
|
|
154
|
+
description='Volatile_Fraction outside 0-1 range'
|
|
155
|
+
),
|
|
156
|
+
QCRule(
|
|
157
|
+
name='Spike',
|
|
158
|
+
condition=lambda df: self.QC_control().spike_detection(
|
|
159
|
+
df[self.PM_COLUMNS], max_change_rate=3.0
|
|
160
|
+
),
|
|
161
|
+
description='Sudden unreasonable value change detected'
|
|
162
|
+
),
|
|
163
|
+
QCRule(
|
|
164
|
+
name='Insufficient',
|
|
165
|
+
condition=lambda df: self.QC_control().hourly_completeness_QC(
|
|
166
|
+
df[self.PM_COLUMNS], freq=self.meta['freq']
|
|
167
|
+
),
|
|
168
|
+
description='Less than 50% hourly data completeness'
|
|
169
|
+
),
|
|
170
|
+
])
|
|
171
|
+
|
|
172
|
+
# Apply all QC rules and get flagged DataFrame
|
|
173
|
+
df_qc = qc.apply(df_qc)
|
|
174
|
+
|
|
175
|
+
# Log QC summary
|
|
176
|
+
summary = qc.get_summary(df_qc)
|
|
177
|
+
self.logger.info(f"{self.nam} QC Summary:")
|
|
178
|
+
for _, row in summary.iterrows():
|
|
179
|
+
self.logger.info(f" {row['Rule']}: {row['Count']} ({row['Percentage']})")
|
|
180
|
+
|
|
181
|
+
return df_qc[self.OUTPUT_COLUMNS + ['QC_Flag']].reindex(_index)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from pandas import read_csv
|
|
2
|
+
|
|
3
|
+
from AeroViz.rawDataReader.core import AbstractReader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Reader(AbstractReader):
|
|
7
|
+
""" Volatile Organic Compounds (VOC) Data Reader
|
|
8
|
+
|
|
9
|
+
This class handles the reading and parsing of VOC measurement data files,
|
|
10
|
+
which provide concentrations of various volatile organic compounds in air.
|
|
11
|
+
|
|
12
|
+
File structure handling:
|
|
13
|
+
- Processes CSV formatted data files with datetime index
|
|
14
|
+
- Handles special values like '-' and 'N.D.' (Not Detected) as NA
|
|
15
|
+
- Standardizes column names by stripping whitespace
|
|
16
|
+
|
|
17
|
+
Data processing:
|
|
18
|
+
- Filters VOC species based on a predefined list of supported compounds
|
|
19
|
+
- Warns about unsupported VOC species in the data file
|
|
20
|
+
- Handles duplicate timestamps and invalid indices
|
|
21
|
+
|
|
22
|
+
Quality Control procedures:
|
|
23
|
+
- Basic file validation
|
|
24
|
+
- No additional QC is applied in the current implementation
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
-------
|
|
28
|
+
DataFrame
|
|
29
|
+
Processed VOC data with datetime index and supported VOC species as columns.
|
|
30
|
+
If no supported species are found, returns the original dataframe.
|
|
31
|
+
|
|
32
|
+
Notes:
|
|
33
|
+
-----
|
|
34
|
+
VOC measurements are important for understanding air quality, photochemical
|
|
35
|
+
reactions, and sources of secondary organic aerosols. This reader requires
|
|
36
|
+
a predefined list of supported VOC species to be provided in the meta attribute.
|
|
37
|
+
"""
|
|
38
|
+
nam = 'VOC'
|
|
39
|
+
|
|
40
|
+
def _raw_reader(self, file):
|
|
41
|
+
"""
|
|
42
|
+
Read and parse raw VOC measurement data files.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
file : Path or str
|
|
47
|
+
Path to the VOC data file.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
pandas.DataFrame
|
|
52
|
+
Processed VOC data with datetime index and supported VOC species as columns.
|
|
53
|
+
|
|
54
|
+
Notes
|
|
55
|
+
-----
|
|
56
|
+
Requires self.meta["key"] to contain a list of supported VOC species names.
|
|
57
|
+
If no supported species are found, returns the original dataframe with a warning.
|
|
58
|
+
"""
|
|
59
|
+
with file.open('r', encoding='utf-8-sig', errors='ignore') as f:
|
|
60
|
+
_df = read_csv(f, parse_dates=True, index_col=0, na_values=('-', 'N.D.'))
|
|
61
|
+
|
|
62
|
+
_df.columns = _df.keys().str.strip(' ')
|
|
63
|
+
_df.index.name = 'time'
|
|
64
|
+
|
|
65
|
+
support_voc = set(self.meta["key"])
|
|
66
|
+
|
|
67
|
+
valid_keys = [key for key in _df.keys() if key in support_voc]
|
|
68
|
+
invalid_keys = [key for key in _df.keys() if key not in support_voc]
|
|
69
|
+
|
|
70
|
+
if invalid_keys:
|
|
71
|
+
self.logger.warning(f'{invalid_keys} are not supported keys.')
|
|
72
|
+
print(f'\n\t{invalid_keys} are not supported keys.'
|
|
73
|
+
f'\n\tPlease check the\033[91m support_voc.md\033[0m file to use the correct name.')
|
|
74
|
+
|
|
75
|
+
if valid_keys:
|
|
76
|
+
return _df[valid_keys].loc[~_df.index.duplicated() & _df.index.notna()]
|
|
77
|
+
else:
|
|
78
|
+
self.logger.warning("沒有找到匹配的鍵。返回原始DataFrame。")
|
|
79
|
+
return _df.loc[~_df.index.duplicated() & _df.index.notna()]
|
|
80
|
+
|
|
81
|
+
def _QC(self, _df):
|
|
82
|
+
"""
|
|
83
|
+
Perform quality control on VOC measurement data.
|
|
84
|
+
|
|
85
|
+
This method is a placeholder for future QC implementation. Currently,
|
|
86
|
+
it returns the data unchanged.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
_df : pandas.DataFrame
|
|
91
|
+
Raw VOC data with datetime index and concentration columns.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
pandas.DataFrame
|
|
96
|
+
The input data unchanged.
|
|
97
|
+
|
|
98
|
+
Notes
|
|
99
|
+
-----
|
|
100
|
+
No QC filters are currently applied. Future implementations could include:
|
|
101
|
+
1. Minimum detection limit filtering
|
|
102
|
+
2. Value range checks for each VOC species
|
|
103
|
+
3. Time-based outlier detection
|
|
104
|
+
4. Correlation checks between related VOC species
|
|
105
|
+
"""
|
|
106
|
+
return _df
|