pypromice 1.3.3__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pypromice might be problematic. Click here for more details.
- pypromice/postprocess/bufr_to_csv.py +11 -0
- pypromice/postprocess/bufr_utilities.py +489 -0
- pypromice/postprocess/get_bufr.py +622 -284
- pypromice/postprocess/positions_seed.csv +5 -0
- pypromice/postprocess/real_time_utilities.py +241 -0
- pypromice/postprocess/station_configurations.toml +762 -0
- pypromice/process/L0toL1.py +4 -2
- pypromice/process/value_clipping.py +4 -13
- pypromice/process/variables.csv +13 -15
- pypromice/qc/github_data_issues.py +10 -40
- {pypromice-1.3.3.dist-info → pypromice-1.3.4.dist-info}/METADATA +2 -1
- {pypromice-1.3.3.dist-info → pypromice-1.3.4.dist-info}/RECORD +16 -13
- {pypromice-1.3.3.dist-info → pypromice-1.3.4.dist-info}/WHEEL +1 -1
- {pypromice-1.3.3.dist-info → pypromice-1.3.4.dist-info}/entry_points.txt +1 -1
- pypromice/postprocess/csv2bufr.py +0 -508
- pypromice/postprocess/wmo_config.py +0 -179
- {pypromice-1.3.3.dist-info → pypromice-1.3.4.dist-info}/LICENSE.txt +0 -0
- {pypromice-1.3.3.dist-info → pypromice-1.3.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for processing real time / instantaneous AWS data.
|
|
3
|
+
|
|
4
|
+
This includes:
|
|
5
|
+
* Select latest data
|
|
6
|
+
* Noise filtering data
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from sklearn.linear_model import LinearRegression
|
|
15
|
+
|
|
16
|
+
__all__ = ["get_latest_data"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_latest_data(
|
|
23
|
+
df: pd.DataFrame,
|
|
24
|
+
lin_reg_time_limit: str,
|
|
25
|
+
) -> Optional[pd.Series]:
|
|
26
|
+
"""
|
|
27
|
+
Determine instantaneous values for the latest valid timestamp in the input dataframe
|
|
28
|
+
|
|
29
|
+
* A valid timestamp is a timestamp with relevant instantaneous variables. See source code.
|
|
30
|
+
* Location smoothing: Fit a linear regression model on gps coordinate over the period lin_reg_time_limit to determine latest values.
|
|
31
|
+
* z_boom: Apply rolling window median filter smooth data
|
|
32
|
+
|
|
33
|
+
The output series contains the same variables as the input dataframe plus smoothed variables:
|
|
34
|
+
|
|
35
|
+
* gps_lat_fit
|
|
36
|
+
* gps_lon_fit
|
|
37
|
+
* gps_alt_fit
|
|
38
|
+
* z_boom_u_smooth
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
df
|
|
43
|
+
Input AWS l3 dataframe
|
|
44
|
+
lin_reg_time_limit
|
|
45
|
+
Previous time to limit dataframe before applying linear regression.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
pd.Series with the latest data.
|
|
50
|
+
|
|
51
|
+
"""
|
|
52
|
+
# TODO: The data frames should be cropped with respect to a selected window
|
|
53
|
+
# Check that the last valid index for all instantaneous values match
|
|
54
|
+
# Note: we cannot always use the single most-recent timestamp in the dataframe
|
|
55
|
+
# e.g. for 6-hr transmissions, *_u will have hourly data while *_i is nan
|
|
56
|
+
# Need to check for last valid (non-nan) index instead
|
|
57
|
+
last_valid_index = df[["t_i", "p_i", "rh_i", "wspd_i", "wdir_i"]].last_valid_index()
|
|
58
|
+
if last_valid_index is None:
|
|
59
|
+
return None
|
|
60
|
+
logger.info(f"TIMESTAMP: {last_valid_index}")
|
|
61
|
+
|
|
62
|
+
# Find positions
|
|
63
|
+
# we only need to add positions to the BUFR file
|
|
64
|
+
df_limited = find_positions(
|
|
65
|
+
df,
|
|
66
|
+
lin_reg_time_limit,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Apply smoothing to z_boom_u
|
|
70
|
+
# require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u
|
|
71
|
+
df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 1)
|
|
72
|
+
|
|
73
|
+
# limit to single most recent valid row (convert to series)
|
|
74
|
+
s_current = df_limited.loc[last_valid_index]
|
|
75
|
+
|
|
76
|
+
return s_current
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def rolling_window(df, column, window, min_periods, decimals) -> pd.DataFrame:
|
|
80
|
+
"""Apply a rolling window (smoothing) to the input column
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
df : pandas.Dataframe
|
|
85
|
+
datetime-indexed df
|
|
86
|
+
column : str
|
|
87
|
+
The target column for applying rolling window
|
|
88
|
+
window : str
|
|
89
|
+
Window size (e.g. '24H' or 30D')
|
|
90
|
+
min_periods : int
|
|
91
|
+
Minimum number of observations in window required to have a value;
|
|
92
|
+
otherwise, result is np.nan.
|
|
93
|
+
decimals : int
|
|
94
|
+
How many decimal places to round the output smoothed values
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
df : pandas.Dataframe
|
|
99
|
+
The original input df, with added column for the smoothed values
|
|
100
|
+
"""
|
|
101
|
+
df["{}_smooth".format(column)] = (
|
|
102
|
+
df[column]
|
|
103
|
+
.rolling(
|
|
104
|
+
window,
|
|
105
|
+
min_periods=min_periods,
|
|
106
|
+
center=True, # set the window labels as the center of the window
|
|
107
|
+
closed="both", # no points in the window are excluded (first or last)
|
|
108
|
+
)
|
|
109
|
+
.median()
|
|
110
|
+
.round(decimals=decimals)
|
|
111
|
+
) # could also round to whole meters (decimals=0)
|
|
112
|
+
return df
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def find_positions(df, time_limit):
|
|
116
|
+
"""Driver function to run linear_fit() and set valid lat, lon, and alt
|
|
117
|
+
to df_limited, which is then used to set position data in BUFR.
|
|
118
|
+
If 'positions' is not None (must pass --positions arg), we also write to
|
|
119
|
+
the positions dict which will be written to AWS_latest_locations.csv for
|
|
120
|
+
all stations (whether processed or skipped)
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
df : pandas dataframe
|
|
125
|
+
The full tx dataframe
|
|
126
|
+
stid : str
|
|
127
|
+
The station ID, such as NUK_L
|
|
128
|
+
time_limit : str
|
|
129
|
+
Previous time to limit dataframe before applying linear regression.
|
|
130
|
+
(e.g. '91d')
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
df_limited : pandas dataframe
|
|
135
|
+
Dataframe limited to time_limit, and including position data
|
|
136
|
+
positions : dict
|
|
137
|
+
Modified dict storing most-recent station positions.
|
|
138
|
+
"""
|
|
139
|
+
logger.info("finding positions")
|
|
140
|
+
time_delta = pd.Timedelta(time_limit)
|
|
141
|
+
last_index = df.index.max()
|
|
142
|
+
last_mask = df.index > last_index - time_delta
|
|
143
|
+
df_limited = df.loc[last_mask].copy()
|
|
144
|
+
|
|
145
|
+
logger.info(f"last transmission: {df_limited.index.max()}")
|
|
146
|
+
|
|
147
|
+
# Extrapolate recommended for altitude, optional for lat and lon.
|
|
148
|
+
df_limited, lat_valid = linear_fit(df_limited, "gps_lat", 6)
|
|
149
|
+
df_limited, lon_valid = linear_fit(df_limited, "gps_lon", 6)
|
|
150
|
+
df_limited, alt_valid = linear_fit(df_limited, "gps_alt", 1)
|
|
151
|
+
|
|
152
|
+
# If we have no valid lat, lon or alt data in the df_limited window, then interpolate
|
|
153
|
+
# using full tx dataset.
|
|
154
|
+
check_valid = {"gps_lat": lat_valid, "gps_lon": lon_valid, "gps_alt": alt_valid}
|
|
155
|
+
check_valid_again = {}
|
|
156
|
+
for k, v in check_valid.items():
|
|
157
|
+
if v is False:
|
|
158
|
+
logger.info(f"----> Using full history for linear extrapolation: {k}")
|
|
159
|
+
logger.info(f"first transmission: {df.index.min()}")
|
|
160
|
+
if k == "gps_alt":
|
|
161
|
+
df, valid = linear_fit(df, k, 1)
|
|
162
|
+
else:
|
|
163
|
+
df, valid = linear_fit(df, k, 6)
|
|
164
|
+
check_valid_again[k] = valid
|
|
165
|
+
if check_valid_again[k] is True:
|
|
166
|
+
df_limited[f"{k}_fit"] = df.loc[df_limited.index, f"{k}_fit"]
|
|
167
|
+
else:
|
|
168
|
+
logger.info(f"----> No data exists for {k}. Stubbing out with NaN.")
|
|
169
|
+
df_limited[f"{k}_fit"] = pd.Series(
|
|
170
|
+
np.nan, index=df_limited.index
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return df_limited
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def linear_fit(df, column, decimals):
|
|
177
|
+
"""Apply a linear regression to the input column
|
|
178
|
+
|
|
179
|
+
Linear regression is following:
|
|
180
|
+
https://realpython.com/linear-regression-in-python/#simple-linear-regression-with-scikit-learn
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
df : pandas.Dataframe
|
|
185
|
+
datetime-indexed df, limited to desired time length for linear fit
|
|
186
|
+
column : str
|
|
187
|
+
The target column for applying linear fit
|
|
188
|
+
decimals : int
|
|
189
|
+
How many decimals to round the output fit values
|
|
190
|
+
extrapolate : boolean
|
|
191
|
+
If False (default), only apply linear fit to timestamps with valid data
|
|
192
|
+
If True, then extrapolate positions based on linear fit model
|
|
193
|
+
|
|
194
|
+
Returns
|
|
195
|
+
-------
|
|
196
|
+
df : pandas.Dataframe
|
|
197
|
+
The original input df, with added column for the linear regression values
|
|
198
|
+
pos_valid : boolean
|
|
199
|
+
If True (default), sufficient valid data found in recent (limited) data.
|
|
200
|
+
If False, we need to return this status to find_positions and use full station history instead.
|
|
201
|
+
"""
|
|
202
|
+
# print('=========== linear_fit ===========')
|
|
203
|
+
pos_valid = True
|
|
204
|
+
if column in df:
|
|
205
|
+
df_dropna = df[
|
|
206
|
+
df[column].notna()
|
|
207
|
+
] # limit to only non-nan for the target column
|
|
208
|
+
# if len(df_dropna[column].index.normalize().unique()) >= 10: # must have at least 10 unique days
|
|
209
|
+
if (
|
|
210
|
+
len(df_dropna[column]) >= 15
|
|
211
|
+
): # must have at least 15 data points (could be hourly or daily)
|
|
212
|
+
# Get datetime x values into epoch sec integers
|
|
213
|
+
x_epoch = df_dropna.index.values.astype(np.int64) // 10**9
|
|
214
|
+
x = x_epoch.reshape(-1, 1)
|
|
215
|
+
y = df_dropna[column].values # can also reshape this, but not necessary
|
|
216
|
+
model = LinearRegression().fit(x, y)
|
|
217
|
+
|
|
218
|
+
# Adding prediction back to original df
|
|
219
|
+
x_all = df.index.values.astype(np.int64) // 10**9
|
|
220
|
+
df["{}_fit".format(column)] = model.predict(x_all.reshape(-1, 1)).round(
|
|
221
|
+
decimals=decimals
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Plot data if desired
|
|
225
|
+
# if stid == 'LYN_T':
|
|
226
|
+
# if (column == 'gps_lat') or (column == 'gps_lon') or (column == 'gps_alt'):
|
|
227
|
+
# import matplotlib.pyplot as plt
|
|
228
|
+
# plt.figure()
|
|
229
|
+
# df_dropna[column].plot(marker='o',ls='None')
|
|
230
|
+
# df['{}_fit'.format(column)].plot(marker='o', ls='None', color='red')
|
|
231
|
+
# plt.title('{} {}'.format(stid, column))
|
|
232
|
+
# plt.xlim(df.index.min(),df.index.max())
|
|
233
|
+
# plt.show()
|
|
234
|
+
else:
|
|
235
|
+
# Do not have 10 days of valid data, or all data is NaN.
|
|
236
|
+
logger.warning("----> Insufficient {} data!".format(column))
|
|
237
|
+
pos_valid = False
|
|
238
|
+
else:
|
|
239
|
+
logger.warning("----> {} not found in dataframe!".format(column))
|
|
240
|
+
pass
|
|
241
|
+
return df, pos_valid
|