pypromice 1.3.3__py3-none-any.whl → 1.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pypromice might be problematic. Click here for more details.

@@ -0,0 +1,5 @@
1
+ stid,timestamp,lat,lon,alt
2
+ TAS_U,2015-08-13 14:00:00,65.6978, -38.8668,570.0
3
+ QAS_A,2015-08-24 17:00:00,61.243, -46.7328,1000.
4
+ NUK_N,2014-07-25 11:00:00,64.9452, -49.885,920.0
5
+ KAN_B,2023-01-01 00:00:00,67.1252, -50.1832,350.0
@@ -0,0 +1,241 @@
1
+ """
2
+ Utility functions for processing real time / instantaneous AWS data.
3
+
4
+ This includes:
5
+ * Select latest data
6
+ * Noise filtering data
7
+
8
+ """
9
+ import logging
10
+ from typing import Optional
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from sklearn.linear_model import LinearRegression
15
+
16
+ __all__ = ["get_latest_data"]
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def get_latest_data(
23
+ df: pd.DataFrame,
24
+ lin_reg_time_limit: str,
25
+ ) -> Optional[pd.Series]:
26
+ """
27
+ Determine instantaneous values for the latest valid timestamp in the input dataframe
28
+
29
+ * A valid timestamp is a timestamp with relevant instantaneous variables. See source code.
30
+ * Location smoothing: Fit a linear regression model on gps coordinate over the period lin_reg_time_limit to determine latest values.
31
+ * z_boom: Apply rolling window median filter smooth data
32
+
33
+ The output series contains the same variables as the input dataframe plus smoothed variables:
34
+
35
+ * gps_lat_fit
36
+ * gps_lon_fit
37
+ * gps_alt_fit
38
+ * z_boom_u_smooth
39
+
40
+ Parameters
41
+ ----------
42
+ df
43
+ Input AWS l3 dataframe
44
+ lin_reg_time_limit
45
+ Previous time to limit dataframe before applying linear regression.
46
+
47
+ Returns
48
+ -------
49
+ pd.Series with the latest data.
50
+
51
+ """
52
+ # TODO: The data frames should be cropped with respect to a selected window
53
+ # Check that the last valid index for all instantaneous values match
54
+ # Note: we cannot always use the single most-recent timestamp in the dataframe
55
+ # e.g. for 6-hr transmissions, *_u will have hourly data while *_i is nan
56
+ # Need to check for last valid (non-nan) index instead
57
+ last_valid_index = df[["t_i", "p_i", "rh_i", "wspd_i", "wdir_i"]].last_valid_index()
58
+ if last_valid_index is None:
59
+ return None
60
+ logger.info(f"TIMESTAMP: {last_valid_index}")
61
+
62
+ # Find positions
63
+ # we only need to add positions to the BUFR file
64
+ df_limited = find_positions(
65
+ df,
66
+ lin_reg_time_limit,
67
+ )
68
+
69
+ # Apply smoothing to z_boom_u
70
+ # require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u
71
+ df_limited = rolling_window(df_limited, "z_boom_u", "72H", 2, 1)
72
+
73
+ # limit to single most recent valid row (convert to series)
74
+ s_current = df_limited.loc[last_valid_index]
75
+
76
+ return s_current
77
+
78
+
79
+ def rolling_window(df, column, window, min_periods, decimals) -> pd.DataFrame:
80
+ """Apply a rolling window (smoothing) to the input column
81
+
82
+ Parameters
83
+ ----------
84
+ df : pandas.Dataframe
85
+ datetime-indexed df
86
+ column : str
87
+ The target column for applying rolling window
88
+ window : str
89
+ Window size (e.g. '24H' or 30D')
90
+ min_periods : int
91
+ Minimum number of observations in window required to have a value;
92
+ otherwise, result is np.nan.
93
+ decimals : int
94
+ How many decimal places to round the output smoothed values
95
+
96
+ Returns
97
+ -------
98
+ df : pandas.Dataframe
99
+ The original input df, with added column for the smoothed values
100
+ """
101
+ df["{}_smooth".format(column)] = (
102
+ df[column]
103
+ .rolling(
104
+ window,
105
+ min_periods=min_periods,
106
+ center=True, # set the window labels as the center of the window
107
+ closed="both", # no points in the window are excluded (first or last)
108
+ )
109
+ .median()
110
+ .round(decimals=decimals)
111
+ ) # could also round to whole meters (decimals=0)
112
+ return df
113
+
114
+
115
+ def find_positions(df, time_limit):
116
+ """Driver function to run linear_fit() and set valid lat, lon, and alt
117
+ to df_limited, which is then used to set position data in BUFR.
118
+ If 'positions' is not None (must pass --positions arg), we also write to
119
+ the positions dict which will be written to AWS_latest_locations.csv for
120
+ all stations (whether processed or skipped)
121
+
122
+ Parameters
123
+ ----------
124
+ df : pandas dataframe
125
+ The full tx dataframe
126
+ stid : str
127
+ The station ID, such as NUK_L
128
+ time_limit : str
129
+ Previous time to limit dataframe before applying linear regression.
130
+ (e.g. '91d')
131
+
132
+ Returns
133
+ -------
134
+ df_limited : pandas dataframe
135
+ Dataframe limited to time_limit, and including position data
136
+ positions : dict
137
+ Modified dict storing most-recent station positions.
138
+ """
139
+ logger.info("finding positions")
140
+ time_delta = pd.Timedelta(time_limit)
141
+ last_index = df.index.max()
142
+ last_mask = df.index > last_index - time_delta
143
+ df_limited = df.loc[last_mask].copy()
144
+
145
+ logger.info(f"last transmission: {df_limited.index.max()}")
146
+
147
+ # Extrapolate recommended for altitude, optional for lat and lon.
148
+ df_limited, lat_valid = linear_fit(df_limited, "gps_lat", 6)
149
+ df_limited, lon_valid = linear_fit(df_limited, "gps_lon", 6)
150
+ df_limited, alt_valid = linear_fit(df_limited, "gps_alt", 1)
151
+
152
+ # If we have no valid lat, lon or alt data in the df_limited window, then interpolate
153
+ # using full tx dataset.
154
+ check_valid = {"gps_lat": lat_valid, "gps_lon": lon_valid, "gps_alt": alt_valid}
155
+ check_valid_again = {}
156
+ for k, v in check_valid.items():
157
+ if v is False:
158
+ logger.info(f"----> Using full history for linear extrapolation: {k}")
159
+ logger.info(f"first transmission: {df.index.min()}")
160
+ if k == "gps_alt":
161
+ df, valid = linear_fit(df, k, 1)
162
+ else:
163
+ df, valid = linear_fit(df, k, 6)
164
+ check_valid_again[k] = valid
165
+ if check_valid_again[k] is True:
166
+ df_limited[f"{k}_fit"] = df.loc[df_limited.index, f"{k}_fit"]
167
+ else:
168
+ logger.info(f"----> No data exists for {k}. Stubbing out with NaN.")
169
+ df_limited[f"{k}_fit"] = pd.Series(
170
+ np.nan, index=df_limited.index
171
+ )
172
+
173
+ return df_limited
174
+
175
+
176
+ def linear_fit(df, column, decimals):
177
+ """Apply a linear regression to the input column
178
+
179
+ Linear regression is following:
180
+ https://realpython.com/linear-regression-in-python/#simple-linear-regression-with-scikit-learn
181
+
182
+ Parameters
183
+ ----------
184
+ df : pandas.Dataframe
185
+ datetime-indexed df, limited to desired time length for linear fit
186
+ column : str
187
+ The target column for applying linear fit
188
+ decimals : int
189
+ How many decimals to round the output fit values
190
+ extrapolate : boolean
191
+ If False (default), only apply linear fit to timestamps with valid data
192
+ If True, then extrapolate positions based on linear fit model
193
+
194
+ Returns
195
+ -------
196
+ df : pandas.Dataframe
197
+ The original input df, with added column for the linear regression values
198
+ pos_valid : boolean
199
+ If True (default), sufficient valid data found in recent (limited) data.
200
+ If False, we need to return this status to find_positions and use full station history instead.
201
+ """
202
+ # print('=========== linear_fit ===========')
203
+ pos_valid = True
204
+ if column in df:
205
+ df_dropna = df[
206
+ df[column].notna()
207
+ ] # limit to only non-nan for the target column
208
+ # if len(df_dropna[column].index.normalize().unique()) >= 10: # must have at least 10 unique days
209
+ if (
210
+ len(df_dropna[column]) >= 15
211
+ ): # must have at least 15 data points (could be hourly or daily)
212
+ # Get datetime x values into epoch sec integers
213
+ x_epoch = df_dropna.index.values.astype(np.int64) // 10**9
214
+ x = x_epoch.reshape(-1, 1)
215
+ y = df_dropna[column].values # can also reshape this, but not necessary
216
+ model = LinearRegression().fit(x, y)
217
+
218
+ # Adding prediction back to original df
219
+ x_all = df.index.values.astype(np.int64) // 10**9
220
+ df["{}_fit".format(column)] = model.predict(x_all.reshape(-1, 1)).round(
221
+ decimals=decimals
222
+ )
223
+
224
+ # Plot data if desired
225
+ # if stid == 'LYN_T':
226
+ # if (column == 'gps_lat') or (column == 'gps_lon') or (column == 'gps_alt'):
227
+ # import matplotlib.pyplot as plt
228
+ # plt.figure()
229
+ # df_dropna[column].plot(marker='o',ls='None')
230
+ # df['{}_fit'.format(column)].plot(marker='o', ls='None', color='red')
231
+ # plt.title('{} {}'.format(stid, column))
232
+ # plt.xlim(df.index.min(),df.index.max())
233
+ # plt.show()
234
+ else:
235
+ # Do not have 10 days of valid data, or all data is NaN.
236
+ logger.warning("----> Insufficient {} data!".format(column))
237
+ pos_valid = False
238
+ else:
239
+ logger.warning("----> {} not found in dataframe!".format(column))
240
+ pass
241
+ return df, pos_valid