pypromice 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pypromice might be problematic. Click here for more details.

@@ -2,290 +2,628 @@
2
2
 
3
3
  """
4
4
  Command-line script for running BUFR file generation
5
- Created: Dec 20, 2022
6
- Author: Patrick Wright, GEUS
5
+
6
+ Post-processing functions for AWS station data, such as converting PROMICE and GC-Net data files to WMO-compliant BUFR files
7
+
7
8
  """
8
- import pandas as pd
9
- import glob, os
10
9
  import argparse
10
+ import glob
11
+ import logging
12
+ import pickle
13
+ import sys
11
14
  from datetime import datetime, timedelta
12
- import pickle, unittest
13
-
14
- from pypromice.postprocess.wmo_config import ibufr_settings, stid_to_skip, positions_seed, positions_update_timestamp_only
15
- from pypromice.postprocess.csv2bufr import getBUFR, linear_fit, rolling_window, round_values, \
16
- find_positions, min_data_check
17
-
18
- # from IPython import embed
19
-
20
-
21
- def parse_arguments_bufr():
22
- parser = argparse.ArgumentParser()
23
-
24
- parser.add_argument('--dev',
25
- action='store_true',
26
- required=False,
27
- help='If included (True), run in dev mode. Useful for repeated runs of script between transmissions.')
28
-
29
- parser.add_argument('--positions',
30
- action='store_true',
31
- required=False,
32
- help='If included (True), make a positions dict and output AWS_latest_locations.csv file.')
33
-
34
- parser.add_argument('--positions-filepath',
35
- default='../aws-l3/AWS_latest_locations.csv',
36
- type=str,
37
- required=False,
38
- help='Path to write AWS_latest_locations.csv file.')
39
-
40
- parser.add_argument('--time-limit',
41
- default='3M',
42
- type=str,
43
- required=False,
44
- help='Previous time to limit dataframe before applying linear regression.')
45
-
46
- parser.add_argument('--l3-filepath',
47
- default='../aws-l3/tx/*/*_hour.csv',
48
- type=str,
49
- required=False,
50
- help='Path to l3 tx .csv files.')
51
-
52
- parser.add_argument('--bufr-out',
53
- default='src/pypromice/postprocess/BUFR_out/',
54
- type=str,
55
- required=False,
56
- help='Path to the BUFR out directory.')
57
-
58
- parser.add_argument('--timestamps-pickle-filepath',
59
- default='../pypromice/src/pypromice/postprocess/latest_timestamps.pickle',
60
- type=str,
61
- required=False,
62
- help='Path to the latest_timestamps.pickle file.')
63
-
64
- args = parser.parse_args()
65
- return args
66
-
67
- def get_bufr():
68
- args = parse_arguments_bufr()
69
-
70
- # Get list of relative file paths
71
- fpaths = glob.glob(args.l3_filepath)
72
-
73
- # Make out dir
74
- outFiles = args.bufr_out
75
- if os.path.exists(outFiles) is False:
76
- os.mkdir(outFiles)
77
-
78
- # Read existing timestamps pickle to dictionary
79
- if os.path.isfile(args.timestamps_pickle_filepath):
80
- with open(args.timestamps_pickle_filepath, 'rb') as handle:
81
- latest_timestamps = pickle.load(handle)
82
- else:
83
- print('latest_timestamps.pickle not found!')
84
- latest_timestamps = {}
85
-
86
- # Initiate a new dict for current timestamps
87
- current_timestamps = {}
88
-
89
- if args.positions is True:
90
- # Initiate a dict to store station positions
91
- # (seeded with initial positions from wmo_config.positions_seed)
92
- # Used to retrieve a static set of positions to register stations with DMI/WMO
93
- # Also used to write AWS_latest_locations.csv to aws-l3 repo
94
- positions = positions_seed
95
-
96
- # Define stations to skip
97
- to_skip = []
98
- for k, v in stid_to_skip.items():
99
- to_skip.extend(v)
100
- to_skip = set(to_skip) # Get rid of any duplicates
101
-
102
- # Setup diagnostic lists (print at end)
103
- skipped = []
104
- no_recent_data = []
105
- no_valid_data = []
106
- no_entry_latest_timestamps = []
107
- failed_min_data_wx = []
108
- failed_min_data_pos = []
109
-
110
- land_stids = ibufr_settings['land']['station']['stationNumber'].keys()
111
-
112
- # Iterate through csv files
113
- for f in fpaths:
114
- last_index = f.rfind('_')
115
- first_index = f.rfind('/')
116
- stid = f[first_index+1:last_index]
117
- # stid = f.split('/')[-1].split('.csv')[0][:-5]
118
-
119
- print('####### Processing {} #######'.format(stid))
120
- if ('Roof' not in f) and (stid not in to_skip):
121
- # if ('v3' not in f) and ('Roof' not in f) and (stid not in to_skip):
122
- bufrname = stid + '.bufr'
123
- print(f'Generating {bufrname} from {f}')
124
-
125
- if (args.positions is True) and (stid not in positions_update_timestamp_only):
126
- positions[stid] = {}
127
- # Optionally include source flag columns, useful to indicate if position
128
- # comes from current transmission, or older data. This could also be used
129
- # to differentiate GPS from modem, but using the combine_first method for
130
- # the modem positions currently prevents us from easily knowing which source
131
- # was used.
132
- # positions[stid]['lat_source'] = ''
133
- # positions[stid]['lon_source'] = ''
134
-
135
- # Read csv file
136
- df1 = pd.read_csv(f, delimiter=',')
137
- df1.set_index(pd.to_datetime(df1['time']), inplace=True)
138
- df1.sort_index(inplace=True) # make sure we are time-sorted
139
-
140
- # Check that the last valid index for all instantaneous values match
141
- # Note: we cannot always use the single most-recent timestamp in the dataframe
142
- # e.g. for 6-hr transmissions, *_u will have hourly data while *_i is nan
143
- # Need to check for last valid (non-nan) index instead
144
- lvi = {'t_i': df1['t_i'].last_valid_index(),
145
- 'p_i': df1['p_i'].last_valid_index(),
146
- 'rh_i': df1['rh_i'].last_valid_index(),
147
- 'wspd_i': df1['wspd_i'].last_valid_index(),
148
- 'wdir_i': df1['wdir_i'].last_valid_index()
149
- }
150
-
151
- two_days_ago = datetime.utcnow() - timedelta(days=2)
152
-
153
- if len(set(lvi.values())) != 1:
154
- # instantaneous vars have different timestamps
155
- recent = {}
156
- for k,v in lvi.items():
157
- if (v is not None) and (v >= two_days_ago):
158
- recent[k] = v
159
- if len(recent) == 0:
160
- print('No recent instantaneous timestamps!')
161
- no_recent_data.append(stid)
162
- if args.positions is True:
163
- df1_limited, positions = find_positions(df1, stid, args.time_limit, positions=positions)
164
- continue
165
- else:
166
- # we have partial data, just use the most recent row
167
- current_timestamp = max(recent.values())
168
- # We will throw this obset down the line, and there is a final min_data_check
169
- # to make sure we have minimum data requirements before writing to BUFR
170
- else:
171
- if all(i is None for i in lvi.values()) is True:
172
- print('All instantaneous timestamps are None!')
173
- no_valid_data.append(stid)
174
- if args.positions is True:
175
- df1_limited, positions = find_positions(df1, stid, args.time_limit, positions=positions)
176
- continue
177
- else:
178
- # all values are present, with matching timestamps, so just use t_i
179
- current_timestamp = df1['t_i'].last_valid_index()
180
-
181
- print(f'TIMESTAMP: {current_timestamp}')
182
-
183
- # set in dict, will be written to disk at end
184
- current_timestamps[stid] = current_timestamp
185
-
186
- if stid in latest_timestamps:
187
- latest_timestamp = latest_timestamps[stid]
188
-
189
- if args.dev is True:
190
- print('----> Running in dev mode!')
191
- # If we want to run repeatedly (before another transmission comes in), then don't
192
- # check the actual latest timestamp, and just set to two_days_ago
193
- latest_timestamp = two_days_ago
194
-
195
- if (current_timestamp > latest_timestamp) and (current_timestamp > two_days_ago):
196
- print('Time checks passed.')
197
-
198
- if args.positions is True:
199
- # return positions dict for writing to csv file after processing finished
200
- df1_limited, positions = find_positions(df1, stid, args.time_limit, current_timestamp, positions)
201
- else:
202
- # we only need to add positions to the BUFR file
203
- df1_limited = find_positions(df1, stid, args.time_limit, current_timestamp)
204
-
205
- # Apply smoothing to z_boom_u
206
- # require at least 2 hourly obs? Sometimes seeing once/day data for z_boom_u
207
- df1_limited = rolling_window(df1_limited, 'z_boom_u', '72H', 2, 1)
208
-
209
- # limit to single most recent valid row (convert to series)
210
- s1_current = df1_limited.loc[current_timestamp]
211
-
212
- # Convert air temp, C to Kelvin
213
- s1_current.t_i = s1_current.t_i + 273.15
214
-
215
- # Convert pressure, correct the -1000 offset, then hPa to Pa
216
- # note that instantaneous pressure has 0.1 hPa precision
217
- s1_current.p_i = (s1_current.p_i+1000.) * 100.
218
-
219
- s1_current = round_values(s1_current)
220
-
221
- # Check that we have minimum required valid data
222
- min_data_wx_result, min_data_pos_result = min_data_check(s1_current, stid)
223
- if min_data_wx_result is False:
224
- failed_min_data_wx.append(stid)
225
- continue
226
- elif min_data_pos_result is False:
227
- failed_min_data_pos.append(stid)
228
- continue
229
-
230
- # Construct and export BUFR file
231
- file_removed = getBUFR(s1_current, outFiles+bufrname, stid, land_stids)
232
-
233
- if file_removed is False:
234
- print(f'Successfully exported bufr file to {outFiles+bufrname}')
235
- else:
236
- print('----> Time checks failed for {}'.format(stid))
237
- print(' current:', current_timestamp)
238
- if args.dev is True:
239
- print(' latest (DEV):', latest_timestamp)
240
- else:
241
- print(' latest:', latest_timestamp)
242
- no_recent_data.append(stid)
243
- if args.positions is True:
244
- current_timestamp = None
245
- df1_limited, positions = find_positions(df1, stid, args.time_limit, current_timestamp, positions)
246
- else:
247
- print('{} not found in latest_timestamps'.format(stid))
248
- no_entry_latest_timestamps.append(stid)
249
- else:
250
- print('----> Skipping {} as per stid_to_skip config'.format(stid))
251
- skipped.append(stid)
252
- if args.positions is True and stid not in ('XXX',):
253
- # still will be useful to have all stations in AWS_station_location.csv,
254
- # regardless if they were skipped for the DMI upload
255
- if stid not in positions_update_timestamp_only:
256
- positions[stid] = {}
257
- df_skipped = pd.read_csv(f, delimiter=',')
258
- df_skipped.set_index(pd.to_datetime(df_skipped['time']), inplace=True)
259
- df_skipped.sort_index(inplace=True) # make sure we are time-sorted
260
- df_skipped_limited, positions = find_positions(df_skipped, stid, args.time_limit, positions=positions)
261
-
262
- # Write the most recent timestamps back to the pickle on disk
263
- print('writing latest_timestamps.pickle')
264
- with open(args.timestamps_pickle_filepath, 'wb') as handle:
265
- pickle.dump(current_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL)
266
-
267
- if args.positions is True:
268
- positions_df = pd.DataFrame.from_dict(
269
- positions,
270
- orient='index',
271
- # columns=['timestamp','lat','lon','alt','lat_source','lon_source']
272
- columns=['timestamp','lat','lon','alt']
273
- )
274
- positions_df.sort_index(inplace=True)
275
- positions_df.to_csv(args.positions_filepath, index_label='stid')
276
-
277
- print('--------------------------------')
278
- not_processed_wx_pos = set(failed_min_data_wx + failed_min_data_pos)
279
- not_processed_count = len(skipped) + len(no_recent_data) + len(no_valid_data) + len(no_entry_latest_timestamps) + len(not_processed_wx_pos)
280
- print('BUFR exported for {} of {} fpaths.'.format((len(fpaths) - not_processed_count),len(fpaths)))
281
- print('')
282
- print('skipped: {}'.format(skipped))
283
- print('no_recent_data: {}'.format(no_recent_data))
284
- print('no_valid_data: {}'.format(no_valid_data))
285
- print('no_entry_latest_timestamps: {}'.format(no_entry_latest_timestamps))
286
- print('failed_min_data_wx: {}'.format(failed_min_data_wx))
287
- print('failed_min_data_pos: {}'.format(failed_min_data_pos))
288
- print('--------------------------------')
289
-
290
- if __name__ == "__main__":
291
- get_bufr()
15
+ from pathlib import Path
16
+ from typing import List, Dict, Mapping, Optional, Collection, Sequence, Union, TextIO
17
+
18
+ import attrs
19
+ import numpy as np
20
+ import pandas as pd
21
+ import toml
22
+
23
+ from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables
24
+ from pypromice.postprocess.real_time_utilities import get_latest_data
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ DEFAULT_STATION_CONFIGURATION_PATH = Path(__file__).parent.joinpath(
29
+ "station_configurations.toml"
30
+ )
31
+ DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv")
32
+ DEFAULT_LIN_REG_TIME_LIMIT = "91d"
33
+
34
+ def parse_arguments_bufr() -> argparse.ArgumentParser:
35
+ parser = argparse.ArgumentParser()
36
+
37
+ parser.add_argument(
38
+ "--store_positions",
39
+ "--positions",
40
+ action="store_true",
41
+ required=False,
42
+ default=False,
43
+ help="If included (True), make a positions dict and output AWS_latest_locations.csv file.",
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--positions-filepath",
48
+ "-p",
49
+ type=Path,
50
+ required=False,
51
+ help="Path to write AWS_latest_locations.csv file.",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--time-limit",
56
+ default=DEFAULT_LIN_REG_TIME_LIMIT,
57
+ type=str,
58
+ required=False,
59
+ help="Previous time to limit dataframe before applying linear regression.",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--input_files",
64
+ "--l3-filepath",
65
+ "-i",
66
+ type=Path,
67
+ nargs="+",
68
+ required=True,
69
+ help="Path to L3 tx .csv files. Can be direct paths or glob patterns",
70
+ )
71
+
72
+ parser.add_argument(
73
+ "--bufr-out",
74
+ "-o",
75
+ type=Path,
76
+ required=True,
77
+ help="Path to the BUFR out directory.",
78
+ )
79
+
80
+ parser.add_argument(
81
+ "--timestamps-pickle-filepath",
82
+ type=Path,
83
+ required=False,
84
+ help="Path to the latest_timestamps.pickle file.",
85
+ )
86
+
87
+ parser.add_argument(
88
+ "--station_configuration_mapping",
89
+ default=DEFAULT_STATION_CONFIGURATION_PATH,
90
+ type=Path,
91
+ required=False,
92
+ help="Path to csv file with station meta data and BUFR export configuration",
93
+ )
94
+
95
+ parser.add_argument(
96
+ "--position_seed",
97
+ default=DEFAULT_POSITION_SEED_PATH,
98
+ type=Path,
99
+ required=False,
100
+ help="Path to csv file with seed values for output positions.",
101
+ )
102
+
103
+ parser.add_argument(
104
+ '--latest_timestamp',
105
+ default=datetime.utcnow(),
106
+ type=pd.Timestamp,
107
+ help="Timestamp used to determine latest data. Default utcnow."
108
+ )
109
+
110
+ parser.add_argument("--verbose", "-v", default=False, action="store_true")
111
+
112
+ return parser
113
+
114
+
115
+ @attrs.define
116
+ class StationConfiguration:
117
+ """
118
+ Helper class for storing station specific configurations with respect to
119
+
120
+ * Installation specific distance measurements such as height differences between instruments
121
+ * Reference strings such as stid, station_site and wmo_id
122
+ * BUFR export specific parameters
123
+
124
+ # TODO: The station related meta data should be fetched from a station specific configuration files in the future or
125
+ # from header data in data source.
126
+ """
127
+
128
+ stid: str
129
+ station_site: str = None
130
+ project: Optional[str] = None
131
+ station_type: Optional[str] = None
132
+ wmo_id: Optional[str] = None
133
+ barometer_from_gps: Optional[float] = None
134
+ anemometer_from_sonic_ranger: Optional[float] = None
135
+ temperature_from_sonic_ranger: Optional[float] = None
136
+ height_of_gps_from_station_ground: Optional[float] = None
137
+ sonic_ranger_from_gps: Optional[float] = None
138
+
139
+ # The station data will be exported to BUFR if True. Otherwise, it will only export latest position
140
+ export_bufr: bool = False
141
+ comment: Optional[str] = None
142
+
143
+ # skip specific variables for stations
144
+ # If a variable has known bad data, use this collection to skip the variable
145
+ # Note that if a station is not reporting both air temp and pressure it will be skipped,
146
+ # as currently implemented in csv2bufr.min_data_check().
147
+ # ['p_i'], # EXAMPLE
148
+ skipped_variables: List[str] = attrs.field(factory=list)
149
+
150
+ positions_update_timestamp_only: bool = False
151
+
152
+ def as_dict(self) -> Dict:
153
+ return attrs.asdict(self)
154
+
155
+
156
+ def load_station_configuration_mapping(
157
+ fp: Union[str, Path, TextIO]
158
+ ) -> Mapping[str, StationConfiguration]:
159
+ """
160
+ Read station configurations from toml file
161
+
162
+ Parameters
163
+ ----------
164
+ fp :
165
+ Path to or open toml file
166
+
167
+ Returns
168
+ -------
169
+ Mapping from stid to StationConfiguration
170
+
171
+ """
172
+ return {
173
+ stid: StationConfiguration(**config_dict)
174
+ for stid, config_dict in toml.load(fp).items()
175
+ }
176
+
177
+
178
+ def write_station_configuration_mapping(
179
+ config_mapping: Mapping[str, StationConfiguration], fp: TextIO
180
+ ):
181
+ """
182
+ Write station configuration to toml file
183
+
184
+ Parameters
185
+ ----------
186
+ config_mapping
187
+ Mapping from stid to StationConfiguration
188
+ fp
189
+ open writable TextIO
190
+ """
191
+ config_mapping = {
192
+ config.stid: config.as_dict() for config in config_mapping.values()
193
+ }
194
+ toml.dump(config_mapping, fp)
195
+
196
+
197
+ def process_station(
198
+ file_path: Path,
199
+ output_path: Path,
200
+ now_timestamp: datetime,
201
+ latest_timestamp: Optional[datetime],
202
+ time_limit: str,
203
+ stid: str,
204
+ station_configuration: StationConfiguration,
205
+ ) -> Optional[Dict]:
206
+ df = load_data(file_path, now_timestamp)
207
+
208
+ # Select current data
209
+ latest_data = get_latest_data(
210
+ df,
211
+ lin_reg_time_limit=time_limit,
212
+ )
213
+
214
+ if latest_data is None:
215
+ logger.info("No valid instantaneous timestamps!")
216
+ return None
217
+
218
+ latest_data = filter_skipped_variables(
219
+ latest_data, vars_to_skip=station_configuration.skipped_variables
220
+ )
221
+
222
+ # Check that we have minimum required valid data
223
+ sufficient_wx_data, sufficient_position_data = min_data_check(latest_data)
224
+
225
+ station_position = dict()
226
+ station_position["timestamp"] = latest_data.name
227
+ if sufficient_position_data:
228
+ station_position["lon"] = latest_data.get("gps_lon_fit")
229
+ station_position["lat"] = latest_data.get("gps_lat_fit")
230
+ station_position["alt"] = latest_data.get("gps_alt_fit")
231
+ else:
232
+ logger.warning("Insufficient position data")
233
+ # Don't use any position attributes from latest_data
234
+ station_position["lon"] = None
235
+ station_position["lat"] = None
236
+ station_position["alt"] = None
237
+ return station_position
238
+
239
+ if station_configuration.export_bufr:
240
+ if not sufficient_wx_data:
241
+ logger.warning(f"Failed min data wx {stid}")
242
+ return station_position
243
+
244
+ # Store current timest
245
+ if latest_data.name <= latest_timestamp:
246
+ logger.info(f"No new data {latest_data.name} <= {latest_timestamp}")
247
+ return station_position
248
+
249
+ # Construct and export BUFR file
250
+ bufr_variables = get_bufr_variables(
251
+ data=latest_data,
252
+ station_configuration=station_configuration,
253
+ )
254
+ with output_path.open("bw") as fp:
255
+ write_bufr_message(variables=bufr_variables, file=fp)
256
+
257
+ return station_position
258
+
259
+
260
+ def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame:
261
+ """
262
+ Read AWS data from csv file using time as index and filter all rows after now_timestamp
263
+
264
+ Parameters
265
+ ----------
266
+ file_path
267
+ now_timestamp
268
+
269
+ Returns
270
+ -------
271
+ Dataframe with all columns from csv file and time as index
272
+ """
273
+ # Read csv file
274
+ df: pd.DataFrame = (
275
+ pd.read_csv(file_path, delimiter=",", parse_dates=["time"])
276
+ .set_index("time")
277
+ .sort_index()
278
+ )
279
+ df = df[:now_timestamp]
280
+ return df
281
+
282
+
283
+ def get_bufr(
284
+ bufr_out: Path,
285
+ input_files: Sequence[Path],
286
+ positions_filepath: Optional[Path],
287
+ timestamps_pickle_filepath: Optional[Path],
288
+ station_configuration_path: Optional[Path],
289
+ now_timestamp: Optional[datetime] = None,
290
+ positions_seed_path: Optional[Path] = None,
291
+ earliest_timestamp: datetime = None,
292
+ store_positions: bool = False,
293
+ time_limit: str = "91d",
294
+ ):
295
+ """
296
+ Main function for generating BUFR files and determine latest positions from a sequence of csv files
297
+
298
+ The file timestamps_pickle_filepath is used to maintain a local state in the execution environment to ensure the
299
+ same data is not processed multiple times.
300
+
301
+
302
+ Parameters
303
+ ----------
304
+ bufr_out
305
+ Path to the BUFR out directory.
306
+ input_files
307
+ List of L3 csv file paths.
308
+ positions_filepath
309
+ Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO
310
+ timestamps_pickle_filepath
311
+ Path to pickle file used for storing latest timestamp
312
+ station_configuration_path
313
+ Path to toml file with configuration entries for each station
314
+ now_timestamp
315
+ get_bufr will export the latest data before now_timestamp. Default datetime.utcnow()
316
+ positions_seed_path
317
+ Path to csv file with position data used as default values for the output position.
318
+ earliest_timestamp
319
+ The earliest allowed timestamp for data to be included in the output. Default now_timestamp - 2 days
320
+ store_positions
321
+ Flag determine if latest positions are exported.
322
+ time_limit
323
+ Previous time to limit dataframe before applying linear regression.
324
+
325
+ """
326
+ if now_timestamp is None:
327
+ now_timestamp = datetime.utcnow()
328
+
329
+ if earliest_timestamp is None:
330
+ earliest_timestamp = now_timestamp - timedelta(days=2)
331
+
332
+ # Prepare (latest) positions
333
+ positions = dict()
334
+ if positions_seed_path:
335
+ positions_seed = pd.read_csv(
336
+ positions_seed_path, index_col=0, delimiter=",", parse_dates=["timestamp"]
337
+ ).to_dict(orient="index")
338
+ logger.info(f"Seed positions for {positions_seed.keys()}")
339
+ positions.update(positions_seed)
340
+
341
+ # Prepare station configurations
342
+ if station_configuration_path is None:
343
+ station_configuration_mapping = dict()
344
+ else:
345
+ station_configuration_mapping = load_station_configuration_mapping(
346
+ station_configuration_path
347
+ )
348
+
349
+ # Prepare bufr output dir
350
+ bufr_out.mkdir(parents=True, exist_ok=True)
351
+
352
+ # Read existing timestamps pickle to dictionary
353
+ if timestamps_pickle_filepath and timestamps_pickle_filepath.exists():
354
+ with timestamps_pickle_filepath.open("rb") as handle:
355
+ latest_timestamps = pickle.load(handle)
356
+ else:
357
+ logger.info("latest_timestamps.pickle not found!")
358
+ latest_timestamps = {}
359
+
360
+ # Initiate a new dict for current timestamps
361
+ current_timestamps = {}
362
+
363
+ # Setup diagnostic lists (logger.info at end)
364
+ skipped = []
365
+ no_recent_data = []
366
+ no_entry_latest_timestamps = []
367
+ failed_min_data_wx = []
368
+ failed_min_data_pos = []
369
+
370
+ # Iterate through csv files
371
+ for file_path in input_files:
372
+ stid = file_path.stem.rsplit("_", 1)[0]
373
+ logger.info("####### Processing {} #######".format(stid))
374
+
375
+ if stid not in station_configuration_mapping:
376
+ logger.info(f"Station id {stid} not in configuration mapping.")
377
+ station_configuration = StationConfiguration(stid=stid)
378
+ skipped.append(stid)
379
+ else:
380
+ station_configuration = station_configuration_mapping[stid]
381
+
382
+ output_path = bufr_out / f"{stid}.bufr"
383
+ logger.info(f"Generating {output_path} from {file_path}")
384
+ latest_timestamp = latest_timestamps.get(stid, earliest_timestamp)
385
+ latest_timestamp = max(earliest_timestamp, latest_timestamp)
386
+
387
+ try:
388
+ station_position = process_station(
389
+ file_path=file_path,
390
+ output_path=output_path,
391
+ now_timestamp=now_timestamp,
392
+ latest_timestamp=latest_timestamp,
393
+ time_limit=time_limit,
394
+ stid=stid,
395
+ station_configuration=station_configuration,
396
+ )
397
+ except Exception:
398
+ logger.exception(f"Failed processing {stid}")
399
+ continue
400
+
401
+ if station_position is None:
402
+ logger.warning(f"No position information available for {stid}")
403
+
404
+ else:
405
+ if stid not in positions:
406
+ positions[stid] = dict()
407
+
408
+ if station_configuration.positions_update_timestamp_only:
409
+ positions[stid]["timestamp"] = station_position["timestamp"]
410
+ else:
411
+ positions[stid].update(station_position)
412
+
413
+ # Write the most recent timestamps back to the pickle on disk
414
+ logger.info(f"writing latest_timestamps to {timestamps_pickle_filepath}")
415
+ if timestamps_pickle_filepath:
416
+ with timestamps_pickle_filepath.open("wb") as handle:
417
+ pickle.dump(current_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL)
418
+
419
+ if store_positions:
420
+ positions_df = pd.DataFrame.from_dict(
421
+ positions,
422
+ orient="index",
423
+ # columns=['timestamp','lat','lon','alt','lat_source','lon_source']
424
+ columns=["timestamp", "lat", "lon", "alt"],
425
+ )
426
+ positions_df.sort_index(inplace=True)
427
+ positions_df.to_csv(positions_filepath, index_label="stid")
428
+
429
+ logger.info("--------------------------------")
430
+ not_processed_wx_pos = set(failed_min_data_wx + failed_min_data_pos)
431
+ not_processed_count = (
432
+ len(skipped)
433
+ + len(no_recent_data)
434
+ + len(no_entry_latest_timestamps)
435
+ + len(not_processed_wx_pos)
436
+ )
437
+ logger.info(
438
+ "BUFR exported for {} of {} fpaths.".format(
439
+ (len(input_files) - not_processed_count), len(input_files)
440
+ )
441
+ )
442
+ logger.info("")
443
+ logger.info("skipped: {}".format(skipped))
444
+ logger.info("no_recent_data: {}".format(no_recent_data))
445
+ logger.info("no_entry_latest_timestamps: {}".format(no_entry_latest_timestamps))
446
+ logger.info("failed_min_data_wx: {}".format(failed_min_data_wx))
447
+ logger.info("failed_min_data_pos: {}".format(failed_min_data_pos))
448
+ logger.info("--------------------------------")
449
+
450
+
451
+ def filter_skipped_variables(
452
+ row: pd.Series, vars_to_skip: Collection[str]
453
+ ) -> pd.Series:
454
+ """
455
+ Mutate input series by setting var_to_skip to np.nan
456
+
457
+ Parameters
458
+ ----------
459
+ row
460
+ vars_to_skip
461
+ List of variable names to be skipped
462
+
463
+ Returns
464
+ -------
465
+ Input series
466
+
467
+ """
468
+ vars_to_skip = set(row.keys()) & set(vars_to_skip)
469
+ for var_key in vars_to_skip:
470
+ row[var_key] = np.nan
471
+ logger.info("----> Skipping var: {}".format(var_key))
472
+ return row
473
+
474
+
475
+ def get_bufr_variables(
476
+ data: pd.Series,
477
+ station_configuration: StationConfiguration,
478
+ ) -> BUFRVariables:
479
+ """
480
+ Helper function for converting our variables to the variables needed for bufr export.
481
+
482
+ Parameters
483
+ ----------
484
+ data
485
+ Series with processed l3 variables from get_latest_datas
486
+
487
+ station_configuration
488
+
489
+ Returns
490
+ -------
491
+ BUFRVariables used by bufr_utilities
492
+
493
+ """
494
+ heightOfStationGroundAboveMeanSeaLevel = np.nan
495
+ if isinstance(station_configuration.height_of_gps_from_station_ground, float):
496
+ heightOfStationGroundAboveMeanSeaLevel = (
497
+ data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground
498
+ )
499
+
500
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan
501
+ if isinstance(station_configuration.temperature_from_sonic_ranger, float):
502
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = (
503
+ data["z_boom_u_smooth"]+ station_configuration.temperature_from_sonic_ranger
504
+ )
505
+
506
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan
507
+ if isinstance(station_configuration.anemometer_from_sonic_ranger, float):
508
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = (
509
+ data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger
510
+ )
511
+
512
+ heightOfBarometerAboveMeanSeaLevel = np.nan
513
+ if isinstance(station_configuration.barometer_from_gps, float):
514
+ heightOfBarometerAboveMeanSeaLevel = (
515
+ data["gps_alt_fit"] + station_configuration.barometer_from_gps
516
+ )
517
+
518
+
519
+ output_row = BUFRVariables(
520
+ wmo_id=station_configuration.wmo_id,
521
+ station_type=station_configuration.station_type,
522
+ timestamp=data.name,
523
+ # DMI wants non-corrected rh
524
+ relativeHumidity=data.rh_i,
525
+ # Convert air temp, C to Kelvin
526
+ airTemperature=data.t_i + 273.15,
527
+ # Convert pressure, correct the -1000 offset, then hPa to Pa
528
+ # note that instantaneous pressure has 0.1 hPa precision
529
+ pressure=(data.p_i + 1000.0) * 100.0,
530
+ windDirection=data.wdir_i,
531
+ windSpeed=data.wspd_i,
532
+ latitude=data.gps_lat_fit,
533
+ longitude=data.gps_lon_fit,
534
+ # TODO: This might need to be relative to snow height instead.
535
+ heightOfStationGroundAboveMeanSeaLevel=heightOfStationGroundAboveMeanSeaLevel,
536
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH,
537
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD,
538
+ heightOfBarometerAboveMeanSeaLevel=heightOfBarometerAboveMeanSeaLevel,
539
+ )
540
+ return output_row
541
+
542
+
543
+ def min_data_check(s):
544
+ """Check that we have minimum required fields to proceed with writing to BUFR
545
+ For wx vars, we currently require both air temp and pressure to be non-NaN.
546
+ If you know a specific var is reporting bad data, you can ignore just that var
547
+ using the vars_to_skip dict in wmo_config.
548
+
549
+ Parameters
550
+ ----------
551
+ s : pandas series
552
+ The current obset we are working with (for BUFR submission)
553
+
554
+ Returns
555
+ -------
556
+ min_data_wx_result : bool
557
+ True (default), the test for min wx data passed. False, the test failed.
558
+ min_data_pos_result : bool
559
+ True (default), the test for min position data passed. False, the test failed.
560
+ """
561
+ min_data_wx_result = True
562
+ min_data_pos_result = True
563
+
564
+ # Can use pd.isna() or math.isnan() below...
565
+
566
+ # Always require valid air temp and valid pressure (both must be non-nan)
567
+ # if (pd.isna(s['t_i']) is False) and (pd.isna(s['p_i']) is False):
568
+ # pass
569
+ # else:
570
+ # print('----> Failed min_data_check for air temp and pressure!')
571
+ # min_data_wx_result = False
572
+
573
+ # If both air temp and pressure are nan, do not submit.
574
+ # This will allow the case of having only one or the other.
575
+ if (pd.isna(s["t_i"]) is True) and (pd.isna(s["p_i"]) is True):
576
+ logger.warning("----> Failed min_data_check for air temp and pressure!")
577
+ min_data_wx_result = False
578
+
579
+ # Missing just elevation OK
580
+ # if (pd.isna(s['gps_lat_fit']) is False) and (pd.isna(s['gps_lon_fit']) is False):
581
+ # pass
582
+ # Require all three: lat, lon, elev
583
+ if (
584
+ (pd.isna(s["gps_lat_fit"]) is False)
585
+ and (pd.isna(s["gps_lon_fit"]) is False)
586
+ and (pd.isna(s["gps_alt_fit"]) is False)
587
+ ):
588
+ pass
589
+ else:
590
+ logger.warning("----> Failed min_data_check for position!")
591
+ min_data_pos_result = False
592
+
593
+ return min_data_wx_result, min_data_pos_result
594
+
595
+ def main():
596
+ args = parse_arguments_bufr().parse_args()
597
+
598
+ log_level = logging.INFO
599
+ if args.verbose:
600
+ log_level = logging.DEBUG
601
+ logging.basicConfig(
602
+ stream=sys.stdout,
603
+ format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
604
+ level=log_level,
605
+ )
606
+
607
+ # Interpret all input file paths as glob patterns if they don't exist
608
+ input_files: List[Path] = list()
609
+ for path in args.input_files:
610
+ if path.exists():
611
+ input_files.append(path)
612
+ else:
613
+ # The input path might be a glob pattern
614
+ input_files += map(Path, glob.glob(path.as_posix()))
615
+
616
+ get_bufr(
617
+ bufr_out=args.bufr_out,
618
+ input_files=input_files,
619
+ store_positions=args.store_positions,
620
+ positions_filepath=args.positions_filepath,
621
+ time_limit=args.time_limit,
622
+ timestamps_pickle_filepath=args.timestamps_pickle_filepath,
623
+ now_timestamp=args.latest_timestamp,
624
+ station_configuration_path=args.station_configuration_mapping,
625
+ positions_seed_path=args.position_seed,
626
+ )
627
+
628
+ if __name__ == "__main__":
629
+ main()