pypromice 1.3.3__py3-none-any.whl → 1.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pypromice might be problematic. Click here for more details.
- pypromice/postprocess/bufr_to_csv.py +11 -0
- pypromice/postprocess/bufr_utilities.py +489 -0
- pypromice/postprocess/get_bufr.py +622 -284
- pypromice/postprocess/positions_seed.csv +5 -0
- pypromice/postprocess/real_time_utilities.py +241 -0
- pypromice/postprocess/station_configurations.toml +762 -0
- pypromice/process/L0toL1.py +4 -2
- pypromice/process/L1toL2.py +1 -0
- pypromice/process/value_clipping.py +4 -13
- pypromice/process/variables.csv +13 -15
- pypromice/qc/github_data_issues.py +10 -40
- {pypromice-1.3.3.dist-info → pypromice-1.3.5.dist-info}/METADATA +2 -1
- {pypromice-1.3.3.dist-info → pypromice-1.3.5.dist-info}/RECORD +17 -14
- {pypromice-1.3.3.dist-info → pypromice-1.3.5.dist-info}/WHEEL +1 -1
- {pypromice-1.3.3.dist-info → pypromice-1.3.5.dist-info}/entry_points.txt +1 -1
- pypromice/postprocess/csv2bufr.py +0 -508
- pypromice/postprocess/wmo_config.py +0 -179
- {pypromice-1.3.3.dist-info → pypromice-1.3.5.dist-info}/LICENSE.txt +0 -0
- {pypromice-1.3.3.dist-info → pypromice-1.3.5.dist-info}/top_level.txt +0 -0
|
@@ -2,290 +2,628 @@
|
|
|
2
2
|
|
|
3
3
|
"""
|
|
4
4
|
Command-line script for running BUFR file generation
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
|
|
6
|
+
Post-processing functions for AWS station data, such as converting PROMICE and GC-Net data files to WMO-compliant BUFR files
|
|
7
|
+
|
|
7
8
|
"""
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import glob, os
|
|
10
9
|
import argparse
|
|
10
|
+
import glob
|
|
11
|
+
import logging
|
|
12
|
+
import pickle
|
|
13
|
+
import sys
|
|
11
14
|
from datetime import datetime, timedelta
|
|
12
|
-
import
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import List, Dict, Mapping, Optional, Collection, Sequence, Union, TextIO
|
|
17
|
+
|
|
18
|
+
import attrs
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import toml
|
|
22
|
+
|
|
23
|
+
from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables
|
|
24
|
+
from pypromice.postprocess.real_time_utilities import get_latest_data
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
DEFAULT_STATION_CONFIGURATION_PATH = Path(__file__).parent.joinpath(
|
|
29
|
+
"station_configurations.toml"
|
|
30
|
+
)
|
|
31
|
+
DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv")
|
|
32
|
+
DEFAULT_LIN_REG_TIME_LIMIT = "91d"
|
|
33
|
+
|
|
34
|
+
def parse_arguments_bufr() -> argparse.ArgumentParser:
|
|
35
|
+
parser = argparse.ArgumentParser()
|
|
36
|
+
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--store_positions",
|
|
39
|
+
"--positions",
|
|
40
|
+
action="store_true",
|
|
41
|
+
required=False,
|
|
42
|
+
default=False,
|
|
43
|
+
help="If included (True), make a positions dict and output AWS_latest_locations.csv file.",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--positions-filepath",
|
|
48
|
+
"-p",
|
|
49
|
+
type=Path,
|
|
50
|
+
required=False,
|
|
51
|
+
help="Path to write AWS_latest_locations.csv file.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--time-limit",
|
|
56
|
+
default=DEFAULT_LIN_REG_TIME_LIMIT,
|
|
57
|
+
type=str,
|
|
58
|
+
required=False,
|
|
59
|
+
help="Previous time to limit dataframe before applying linear regression.",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--input_files",
|
|
64
|
+
"--l3-filepath",
|
|
65
|
+
"-i",
|
|
66
|
+
type=Path,
|
|
67
|
+
nargs="+",
|
|
68
|
+
required=True,
|
|
69
|
+
help="Path to L3 tx .csv files. Can be direct paths or glob patterns",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--bufr-out",
|
|
74
|
+
"-o",
|
|
75
|
+
type=Path,
|
|
76
|
+
required=True,
|
|
77
|
+
help="Path to the BUFR out directory.",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"--timestamps-pickle-filepath",
|
|
82
|
+
type=Path,
|
|
83
|
+
required=False,
|
|
84
|
+
help="Path to the latest_timestamps.pickle file.",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--station_configuration_mapping",
|
|
89
|
+
default=DEFAULT_STATION_CONFIGURATION_PATH,
|
|
90
|
+
type=Path,
|
|
91
|
+
required=False,
|
|
92
|
+
help="Path to csv file with station meta data and BUFR export configuration",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"--position_seed",
|
|
97
|
+
default=DEFAULT_POSITION_SEED_PATH,
|
|
98
|
+
type=Path,
|
|
99
|
+
required=False,
|
|
100
|
+
help="Path to csv file with seed values for output positions.",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
'--latest_timestamp',
|
|
105
|
+
default=datetime.utcnow(),
|
|
106
|
+
type=pd.Timestamp,
|
|
107
|
+
help="Timestamp used to determine latest data. Default utcnow."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
parser.add_argument("--verbose", "-v", default=False, action="store_true")
|
|
111
|
+
|
|
112
|
+
return parser
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@attrs.define
|
|
116
|
+
class StationConfiguration:
|
|
117
|
+
"""
|
|
118
|
+
Helper class for storing station specific configurations with respect to
|
|
119
|
+
|
|
120
|
+
* Installation specific distance measurements such as height differences between instruments
|
|
121
|
+
* Reference strings such as stid, station_site and wmo_id
|
|
122
|
+
* BUFR export specific parameters
|
|
123
|
+
|
|
124
|
+
# TODO: The station related meta data should be fetched from a station specific configuration files in the future or
|
|
125
|
+
# from header data in data source.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
stid: str
|
|
129
|
+
station_site: str = None
|
|
130
|
+
project: Optional[str] = None
|
|
131
|
+
station_type: Optional[str] = None
|
|
132
|
+
wmo_id: Optional[str] = None
|
|
133
|
+
barometer_from_gps: Optional[float] = None
|
|
134
|
+
anemometer_from_sonic_ranger: Optional[float] = None
|
|
135
|
+
temperature_from_sonic_ranger: Optional[float] = None
|
|
136
|
+
height_of_gps_from_station_ground: Optional[float] = None
|
|
137
|
+
sonic_ranger_from_gps: Optional[float] = None
|
|
138
|
+
|
|
139
|
+
# The station data will be exported to BUFR if True. Otherwise, it will only export latest position
|
|
140
|
+
export_bufr: bool = False
|
|
141
|
+
comment: Optional[str] = None
|
|
142
|
+
|
|
143
|
+
# skip specific variables for stations
|
|
144
|
+
# If a variable has known bad data, use this collection to skip the variable
|
|
145
|
+
# Note that if a station is not reporting both air temp and pressure it will be skipped,
|
|
146
|
+
# as currently implemented in csv2bufr.min_data_check().
|
|
147
|
+
# ['p_i'], # EXAMPLE
|
|
148
|
+
skipped_variables: List[str] = attrs.field(factory=list)
|
|
149
|
+
|
|
150
|
+
positions_update_timestamp_only: bool = False
|
|
151
|
+
|
|
152
|
+
def as_dict(self) -> Dict:
|
|
153
|
+
return attrs.asdict(self)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def load_station_configuration_mapping(
|
|
157
|
+
fp: Union[str, Path, TextIO]
|
|
158
|
+
) -> Mapping[str, StationConfiguration]:
|
|
159
|
+
"""
|
|
160
|
+
Read station configurations from toml file
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
fp :
|
|
165
|
+
Path to or open toml file
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
Mapping from stid to StationConfiguration
|
|
170
|
+
|
|
171
|
+
"""
|
|
172
|
+
return {
|
|
173
|
+
stid: StationConfiguration(**config_dict)
|
|
174
|
+
for stid, config_dict in toml.load(fp).items()
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def write_station_configuration_mapping(
|
|
179
|
+
config_mapping: Mapping[str, StationConfiguration], fp: TextIO
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Write station configuration to toml file
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
config_mapping
|
|
187
|
+
Mapping from stid to StationConfiguration
|
|
188
|
+
fp
|
|
189
|
+
open writable TextIO
|
|
190
|
+
"""
|
|
191
|
+
config_mapping = {
|
|
192
|
+
config.stid: config.as_dict() for config in config_mapping.values()
|
|
193
|
+
}
|
|
194
|
+
toml.dump(config_mapping, fp)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def process_station(
|
|
198
|
+
file_path: Path,
|
|
199
|
+
output_path: Path,
|
|
200
|
+
now_timestamp: datetime,
|
|
201
|
+
latest_timestamp: Optional[datetime],
|
|
202
|
+
time_limit: str,
|
|
203
|
+
stid: str,
|
|
204
|
+
station_configuration: StationConfiguration,
|
|
205
|
+
) -> Optional[Dict]:
|
|
206
|
+
df = load_data(file_path, now_timestamp)
|
|
207
|
+
|
|
208
|
+
# Select current data
|
|
209
|
+
latest_data = get_latest_data(
|
|
210
|
+
df,
|
|
211
|
+
lin_reg_time_limit=time_limit,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if latest_data is None:
|
|
215
|
+
logger.info("No valid instantaneous timestamps!")
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
latest_data = filter_skipped_variables(
|
|
219
|
+
latest_data, vars_to_skip=station_configuration.skipped_variables
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Check that we have minimum required valid data
|
|
223
|
+
sufficient_wx_data, sufficient_position_data = min_data_check(latest_data)
|
|
224
|
+
|
|
225
|
+
station_position = dict()
|
|
226
|
+
station_position["timestamp"] = latest_data.name
|
|
227
|
+
if sufficient_position_data:
|
|
228
|
+
station_position["lon"] = latest_data.get("gps_lon_fit")
|
|
229
|
+
station_position["lat"] = latest_data.get("gps_lat_fit")
|
|
230
|
+
station_position["alt"] = latest_data.get("gps_alt_fit")
|
|
231
|
+
else:
|
|
232
|
+
logger.warning("Insufficient position data")
|
|
233
|
+
# Don't use any position attributes from latest_data
|
|
234
|
+
station_position["lon"] = None
|
|
235
|
+
station_position["lat"] = None
|
|
236
|
+
station_position["alt"] = None
|
|
237
|
+
return station_position
|
|
238
|
+
|
|
239
|
+
if station_configuration.export_bufr:
|
|
240
|
+
if not sufficient_wx_data:
|
|
241
|
+
logger.warning(f"Failed min data wx {stid}")
|
|
242
|
+
return station_position
|
|
243
|
+
|
|
244
|
+
# Store current timest
|
|
245
|
+
if latest_data.name <= latest_timestamp:
|
|
246
|
+
logger.info(f"No new data {latest_data.name} <= {latest_timestamp}")
|
|
247
|
+
return station_position
|
|
248
|
+
|
|
249
|
+
# Construct and export BUFR file
|
|
250
|
+
bufr_variables = get_bufr_variables(
|
|
251
|
+
data=latest_data,
|
|
252
|
+
station_configuration=station_configuration,
|
|
253
|
+
)
|
|
254
|
+
with output_path.open("bw") as fp:
|
|
255
|
+
write_bufr_message(variables=bufr_variables, file=fp)
|
|
256
|
+
|
|
257
|
+
return station_position
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame:
|
|
261
|
+
"""
|
|
262
|
+
Read AWS data from csv file using time as index and filter all rows after now_timestamp
|
|
263
|
+
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
file_path
|
|
267
|
+
now_timestamp
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
Dataframe with all columns from csv file and time as index
|
|
272
|
+
"""
|
|
273
|
+
# Read csv file
|
|
274
|
+
df: pd.DataFrame = (
|
|
275
|
+
pd.read_csv(file_path, delimiter=",", parse_dates=["time"])
|
|
276
|
+
.set_index("time")
|
|
277
|
+
.sort_index()
|
|
278
|
+
)
|
|
279
|
+
df = df[:now_timestamp]
|
|
280
|
+
return df
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def get_bufr(
|
|
284
|
+
bufr_out: Path,
|
|
285
|
+
input_files: Sequence[Path],
|
|
286
|
+
positions_filepath: Optional[Path],
|
|
287
|
+
timestamps_pickle_filepath: Optional[Path],
|
|
288
|
+
station_configuration_path: Optional[Path],
|
|
289
|
+
now_timestamp: Optional[datetime] = None,
|
|
290
|
+
positions_seed_path: Optional[Path] = None,
|
|
291
|
+
earliest_timestamp: datetime = None,
|
|
292
|
+
store_positions: bool = False,
|
|
293
|
+
time_limit: str = "91d",
|
|
294
|
+
):
|
|
295
|
+
"""
|
|
296
|
+
Main function for generating BUFR files and determine latest positions from a sequence of csv files
|
|
297
|
+
|
|
298
|
+
The file timestamps_pickle_filepath is used to maintain a local state in the execution environment to ensure the
|
|
299
|
+
same data is not processed multiple times.
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
bufr_out
|
|
305
|
+
Path to the BUFR out directory.
|
|
306
|
+
input_files
|
|
307
|
+
List of L3 csv file paths.
|
|
308
|
+
positions_filepath
|
|
309
|
+
Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO
|
|
310
|
+
timestamps_pickle_filepath
|
|
311
|
+
Path to pickle file used for storing latest timestamp
|
|
312
|
+
station_configuration_path
|
|
313
|
+
Path to toml file with configuration entries for each station
|
|
314
|
+
now_timestamp
|
|
315
|
+
get_bufr will export the latest data before now_timestamp. Default datetime.utcnow()
|
|
316
|
+
positions_seed_path
|
|
317
|
+
Path to csv file with position data used as default values for the output position.
|
|
318
|
+
earliest_timestamp
|
|
319
|
+
The earliest allowed timestamp for data to be included in the output. Default now_timestamp - 2 days
|
|
320
|
+
store_positions
|
|
321
|
+
Flag determine if latest positions are exported.
|
|
322
|
+
time_limit
|
|
323
|
+
Previous time to limit dataframe before applying linear regression.
|
|
324
|
+
|
|
325
|
+
"""
|
|
326
|
+
if now_timestamp is None:
|
|
327
|
+
now_timestamp = datetime.utcnow()
|
|
328
|
+
|
|
329
|
+
if earliest_timestamp is None:
|
|
330
|
+
earliest_timestamp = now_timestamp - timedelta(days=2)
|
|
331
|
+
|
|
332
|
+
# Prepare (latest) positions
|
|
333
|
+
positions = dict()
|
|
334
|
+
if positions_seed_path:
|
|
335
|
+
positions_seed = pd.read_csv(
|
|
336
|
+
positions_seed_path, index_col=0, delimiter=",", parse_dates=["timestamp"]
|
|
337
|
+
).to_dict(orient="index")
|
|
338
|
+
logger.info(f"Seed positions for {positions_seed.keys()}")
|
|
339
|
+
positions.update(positions_seed)
|
|
340
|
+
|
|
341
|
+
# Prepare station configurations
|
|
342
|
+
if station_configuration_path is None:
|
|
343
|
+
station_configuration_mapping = dict()
|
|
344
|
+
else:
|
|
345
|
+
station_configuration_mapping = load_station_configuration_mapping(
|
|
346
|
+
station_configuration_path
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Prepare bufr output dir
|
|
350
|
+
bufr_out.mkdir(parents=True, exist_ok=True)
|
|
351
|
+
|
|
352
|
+
# Read existing timestamps pickle to dictionary
|
|
353
|
+
if timestamps_pickle_filepath and timestamps_pickle_filepath.exists():
|
|
354
|
+
with timestamps_pickle_filepath.open("rb") as handle:
|
|
355
|
+
latest_timestamps = pickle.load(handle)
|
|
356
|
+
else:
|
|
357
|
+
logger.info("latest_timestamps.pickle not found!")
|
|
358
|
+
latest_timestamps = {}
|
|
359
|
+
|
|
360
|
+
# Initiate a new dict for current timestamps
|
|
361
|
+
current_timestamps = {}
|
|
362
|
+
|
|
363
|
+
# Setup diagnostic lists (logger.info at end)
|
|
364
|
+
skipped = []
|
|
365
|
+
no_recent_data = []
|
|
366
|
+
no_entry_latest_timestamps = []
|
|
367
|
+
failed_min_data_wx = []
|
|
368
|
+
failed_min_data_pos = []
|
|
369
|
+
|
|
370
|
+
# Iterate through csv files
|
|
371
|
+
for file_path in input_files:
|
|
372
|
+
stid = file_path.stem.rsplit("_", 1)[0]
|
|
373
|
+
logger.info("####### Processing {} #######".format(stid))
|
|
374
|
+
|
|
375
|
+
if stid not in station_configuration_mapping:
|
|
376
|
+
logger.info(f"Station id {stid} not in configuration mapping.")
|
|
377
|
+
station_configuration = StationConfiguration(stid=stid)
|
|
378
|
+
skipped.append(stid)
|
|
379
|
+
else:
|
|
380
|
+
station_configuration = station_configuration_mapping[stid]
|
|
381
|
+
|
|
382
|
+
output_path = bufr_out / f"{stid}.bufr"
|
|
383
|
+
logger.info(f"Generating {output_path} from {file_path}")
|
|
384
|
+
latest_timestamp = latest_timestamps.get(stid, earliest_timestamp)
|
|
385
|
+
latest_timestamp = max(earliest_timestamp, latest_timestamp)
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
station_position = process_station(
|
|
389
|
+
file_path=file_path,
|
|
390
|
+
output_path=output_path,
|
|
391
|
+
now_timestamp=now_timestamp,
|
|
392
|
+
latest_timestamp=latest_timestamp,
|
|
393
|
+
time_limit=time_limit,
|
|
394
|
+
stid=stid,
|
|
395
|
+
station_configuration=station_configuration,
|
|
396
|
+
)
|
|
397
|
+
except Exception:
|
|
398
|
+
logger.exception(f"Failed processing {stid}")
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
if station_position is None:
|
|
402
|
+
logger.warning(f"No position information available for {stid}")
|
|
403
|
+
|
|
404
|
+
else:
|
|
405
|
+
if stid not in positions:
|
|
406
|
+
positions[stid] = dict()
|
|
407
|
+
|
|
408
|
+
if station_configuration.positions_update_timestamp_only:
|
|
409
|
+
positions[stid]["timestamp"] = station_position["timestamp"]
|
|
410
|
+
else:
|
|
411
|
+
positions[stid].update(station_position)
|
|
412
|
+
|
|
413
|
+
# Write the most recent timestamps back to the pickle on disk
|
|
414
|
+
logger.info(f"writing latest_timestamps to {timestamps_pickle_filepath}")
|
|
415
|
+
if timestamps_pickle_filepath:
|
|
416
|
+
with timestamps_pickle_filepath.open("wb") as handle:
|
|
417
|
+
pickle.dump(current_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
418
|
+
|
|
419
|
+
if store_positions:
|
|
420
|
+
positions_df = pd.DataFrame.from_dict(
|
|
421
|
+
positions,
|
|
422
|
+
orient="index",
|
|
423
|
+
# columns=['timestamp','lat','lon','alt','lat_source','lon_source']
|
|
424
|
+
columns=["timestamp", "lat", "lon", "alt"],
|
|
425
|
+
)
|
|
426
|
+
positions_df.sort_index(inplace=True)
|
|
427
|
+
positions_df.to_csv(positions_filepath, index_label="stid")
|
|
428
|
+
|
|
429
|
+
logger.info("--------------------------------")
|
|
430
|
+
not_processed_wx_pos = set(failed_min_data_wx + failed_min_data_pos)
|
|
431
|
+
not_processed_count = (
|
|
432
|
+
len(skipped)
|
|
433
|
+
+ len(no_recent_data)
|
|
434
|
+
+ len(no_entry_latest_timestamps)
|
|
435
|
+
+ len(not_processed_wx_pos)
|
|
436
|
+
)
|
|
437
|
+
logger.info(
|
|
438
|
+
"BUFR exported for {} of {} fpaths.".format(
|
|
439
|
+
(len(input_files) - not_processed_count), len(input_files)
|
|
440
|
+
)
|
|
441
|
+
)
|
|
442
|
+
logger.info("")
|
|
443
|
+
logger.info("skipped: {}".format(skipped))
|
|
444
|
+
logger.info("no_recent_data: {}".format(no_recent_data))
|
|
445
|
+
logger.info("no_entry_latest_timestamps: {}".format(no_entry_latest_timestamps))
|
|
446
|
+
logger.info("failed_min_data_wx: {}".format(failed_min_data_wx))
|
|
447
|
+
logger.info("failed_min_data_pos: {}".format(failed_min_data_pos))
|
|
448
|
+
logger.info("--------------------------------")
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def filter_skipped_variables(
|
|
452
|
+
row: pd.Series, vars_to_skip: Collection[str]
|
|
453
|
+
) -> pd.Series:
|
|
454
|
+
"""
|
|
455
|
+
Mutate input series by setting var_to_skip to np.nan
|
|
456
|
+
|
|
457
|
+
Parameters
|
|
458
|
+
----------
|
|
459
|
+
row
|
|
460
|
+
vars_to_skip
|
|
461
|
+
List of variable names to be skipped
|
|
462
|
+
|
|
463
|
+
Returns
|
|
464
|
+
-------
|
|
465
|
+
Input series
|
|
466
|
+
|
|
467
|
+
"""
|
|
468
|
+
vars_to_skip = set(row.keys()) & set(vars_to_skip)
|
|
469
|
+
for var_key in vars_to_skip:
|
|
470
|
+
row[var_key] = np.nan
|
|
471
|
+
logger.info("----> Skipping var: {}".format(var_key))
|
|
472
|
+
return row
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def get_bufr_variables(
|
|
476
|
+
data: pd.Series,
|
|
477
|
+
station_configuration: StationConfiguration,
|
|
478
|
+
) -> BUFRVariables:
|
|
479
|
+
"""
|
|
480
|
+
Helper function for converting our variables to the variables needed for bufr export.
|
|
481
|
+
|
|
482
|
+
Parameters
|
|
483
|
+
----------
|
|
484
|
+
data
|
|
485
|
+
Series with processed l3 variables from get_latest_datas
|
|
486
|
+
|
|
487
|
+
station_configuration
|
|
488
|
+
|
|
489
|
+
Returns
|
|
490
|
+
-------
|
|
491
|
+
BUFRVariables used by bufr_utilities
|
|
492
|
+
|
|
493
|
+
"""
|
|
494
|
+
heightOfStationGroundAboveMeanSeaLevel = np.nan
|
|
495
|
+
if isinstance(station_configuration.height_of_gps_from_station_ground, float):
|
|
496
|
+
heightOfStationGroundAboveMeanSeaLevel = (
|
|
497
|
+
data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan
|
|
501
|
+
if isinstance(station_configuration.temperature_from_sonic_ranger, float):
|
|
502
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = (
|
|
503
|
+
data["z_boom_u_smooth"]+ station_configuration.temperature_from_sonic_ranger
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan
|
|
507
|
+
if isinstance(station_configuration.anemometer_from_sonic_ranger, float):
|
|
508
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = (
|
|
509
|
+
data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
heightOfBarometerAboveMeanSeaLevel = np.nan
|
|
513
|
+
if isinstance(station_configuration.barometer_from_gps, float):
|
|
514
|
+
heightOfBarometerAboveMeanSeaLevel = (
|
|
515
|
+
data["gps_alt_fit"] + station_configuration.barometer_from_gps
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
output_row = BUFRVariables(
|
|
520
|
+
wmo_id=station_configuration.wmo_id,
|
|
521
|
+
station_type=station_configuration.station_type,
|
|
522
|
+
timestamp=data.name,
|
|
523
|
+
# DMI wants non-corrected rh
|
|
524
|
+
relativeHumidity=data.rh_i,
|
|
525
|
+
# Convert air temp, C to Kelvin
|
|
526
|
+
airTemperature=data.t_i + 273.15,
|
|
527
|
+
# Convert pressure, correct the -1000 offset, then hPa to Pa
|
|
528
|
+
# note that instantaneous pressure has 0.1 hPa precision
|
|
529
|
+
pressure=(data.p_i + 1000.0) * 100.0,
|
|
530
|
+
windDirection=data.wdir_i,
|
|
531
|
+
windSpeed=data.wspd_i,
|
|
532
|
+
latitude=data.gps_lat_fit,
|
|
533
|
+
longitude=data.gps_lon_fit,
|
|
534
|
+
# TODO: This might need to be relative to snow height instead.
|
|
535
|
+
heightOfStationGroundAboveMeanSeaLevel=heightOfStationGroundAboveMeanSeaLevel,
|
|
536
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH,
|
|
537
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD=heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD,
|
|
538
|
+
heightOfBarometerAboveMeanSeaLevel=heightOfBarometerAboveMeanSeaLevel,
|
|
539
|
+
)
|
|
540
|
+
return output_row
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def min_data_check(s):
|
|
544
|
+
"""Check that we have minimum required fields to proceed with writing to BUFR
|
|
545
|
+
For wx vars, we currently require both air temp and pressure to be non-NaN.
|
|
546
|
+
If you know a specific var is reporting bad data, you can ignore just that var
|
|
547
|
+
using the vars_to_skip dict in wmo_config.
|
|
548
|
+
|
|
549
|
+
Parameters
|
|
550
|
+
----------
|
|
551
|
+
s : pandas series
|
|
552
|
+
The current obset we are working with (for BUFR submission)
|
|
553
|
+
|
|
554
|
+
Returns
|
|
555
|
+
-------
|
|
556
|
+
min_data_wx_result : bool
|
|
557
|
+
True (default), the test for min wx data passed. False, the test failed.
|
|
558
|
+
min_data_pos_result : bool
|
|
559
|
+
True (default), the test for min position data passed. False, the test failed.
|
|
560
|
+
"""
|
|
561
|
+
min_data_wx_result = True
|
|
562
|
+
min_data_pos_result = True
|
|
563
|
+
|
|
564
|
+
# Can use pd.isna() or math.isnan() below...
|
|
565
|
+
|
|
566
|
+
# Always require valid air temp and valid pressure (both must be non-nan)
|
|
567
|
+
# if (pd.isna(s['t_i']) is False) and (pd.isna(s['p_i']) is False):
|
|
568
|
+
# pass
|
|
569
|
+
# else:
|
|
570
|
+
# print('----> Failed min_data_check for air temp and pressure!')
|
|
571
|
+
# min_data_wx_result = False
|
|
572
|
+
|
|
573
|
+
# If both air temp and pressure are nan, do not submit.
|
|
574
|
+
# This will allow the case of having only one or the other.
|
|
575
|
+
if (pd.isna(s["t_i"]) is True) and (pd.isna(s["p_i"]) is True):
|
|
576
|
+
logger.warning("----> Failed min_data_check for air temp and pressure!")
|
|
577
|
+
min_data_wx_result = False
|
|
578
|
+
|
|
579
|
+
# Missing just elevation OK
|
|
580
|
+
# if (pd.isna(s['gps_lat_fit']) is False) and (pd.isna(s['gps_lon_fit']) is False):
|
|
581
|
+
# pass
|
|
582
|
+
# Require all three: lat, lon, elev
|
|
583
|
+
if (
|
|
584
|
+
(pd.isna(s["gps_lat_fit"]) is False)
|
|
585
|
+
and (pd.isna(s["gps_lon_fit"]) is False)
|
|
586
|
+
and (pd.isna(s["gps_alt_fit"]) is False)
|
|
587
|
+
):
|
|
588
|
+
pass
|
|
589
|
+
else:
|
|
590
|
+
logger.warning("----> Failed min_data_check for position!")
|
|
591
|
+
min_data_pos_result = False
|
|
592
|
+
|
|
593
|
+
return min_data_wx_result, min_data_pos_result
|
|
594
|
+
|
|
595
|
+
def main():
|
|
596
|
+
args = parse_arguments_bufr().parse_args()
|
|
597
|
+
|
|
598
|
+
log_level = logging.INFO
|
|
599
|
+
if args.verbose:
|
|
600
|
+
log_level = logging.DEBUG
|
|
601
|
+
logging.basicConfig(
|
|
602
|
+
stream=sys.stdout,
|
|
603
|
+
format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
|
|
604
|
+
level=log_level,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# Interpret all input file paths as glob patterns if they don't exist
|
|
608
|
+
input_files: List[Path] = list()
|
|
609
|
+
for path in args.input_files:
|
|
610
|
+
if path.exists():
|
|
611
|
+
input_files.append(path)
|
|
612
|
+
else:
|
|
613
|
+
# The input path might be a glob pattern
|
|
614
|
+
input_files += map(Path, glob.glob(path.as_posix()))
|
|
615
|
+
|
|
616
|
+
get_bufr(
|
|
617
|
+
bufr_out=args.bufr_out,
|
|
618
|
+
input_files=input_files,
|
|
619
|
+
store_positions=args.store_positions,
|
|
620
|
+
positions_filepath=args.positions_filepath,
|
|
621
|
+
time_limit=args.time_limit,
|
|
622
|
+
timestamps_pickle_filepath=args.timestamps_pickle_filepath,
|
|
623
|
+
now_timestamp=args.latest_timestamp,
|
|
624
|
+
station_configuration_path=args.station_configuration_mapping,
|
|
625
|
+
positions_seed_path=args.position_seed,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
if __name__ == "__main__":
|
|
629
|
+
main()
|