pypromice 1.3.6__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pypromice might be problematic. Click here for more details.
- pypromice/postprocess/bufr_to_csv.py +15 -3
- pypromice/postprocess/bufr_utilities.py +91 -18
- pypromice/postprocess/create_bufr_files.py +178 -0
- pypromice/postprocess/get_bufr.py +248 -397
- pypromice/postprocess/make_metadata_csv.py +214 -0
- pypromice/postprocess/real_time_utilities.py +41 -11
- pypromice/process/L0toL1.py +12 -5
- pypromice/process/L1toL2.py +69 -14
- pypromice/process/L2toL3.py +1034 -186
- pypromice/process/aws.py +139 -808
- pypromice/process/get_l2.py +90 -0
- pypromice/process/get_l2tol3.py +111 -0
- pypromice/process/join_l2.py +112 -0
- pypromice/process/join_l3.py +551 -120
- pypromice/process/load.py +161 -0
- pypromice/process/resample.py +147 -0
- pypromice/process/utilities.py +68 -0
- pypromice/process/write.py +503 -0
- pypromice/qc/github_data_issues.py +10 -16
- pypromice/qc/persistence.py +52 -30
- pypromice/resources/__init__.py +28 -0
- pypromice/{process/metadata.csv → resources/file_attributes.csv} +0 -2
- pypromice/resources/variable_aliases_GC-Net.csv +78 -0
- pypromice/resources/variables.csv +106 -0
- pypromice/station_configuration.py +118 -0
- pypromice/tx/get_l0tx.py +7 -4
- pypromice/tx/payload_formats.csv +1 -0
- pypromice/tx/tx.py +27 -6
- pypromice/utilities/__init__.py +0 -0
- pypromice/utilities/git.py +62 -0
- {pypromice-1.3.6.dist-info → pypromice-1.4.1.dist-info}/METADATA +4 -4
- pypromice-1.4.1.dist-info/RECORD +53 -0
- {pypromice-1.3.6.dist-info → pypromice-1.4.1.dist-info}/WHEEL +1 -1
- pypromice-1.4.1.dist-info/entry_points.txt +13 -0
- pypromice/postprocess/station_configurations.toml +0 -762
- pypromice/process/get_l3.py +0 -46
- pypromice/process/variables.csv +0 -92
- pypromice/qc/persistence_test.py +0 -150
- pypromice/test/test_config1.toml +0 -69
- pypromice/test/test_config2.toml +0 -54
- pypromice/test/test_email +0 -75
- pypromice/test/test_payload_formats.csv +0 -4
- pypromice/test/test_payload_types.csv +0 -7
- pypromice/test/test_percentile.py +0 -229
- pypromice/test/test_raw1.txt +0 -4468
- pypromice/test/test_raw_DataTable2.txt +0 -11167
- pypromice/test/test_raw_SlimTableMem1.txt +0 -1155
- pypromice/test/test_raw_transmitted1.txt +0 -15411
- pypromice/test/test_raw_transmitted2.txt +0 -28
- pypromice-1.3.6.dist-info/RECORD +0 -53
- pypromice-1.3.6.dist-info/entry_points.txt +0 -8
- {pypromice-1.3.6.dist-info → pypromice-1.4.1.dist-info}/LICENSE.txt +0 -0
- {pypromice-1.3.6.dist-info → pypromice-1.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
1
|
"""
|
|
4
2
|
Command-line script for running BUFR file generation
|
|
5
3
|
|
|
6
4
|
Post-processing functions for AWS station data, such as converting PROMICE and GC-Net data files to WMO-compliant BUFR files
|
|
7
5
|
|
|
8
6
|
"""
|
|
7
|
+
__all__ = [
|
|
8
|
+
"get_bufr",
|
|
9
|
+
"main",
|
|
10
|
+
"DEFAULT_POSITION_SEED_PATH",
|
|
11
|
+
"DEFAULT_LIN_REG_TIME_LIMIT",
|
|
12
|
+
]
|
|
13
|
+
|
|
9
14
|
import argparse
|
|
10
15
|
import glob
|
|
11
16
|
import logging
|
|
@@ -13,258 +18,45 @@ import pickle
|
|
|
13
18
|
import sys
|
|
14
19
|
from datetime import datetime, timedelta
|
|
15
20
|
from pathlib import Path
|
|
16
|
-
from typing import List, Dict,
|
|
21
|
+
from typing import List, Dict, Optional, Sequence, Mapping
|
|
17
22
|
|
|
18
|
-
import attrs
|
|
19
23
|
import numpy as np
|
|
20
24
|
import pandas as pd
|
|
21
|
-
import toml
|
|
22
25
|
|
|
23
26
|
from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables
|
|
24
27
|
from pypromice.postprocess.real_time_utilities import get_latest_data
|
|
25
28
|
|
|
26
|
-
logger = logging.getLogger(__name__)
|
|
27
29
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
from pypromice.station_configuration import (
|
|
31
|
+
StationConfiguration,
|
|
32
|
+
load_station_configuration_mapping,
|
|
30
33
|
)
|
|
31
|
-
DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv")
|
|
32
|
-
DEFAULT_LIN_REG_TIME_LIMIT = "91d"
|
|
33
|
-
|
|
34
|
-
def parse_arguments_bufr() -> argparse.ArgumentParser:
|
|
35
|
-
parser = argparse.ArgumentParser()
|
|
36
|
-
|
|
37
|
-
parser.add_argument(
|
|
38
|
-
"--store_positions",
|
|
39
|
-
"--positions",
|
|
40
|
-
action="store_true",
|
|
41
|
-
required=False,
|
|
42
|
-
default=False,
|
|
43
|
-
help="If included (True), make a positions dict and output AWS_latest_locations.csv file.",
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
parser.add_argument(
|
|
47
|
-
"--positions-filepath",
|
|
48
|
-
"-p",
|
|
49
|
-
type=Path,
|
|
50
|
-
required=False,
|
|
51
|
-
help="Path to write AWS_latest_locations.csv file.",
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
parser.add_argument(
|
|
55
|
-
"--time-limit",
|
|
56
|
-
default=DEFAULT_LIN_REG_TIME_LIMIT,
|
|
57
|
-
type=str,
|
|
58
|
-
required=False,
|
|
59
|
-
help="Previous time to limit dataframe before applying linear regression.",
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
parser.add_argument(
|
|
63
|
-
"--input_files",
|
|
64
|
-
"--l3-filepath",
|
|
65
|
-
"-i",
|
|
66
|
-
type=Path,
|
|
67
|
-
nargs="+",
|
|
68
|
-
required=True,
|
|
69
|
-
help="Path to L3 tx .csv files. Can be direct paths or glob patterns",
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
parser.add_argument(
|
|
73
|
-
"--bufr-out",
|
|
74
|
-
"-o",
|
|
75
|
-
type=Path,
|
|
76
|
-
required=True,
|
|
77
|
-
help="Path to the BUFR out directory.",
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
parser.add_argument(
|
|
81
|
-
"--timestamps-pickle-filepath",
|
|
82
|
-
type=Path,
|
|
83
|
-
required=False,
|
|
84
|
-
help="Path to the latest_timestamps.pickle file.",
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
parser.add_argument(
|
|
88
|
-
"--station_configuration_mapping",
|
|
89
|
-
default=DEFAULT_STATION_CONFIGURATION_PATH,
|
|
90
|
-
type=Path,
|
|
91
|
-
required=False,
|
|
92
|
-
help="Path to csv file with station meta data and BUFR export configuration",
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
parser.add_argument(
|
|
96
|
-
"--position_seed",
|
|
97
|
-
default=DEFAULT_POSITION_SEED_PATH,
|
|
98
|
-
type=Path,
|
|
99
|
-
required=False,
|
|
100
|
-
help="Path to csv file with seed values for output positions.",
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
parser.add_argument(
|
|
104
|
-
'--latest_timestamp',
|
|
105
|
-
default=datetime.utcnow(),
|
|
106
|
-
type=pd.Timestamp,
|
|
107
|
-
help="Timestamp used to determine latest data. Default utcnow."
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
parser.add_argument("--verbose", "-v", default=False, action="store_true")
|
|
111
|
-
|
|
112
|
-
return parser
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
@attrs.define
|
|
116
|
-
class StationConfiguration:
|
|
117
|
-
"""
|
|
118
|
-
Helper class for storing station specific configurations with respect to
|
|
119
|
-
|
|
120
|
-
* Installation specific distance measurements such as height differences between instruments
|
|
121
|
-
* Reference strings such as stid, station_site and wmo_id
|
|
122
|
-
* BUFR export specific parameters
|
|
123
|
-
|
|
124
|
-
# TODO: The station related meta data should be fetched from a station specific configuration files in the future or
|
|
125
|
-
# from header data in data source.
|
|
126
|
-
"""
|
|
127
34
|
|
|
128
|
-
|
|
129
|
-
station_site: str = None
|
|
130
|
-
project: Optional[str] = None
|
|
131
|
-
station_type: Optional[str] = None
|
|
132
|
-
wmo_id: Optional[str] = None
|
|
133
|
-
barometer_from_gps: Optional[float] = None
|
|
134
|
-
anemometer_from_sonic_ranger: Optional[float] = None
|
|
135
|
-
temperature_from_sonic_ranger: Optional[float] = None
|
|
136
|
-
height_of_gps_from_station_ground: Optional[float] = None
|
|
137
|
-
sonic_ranger_from_gps: Optional[float] = None
|
|
138
|
-
|
|
139
|
-
# The station data will be exported to BUFR if True. Otherwise, it will only export latest position
|
|
140
|
-
export_bufr: bool = False
|
|
141
|
-
comment: Optional[str] = None
|
|
142
|
-
|
|
143
|
-
# skip specific variables for stations
|
|
144
|
-
# If a variable has known bad data, use this collection to skip the variable
|
|
145
|
-
# Note that if a station is not reporting both air temp and pressure it will be skipped,
|
|
146
|
-
# as currently implemented in csv2bufr.min_data_check().
|
|
147
|
-
# ['p_i'], # EXAMPLE
|
|
148
|
-
skipped_variables: List[str] = attrs.field(factory=list)
|
|
149
|
-
|
|
150
|
-
positions_update_timestamp_only: bool = False
|
|
151
|
-
|
|
152
|
-
def as_dict(self) -> Dict:
|
|
153
|
-
return attrs.asdict(self)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def load_station_configuration_mapping(
|
|
157
|
-
fp: Union[str, Path, TextIO]
|
|
158
|
-
) -> Mapping[str, StationConfiguration]:
|
|
159
|
-
"""
|
|
160
|
-
Read station configurations from toml file
|
|
161
|
-
|
|
162
|
-
Parameters
|
|
163
|
-
----------
|
|
164
|
-
fp :
|
|
165
|
-
Path to or open toml file
|
|
166
|
-
|
|
167
|
-
Returns
|
|
168
|
-
-------
|
|
169
|
-
Mapping from stid to StationConfiguration
|
|
170
|
-
|
|
171
|
-
"""
|
|
172
|
-
return {
|
|
173
|
-
stid: StationConfiguration(**config_dict)
|
|
174
|
-
for stid, config_dict in toml.load(fp).items()
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def write_station_configuration_mapping(
|
|
179
|
-
config_mapping: Mapping[str, StationConfiguration], fp: TextIO
|
|
180
|
-
):
|
|
181
|
-
"""
|
|
182
|
-
Write station configuration to toml file
|
|
183
|
-
|
|
184
|
-
Parameters
|
|
185
|
-
----------
|
|
186
|
-
config_mapping
|
|
187
|
-
Mapping from stid to StationConfiguration
|
|
188
|
-
fp
|
|
189
|
-
open writable TextIO
|
|
190
|
-
"""
|
|
191
|
-
config_mapping = {
|
|
192
|
-
config.stid: config.as_dict() for config in config_mapping.values()
|
|
193
|
-
}
|
|
194
|
-
toml.dump(config_mapping, fp)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def process_station(
|
|
198
|
-
file_path: Path,
|
|
199
|
-
output_path: Path,
|
|
200
|
-
now_timestamp: datetime,
|
|
201
|
-
latest_timestamp: Optional[datetime],
|
|
202
|
-
time_limit: str,
|
|
203
|
-
stid: str,
|
|
204
|
-
station_configuration: StationConfiguration,
|
|
205
|
-
) -> Optional[Dict]:
|
|
206
|
-
df = load_data(file_path, now_timestamp)
|
|
207
|
-
|
|
208
|
-
# Select current data
|
|
209
|
-
latest_data = get_latest_data(
|
|
210
|
-
df,
|
|
211
|
-
lin_reg_time_limit=time_limit,
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
if latest_data is None:
|
|
215
|
-
logger.info("No valid instantaneous timestamps!")
|
|
216
|
-
return None
|
|
217
|
-
|
|
218
|
-
latest_data = filter_skipped_variables(
|
|
219
|
-
latest_data, vars_to_skip=station_configuration.skipped_variables
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
# Check that we have minimum required valid data
|
|
223
|
-
sufficient_wx_data, sufficient_position_data = min_data_check(latest_data)
|
|
224
|
-
|
|
225
|
-
station_position = dict()
|
|
226
|
-
station_position["timestamp"] = latest_data.name
|
|
227
|
-
if sufficient_position_data:
|
|
228
|
-
station_position["lon"] = latest_data.get("gps_lon_fit")
|
|
229
|
-
station_position["lat"] = latest_data.get("gps_lat_fit")
|
|
230
|
-
station_position["alt"] = latest_data.get("gps_alt_fit")
|
|
231
|
-
else:
|
|
232
|
-
logger.warning("Insufficient position data")
|
|
233
|
-
# Don't use any position attributes from latest_data
|
|
234
|
-
station_position["lon"] = None
|
|
235
|
-
station_position["lat"] = None
|
|
236
|
-
station_position["alt"] = None
|
|
237
|
-
return station_position
|
|
238
|
-
|
|
239
|
-
if station_configuration.export_bufr:
|
|
240
|
-
if not sufficient_wx_data:
|
|
241
|
-
logger.warning(f"Failed min data wx {stid}")
|
|
242
|
-
return station_position
|
|
243
|
-
|
|
244
|
-
# Store current timest
|
|
245
|
-
if latest_data.name <= latest_timestamp:
|
|
246
|
-
logger.info(f"No new data {latest_data.name} <= {latest_timestamp}")
|
|
247
|
-
return station_position
|
|
248
|
-
|
|
249
|
-
# Construct and export BUFR file
|
|
250
|
-
bufr_variables = get_bufr_variables(
|
|
251
|
-
data=latest_data,
|
|
252
|
-
station_configuration=station_configuration,
|
|
253
|
-
)
|
|
254
|
-
with output_path.open("bw") as fp:
|
|
255
|
-
write_bufr_message(variables=bufr_variables, file=fp)
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
256
36
|
|
|
257
|
-
|
|
37
|
+
DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv")
|
|
38
|
+
DEFAULT_LIN_REG_TIME_LIMIT = "91d"
|
|
39
|
+
REQUIRED_KEYS = (
|
|
40
|
+
"t_i",
|
|
41
|
+
"p_i",
|
|
42
|
+
"rh_i",
|
|
43
|
+
"wdir_i",
|
|
44
|
+
"wspd_i",
|
|
45
|
+
"gps_lat_fit",
|
|
46
|
+
"gps_lon_fit",
|
|
47
|
+
"gps_alt_fit",
|
|
48
|
+
"z_boom_u_smooth",
|
|
49
|
+
)
|
|
258
50
|
|
|
259
51
|
|
|
260
|
-
def load_data(file_path: Path,
|
|
52
|
+
def load_data(file_path: Path, latest_timestamp: datetime) -> pd.DataFrame:
|
|
261
53
|
"""
|
|
262
|
-
Read AWS data from csv file using time as index and filter all rows after
|
|
54
|
+
Read AWS data from csv file using time as index and filter all rows after latest_timestamp
|
|
263
55
|
|
|
264
56
|
Parameters
|
|
265
57
|
----------
|
|
266
58
|
file_path
|
|
267
|
-
|
|
59
|
+
latest_timestamp
|
|
268
60
|
|
|
269
61
|
Returns
|
|
270
62
|
-------
|
|
@@ -276,7 +68,7 @@ def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame:
|
|
|
276
68
|
.set_index("time")
|
|
277
69
|
.sort_index()
|
|
278
70
|
)
|
|
279
|
-
df = df[:
|
|
71
|
+
df = df[:latest_timestamp]
|
|
280
72
|
return df
|
|
281
73
|
|
|
282
74
|
|
|
@@ -285,12 +77,13 @@ def get_bufr(
|
|
|
285
77
|
input_files: Sequence[Path],
|
|
286
78
|
positions_filepath: Optional[Path],
|
|
287
79
|
timestamps_pickle_filepath: Optional[Path],
|
|
288
|
-
|
|
289
|
-
|
|
80
|
+
station_configuration_mapping: Mapping[str, StationConfiguration],
|
|
81
|
+
target_timestamp: Optional[datetime] = None,
|
|
290
82
|
positions_seed_path: Optional[Path] = None,
|
|
291
|
-
|
|
83
|
+
time_window_length: timedelta = timedelta(days=2),
|
|
292
84
|
store_positions: bool = False,
|
|
293
|
-
|
|
85
|
+
linear_regression_time_limit: str = "91d",
|
|
86
|
+
break_on_error: bool = False,
|
|
294
87
|
):
|
|
295
88
|
"""
|
|
296
89
|
Main function for generating BUFR files and determine latest positions from a sequence of csv files
|
|
@@ -304,48 +97,42 @@ def get_bufr(
|
|
|
304
97
|
bufr_out
|
|
305
98
|
Path to the BUFR out directory.
|
|
306
99
|
input_files
|
|
307
|
-
List of
|
|
100
|
+
List of csv file paths.
|
|
308
101
|
positions_filepath
|
|
309
102
|
Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO
|
|
310
103
|
timestamps_pickle_filepath
|
|
311
104
|
Path to pickle file used for storing latest timestamp
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
get_bufr will export the latest data before
|
|
105
|
+
station_configuration_mapping
|
|
106
|
+
Mapping of station id to StationConfiguration object
|
|
107
|
+
target_timestamp
|
|
108
|
+
get_bufr will export the latest data before target_timestamp. Default datetime.utcnow()
|
|
316
109
|
positions_seed_path
|
|
317
110
|
Path to csv file with position data used as default values for the output position.
|
|
318
|
-
|
|
319
|
-
The
|
|
111
|
+
time_window_length
|
|
112
|
+
The length of the time window to consider for the latest data. Default 2 days
|
|
320
113
|
store_positions
|
|
321
114
|
Flag determine if latest positions are exported.
|
|
322
|
-
|
|
115
|
+
linear_regression_time_limit
|
|
323
116
|
Previous time to limit dataframe before applying linear regression.
|
|
117
|
+
break_on_error
|
|
118
|
+
If True, the function will raise an exception if an error occurs during processing.
|
|
324
119
|
|
|
325
120
|
"""
|
|
326
|
-
if
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
if earliest_timestamp is None:
|
|
330
|
-
earliest_timestamp = now_timestamp - timedelta(days=2)
|
|
121
|
+
if target_timestamp is None:
|
|
122
|
+
target_timestamp = datetime.utcnow()
|
|
331
123
|
|
|
332
124
|
# Prepare (latest) positions
|
|
333
125
|
positions = dict()
|
|
334
126
|
if positions_seed_path:
|
|
335
127
|
positions_seed = pd.read_csv(
|
|
336
|
-
positions_seed_path,
|
|
128
|
+
positions_seed_path,
|
|
129
|
+
index_col="stid",
|
|
130
|
+
delimiter=",",
|
|
131
|
+
parse_dates=["timestamp"],
|
|
337
132
|
).to_dict(orient="index")
|
|
338
133
|
logger.info(f"Seed positions for {positions_seed.keys()}")
|
|
339
134
|
positions.update(positions_seed)
|
|
340
135
|
|
|
341
|
-
# Prepare station configurations
|
|
342
|
-
if station_configuration_path is None:
|
|
343
|
-
station_configuration_mapping = dict()
|
|
344
|
-
else:
|
|
345
|
-
station_configuration_mapping = load_station_configuration_mapping(
|
|
346
|
-
station_configuration_path
|
|
347
|
-
)
|
|
348
|
-
|
|
349
136
|
# Prepare bufr output dir
|
|
350
137
|
bufr_out.mkdir(parents=True, exist_ok=True)
|
|
351
138
|
|
|
@@ -357,18 +144,13 @@ def get_bufr(
|
|
|
357
144
|
logger.info("latest_timestamps.pickle not found!")
|
|
358
145
|
latest_timestamps = {}
|
|
359
146
|
|
|
360
|
-
# Initiate a new dict for current timestamps
|
|
361
|
-
current_timestamps = {}
|
|
362
|
-
|
|
363
147
|
# Setup diagnostic lists (logger.info at end)
|
|
364
148
|
skipped = []
|
|
365
149
|
no_recent_data = []
|
|
366
|
-
no_entry_latest_timestamps = []
|
|
367
|
-
failed_min_data_wx = []
|
|
368
|
-
failed_min_data_pos = []
|
|
369
150
|
|
|
370
151
|
# Iterate through csv files
|
|
371
152
|
for file_path in input_files:
|
|
153
|
+
# TODO: This split is explicitly requiring the filename to have sampleate at suffix. This shuld be more robust
|
|
372
154
|
stid = file_path.stem.rsplit("_", 1)[0]
|
|
373
155
|
logger.info("####### Processing {} #######".format(stid))
|
|
374
156
|
|
|
@@ -381,40 +163,63 @@ def get_bufr(
|
|
|
381
163
|
|
|
382
164
|
output_path = bufr_out / f"{stid}.bufr"
|
|
383
165
|
logger.info(f"Generating {output_path} from {file_path}")
|
|
384
|
-
|
|
385
|
-
|
|
166
|
+
|
|
167
|
+
time_window_start = target_timestamp - time_window_length
|
|
168
|
+
# Use only newer data than the latest timestamp
|
|
169
|
+
if stid in latest_timestamps:
|
|
170
|
+
time_window_start = max(latest_timestamps[stid], time_window_start)
|
|
386
171
|
|
|
387
172
|
try:
|
|
388
|
-
|
|
389
|
-
file_path=file_path,
|
|
390
|
-
output_path=output_path,
|
|
391
|
-
now_timestamp=now_timestamp,
|
|
392
|
-
latest_timestamp=latest_timestamp,
|
|
393
|
-
time_limit=time_limit,
|
|
394
|
-
stid=stid,
|
|
395
|
-
station_configuration=station_configuration,
|
|
396
|
-
)
|
|
397
|
-
except Exception:
|
|
398
|
-
logger.exception(f"Failed processing {stid}")
|
|
399
|
-
continue
|
|
173
|
+
input_data = load_data(file_path, target_timestamp)
|
|
400
174
|
|
|
401
|
-
|
|
402
|
-
|
|
175
|
+
# Select current data
|
|
176
|
+
latest_data = get_latest_data(
|
|
177
|
+
input_data,
|
|
178
|
+
lin_reg_time_limit=linear_regression_time_limit,
|
|
179
|
+
vars_to_skip=station_configuration.skipped_variables,
|
|
180
|
+
)
|
|
181
|
+
if latest_data is None:
|
|
182
|
+
logger.info("No valid instantaneous timestamps!")
|
|
183
|
+
skipped.append(stid)
|
|
184
|
+
continue
|
|
403
185
|
|
|
404
|
-
|
|
186
|
+
# Create station positions
|
|
187
|
+
station_position = get_station_positions(latest_data)
|
|
405
188
|
if stid not in positions:
|
|
406
189
|
positions[stid] = dict()
|
|
407
|
-
|
|
408
190
|
if station_configuration.positions_update_timestamp_only:
|
|
409
191
|
positions[stid]["timestamp"] = station_position["timestamp"]
|
|
410
192
|
else:
|
|
411
193
|
positions[stid].update(station_position)
|
|
412
194
|
|
|
195
|
+
# Create BUFR File
|
|
196
|
+
if (
|
|
197
|
+
station_configuration.export_bufr
|
|
198
|
+
and latest_data.name > time_window_start
|
|
199
|
+
):
|
|
200
|
+
latest_timestamps[stid] = latest_data.name
|
|
201
|
+
bufr_variables = get_bufr_variables(latest_data, station_configuration)
|
|
202
|
+
if bufr_variables:
|
|
203
|
+
with output_path.open("bw") as output_file:
|
|
204
|
+
write_bufr_message(bufr_variables, output_file)
|
|
205
|
+
else:
|
|
206
|
+
logger.info(f"No new data {latest_data.name} <= {time_window_start}")
|
|
207
|
+
no_recent_data.append(stid)
|
|
208
|
+
|
|
209
|
+
except Exception:
|
|
210
|
+
logger.exception(f"Failed processing {stid}")
|
|
211
|
+
if output_path.exists():
|
|
212
|
+
output_path.unlink()
|
|
213
|
+
if break_on_error:
|
|
214
|
+
raise
|
|
215
|
+
skipped.append(stid)
|
|
216
|
+
continue
|
|
217
|
+
|
|
413
218
|
# Write the most recent timestamps back to the pickle on disk
|
|
414
219
|
logger.info(f"writing latest_timestamps to {timestamps_pickle_filepath}")
|
|
415
220
|
if timestamps_pickle_filepath:
|
|
416
221
|
with timestamps_pickle_filepath.open("wb") as handle:
|
|
417
|
-
pickle.dump(
|
|
222
|
+
pickle.dump(latest_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
418
223
|
|
|
419
224
|
if store_positions:
|
|
420
225
|
positions_df = pd.DataFrame.from_dict(
|
|
@@ -427,13 +232,7 @@ def get_bufr(
|
|
|
427
232
|
positions_df.to_csv(positions_filepath, index_label="stid")
|
|
428
233
|
|
|
429
234
|
logger.info("--------------------------------")
|
|
430
|
-
|
|
431
|
-
not_processed_count = (
|
|
432
|
-
len(skipped)
|
|
433
|
-
+ len(no_recent_data)
|
|
434
|
-
+ len(no_entry_latest_timestamps)
|
|
435
|
-
+ len(not_processed_wx_pos)
|
|
436
|
-
)
|
|
235
|
+
not_processed_count = len(skipped) + len(no_recent_data)
|
|
437
236
|
logger.info(
|
|
438
237
|
"BUFR exported for {} of {} fpaths.".format(
|
|
439
238
|
(len(input_files) - not_processed_count), len(input_files)
|
|
@@ -442,47 +241,46 @@ def get_bufr(
|
|
|
442
241
|
logger.info("")
|
|
443
242
|
logger.info("skipped: {}".format(skipped))
|
|
444
243
|
logger.info("no_recent_data: {}".format(no_recent_data))
|
|
445
|
-
logger.info("no_entry_latest_timestamps: {}".format(no_entry_latest_timestamps))
|
|
446
|
-
logger.info("failed_min_data_wx: {}".format(failed_min_data_wx))
|
|
447
|
-
logger.info("failed_min_data_pos: {}".format(failed_min_data_pos))
|
|
448
244
|
logger.info("--------------------------------")
|
|
449
245
|
|
|
450
246
|
|
|
451
|
-
def
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
"""
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
for var_key in vars_to_skip:
|
|
470
|
-
row[var_key] = np.nan
|
|
471
|
-
logger.info("----> Skipping var: {}".format(var_key))
|
|
472
|
-
return row
|
|
247
|
+
def get_station_positions(latest_data: pd.Series) -> Dict:
|
|
248
|
+
station_position = dict()
|
|
249
|
+
station_position["timestamp"] = latest_data.name
|
|
250
|
+
station_position["lat"] = latest_data["gps_lat_fit"]
|
|
251
|
+
station_position["lon"] = latest_data["gps_lon_fit"]
|
|
252
|
+
station_position["alt"] = latest_data["gps_alt_fit"]
|
|
253
|
+
if any(
|
|
254
|
+
[
|
|
255
|
+
pd.isna(station_position["lat"]),
|
|
256
|
+
pd.isna(station_position["lon"]),
|
|
257
|
+
pd.isna(station_position["alt"]),
|
|
258
|
+
]
|
|
259
|
+
):
|
|
260
|
+
logger.warning("Insufficient position data")
|
|
261
|
+
station_position["lat"] = None
|
|
262
|
+
station_position["lon"] = None
|
|
263
|
+
station_position["alt"] = None
|
|
264
|
+
return station_position
|
|
473
265
|
|
|
474
266
|
|
|
475
267
|
def get_bufr_variables(
|
|
476
268
|
data: pd.Series,
|
|
477
269
|
station_configuration: StationConfiguration,
|
|
478
|
-
) -> BUFRVariables:
|
|
270
|
+
) -> Optional[BUFRVariables]:
|
|
479
271
|
"""
|
|
480
|
-
Helper function for converting our
|
|
272
|
+
Helper function for converting our variables to the variables needed for bufr export.
|
|
273
|
+
|
|
274
|
+
Raises AttributeError if station_configuration don't have the minimum dimension fields since they are required to determine barometer heights.
|
|
275
|
+
* height_of_gps_from_station_ground
|
|
276
|
+
* barometer_from_gps
|
|
277
|
+
|
|
278
|
+
|
|
481
279
|
|
|
482
280
|
Parameters
|
|
483
281
|
----------
|
|
484
282
|
data
|
|
485
|
-
Series with processed
|
|
283
|
+
Series with processed variables from get_latest_datas
|
|
486
284
|
|
|
487
285
|
station_configuration
|
|
488
286
|
|
|
@@ -491,30 +289,62 @@ def get_bufr_variables(
|
|
|
491
289
|
BUFRVariables used by bufr_utilities
|
|
492
290
|
|
|
493
291
|
"""
|
|
494
|
-
|
|
495
|
-
if
|
|
496
|
-
|
|
497
|
-
|
|
292
|
+
|
|
293
|
+
if not all(key in data.index for key in REQUIRED_KEYS):
|
|
294
|
+
raise ValueError(
|
|
295
|
+
f"Failed to process BUFRVariables. Missing required keys: {REQUIRED_KEYS}"
|
|
498
296
|
)
|
|
499
297
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
298
|
+
# Check that we have minimum required fields to proceed with writing to BUFR
|
|
299
|
+
# Always require minimum a valid air temp or a valid pressure.
|
|
300
|
+
# If both air temp and pressure are nan, do not submit.
|
|
301
|
+
# This will allow the case of having only one or the other.
|
|
302
|
+
if data[["t_i", "p_i"]].isna().all():
|
|
303
|
+
logger.warning("Failed to process BUFRVariables - insufficient data")
|
|
304
|
+
return None
|
|
305
|
+
|
|
306
|
+
# Always require a valid position data
|
|
307
|
+
if data[["gps_lat_fit", "gps_lon_fit", "gps_alt_fit"]].isna().any():
|
|
308
|
+
logger.warning("Failed to process BUFRVariables - insufficient position data")
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
if station_configuration.height_of_gps_from_station_ground is None:
|
|
312
|
+
raise AttributeError(
|
|
313
|
+
"height_of_gps_from_station_ground is required for BUFR export"
|
|
504
314
|
)
|
|
315
|
+
if station_configuration.barometer_from_gps is None:
|
|
316
|
+
raise AttributeError("barometer_from_gps is required for BUFR export")
|
|
505
317
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
318
|
+
if station_configuration.static_height_of_gps_from_mean_sea_level is None:
|
|
319
|
+
height_of_gps_above_mean_sea_level = data["gps_alt_fit"]
|
|
320
|
+
else:
|
|
321
|
+
height_of_gps_above_mean_sea_level = (
|
|
322
|
+
station_configuration.static_height_of_gps_from_mean_sea_level
|
|
510
323
|
)
|
|
511
324
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
325
|
+
heightOfStationGroundAboveMeanSeaLevel = (
|
|
326
|
+
height_of_gps_above_mean_sea_level
|
|
327
|
+
- station_configuration.height_of_gps_from_station_ground
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
heightOfBarometerAboveMeanSeaLevel = (
|
|
331
|
+
height_of_gps_above_mean_sea_level + station_configuration.barometer_from_gps
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
if station_configuration.temperature_from_sonic_ranger is None:
|
|
335
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan
|
|
336
|
+
else:
|
|
337
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = (
|
|
338
|
+
data["z_boom_u_smooth"]
|
|
339
|
+
+ station_configuration.temperature_from_sonic_ranger
|
|
516
340
|
)
|
|
517
341
|
|
|
342
|
+
if station_configuration.anemometer_from_sonic_ranger is None:
|
|
343
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan
|
|
344
|
+
else:
|
|
345
|
+
heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = (
|
|
346
|
+
data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger
|
|
347
|
+
)
|
|
518
348
|
|
|
519
349
|
output_row = BUFRVariables(
|
|
520
350
|
wmo_id=station_configuration.wmo_id,
|
|
@@ -526,7 +356,7 @@ def get_bufr_variables(
|
|
|
526
356
|
airTemperature=data.t_i + 273.15,
|
|
527
357
|
# Convert pressure, correct the -1000 offset, then hPa to Pa
|
|
528
358
|
# note that instantaneous pressure has 0.1 hPa precision
|
|
529
|
-
|
|
359
|
+
nonCoordinatePressure=(data.p_i + 1000.0) * 100.0,
|
|
530
360
|
windDirection=data.wdir_i,
|
|
531
361
|
windSpeed=data.wspd_i,
|
|
532
362
|
latitude=data.gps_lat_fit,
|
|
@@ -540,60 +370,75 @@ def get_bufr_variables(
|
|
|
540
370
|
return output_row
|
|
541
371
|
|
|
542
372
|
|
|
543
|
-
def min_data_check(s):
|
|
544
|
-
"""Check that we have minimum required fields to proceed with writing to BUFR
|
|
545
|
-
For wx vars, we currently require both air temp and pressure to be non-NaN.
|
|
546
|
-
If you know a specific var is reporting bad data, you can ignore just that var
|
|
547
|
-
using the vars_to_skip dict in wmo_config.
|
|
548
|
-
|
|
549
|
-
Parameters
|
|
550
|
-
----------
|
|
551
|
-
s : pandas series
|
|
552
|
-
The current obset we are working with (for BUFR submission)
|
|
553
|
-
|
|
554
|
-
Returns
|
|
555
|
-
-------
|
|
556
|
-
min_data_wx_result : bool
|
|
557
|
-
True (default), the test for min wx data passed. False, the test failed.
|
|
558
|
-
min_data_pos_result : bool
|
|
559
|
-
True (default), the test for min position data passed. False, the test failed.
|
|
560
|
-
"""
|
|
561
|
-
min_data_wx_result = True
|
|
562
|
-
min_data_pos_result = True
|
|
563
|
-
|
|
564
|
-
# Can use pd.isna() or math.isnan() below...
|
|
565
|
-
|
|
566
|
-
# Always require valid air temp and valid pressure (both must be non-nan)
|
|
567
|
-
# if (pd.isna(s['t_i']) is False) and (pd.isna(s['p_i']) is False):
|
|
568
|
-
# pass
|
|
569
|
-
# else:
|
|
570
|
-
# print('----> Failed min_data_check for air temp and pressure!')
|
|
571
|
-
# min_data_wx_result = False
|
|
572
|
-
|
|
573
|
-
# If both air temp and pressure are nan, do not submit.
|
|
574
|
-
# This will allow the case of having only one or the other.
|
|
575
|
-
if (pd.isna(s["t_i"]) is True) and (pd.isna(s["p_i"]) is True):
|
|
576
|
-
logger.warning("----> Failed min_data_check for air temp and pressure!")
|
|
577
|
-
min_data_wx_result = False
|
|
578
|
-
|
|
579
|
-
# Missing just elevation OK
|
|
580
|
-
# if (pd.isna(s['gps_lat_fit']) is False) and (pd.isna(s['gps_lon_fit']) is False):
|
|
581
|
-
# pass
|
|
582
|
-
# Require all three: lat, lon, elev
|
|
583
|
-
if (
|
|
584
|
-
(pd.isna(s["gps_lat_fit"]) is False)
|
|
585
|
-
and (pd.isna(s["gps_lon_fit"]) is False)
|
|
586
|
-
and (pd.isna(s["gps_alt_fit"]) is False)
|
|
587
|
-
):
|
|
588
|
-
pass
|
|
589
|
-
else:
|
|
590
|
-
logger.warning("----> Failed min_data_check for position!")
|
|
591
|
-
min_data_pos_result = False
|
|
592
|
-
|
|
593
|
-
return min_data_wx_result, min_data_pos_result
|
|
594
|
-
|
|
595
373
|
def main():
|
|
596
|
-
|
|
374
|
+
parser = argparse.ArgumentParser()
|
|
375
|
+
parser.add_argument(
|
|
376
|
+
"--store_positions",
|
|
377
|
+
"--positions",
|
|
378
|
+
action="store_true",
|
|
379
|
+
required=False,
|
|
380
|
+
default=False,
|
|
381
|
+
help="If included (True), make a positions dict and output AWS_latest_locations.csv file.",
|
|
382
|
+
)
|
|
383
|
+
parser.add_argument(
|
|
384
|
+
"--positions-filepath",
|
|
385
|
+
"-p",
|
|
386
|
+
type=Path,
|
|
387
|
+
required=False,
|
|
388
|
+
help="Path to write AWS_latest_locations.csv file.",
|
|
389
|
+
)
|
|
390
|
+
parser.add_argument(
|
|
391
|
+
"--linear_regression_time_limit",
|
|
392
|
+
"--time-limit",
|
|
393
|
+
default=DEFAULT_LIN_REG_TIME_LIMIT,
|
|
394
|
+
type=str,
|
|
395
|
+
required=False,
|
|
396
|
+
help="Previous time to limit dataframe before applying linear regression.",
|
|
397
|
+
)
|
|
398
|
+
parser.add_argument(
|
|
399
|
+
"--input_files",
|
|
400
|
+
"-i",
|
|
401
|
+
type=Path,
|
|
402
|
+
nargs="+",
|
|
403
|
+
required=True,
|
|
404
|
+
help="Path to input files .csv files. Can be direct paths or glob patterns",
|
|
405
|
+
)
|
|
406
|
+
parser.add_argument(
|
|
407
|
+
"--bufr-out",
|
|
408
|
+
"-o",
|
|
409
|
+
type=Path,
|
|
410
|
+
required=True,
|
|
411
|
+
help="Path to the BUFR out directory.",
|
|
412
|
+
)
|
|
413
|
+
parser.add_argument(
|
|
414
|
+
"--timestamps-pickle-filepath",
|
|
415
|
+
type=Path,
|
|
416
|
+
required=False,
|
|
417
|
+
help="Path to the latest_timestamps.pickle file.",
|
|
418
|
+
)
|
|
419
|
+
parser.add_argument(
|
|
420
|
+
"--station_configurations_root",
|
|
421
|
+
type=Path,
|
|
422
|
+
required=True,
|
|
423
|
+
help="Path to root directory containing station configuration toml files",
|
|
424
|
+
)
|
|
425
|
+
parser.add_argument(
|
|
426
|
+
"--position_seed",
|
|
427
|
+
default=DEFAULT_POSITION_SEED_PATH,
|
|
428
|
+
type=Path,
|
|
429
|
+
required=False,
|
|
430
|
+
help="Path to csv file with seed values for output positions.",
|
|
431
|
+
)
|
|
432
|
+
parser.add_argument(
|
|
433
|
+
"--target_timestamp",
|
|
434
|
+
"--now-timestamp",
|
|
435
|
+
default=datetime.utcnow(),
|
|
436
|
+
type=pd.Timestamp,
|
|
437
|
+
help="Timestamp used to determine latest data. Default utcnow.",
|
|
438
|
+
)
|
|
439
|
+
parser.add_argument("--verbose", "-v", default=False, action="store_true")
|
|
440
|
+
|
|
441
|
+
args = parser.parse_args()
|
|
597
442
|
|
|
598
443
|
log_level = logging.INFO
|
|
599
444
|
if args.verbose:
|
|
@@ -613,17 +458,23 @@ def main():
|
|
|
613
458
|
# The input path might be a glob pattern
|
|
614
459
|
input_files += map(Path, glob.glob(path.as_posix()))
|
|
615
460
|
|
|
461
|
+
station_configuration_mapping = load_station_configuration_mapping(
|
|
462
|
+
args.station_configurations_root,
|
|
463
|
+
skip_unexpected_fields=True,
|
|
464
|
+
)
|
|
465
|
+
|
|
616
466
|
get_bufr(
|
|
617
467
|
bufr_out=args.bufr_out,
|
|
618
468
|
input_files=input_files,
|
|
619
469
|
store_positions=args.store_positions,
|
|
620
470
|
positions_filepath=args.positions_filepath,
|
|
621
|
-
|
|
471
|
+
linear_regression_time_limit=args.linear_regression_time_limit,
|
|
622
472
|
timestamps_pickle_filepath=args.timestamps_pickle_filepath,
|
|
623
|
-
|
|
624
|
-
|
|
473
|
+
target_timestamp=args.target_timestamp,
|
|
474
|
+
station_configuration_mapping=station_configuration_mapping,
|
|
625
475
|
positions_seed_path=args.position_seed,
|
|
626
476
|
)
|
|
627
477
|
|
|
478
|
+
|
|
628
479
|
if __name__ == "__main__":
|
|
629
|
-
main()
|
|
480
|
+
main()
|