pypromice 1.3.5__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pypromice might be problematic. Click here for more details.

Files changed (55) hide show
  1. pypromice/get/get.py +19 -19
  2. pypromice/postprocess/bufr_to_csv.py +6 -1
  3. pypromice/postprocess/bufr_utilities.py +91 -18
  4. pypromice/postprocess/create_bufr_files.py +178 -0
  5. pypromice/postprocess/get_bufr.py +248 -397
  6. pypromice/postprocess/make_metadata_csv.py +214 -0
  7. pypromice/postprocess/real_time_utilities.py +41 -11
  8. pypromice/process/L0toL1.py +12 -5
  9. pypromice/process/L1toL2.py +159 -30
  10. pypromice/process/L2toL3.py +1034 -187
  11. pypromice/process/aws.py +131 -752
  12. pypromice/process/get_l2.py +90 -0
  13. pypromice/process/get_l2tol3.py +111 -0
  14. pypromice/process/join_l2.py +112 -0
  15. pypromice/process/join_l3.py +551 -120
  16. pypromice/process/load.py +161 -0
  17. pypromice/process/resample.py +128 -0
  18. pypromice/process/utilities.py +68 -0
  19. pypromice/process/write.py +503 -0
  20. pypromice/qc/github_data_issues.py +10 -16
  21. pypromice/qc/percentiles/thresholds.csv +2 -2
  22. pypromice/qc/persistence.py +71 -25
  23. pypromice/resources/__init__.py +28 -0
  24. pypromice/{process/metadata.csv → resources/file_attributes.csv} +0 -2
  25. pypromice/resources/variable_aliases_GC-Net.csv +78 -0
  26. pypromice/resources/variables.csv +106 -0
  27. pypromice/station_configuration.py +118 -0
  28. pypromice/tx/get_l0tx.py +7 -4
  29. pypromice/tx/payload_formats.csv +1 -0
  30. pypromice/tx/tx.py +27 -6
  31. pypromice/utilities/__init__.py +0 -0
  32. pypromice/utilities/git.py +61 -0
  33. {pypromice-1.3.5.dist-info → pypromice-1.4.0.dist-info}/METADATA +12 -21
  34. pypromice-1.4.0.dist-info/RECORD +53 -0
  35. {pypromice-1.3.5.dist-info → pypromice-1.4.0.dist-info}/WHEEL +1 -1
  36. pypromice-1.4.0.dist-info/entry_points.txt +13 -0
  37. pypromice/postprocess/station_configurations.toml +0 -762
  38. pypromice/process/get_l3.py +0 -46
  39. pypromice/process/variables.csv +0 -92
  40. pypromice/qc/persistence_test.py +0 -150
  41. pypromice/test/test_config1.toml +0 -69
  42. pypromice/test/test_config2.toml +0 -54
  43. pypromice/test/test_email +0 -75
  44. pypromice/test/test_payload_formats.csv +0 -4
  45. pypromice/test/test_payload_types.csv +0 -7
  46. pypromice/test/test_percentile.py +0 -229
  47. pypromice/test/test_raw1.txt +0 -4468
  48. pypromice/test/test_raw_DataTable2.txt +0 -11167
  49. pypromice/test/test_raw_SlimTableMem1.txt +0 -1155
  50. pypromice/test/test_raw_transmitted1.txt +0 -15411
  51. pypromice/test/test_raw_transmitted2.txt +0 -28
  52. pypromice-1.3.5.dist-info/RECORD +0 -53
  53. pypromice-1.3.5.dist-info/entry_points.txt +0 -8
  54. {pypromice-1.3.5.dist-info → pypromice-1.4.0.dist-info}/LICENSE.txt +0 -0
  55. {pypromice-1.3.5.dist-info → pypromice-1.4.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,16 @@
1
- #!/usr/bin/env python
2
-
3
1
  """
4
2
  Command-line script for running BUFR file generation
5
3
 
6
4
  Post-processing functions for AWS station data, such as converting PROMICE and GC-Net data files to WMO-compliant BUFR files
7
5
 
8
6
  """
7
+ __all__ = [
8
+ "get_bufr",
9
+ "main",
10
+ "DEFAULT_POSITION_SEED_PATH",
11
+ "DEFAULT_LIN_REG_TIME_LIMIT",
12
+ ]
13
+
9
14
  import argparse
10
15
  import glob
11
16
  import logging
@@ -13,258 +18,45 @@ import pickle
13
18
  import sys
14
19
  from datetime import datetime, timedelta
15
20
  from pathlib import Path
16
- from typing import List, Dict, Mapping, Optional, Collection, Sequence, Union, TextIO
21
+ from typing import List, Dict, Optional, Sequence, Mapping
17
22
 
18
- import attrs
19
23
  import numpy as np
20
24
  import pandas as pd
21
- import toml
22
25
 
23
26
  from pypromice.postprocess.bufr_utilities import write_bufr_message, BUFRVariables
24
27
  from pypromice.postprocess.real_time_utilities import get_latest_data
25
28
 
26
- logger = logging.getLogger(__name__)
27
29
 
28
- DEFAULT_STATION_CONFIGURATION_PATH = Path(__file__).parent.joinpath(
29
- "station_configurations.toml"
30
+ from pypromice.station_configuration import (
31
+ StationConfiguration,
32
+ load_station_configuration_mapping,
30
33
  )
31
- DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv")
32
- DEFAULT_LIN_REG_TIME_LIMIT = "91d"
33
-
34
- def parse_arguments_bufr() -> argparse.ArgumentParser:
35
- parser = argparse.ArgumentParser()
36
-
37
- parser.add_argument(
38
- "--store_positions",
39
- "--positions",
40
- action="store_true",
41
- required=False,
42
- default=False,
43
- help="If included (True), make a positions dict and output AWS_latest_locations.csv file.",
44
- )
45
-
46
- parser.add_argument(
47
- "--positions-filepath",
48
- "-p",
49
- type=Path,
50
- required=False,
51
- help="Path to write AWS_latest_locations.csv file.",
52
- )
53
-
54
- parser.add_argument(
55
- "--time-limit",
56
- default=DEFAULT_LIN_REG_TIME_LIMIT,
57
- type=str,
58
- required=False,
59
- help="Previous time to limit dataframe before applying linear regression.",
60
- )
61
-
62
- parser.add_argument(
63
- "--input_files",
64
- "--l3-filepath",
65
- "-i",
66
- type=Path,
67
- nargs="+",
68
- required=True,
69
- help="Path to L3 tx .csv files. Can be direct paths or glob patterns",
70
- )
71
-
72
- parser.add_argument(
73
- "--bufr-out",
74
- "-o",
75
- type=Path,
76
- required=True,
77
- help="Path to the BUFR out directory.",
78
- )
79
-
80
- parser.add_argument(
81
- "--timestamps-pickle-filepath",
82
- type=Path,
83
- required=False,
84
- help="Path to the latest_timestamps.pickle file.",
85
- )
86
-
87
- parser.add_argument(
88
- "--station_configuration_mapping",
89
- default=DEFAULT_STATION_CONFIGURATION_PATH,
90
- type=Path,
91
- required=False,
92
- help="Path to csv file with station meta data and BUFR export configuration",
93
- )
94
-
95
- parser.add_argument(
96
- "--position_seed",
97
- default=DEFAULT_POSITION_SEED_PATH,
98
- type=Path,
99
- required=False,
100
- help="Path to csv file with seed values for output positions.",
101
- )
102
-
103
- parser.add_argument(
104
- '--latest_timestamp',
105
- default=datetime.utcnow(),
106
- type=pd.Timestamp,
107
- help="Timestamp used to determine latest data. Default utcnow."
108
- )
109
-
110
- parser.add_argument("--verbose", "-v", default=False, action="store_true")
111
-
112
- return parser
113
-
114
-
115
- @attrs.define
116
- class StationConfiguration:
117
- """
118
- Helper class for storing station specific configurations with respect to
119
-
120
- * Installation specific distance measurements such as height differences between instruments
121
- * Reference strings such as stid, station_site and wmo_id
122
- * BUFR export specific parameters
123
-
124
- # TODO: The station related meta data should be fetched from a station specific configuration files in the future or
125
- # from header data in data source.
126
- """
127
34
 
128
- stid: str
129
- station_site: str = None
130
- project: Optional[str] = None
131
- station_type: Optional[str] = None
132
- wmo_id: Optional[str] = None
133
- barometer_from_gps: Optional[float] = None
134
- anemometer_from_sonic_ranger: Optional[float] = None
135
- temperature_from_sonic_ranger: Optional[float] = None
136
- height_of_gps_from_station_ground: Optional[float] = None
137
- sonic_ranger_from_gps: Optional[float] = None
138
-
139
- # The station data will be exported to BUFR if True. Otherwise, it will only export latest position
140
- export_bufr: bool = False
141
- comment: Optional[str] = None
142
-
143
- # skip specific variables for stations
144
- # If a variable has known bad data, use this collection to skip the variable
145
- # Note that if a station is not reporting both air temp and pressure it will be skipped,
146
- # as currently implemented in csv2bufr.min_data_check().
147
- # ['p_i'], # EXAMPLE
148
- skipped_variables: List[str] = attrs.field(factory=list)
149
-
150
- positions_update_timestamp_only: bool = False
151
-
152
- def as_dict(self) -> Dict:
153
- return attrs.asdict(self)
154
-
155
-
156
- def load_station_configuration_mapping(
157
- fp: Union[str, Path, TextIO]
158
- ) -> Mapping[str, StationConfiguration]:
159
- """
160
- Read station configurations from toml file
161
-
162
- Parameters
163
- ----------
164
- fp :
165
- Path to or open toml file
166
-
167
- Returns
168
- -------
169
- Mapping from stid to StationConfiguration
170
-
171
- """
172
- return {
173
- stid: StationConfiguration(**config_dict)
174
- for stid, config_dict in toml.load(fp).items()
175
- }
176
-
177
-
178
- def write_station_configuration_mapping(
179
- config_mapping: Mapping[str, StationConfiguration], fp: TextIO
180
- ):
181
- """
182
- Write station configuration to toml file
183
-
184
- Parameters
185
- ----------
186
- config_mapping
187
- Mapping from stid to StationConfiguration
188
- fp
189
- open writable TextIO
190
- """
191
- config_mapping = {
192
- config.stid: config.as_dict() for config in config_mapping.values()
193
- }
194
- toml.dump(config_mapping, fp)
195
-
196
-
197
- def process_station(
198
- file_path: Path,
199
- output_path: Path,
200
- now_timestamp: datetime,
201
- latest_timestamp: Optional[datetime],
202
- time_limit: str,
203
- stid: str,
204
- station_configuration: StationConfiguration,
205
- ) -> Optional[Dict]:
206
- df = load_data(file_path, now_timestamp)
207
-
208
- # Select current data
209
- latest_data = get_latest_data(
210
- df,
211
- lin_reg_time_limit=time_limit,
212
- )
213
-
214
- if latest_data is None:
215
- logger.info("No valid instantaneous timestamps!")
216
- return None
217
-
218
- latest_data = filter_skipped_variables(
219
- latest_data, vars_to_skip=station_configuration.skipped_variables
220
- )
221
-
222
- # Check that we have minimum required valid data
223
- sufficient_wx_data, sufficient_position_data = min_data_check(latest_data)
224
-
225
- station_position = dict()
226
- station_position["timestamp"] = latest_data.name
227
- if sufficient_position_data:
228
- station_position["lon"] = latest_data.get("gps_lon_fit")
229
- station_position["lat"] = latest_data.get("gps_lat_fit")
230
- station_position["alt"] = latest_data.get("gps_alt_fit")
231
- else:
232
- logger.warning("Insufficient position data")
233
- # Don't use any position attributes from latest_data
234
- station_position["lon"] = None
235
- station_position["lat"] = None
236
- station_position["alt"] = None
237
- return station_position
238
-
239
- if station_configuration.export_bufr:
240
- if not sufficient_wx_data:
241
- logger.warning(f"Failed min data wx {stid}")
242
- return station_position
243
-
244
- # Store current timest
245
- if latest_data.name <= latest_timestamp:
246
- logger.info(f"No new data {latest_data.name} <= {latest_timestamp}")
247
- return station_position
248
-
249
- # Construct and export BUFR file
250
- bufr_variables = get_bufr_variables(
251
- data=latest_data,
252
- station_configuration=station_configuration,
253
- )
254
- with output_path.open("bw") as fp:
255
- write_bufr_message(variables=bufr_variables, file=fp)
35
+ logger = logging.getLogger(__name__)
256
36
 
257
- return station_position
37
+ DEFAULT_POSITION_SEED_PATH = Path(__file__).parent.joinpath("positions_seed.csv")
38
+ DEFAULT_LIN_REG_TIME_LIMIT = "91d"
39
+ REQUIRED_KEYS = (
40
+ "t_i",
41
+ "p_i",
42
+ "rh_i",
43
+ "wdir_i",
44
+ "wspd_i",
45
+ "gps_lat_fit",
46
+ "gps_lon_fit",
47
+ "gps_alt_fit",
48
+ "z_boom_u_smooth",
49
+ )
258
50
 
259
51
 
260
- def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame:
52
+ def load_data(file_path: Path, latest_timestamp: datetime) -> pd.DataFrame:
261
53
  """
262
- Read AWS data from csv file using time as index and filter all rows after now_timestamp
54
+ Read AWS data from csv file using time as index and filter all rows after latest_timestamp
263
55
 
264
56
  Parameters
265
57
  ----------
266
58
  file_path
267
- now_timestamp
59
+ latest_timestamp
268
60
 
269
61
  Returns
270
62
  -------
@@ -276,7 +68,7 @@ def load_data(file_path: Path, now_timestamp: datetime) -> pd.DataFrame:
276
68
  .set_index("time")
277
69
  .sort_index()
278
70
  )
279
- df = df[:now_timestamp]
71
+ df = df[:latest_timestamp]
280
72
  return df
281
73
 
282
74
 
@@ -285,12 +77,13 @@ def get_bufr(
285
77
  input_files: Sequence[Path],
286
78
  positions_filepath: Optional[Path],
287
79
  timestamps_pickle_filepath: Optional[Path],
288
- station_configuration_path: Optional[Path],
289
- now_timestamp: Optional[datetime] = None,
80
+ station_configuration_mapping: Mapping[str, StationConfiguration],
81
+ target_timestamp: Optional[datetime] = None,
290
82
  positions_seed_path: Optional[Path] = None,
291
- earliest_timestamp: datetime = None,
83
+ time_window_length: timedelta = timedelta(days=2),
292
84
  store_positions: bool = False,
293
- time_limit: str = "91d",
85
+ linear_regression_time_limit: str = "91d",
86
+ break_on_error: bool = False,
294
87
  ):
295
88
  """
296
89
  Main function for generating BUFR files and determine latest positions from a sequence of csv files
@@ -304,48 +97,42 @@ def get_bufr(
304
97
  bufr_out
305
98
  Path to the BUFR out directory.
306
99
  input_files
307
- List of L3 csv file paths.
100
+ List of csv file paths.
308
101
  positions_filepath
309
102
  Path to write latest positions. Used to retrieve a static set of positions to register stations with DMI/WMO
310
103
  timestamps_pickle_filepath
311
104
  Path to pickle file used for storing latest timestamp
312
- station_configuration_path
313
- Path to toml file with configuration entries for each station
314
- now_timestamp
315
- get_bufr will export the latest data before now_timestamp. Default datetime.utcnow()
105
+ station_configuration_mapping
106
+ Mapping of station id to StationConfiguration object
107
+ target_timestamp
108
+ get_bufr will export the latest data before target_timestamp. Default datetime.utcnow()
316
109
  positions_seed_path
317
110
  Path to csv file with position data used as default values for the output position.
318
- earliest_timestamp
319
- The earliest allowed timestamp for data to be included in the output. Default now_timestamp - 2 days
111
+ time_window_length
112
+ The length of the time window to consider for the latest data. Default 2 days
320
113
  store_positions
321
114
  Flag determine if latest positions are exported.
322
- time_limit
115
+ linear_regression_time_limit
323
116
  Previous time to limit dataframe before applying linear regression.
117
+ break_on_error
118
+ If True, the function will raise an exception if an error occurs during processing.
324
119
 
325
120
  """
326
- if now_timestamp is None:
327
- now_timestamp = datetime.utcnow()
328
-
329
- if earliest_timestamp is None:
330
- earliest_timestamp = now_timestamp - timedelta(days=2)
121
+ if target_timestamp is None:
122
+ target_timestamp = datetime.utcnow()
331
123
 
332
124
  # Prepare (latest) positions
333
125
  positions = dict()
334
126
  if positions_seed_path:
335
127
  positions_seed = pd.read_csv(
336
- positions_seed_path, index_col=0, delimiter=",", parse_dates=["timestamp"]
128
+ positions_seed_path,
129
+ index_col="stid",
130
+ delimiter=",",
131
+ parse_dates=["timestamp"],
337
132
  ).to_dict(orient="index")
338
133
  logger.info(f"Seed positions for {positions_seed.keys()}")
339
134
  positions.update(positions_seed)
340
135
 
341
- # Prepare station configurations
342
- if station_configuration_path is None:
343
- station_configuration_mapping = dict()
344
- else:
345
- station_configuration_mapping = load_station_configuration_mapping(
346
- station_configuration_path
347
- )
348
-
349
136
  # Prepare bufr output dir
350
137
  bufr_out.mkdir(parents=True, exist_ok=True)
351
138
 
@@ -357,18 +144,13 @@ def get_bufr(
357
144
  logger.info("latest_timestamps.pickle not found!")
358
145
  latest_timestamps = {}
359
146
 
360
- # Initiate a new dict for current timestamps
361
- current_timestamps = {}
362
-
363
147
  # Setup diagnostic lists (logger.info at end)
364
148
  skipped = []
365
149
  no_recent_data = []
366
- no_entry_latest_timestamps = []
367
- failed_min_data_wx = []
368
- failed_min_data_pos = []
369
150
 
370
151
  # Iterate through csv files
371
152
  for file_path in input_files:
153
+ # TODO: This split is explicitly requiring the filename to have sampleate at suffix. This shuld be more robust
372
154
  stid = file_path.stem.rsplit("_", 1)[0]
373
155
  logger.info("####### Processing {} #######".format(stid))
374
156
 
@@ -381,40 +163,63 @@ def get_bufr(
381
163
 
382
164
  output_path = bufr_out / f"{stid}.bufr"
383
165
  logger.info(f"Generating {output_path} from {file_path}")
384
- latest_timestamp = latest_timestamps.get(stid, earliest_timestamp)
385
- latest_timestamp = max(earliest_timestamp, latest_timestamp)
166
+
167
+ time_window_start = target_timestamp - time_window_length
168
+ # Use only newer data than the latest timestamp
169
+ if stid in latest_timestamps:
170
+ time_window_start = max(latest_timestamps[stid], time_window_start)
386
171
 
387
172
  try:
388
- station_position = process_station(
389
- file_path=file_path,
390
- output_path=output_path,
391
- now_timestamp=now_timestamp,
392
- latest_timestamp=latest_timestamp,
393
- time_limit=time_limit,
394
- stid=stid,
395
- station_configuration=station_configuration,
396
- )
397
- except Exception:
398
- logger.exception(f"Failed processing {stid}")
399
- continue
173
+ input_data = load_data(file_path, target_timestamp)
400
174
 
401
- if station_position is None:
402
- logger.warning(f"No position information available for {stid}")
175
+ # Select current data
176
+ latest_data = get_latest_data(
177
+ input_data,
178
+ lin_reg_time_limit=linear_regression_time_limit,
179
+ vars_to_skip=station_configuration.skipped_variables,
180
+ )
181
+ if latest_data is None:
182
+ logger.info("No valid instantaneous timestamps!")
183
+ skipped.append(stid)
184
+ continue
403
185
 
404
- else:
186
+ # Create station positions
187
+ station_position = get_station_positions(latest_data)
405
188
  if stid not in positions:
406
189
  positions[stid] = dict()
407
-
408
190
  if station_configuration.positions_update_timestamp_only:
409
191
  positions[stid]["timestamp"] = station_position["timestamp"]
410
192
  else:
411
193
  positions[stid].update(station_position)
412
194
 
195
+ # Create BUFR File
196
+ if (
197
+ station_configuration.export_bufr
198
+ and latest_data.name > time_window_start
199
+ ):
200
+ latest_timestamps[stid] = latest_data.name
201
+ bufr_variables = get_bufr_variables(latest_data, station_configuration)
202
+ if bufr_variables:
203
+ with output_path.open("bw") as output_file:
204
+ write_bufr_message(bufr_variables, output_file)
205
+ else:
206
+ logger.info(f"No new data {latest_data.name} <= {time_window_start}")
207
+ no_recent_data.append(stid)
208
+
209
+ except Exception:
210
+ logger.exception(f"Failed processing {stid}")
211
+ if output_path.exists():
212
+ output_path.unlink()
213
+ if break_on_error:
214
+ raise
215
+ skipped.append(stid)
216
+ continue
217
+
413
218
  # Write the most recent timestamps back to the pickle on disk
414
219
  logger.info(f"writing latest_timestamps to {timestamps_pickle_filepath}")
415
220
  if timestamps_pickle_filepath:
416
221
  with timestamps_pickle_filepath.open("wb") as handle:
417
- pickle.dump(current_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL)
222
+ pickle.dump(latest_timestamps, handle, protocol=pickle.HIGHEST_PROTOCOL)
418
223
 
419
224
  if store_positions:
420
225
  positions_df = pd.DataFrame.from_dict(
@@ -427,13 +232,7 @@ def get_bufr(
427
232
  positions_df.to_csv(positions_filepath, index_label="stid")
428
233
 
429
234
  logger.info("--------------------------------")
430
- not_processed_wx_pos = set(failed_min_data_wx + failed_min_data_pos)
431
- not_processed_count = (
432
- len(skipped)
433
- + len(no_recent_data)
434
- + len(no_entry_latest_timestamps)
435
- + len(not_processed_wx_pos)
436
- )
235
+ not_processed_count = len(skipped) + len(no_recent_data)
437
236
  logger.info(
438
237
  "BUFR exported for {} of {} fpaths.".format(
439
238
  (len(input_files) - not_processed_count), len(input_files)
@@ -442,47 +241,46 @@ def get_bufr(
442
241
  logger.info("")
443
242
  logger.info("skipped: {}".format(skipped))
444
243
  logger.info("no_recent_data: {}".format(no_recent_data))
445
- logger.info("no_entry_latest_timestamps: {}".format(no_entry_latest_timestamps))
446
- logger.info("failed_min_data_wx: {}".format(failed_min_data_wx))
447
- logger.info("failed_min_data_pos: {}".format(failed_min_data_pos))
448
244
  logger.info("--------------------------------")
449
245
 
450
246
 
451
- def filter_skipped_variables(
452
- row: pd.Series, vars_to_skip: Collection[str]
453
- ) -> pd.Series:
454
- """
455
- Mutate input series by setting var_to_skip to np.nan
456
-
457
- Parameters
458
- ----------
459
- row
460
- vars_to_skip
461
- List of variable names to be skipped
462
-
463
- Returns
464
- -------
465
- Input series
466
-
467
- """
468
- vars_to_skip = set(row.keys()) & set(vars_to_skip)
469
- for var_key in vars_to_skip:
470
- row[var_key] = np.nan
471
- logger.info("----> Skipping var: {}".format(var_key))
472
- return row
247
+ def get_station_positions(latest_data: pd.Series) -> Dict:
248
+ station_position = dict()
249
+ station_position["timestamp"] = latest_data.name
250
+ station_position["lat"] = latest_data["gps_lat_fit"]
251
+ station_position["lon"] = latest_data["gps_lon_fit"]
252
+ station_position["alt"] = latest_data["gps_alt_fit"]
253
+ if any(
254
+ [
255
+ pd.isna(station_position["lat"]),
256
+ pd.isna(station_position["lon"]),
257
+ pd.isna(station_position["alt"]),
258
+ ]
259
+ ):
260
+ logger.warning("Insufficient position data")
261
+ station_position["lat"] = None
262
+ station_position["lon"] = None
263
+ station_position["alt"] = None
264
+ return station_position
473
265
 
474
266
 
475
267
  def get_bufr_variables(
476
268
  data: pd.Series,
477
269
  station_configuration: StationConfiguration,
478
- ) -> BUFRVariables:
270
+ ) -> Optional[BUFRVariables]:
479
271
  """
480
- Helper function for converting our variables to the variables needed for bufr export.
272
+ Helper function for converting our variables to the variables needed for bufr export.
273
+
274
+ Raises AttributeError if station_configuration don't have the minimum dimension fields since they are required to determine barometer heights.
275
+ * height_of_gps_from_station_ground
276
+ * barometer_from_gps
277
+
278
+
481
279
 
482
280
  Parameters
483
281
  ----------
484
282
  data
485
- Series with processed l3 variables from get_latest_datas
283
+ Series with processed variables from get_latest_datas
486
284
 
487
285
  station_configuration
488
286
 
@@ -491,30 +289,62 @@ def get_bufr_variables(
491
289
  BUFRVariables used by bufr_utilities
492
290
 
493
291
  """
494
- heightOfStationGroundAboveMeanSeaLevel = np.nan
495
- if isinstance(station_configuration.height_of_gps_from_station_ground, float):
496
- heightOfStationGroundAboveMeanSeaLevel = (
497
- data["gps_alt_fit"] - station_configuration.height_of_gps_from_station_ground
292
+
293
+ if not all(key in data.index for key in REQUIRED_KEYS):
294
+ raise ValueError(
295
+ f"Failed to process BUFRVariables. Missing required keys: {REQUIRED_KEYS}"
498
296
  )
499
297
 
500
- heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan
501
- if isinstance(station_configuration.temperature_from_sonic_ranger, float):
502
- heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = (
503
- data["z_boom_u_smooth"]+ station_configuration.temperature_from_sonic_ranger
298
+ # Check that we have minimum required fields to proceed with writing to BUFR
299
+ # Always require minimum a valid air temp or a valid pressure.
300
+ # If both air temp and pressure are nan, do not submit.
301
+ # This will allow the case of having only one or the other.
302
+ if data[["t_i", "p_i"]].isna().all():
303
+ logger.warning("Failed to process BUFRVariables - insufficient data")
304
+ return None
305
+
306
+ # Always require a valid position data
307
+ if data[["gps_lat_fit", "gps_lon_fit", "gps_alt_fit"]].isna().any():
308
+ logger.warning("Failed to process BUFRVariables - insufficient position data")
309
+ return None
310
+
311
+ if station_configuration.height_of_gps_from_station_ground is None:
312
+ raise AttributeError(
313
+ "height_of_gps_from_station_ground is required for BUFR export"
504
314
  )
315
+ if station_configuration.barometer_from_gps is None:
316
+ raise AttributeError("barometer_from_gps is required for BUFR export")
505
317
 
506
- heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan
507
- if isinstance(station_configuration.anemometer_from_sonic_ranger, float):
508
- heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = (
509
- data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger
318
+ if station_configuration.static_height_of_gps_from_mean_sea_level is None:
319
+ height_of_gps_above_mean_sea_level = data["gps_alt_fit"]
320
+ else:
321
+ height_of_gps_above_mean_sea_level = (
322
+ station_configuration.static_height_of_gps_from_mean_sea_level
510
323
  )
511
324
 
512
- heightOfBarometerAboveMeanSeaLevel = np.nan
513
- if isinstance(station_configuration.barometer_from_gps, float):
514
- heightOfBarometerAboveMeanSeaLevel = (
515
- data["gps_alt_fit"] + station_configuration.barometer_from_gps
325
+ heightOfStationGroundAboveMeanSeaLevel = (
326
+ height_of_gps_above_mean_sea_level
327
+ - station_configuration.height_of_gps_from_station_ground
328
+ )
329
+
330
+ heightOfBarometerAboveMeanSeaLevel = (
331
+ height_of_gps_above_mean_sea_level + station_configuration.barometer_from_gps
332
+ )
333
+
334
+ if station_configuration.temperature_from_sonic_ranger is None:
335
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = np.nan
336
+ else:
337
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformTempRH = (
338
+ data["z_boom_u_smooth"]
339
+ + station_configuration.temperature_from_sonic_ranger
516
340
  )
517
341
 
342
+ if station_configuration.anemometer_from_sonic_ranger is None:
343
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = np.nan
344
+ else:
345
+ heightOfSensorAboveLocalGroundOrDeckOfMarinePlatformWSPD = (
346
+ data["z_boom_u_smooth"] + station_configuration.anemometer_from_sonic_ranger
347
+ )
518
348
 
519
349
  output_row = BUFRVariables(
520
350
  wmo_id=station_configuration.wmo_id,
@@ -526,7 +356,7 @@ def get_bufr_variables(
526
356
  airTemperature=data.t_i + 273.15,
527
357
  # Convert pressure, correct the -1000 offset, then hPa to Pa
528
358
  # note that instantaneous pressure has 0.1 hPa precision
529
- pressure=(data.p_i + 1000.0) * 100.0,
359
+ nonCoordinatePressure=(data.p_i + 1000.0) * 100.0,
530
360
  windDirection=data.wdir_i,
531
361
  windSpeed=data.wspd_i,
532
362
  latitude=data.gps_lat_fit,
@@ -540,60 +370,75 @@ def get_bufr_variables(
540
370
  return output_row
541
371
 
542
372
 
543
- def min_data_check(s):
544
- """Check that we have minimum required fields to proceed with writing to BUFR
545
- For wx vars, we currently require both air temp and pressure to be non-NaN.
546
- If you know a specific var is reporting bad data, you can ignore just that var
547
- using the vars_to_skip dict in wmo_config.
548
-
549
- Parameters
550
- ----------
551
- s : pandas series
552
- The current obset we are working with (for BUFR submission)
553
-
554
- Returns
555
- -------
556
- min_data_wx_result : bool
557
- True (default), the test for min wx data passed. False, the test failed.
558
- min_data_pos_result : bool
559
- True (default), the test for min position data passed. False, the test failed.
560
- """
561
- min_data_wx_result = True
562
- min_data_pos_result = True
563
-
564
- # Can use pd.isna() or math.isnan() below...
565
-
566
- # Always require valid air temp and valid pressure (both must be non-nan)
567
- # if (pd.isna(s['t_i']) is False) and (pd.isna(s['p_i']) is False):
568
- # pass
569
- # else:
570
- # print('----> Failed min_data_check for air temp and pressure!')
571
- # min_data_wx_result = False
572
-
573
- # If both air temp and pressure are nan, do not submit.
574
- # This will allow the case of having only one or the other.
575
- if (pd.isna(s["t_i"]) is True) and (pd.isna(s["p_i"]) is True):
576
- logger.warning("----> Failed min_data_check for air temp and pressure!")
577
- min_data_wx_result = False
578
-
579
- # Missing just elevation OK
580
- # if (pd.isna(s['gps_lat_fit']) is False) and (pd.isna(s['gps_lon_fit']) is False):
581
- # pass
582
- # Require all three: lat, lon, elev
583
- if (
584
- (pd.isna(s["gps_lat_fit"]) is False)
585
- and (pd.isna(s["gps_lon_fit"]) is False)
586
- and (pd.isna(s["gps_alt_fit"]) is False)
587
- ):
588
- pass
589
- else:
590
- logger.warning("----> Failed min_data_check for position!")
591
- min_data_pos_result = False
592
-
593
- return min_data_wx_result, min_data_pos_result
594
-
595
373
  def main():
596
- args = parse_arguments_bufr().parse_args()
374
+ parser = argparse.ArgumentParser()
375
+ parser.add_argument(
376
+ "--store_positions",
377
+ "--positions",
378
+ action="store_true",
379
+ required=False,
380
+ default=False,
381
+ help="If included (True), make a positions dict and output AWS_latest_locations.csv file.",
382
+ )
383
+ parser.add_argument(
384
+ "--positions-filepath",
385
+ "-p",
386
+ type=Path,
387
+ required=False,
388
+ help="Path to write AWS_latest_locations.csv file.",
389
+ )
390
+ parser.add_argument(
391
+ "--linear_regression_time_limit",
392
+ "--time-limit",
393
+ default=DEFAULT_LIN_REG_TIME_LIMIT,
394
+ type=str,
395
+ required=False,
396
+ help="Previous time to limit dataframe before applying linear regression.",
397
+ )
398
+ parser.add_argument(
399
+ "--input_files",
400
+ "-i",
401
+ type=Path,
402
+ nargs="+",
403
+ required=True,
404
+ help="Path to input files .csv files. Can be direct paths or glob patterns",
405
+ )
406
+ parser.add_argument(
407
+ "--bufr-out",
408
+ "-o",
409
+ type=Path,
410
+ required=True,
411
+ help="Path to the BUFR out directory.",
412
+ )
413
+ parser.add_argument(
414
+ "--timestamps-pickle-filepath",
415
+ type=Path,
416
+ required=False,
417
+ help="Path to the latest_timestamps.pickle file.",
418
+ )
419
+ parser.add_argument(
420
+ "--station_configurations_root",
421
+ type=Path,
422
+ required=True,
423
+ help="Path to root directory containing station configuration toml files",
424
+ )
425
+ parser.add_argument(
426
+ "--position_seed",
427
+ default=DEFAULT_POSITION_SEED_PATH,
428
+ type=Path,
429
+ required=False,
430
+ help="Path to csv file with seed values for output positions.",
431
+ )
432
+ parser.add_argument(
433
+ "--target_timestamp",
434
+ "--now-timestamp",
435
+ default=datetime.utcnow(),
436
+ type=pd.Timestamp,
437
+ help="Timestamp used to determine latest data. Default utcnow.",
438
+ )
439
+ parser.add_argument("--verbose", "-v", default=False, action="store_true")
440
+
441
+ args = parser.parse_args()
597
442
 
598
443
  log_level = logging.INFO
599
444
  if args.verbose:
@@ -613,17 +458,23 @@ def main():
613
458
  # The input path might be a glob pattern
614
459
  input_files += map(Path, glob.glob(path.as_posix()))
615
460
 
461
+ station_configuration_mapping = load_station_configuration_mapping(
462
+ args.station_configurations_root,
463
+ skip_unexpected_fields=True,
464
+ )
465
+
616
466
  get_bufr(
617
467
  bufr_out=args.bufr_out,
618
468
  input_files=input_files,
619
469
  store_positions=args.store_positions,
620
470
  positions_filepath=args.positions_filepath,
621
- time_limit=args.time_limit,
471
+ linear_regression_time_limit=args.linear_regression_time_limit,
622
472
  timestamps_pickle_filepath=args.timestamps_pickle_filepath,
623
- now_timestamp=args.latest_timestamp,
624
- station_configuration_path=args.station_configuration_mapping,
473
+ target_timestamp=args.target_timestamp,
474
+ station_configuration_mapping=station_configuration_mapping,
625
475
  positions_seed_path=args.position_seed,
626
476
  )
627
477
 
478
+
628
479
  if __name__ == "__main__":
629
- main()
480
+ main()