weatherdb 1.1.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- docker/Dockerfile +30 -0
- docker/docker-compose.yaml +58 -0
- docker/docker-compose_test.yaml +24 -0
- docker/start-docker-test.sh +6 -0
- docs/requirements.txt +10 -0
- docs/source/Changelog.md +2 -0
- docs/source/License.rst +7 -0
- docs/source/Methode.md +161 -0
- docs/source/_static/custom.css +8 -0
- docs/source/_static/favicon.ico +0 -0
- docs/source/_static/logo.png +0 -0
- docs/source/api/api.rst +15 -0
- docs/source/api/cli.rst +8 -0
- docs/source/api/weatherDB.broker.rst +10 -0
- docs/source/api/weatherDB.config.rst +7 -0
- docs/source/api/weatherDB.db.rst +23 -0
- docs/source/api/weatherDB.rst +22 -0
- docs/source/api/weatherDB.station.rst +56 -0
- docs/source/api/weatherDB.stations.rst +46 -0
- docs/source/api/weatherDB.utils.rst +22 -0
- docs/source/conf.py +137 -0
- docs/source/index.rst +33 -0
- docs/source/setup/Configuration.md +127 -0
- docs/source/setup/Hosting.md +9 -0
- docs/source/setup/Install.md +49 -0
- docs/source/setup/Quickstart.md +183 -0
- docs/source/setup/setup.rst +12 -0
- weatherdb/__init__.py +24 -0
- weatherdb/_version.py +1 -0
- weatherdb/alembic/README.md +8 -0
- weatherdb/alembic/alembic.ini +80 -0
- weatherdb/alembic/config.py +9 -0
- weatherdb/alembic/env.py +100 -0
- weatherdb/alembic/script.py.mako +26 -0
- weatherdb/alembic/versions/V1.0.0_initial_database_creation.py +898 -0
- weatherdb/alembic/versions/V1.0.2_more_charachters_for_settings+term_station_ma_raster.py +88 -0
- weatherdb/alembic/versions/V1.0.5_fix-ma-raster-values.py +152 -0
- weatherdb/alembic/versions/V1.0.6_update-views.py +22 -0
- weatherdb/broker.py +667 -0
- weatherdb/cli.py +214 -0
- weatherdb/config/ConfigParser.py +663 -0
- weatherdb/config/__init__.py +5 -0
- weatherdb/config/config_default.ini +162 -0
- weatherdb/db/__init__.py +3 -0
- weatherdb/db/connections.py +374 -0
- weatherdb/db/fixtures/RichterParameters.json +34 -0
- weatherdb/db/models.py +402 -0
- weatherdb/db/queries/get_quotient.py +155 -0
- weatherdb/db/views.py +165 -0
- weatherdb/station/GroupStation.py +710 -0
- weatherdb/station/StationBases.py +3108 -0
- weatherdb/station/StationET.py +111 -0
- weatherdb/station/StationP.py +807 -0
- weatherdb/station/StationPD.py +98 -0
- weatherdb/station/StationT.py +164 -0
- weatherdb/station/__init__.py +13 -0
- weatherdb/station/constants.py +21 -0
- weatherdb/stations/GroupStations.py +519 -0
- weatherdb/stations/StationsBase.py +1021 -0
- weatherdb/stations/StationsBaseTET.py +30 -0
- weatherdb/stations/StationsET.py +17 -0
- weatherdb/stations/StationsP.py +128 -0
- weatherdb/stations/StationsPD.py +24 -0
- weatherdb/stations/StationsT.py +21 -0
- weatherdb/stations/__init__.py +11 -0
- weatherdb/utils/TimestampPeriod.py +369 -0
- weatherdb/utils/__init__.py +3 -0
- weatherdb/utils/dwd.py +350 -0
- weatherdb/utils/geometry.py +69 -0
- weatherdb/utils/get_data.py +285 -0
- weatherdb/utils/logging.py +126 -0
- weatherdb-1.1.0.dist-info/LICENSE +674 -0
- weatherdb-1.1.0.dist-info/METADATA +765 -0
- weatherdb-1.1.0.dist-info/RECORD +77 -0
- weatherdb-1.1.0.dist-info/WHEEL +5 -0
- weatherdb-1.1.0.dist-info/entry_points.txt +2 -0
- weatherdb-1.1.0.dist-info/top_level.txt +3 -0
@@ -0,0 +1,3108 @@
|
|
1
|
+
# libraries
|
2
|
+
import itertools
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
import time
|
6
|
+
from datetime import datetime, timedelta, date
|
7
|
+
from pathlib import Path
|
8
|
+
import warnings
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
from sqlalchemy.exc import OperationalError
|
12
|
+
from sqlalchemy import text as sqltxt
|
13
|
+
import sqlalchemy as sa
|
14
|
+
import rasterio as rio
|
15
|
+
import shapely.wkb
|
16
|
+
import shapely.ops
|
17
|
+
import pyproj
|
18
|
+
from rasterstats import zonal_stats
|
19
|
+
import textwrap
|
20
|
+
from functools import cached_property
|
21
|
+
|
22
|
+
from ..db.connections import db_engine
|
23
|
+
from ..utils.dwd import get_cdc_file_list, get_dwd_file
|
24
|
+
from ..utils.TimestampPeriod import TimestampPeriod
|
25
|
+
from ..config import config
|
26
|
+
from ..db.models import StationMARaster, MetaBase, RawFiles
|
27
|
+
from .constants import AGG_TO
|
28
|
+
from ..db.queries.get_quotient import _get_quotient
|
29
|
+
|
30
|
+
# set settings
|
31
|
+
# ############
|
32
|
+
__all__ = ["StationBase"]
|
33
|
+
log = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
# class definitions
|
36
|
+
###################
|
37
|
+
|
38
|
+
class StationBase:
|
39
|
+
"""This is the Base class for one Station.
|
40
|
+
It is not working on it's own, because those parameters need to get defined in the real classes
|
41
|
+
"""
|
42
|
+
# those parameters need to get defined in the real classes:
|
43
|
+
################################################################
|
44
|
+
|
45
|
+
# common settings
|
46
|
+
# ---------------
|
47
|
+
# The sqlalchemy model of the meta table
|
48
|
+
_MetaModel = MetaBase
|
49
|
+
# The parameter string "p", "t", "et" or "p_d"
|
50
|
+
_para = None
|
51
|
+
# The base parameter, without the "_d" suffix
|
52
|
+
_para_base = None
|
53
|
+
# The parameter as a long descriptive string
|
54
|
+
_para_long = None
|
55
|
+
# The Unit as str
|
56
|
+
_unit = "None"
|
57
|
+
# the factor to change data to integers for the database
|
58
|
+
_decimals = 1
|
59
|
+
# the kinds that should not get multiplied with the amount of decimals, e.g. "qn"
|
60
|
+
_kinds_not_decimal = ["qn", "filled_by", "filled_share"]
|
61
|
+
# The valid kinds to use. Must be a column in the timeseries tables.
|
62
|
+
_valid_kinds = {"raw", "qc", "filled", "filled_by"}
|
63
|
+
# the kind that is best for simulations
|
64
|
+
_best_kind = "filled"
|
65
|
+
|
66
|
+
# cdc dwd parameters
|
67
|
+
# ------------------
|
68
|
+
# the base folder on the CDC-FTP server
|
69
|
+
_ftp_folder_base = ["None"]
|
70
|
+
# a regex prefix to the default search regex to find the zip files on the CDC-FTP server
|
71
|
+
_ftp_zip_regex_prefix = None
|
72
|
+
# The name of the date column on the CDC server
|
73
|
+
_cdc_date_col = None
|
74
|
+
# the names of the CDC columns that get imported
|
75
|
+
_cdc_col_names_imp = [None]
|
76
|
+
# the corresponding column name in the DB of the raw import
|
77
|
+
_db_col_names_imp = ["raw"]
|
78
|
+
|
79
|
+
# timestamp configurations
|
80
|
+
# ------------------------
|
81
|
+
# The format string for the strftime for the database to be readable
|
82
|
+
_tstp_format_db = None
|
83
|
+
# the format of the timestamp to be human readable
|
84
|
+
_tstp_format_human = "%Y-%m-%d %H:%M"
|
85
|
+
# the postgresql data type of the timestamp column, e.g. "date" or "timestamp"
|
86
|
+
_tstp_dtype = None
|
87
|
+
# The interval of the timeseries e.g. "1 day" or "10 min"
|
88
|
+
_interval = None
|
89
|
+
|
90
|
+
# aggregation
|
91
|
+
# -----------
|
92
|
+
# Similar to the interval, but same format ass in AGG_TO
|
93
|
+
_min_agg_to = None
|
94
|
+
# the sql aggregating function to use
|
95
|
+
_agg_fun = "sum"
|
96
|
+
|
97
|
+
# for regionalistaion
|
98
|
+
# -------------------
|
99
|
+
# the key names of the band names in the config file, without prefix "BAND_".
|
100
|
+
# Specifies the term in the db to use to calculate the coefficients, 2 values: wi/so or one value:yearly
|
101
|
+
_ma_terms = []
|
102
|
+
# The sign to use to calculate the coefficient (first element) and to use the coefficient (second coefficient).
|
103
|
+
_coef_sign = ["/", "*"]
|
104
|
+
# The multi annual raster to use to calculate the multi annual values
|
105
|
+
_ma_raster_key = "dwd" # section name in the config file (data:rasters:...)
|
106
|
+
|
107
|
+
# for the fillup
|
108
|
+
# --------------
|
109
|
+
# How many neighboring stations are used for the fillup procedure
|
110
|
+
_filled_by_n = 1
|
111
|
+
# The maximal distance in meters to use to get neighbor stations for the fillup. Only relevant if multiple stations are considered for fillup.
|
112
|
+
_fillup_max_dist = None
|
113
|
+
|
114
|
+
def __init__(self, id, _skip_meta_check=False):
|
115
|
+
"""Create a Station object.
|
116
|
+
|
117
|
+
Parameters
|
118
|
+
----------
|
119
|
+
id : int
|
120
|
+
The stations ID.
|
121
|
+
_skip_meta_check : bool, optional
|
122
|
+
Should the check if the station is in the database meta file get skiped.
|
123
|
+
Pay attention, when skipping this, because it can lead to problems.
|
124
|
+
This is for computational reasons, because it makes the initialization faster.
|
125
|
+
Is used by the stations classes, because the only initialize objects that are in the meta table.
|
126
|
+
The default is False
|
127
|
+
|
128
|
+
Raises
|
129
|
+
------
|
130
|
+
NotImplementedError
|
131
|
+
If the class is initiated with a station ID that is not in the database.
|
132
|
+
To prevent this error, set _skip_meta_check=True.
|
133
|
+
"""
|
134
|
+
if type(self) is StationBase:
|
135
|
+
raise NotImplementedError("""
|
136
|
+
The StationBase is only a wrapper class an is not working on its own.
|
137
|
+
Please use StationP, StationT or StationET instead""")
|
138
|
+
self.id = int(id)
|
139
|
+
self.id_str = str(id)
|
140
|
+
|
141
|
+
if isinstance(self._ftp_folder_base, str):
|
142
|
+
self._ftp_folder_base = [self._ftp_folder_base]
|
143
|
+
|
144
|
+
# create ftp_folders in order of importance
|
145
|
+
self._ftp_folders = list(itertools.chain(*[
|
146
|
+
[base + "historical/", base + "recent/"]
|
147
|
+
for base in self._ftp_folder_base]))
|
148
|
+
|
149
|
+
self._db_unit = " ".join([str(self._decimals), self._unit])
|
150
|
+
if not _skip_meta_check:
|
151
|
+
self._check_isin_meta()
|
152
|
+
|
153
|
+
# initiate the dictionary to store the last checked periods
|
154
|
+
self._cached_periods = dict()
|
155
|
+
|
156
|
+
@property
|
157
|
+
def _ma_raster_conf(self):
|
158
|
+
return config[f"data:rasters:{self._ma_raster_key}"]
|
159
|
+
|
160
|
+
def _check_isin_meta(self):
|
161
|
+
if self.isin_meta():
|
162
|
+
return True
|
163
|
+
else:
|
164
|
+
raise NotImplementedError("""
|
165
|
+
The given {para_long} station with id {stid}
|
166
|
+
is not in the corresponding meta table in the DB""".format(
|
167
|
+
stid=self.id, para_long=self._para_long
|
168
|
+
))
|
169
|
+
|
170
|
+
@classmethod
|
171
|
+
def _check_kind(cls, kind, valids=None):
|
172
|
+
"""Check if the given kind is valid.
|
173
|
+
|
174
|
+
Parameters
|
175
|
+
----------
|
176
|
+
kind : str
|
177
|
+
The data kind to look for filled period.
|
178
|
+
Must be a column in the timeseries DB.
|
179
|
+
Must be one of "raw", "qc", "filled", "adj".
|
180
|
+
If "best" is given, then depending on the parameter of the station the best kind is selected.
|
181
|
+
For Precipitation this is "corr" and for the other this is "filled".
|
182
|
+
For the precipitation also "qn" and "corr" are valid.
|
183
|
+
valids : set of str, optional
|
184
|
+
Additional kinds that are valid.
|
185
|
+
This is used to add additional kinds that are valid.
|
186
|
+
The default is an empty set.
|
187
|
+
|
188
|
+
Raises
|
189
|
+
------
|
190
|
+
NotImplementedError
|
191
|
+
If the given kind is not valid.
|
192
|
+
ValueError
|
193
|
+
If the given kind is not a string.
|
194
|
+
"""
|
195
|
+
# check valids
|
196
|
+
if valids is None:
|
197
|
+
valids = cls._valid_kinds
|
198
|
+
|
199
|
+
# check kind
|
200
|
+
if not isinstance(kind, str):
|
201
|
+
raise ValueError("The given kind is not a string.")
|
202
|
+
|
203
|
+
if kind == "best":
|
204
|
+
kind = cls._best_kind
|
205
|
+
|
206
|
+
if kind not in valids:
|
207
|
+
raise NotImplementedError("""
|
208
|
+
The given kind "{kind}" is not a valid kind.
|
209
|
+
Must be one of "{valid_kinds}"
|
210
|
+
""".format(
|
211
|
+
kind=kind,
|
212
|
+
valid_kinds='", "'.join(valids)))
|
213
|
+
|
214
|
+
return kind
|
215
|
+
|
216
|
+
@classmethod
|
217
|
+
def _check_kind_tstp_meta(cls, kind):
|
218
|
+
"""Check if the kind has a timestamp from and until in the meta table."""
|
219
|
+
if kind != "last_imp":
|
220
|
+
kind = cls._check_kind(kind)
|
221
|
+
|
222
|
+
# compute the valid kinds if not already done
|
223
|
+
if not hasattr(cls, "_valid_kinds_tstp_meta"):
|
224
|
+
cls._valid_kinds_tstp_meta = ["last_imp"]
|
225
|
+
for vk in cls._valid_kinds:
|
226
|
+
if vk in {"raw", "qc", "filled", "corr"}:
|
227
|
+
cls._valid_kinds_tstp_meta.append(vk)
|
228
|
+
|
229
|
+
if kind not in cls._valid_kinds_tstp_meta:
|
230
|
+
raise NotImplementedError("""
|
231
|
+
The given kind "{kind}" is not a valid kind.
|
232
|
+
Must be one of "{valid_kinds}"
|
233
|
+
""".format(
|
234
|
+
kind=kind,
|
235
|
+
valid_kinds='", "'.join(cls._valid_kinds_tstp_meta)))
|
236
|
+
|
237
|
+
return kind
|
238
|
+
|
239
|
+
@classmethod
|
240
|
+
def _check_kinds(cls, kinds, valids=None):
|
241
|
+
"""Check if the given kinds are valid.
|
242
|
+
|
243
|
+
Parameters
|
244
|
+
----------
|
245
|
+
kinds : list of str or str
|
246
|
+
A list of the data kinds to check if they are valid kinds for this class implementation.
|
247
|
+
Must be a column in the timeseries DB.
|
248
|
+
Must be one of "raw", "qc", "filled", "adj".
|
249
|
+
For the precipitation also "qn" and "corr" are valid.
|
250
|
+
If "best" is given, then depending on the parameter of the station the best kind is selected.
|
251
|
+
For Precipitation this is "corr" and for the other this is "filled".
|
252
|
+
valids : set of str or None, optional
|
253
|
+
The kinds that are valid.
|
254
|
+
This is used to check the kins against.
|
255
|
+
If None then the default valids are used. (cls._valid_kinds)
|
256
|
+
The default is None.
|
257
|
+
|
258
|
+
Raises
|
259
|
+
------
|
260
|
+
NotImplementedError
|
261
|
+
If the given kind is not valid.
|
262
|
+
ValueError
|
263
|
+
If the given kind is not a string.
|
264
|
+
|
265
|
+
Returns
|
266
|
+
-------
|
267
|
+
kinds: list of str
|
268
|
+
returns a list of strings of valid kinds
|
269
|
+
"""
|
270
|
+
# check valids
|
271
|
+
if valids is None:
|
272
|
+
valids = cls._valid_kinds
|
273
|
+
|
274
|
+
# check kinds
|
275
|
+
if isinstance(kinds, str):
|
276
|
+
kinds = [kinds]
|
277
|
+
else:
|
278
|
+
kinds = kinds.copy() # because else the original variable is changed
|
279
|
+
|
280
|
+
for i, kind_i in enumerate(kinds):
|
281
|
+
if kind_i not in valids:
|
282
|
+
kinds[i] = cls._check_kind(kind_i, valids)
|
283
|
+
return kinds
|
284
|
+
|
285
|
+
def _check_period(self, period, kinds, nas_allowed=False):
|
286
|
+
"""Correct a given period to a valid format.
|
287
|
+
|
288
|
+
If the given Timestamp is none the maximum or minimum possible is given.
|
289
|
+
|
290
|
+
Parameters
|
291
|
+
----------
|
292
|
+
period : tuple or list of datetime.datetime or None, optional
|
293
|
+
The minimum and maximum Timestamp for which to get the timeseries.
|
294
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
295
|
+
The default is (None, None).
|
296
|
+
kinds : str or list of str
|
297
|
+
The data kinds to update.
|
298
|
+
Must be a column in the timeseries DB.
|
299
|
+
Must be one of "raw", "qc", "filled", "adj".
|
300
|
+
For the precipitation also "qn" and "corr" are valid.
|
301
|
+
nas_allowed : bool, optional
|
302
|
+
Should NAs be allowed?
|
303
|
+
If True, then the maximum possible period is returned, even if there are NAs in the timeserie.
|
304
|
+
If False, then the minimal filled period is returned.
|
305
|
+
The default is False.
|
306
|
+
|
307
|
+
Returns
|
308
|
+
-------
|
309
|
+
list with 2 datetime.datetime
|
310
|
+
The minimum and maximum Timestamp.
|
311
|
+
"""
|
312
|
+
# check if period gor recently checked
|
313
|
+
self._clean_cached_periods()
|
314
|
+
cache_key = str((kinds, period, nas_allowed))
|
315
|
+
if cache_key in self._cached_periods:
|
316
|
+
return self._cached_periods[cache_key]["return"]
|
317
|
+
|
318
|
+
# remove filled_by kinds
|
319
|
+
if "filled_by" in kinds:
|
320
|
+
kinds = kinds.copy()
|
321
|
+
kinds.remove("filled_by")
|
322
|
+
if len(kinds)==0:
|
323
|
+
nas_allowed=True
|
324
|
+
|
325
|
+
# get filled period or max period
|
326
|
+
max_period = self.get_max_period(kinds=kinds, nas_allowed=nas_allowed)
|
327
|
+
|
328
|
+
# check if filled_period is empty and throw error
|
329
|
+
if max_period.is_empty():
|
330
|
+
raise ValueError(
|
331
|
+
"No maximum period was found for the {para_long} Station with ID {stid} and kinds '{kinds}'."
|
332
|
+
.format(
|
333
|
+
para_long=self._para_long, stid=self.id, kinds="', '".join(kinds)))
|
334
|
+
|
335
|
+
# get period if None providen
|
336
|
+
if not isinstance(period, TimestampPeriod):
|
337
|
+
period = TimestampPeriod(*period)
|
338
|
+
else:
|
339
|
+
period = period.copy()
|
340
|
+
|
341
|
+
# do additional period checks in subclasses
|
342
|
+
period = self._check_period_extra(period)
|
343
|
+
|
344
|
+
# compare with filled_period
|
345
|
+
if period.is_empty():
|
346
|
+
period = max_period
|
347
|
+
else:
|
348
|
+
period = period.union(
|
349
|
+
max_period,
|
350
|
+
how="inner")
|
351
|
+
|
352
|
+
# save for later
|
353
|
+
self._cached_periods.update({
|
354
|
+
cache_key: {
|
355
|
+
"time": datetime.now(),
|
356
|
+
"return": period}})
|
357
|
+
|
358
|
+
return period
|
359
|
+
|
360
|
+
@staticmethod
|
361
|
+
def _check_period_extra(period):
|
362
|
+
"""Additional checks on period to define in subclasses"""
|
363
|
+
return period
|
364
|
+
|
365
|
+
def _check_agg_to(self, agg_to):
|
366
|
+
agg_to_valid = list(AGG_TO.keys())
|
367
|
+
if agg_to not in agg_to_valid:
|
368
|
+
raise ValueError(
|
369
|
+
"The given agg_to Parameter \"{agg_to}\" is not a valid aggregating period. Please use one of:\n{agg_valid}".format(
|
370
|
+
agg_to=agg_to,
|
371
|
+
agg_valid=", ".join([str(item) for item in agg_to_valid])
|
372
|
+
))
|
373
|
+
if agg_to_valid.index(agg_to) <= agg_to_valid.index(self._min_agg_to):
|
374
|
+
return None
|
375
|
+
else:
|
376
|
+
return agg_to
|
377
|
+
|
378
|
+
def _check_df_raw(self, df):
|
379
|
+
"""This is an empty function to get implemented in the subclasses if necessary.
|
380
|
+
|
381
|
+
It applies extra checkups, like adjusting the timezone on the downloaded raw timeseries and returns the dataframe."""
|
382
|
+
# add Timezone as UTC
|
383
|
+
df.index = df.index.tz_localize("UTC")
|
384
|
+
|
385
|
+
return df
|
386
|
+
|
387
|
+
def _clean_cached_periods(self):
|
388
|
+
time_limit = datetime.now() - timedelta(minutes=1)
|
389
|
+
for key in list(self._cached_periods):
|
390
|
+
if self._cached_periods[key]["time"] < time_limit:
|
391
|
+
self._cached_periods.pop(key)
|
392
|
+
|
393
|
+
@db_engine.deco_update_privilege
|
394
|
+
def _check_ma(self):
|
395
|
+
if not self.isin_ma():
|
396
|
+
self.update_ma_raster()
|
397
|
+
|
398
|
+
@db_engine.deco_create_privilege
|
399
|
+
def _check_isin_db(self):
|
400
|
+
"""Check if the station has already a timeserie and if not create one.
|
401
|
+
"""
|
402
|
+
if not self.isin_db():
|
403
|
+
self._create_timeseries_table()
|
404
|
+
|
405
|
+
@db_engine.deco_update_privilege
|
406
|
+
def _check_min_date(self, **kwargs):
|
407
|
+
"""Check if the station has already a timeserie and if not create one.
|
408
|
+
"""
|
409
|
+
min_dt_config = config.get_datetime("weatherdb", "min_date")
|
410
|
+
with db_engine.connect() as con:
|
411
|
+
# get minimal timeseries timestamp
|
412
|
+
try:
|
413
|
+
min_dt_ts = con.execute(
|
414
|
+
sa.select(sa.func.min(self._table.c.timestamp))
|
415
|
+
).scalar()
|
416
|
+
except sa.exc.ProgrammingError as e:
|
417
|
+
if "relation" in str(e) and "does not exist" in str(e):
|
418
|
+
min_dt_ts = None
|
419
|
+
else:
|
420
|
+
raise e
|
421
|
+
if min_dt_ts is None:
|
422
|
+
return None
|
423
|
+
if isinstance(min_dt_ts, date):
|
424
|
+
min_dt_ts = datetime.combine(min_dt_ts, datetime.min.time())
|
425
|
+
min_dt_ts = min_dt_ts.replace(tzinfo=min_dt_config.tzinfo)
|
426
|
+
|
427
|
+
# compare to config min_date and correct timeseries
|
428
|
+
if min_dt_ts < min_dt_config:
|
429
|
+
log.debug(f"The Station{self._para}({self.id})'s minimum timestamp of {min_dt_ts} is below the configurations min_date of {min_dt_config.date()}. The timeserie will be reduced to the configuration value.")
|
430
|
+
con.execute(
|
431
|
+
sa.delete(self._table)
|
432
|
+
.where(self._table.c.timestamp < min_dt_config)
|
433
|
+
)
|
434
|
+
con.commit()
|
435
|
+
elif min_dt_ts > min_dt_config:
|
436
|
+
log.debug(f"The Station{self._para}({self.id})'s minimum timestamp of {min_dt_ts} is above the configurations min_date of {min_dt_config.date()}. The timeserie will be expanded to the configuration value and the raw_files for this station are getting deleted, to reload all possible dwd data.")
|
437
|
+
zipfiles = self.get_zipfiles(
|
438
|
+
only_new=False,
|
439
|
+
ftp_file_list=kwargs.get("ftp_file_list", None))
|
440
|
+
con.execute(sa.delete(RawFiles.__table__)\
|
441
|
+
.where(sa.and_(
|
442
|
+
RawFiles.filepath.in_(zipfiles.index.to_list()),
|
443
|
+
RawFiles.parameter == self._para)))
|
444
|
+
con.commit()
|
445
|
+
self._expand_timeserie_to_period()
|
446
|
+
|
447
|
+
@property
|
448
|
+
def _ma_terms_all(self):
|
449
|
+
"""Get all terms for the station. If Year is not part of terms it is added."""
|
450
|
+
return set(self._ma_terms) | {"year"}
|
451
|
+
|
452
|
+
@property
|
453
|
+
def _ma_raster_bands(self):
|
454
|
+
"""Get the raster bands of the stations multi annual raster file."""
|
455
|
+
return [self._ma_raster_conf[key]
|
456
|
+
for key in self._ma_raster_band_conf_keys]
|
457
|
+
|
458
|
+
@property
|
459
|
+
def _ma_raster_band_conf_keys(self):
|
460
|
+
"""Get the raster band keys for the station. E.g. P_WIHY"""
|
461
|
+
return [f"band_{self._para_base}_{term}"
|
462
|
+
for term in self._ma_terms_all]
|
463
|
+
|
464
|
+
@property
|
465
|
+
def _ma_raster_factors(self):
|
466
|
+
"""Get the factor to convert the raster values to the real unit e.g. °C or mm."""
|
467
|
+
return [float(self._ma_raster_conf.get(key, 1))
|
468
|
+
for key in self._ma_raster_factor_conf_keys]
|
469
|
+
|
470
|
+
@property
|
471
|
+
def _ma_raster_factor_conf_keys(self):
|
472
|
+
"""Get the raster band keys for the station. E.g. P_WIHY"""
|
473
|
+
return [f"factor_{self._para_base}_{term}"
|
474
|
+
for term in self._ma_terms_all]
|
475
|
+
|
476
|
+
@cached_property
|
477
|
+
def _table(self):
|
478
|
+
raise NotImplementedError("The table property is not implemented in the base class.")
|
479
|
+
|
480
|
+
@db_engine.deco_create_privilege
|
481
|
+
def _create_timeseries_table(self):
|
482
|
+
"""Create the timeseries table in the DB if it is not yet existing."""
|
483
|
+
pass
|
484
|
+
|
485
|
+
@db_engine.deco_update_privilege
|
486
|
+
def _expand_timeserie_to_period(self):
|
487
|
+
"""Expand the timeserie to the complete possible time range"""
|
488
|
+
# The interval of 9h and 30 seconds is due to the fact, that the fact that t and et data for the previous day is only updated around 9 on the following day
|
489
|
+
# the 10 minutes interval is to get the previous day and not the same day
|
490
|
+
sql = """
|
491
|
+
WITH whole_ts AS (
|
492
|
+
SELECT generate_series(
|
493
|
+
'{min_date} 00:00'::{tstp_dtype},
|
494
|
+
(SELECT
|
495
|
+
LEAST(
|
496
|
+
date_trunc(
|
497
|
+
'day',
|
498
|
+
min(start_tstp_last_imp) - '9h 30min'::INTERVAL
|
499
|
+
) - '10 min'::INTERVAL,
|
500
|
+
min(CASE WHEN parameter='p' THEN max_tstp_last_imp
|
501
|
+
ELSE max_tstp_last_imp + '23h 50min'::INTERVAL
|
502
|
+
END))
|
503
|
+
FROM parameter_variables)::{tstp_dtype},
|
504
|
+
'{interval}'::INTERVAL)::{tstp_dtype} AS timestamp)
|
505
|
+
INSERT INTO timeseries."{stid}_{para}"(timestamp)
|
506
|
+
(SELECT wts.timestamp
|
507
|
+
FROM whole_ts wts
|
508
|
+
LEFT JOIN timeseries."{stid}_{para}" ts
|
509
|
+
ON ts.timestamp=wts.timestamp
|
510
|
+
WHERE ts.timestamp IS NULL);
|
511
|
+
""".format(
|
512
|
+
stid=self.id,
|
513
|
+
para=self._para,
|
514
|
+
tstp_dtype=self._tstp_dtype,
|
515
|
+
interval=self._interval,
|
516
|
+
min_date=config.get("weatherdb", "min_date"))
|
517
|
+
|
518
|
+
with db_engine.connect() as con:
|
519
|
+
con.execute(sqltxt(sql))
|
520
|
+
con.commit()
|
521
|
+
|
522
|
+
@db_engine.deco_update_privilege
|
523
|
+
def _update_db_timeserie(self, df, kinds):
|
524
|
+
"""Update the timeseries table on the database with new DataFrame.
|
525
|
+
|
526
|
+
Parameters
|
527
|
+
----------
|
528
|
+
df : pandas.Series of integers
|
529
|
+
A Serie with a DatetimeIndex and the values to update in the Database.
|
530
|
+
The values need to be in the database unit. So you might have to multiply your values with self._decimals and convert to integers.
|
531
|
+
kinds : str or list of str
|
532
|
+
The data kinds to update.
|
533
|
+
Must be a column in the timeseries DB.
|
534
|
+
Must be one of "raw", "qc", "filled".
|
535
|
+
For the precipitation also "qn" and "corr" are valid.
|
536
|
+
|
537
|
+
Raises
|
538
|
+
------
|
539
|
+
NotImplementedError
|
540
|
+
If the given kind is not valid.
|
541
|
+
ValueError
|
542
|
+
If the given kind is not a string.
|
543
|
+
"""
|
544
|
+
# check kinds
|
545
|
+
kinds = self._check_kinds(kinds)
|
546
|
+
|
547
|
+
# check if df is empty
|
548
|
+
if len(df) == 0:
|
549
|
+
log.debug(("The _update_db_timeserie method got an empty df " +
|
550
|
+
"for the {para_long} Station with ID {stid}"
|
551
|
+
).format(
|
552
|
+
para_long=self._para_long,
|
553
|
+
stid=self.id))
|
554
|
+
return None
|
555
|
+
else:
|
556
|
+
self._create_timeseries_table()
|
557
|
+
|
558
|
+
with db_engine.connect() as con:
|
559
|
+
# create groups of 1000 values to insert
|
560
|
+
groups = np.array_split(df.index, (len(df)//1000)+1)
|
561
|
+
for group in groups:
|
562
|
+
df_i = df.loc[group]
|
563
|
+
|
564
|
+
# make insert statement
|
565
|
+
values_all = [
|
566
|
+
ind.strftime("('%Y%m%d %H:%M', ") + ", ".join(pair) + ")"
|
567
|
+
for ind, pair in zip(df_i.index, df_i.values.astype(str))]
|
568
|
+
values = ", ".join(values_all)
|
569
|
+
values = re.sub(r"(nan)|(<NA>)", "NULL", values)
|
570
|
+
sql_insert = '''
|
571
|
+
INSERT INTO timeseries."{stid}_{para}"(timestamp, "{kinds}")
|
572
|
+
Values {values}
|
573
|
+
ON CONFLICT (timestamp) DO UPDATE SET
|
574
|
+
'''.format(
|
575
|
+
stid=self.id, para=self._para,
|
576
|
+
kinds='", "'.join(kinds), values=values)
|
577
|
+
|
578
|
+
for kind_i in kinds:
|
579
|
+
sql_insert += '"{kind}" = EXCLUDED."{kind}",'\
|
580
|
+
.format(kind=kind_i)
|
581
|
+
sql_insert = sql_insert[:-1] + ";"
|
582
|
+
|
583
|
+
# run sql command
|
584
|
+
con.execute(sqltxt(sql_insert))
|
585
|
+
con.commit()
|
586
|
+
|
587
|
+
@db_engine.deco_delete_privilege
|
588
|
+
def _drop(self, why="No reason given"):
|
589
|
+
"""Drop this station from the database. (meta table and timeseries)
|
590
|
+
"""
|
591
|
+
why=why.replace("'", "''")
|
592
|
+
sql = f"""
|
593
|
+
DROP TABLE IF EXISTS timeseries."{self.id}_{self._para}";
|
594
|
+
DELETE FROM meta_{self._para} WHERE station_id={self.id};
|
595
|
+
DELETE FROM station_ma_raster WHERE station_id={self.id} and parameter='{self._para}';
|
596
|
+
DELETE FROM station_ma_timeseries WHERE station_id={self.id} and parameter='{self._para}';
|
597
|
+
INSERT INTO dropped_stations(station_id, parameter, why, timestamp)
|
598
|
+
VALUES ('{self.id}', '{self._para}', '{why}', NOW())
|
599
|
+
ON CONFLICT (station_id, parameter)
|
600
|
+
DO UPDATE SET
|
601
|
+
why = EXCLUDED.why,
|
602
|
+
timestamp = EXCLUDED.timestamp;
|
603
|
+
"""
|
604
|
+
|
605
|
+
with db_engine.connect() as con:
|
606
|
+
con.execute(sqltxt(sql))
|
607
|
+
con.commit()
|
608
|
+
log.debug(
|
609
|
+
f"The {self._para_long} Station with ID {self.id} got dropped from the database, because \"{why}\".")
|
610
|
+
|
611
|
+
@db_engine.deco_update_privilege
|
612
|
+
def _update_meta(self, cols, values):
|
613
|
+
sets = []
|
614
|
+
for col, value in zip(cols, values):
|
615
|
+
sets.append(
|
616
|
+
"{col}='{value}'".format(
|
617
|
+
col=col, value=value))
|
618
|
+
|
619
|
+
sql_update = """
|
620
|
+
UPDATE meta_{para}
|
621
|
+
SET {sets}
|
622
|
+
WHERE station_id={stid};
|
623
|
+
""".format(
|
624
|
+
stid=self.id,
|
625
|
+
para=self._para,
|
626
|
+
sets=", ".join(sets)
|
627
|
+
)
|
628
|
+
|
629
|
+
with db_engine.connect() as con:
|
630
|
+
con.execute(sqltxt(sql_update))
|
631
|
+
con.commit()
|
632
|
+
|
633
|
+
@db_engine.deco_all_privileges
|
634
|
+
def _execute_long_sql(self, sql, description="treated"):
|
635
|
+
done = False
|
636
|
+
attempts = 0
|
637
|
+
re_comp = re.compile("(the database system is in recovery mode)" +
|
638
|
+
"|(SSL SYSCALL error: EOF detected)" + # login problem due to recovery mode
|
639
|
+
"|(SSL connection has been closed unexpectedly)" + # sudden logoff
|
640
|
+
"|(the database system is shutting down)") # to test the procedure by stoping postgresql
|
641
|
+
# execute until done
|
642
|
+
while not done:
|
643
|
+
attempts += 1
|
644
|
+
try:
|
645
|
+
with db_engine.connect() as con:
|
646
|
+
con.execute(sqltxt(sql))
|
647
|
+
con.commit()
|
648
|
+
done = True
|
649
|
+
except OperationalError as err:
|
650
|
+
log_msg = ("There was an operational error for the {para_long} Station (ID:{stid})" +
|
651
|
+
"\nHere is the complete error:\n" + str(err)).format(
|
652
|
+
stid=self.id, para_long=self._para_long)
|
653
|
+
if any(filter(re_comp.search, err.args)):
|
654
|
+
if attempts > 10:
|
655
|
+
log.error(
|
656
|
+
log_msg +
|
657
|
+
"\nBecause there were already too many attempts, the execution of this process got completely stopped.")
|
658
|
+
break
|
659
|
+
else:
|
660
|
+
log.debug(
|
661
|
+
log_msg +
|
662
|
+
"\nThe execution is stopped for 10 minutes and then redone.")
|
663
|
+
time.sleep(60*10)
|
664
|
+
|
665
|
+
# log
|
666
|
+
if done:
|
667
|
+
log.info(
|
668
|
+
"The {para_long} Station ({stid}) got successfully {desc}.".format(
|
669
|
+
stid=self.id,
|
670
|
+
para_long=self._para_long,
|
671
|
+
desc=description))
|
672
|
+
else:
|
673
|
+
raise Exception(
|
674
|
+
"The {para_long} Station ({stid}) could not get {desc}.".format(
|
675
|
+
stid=self.id,
|
676
|
+
para_long=self._para_long,
|
677
|
+
desc=description)
|
678
|
+
)
|
679
|
+
|
680
|
+
@db_engine.deco_update_privilege
|
681
|
+
def _set_is_real(self, state=True):
|
682
|
+
sql = """
|
683
|
+
UPDATE meta_{para}
|
684
|
+
SET is_real={state}
|
685
|
+
WHERE station_id={stid};
|
686
|
+
""".format(stid=self.id, para=self._para, state=state)
|
687
|
+
|
688
|
+
with db_engine.connect() as con:
|
689
|
+
con.execute(sqltxt(sql))
|
690
|
+
|
691
|
+
def isin_db(self):
|
692
|
+
"""Check if Station is already in a timeseries table.
|
693
|
+
|
694
|
+
Returns
|
695
|
+
-------
|
696
|
+
bool
|
697
|
+
True if Station has a table in DB, no matter if it is filled or not.
|
698
|
+
"""
|
699
|
+
|
700
|
+
sql = """
|
701
|
+
select '{stid}_{para}' in (
|
702
|
+
select table_name
|
703
|
+
from information_schema.columns
|
704
|
+
where table_schema='timeseries');
|
705
|
+
""".format(para=self._para, stid=self.id)
|
706
|
+
with db_engine.connect() as con:
|
707
|
+
result = con.execute(sqltxt(sql)).first()[0]
|
708
|
+
|
709
|
+
return result
|
710
|
+
|
711
|
+
def isin_meta(self):
|
712
|
+
"""Check if Station is already in the meta table.
|
713
|
+
|
714
|
+
Returns
|
715
|
+
-------
|
716
|
+
bool
|
717
|
+
True if Station is in meta table.
|
718
|
+
"""
|
719
|
+
with db_engine.connect() as con:
|
720
|
+
result = con.execute(sqltxt("""
|
721
|
+
SELECT EXISTS(SELECT station_id FROM meta_{para} WHERE station_id={stid});
|
722
|
+
""".format(stid=self.id, para=self._para)))
|
723
|
+
return result.first()[0]
|
724
|
+
|
725
|
+
def isin_ma(self):
|
726
|
+
"""Check if Station is already in the multi annual table.
|
727
|
+
|
728
|
+
Returns
|
729
|
+
-------
|
730
|
+
bool
|
731
|
+
True if Station is in multi annual table.
|
732
|
+
"""
|
733
|
+
sql_select = sa\
|
734
|
+
.select(sa.func.count(StationMARaster.term)==len(self._ma_terms_all))\
|
735
|
+
.where(sa.and_(
|
736
|
+
StationMARaster.station_id == self.id,
|
737
|
+
StationMARaster.raster_key == self._ma_raster_key,
|
738
|
+
StationMARaster.parameter == self._para_base,
|
739
|
+
StationMARaster.term.in_(self._ma_terms_all),
|
740
|
+
StationMARaster.value.isnot(None)))
|
741
|
+
with db_engine.connect() as conn:
|
742
|
+
return conn.execute(sql_select).scalar()
|
743
|
+
|
744
|
+
def is_virtual(self):
|
745
|
+
"""Check if the station is a real station or only a virtual one.
|
746
|
+
|
747
|
+
Real means that the DWD is measuring here.
|
748
|
+
Virtual means, that there are no measurements here, but the station got created to have timeseries for every parameter for every precipitation station.
|
749
|
+
|
750
|
+
Returns
|
751
|
+
-------
|
752
|
+
bool
|
753
|
+
true if the station is virtual, false if it is real.
|
754
|
+
"""
|
755
|
+
return not self.is_real()
|
756
|
+
|
757
|
+
def is_real(self):
|
758
|
+
"""Check if the station is a real station or only a virtual one.
|
759
|
+
|
760
|
+
Real means that the DWD is measuring here.
|
761
|
+
Virtual means, that there are no measurements here, but the station got created to have timeseries for every parameter for every precipitation station.
|
762
|
+
|
763
|
+
Returns
|
764
|
+
-------
|
765
|
+
bool
|
766
|
+
true if the station is real, false if it is virtual.
|
767
|
+
"""
|
768
|
+
sql = """
|
769
|
+
SELECT is_real
|
770
|
+
FROM meta_{para}
|
771
|
+
WHERE station_id= {stid}
|
772
|
+
""".format(stid=self.id, para=self._para)
|
773
|
+
with db_engine.connect() as con:
|
774
|
+
res = con.execute(sqltxt(sql))
|
775
|
+
return res.first()[0]
|
776
|
+
|
777
|
+
def is_last_imp_done(self, kind):
|
778
|
+
"""Is the last import for the given kind already worked in?
|
779
|
+
|
780
|
+
|
781
|
+
Parameters
|
782
|
+
----------
|
783
|
+
kind : str
|
784
|
+
The data kind to look for filled period.
|
785
|
+
Must be a column in the timeseries DB.
|
786
|
+
Must be one of "raw", "qc", "filled", "adj", "best".
|
787
|
+
If "best" is given, then depending on the parameter of the station the best kind is selected.
|
788
|
+
For Precipitation this is "corr" and for the other this is "filled".
|
789
|
+
For the precipitation also "qn" and "corr" are valid.
|
790
|
+
|
791
|
+
Returns
|
792
|
+
-------
|
793
|
+
bool
|
794
|
+
True if the last import of the given kind is already treated.
|
795
|
+
"""
|
796
|
+
|
797
|
+
kind = self._check_kind(kind)
|
798
|
+
sql = """
|
799
|
+
SELECT last_imp_{kind}
|
800
|
+
FROM meta_{para}
|
801
|
+
WHERE station_id = {stid}
|
802
|
+
""".format(stid=self.id, para=self._para, kind=kind)
|
803
|
+
|
804
|
+
with db_engine.connect() as con:
|
805
|
+
res = con.execute(sqltxt(sql))
|
806
|
+
|
807
|
+
return res.first()[0]
|
808
|
+
|
809
|
+
@db_engine.deco_update_privilege
|
810
|
+
def update_period_meta(self, kind, **kwargs):
|
811
|
+
"""Update the time period in the meta file.
|
812
|
+
|
813
|
+
Compute teh filled period of a timeserie and save in the meta table.
|
814
|
+
|
815
|
+
Parameters
|
816
|
+
----------
|
817
|
+
kind : str
|
818
|
+
The data kind to look for filled period.
|
819
|
+
Must be a column in the timeseries DB.
|
820
|
+
Must be one of "raw", "qc", "filled".
|
821
|
+
If "best" is given, then depending on the parameter of the station the best kind is selected.
|
822
|
+
For Precipitation this is "corr" and for the other this is "filled".
|
823
|
+
For the precipitation also "corr" are valid.
|
824
|
+
**kwargs : dict, optional
|
825
|
+
Additional keyword arguments catch all, but unused here.
|
826
|
+
"""
|
827
|
+
kind = self._check_kind_tstp_meta(kind)
|
828
|
+
period = self.get_filled_period(kind=kind)
|
829
|
+
|
830
|
+
sql = """
|
831
|
+
UPDATE meta_{para}
|
832
|
+
SET {kind}_from={min_tstp},
|
833
|
+
{kind}_until={max_tstp}
|
834
|
+
WHERE station_id={stid};
|
835
|
+
""".format(
|
836
|
+
stid=self.id, para=self._para,
|
837
|
+
kind=kind,
|
838
|
+
**period.get_sql_format_dict(
|
839
|
+
format="'{}'".format(self._tstp_format_db))
|
840
|
+
)
|
841
|
+
|
842
|
+
with db_engine.connect() as con:
|
843
|
+
con.execute(sqltxt(sql))
|
844
|
+
con.commit()
|
845
|
+
|
846
|
+
@db_engine.deco_update_privilege
|
847
|
+
def update_ma_raster(self, skip_if_exist=True, drop_when_error=True, **kwargs):
|
848
|
+
"""Update the multi annual values in the station_ma_raster table.
|
849
|
+
|
850
|
+
Get new values from the raster and put in the table.
|
851
|
+
|
852
|
+
Parameters
|
853
|
+
----------
|
854
|
+
skip_if_exist : bool, optional
|
855
|
+
Skip the update if the stations multi annual data is already in the table.
|
856
|
+
The default is True.
|
857
|
+
drop_when_error : bool, optional
|
858
|
+
Drop the station from the database if there is an error.
|
859
|
+
The default is True.
|
860
|
+
**kwargs : dict, optional
|
861
|
+
Additional keyword arguments catch all, but unused here.
|
862
|
+
"""
|
863
|
+
if skip_if_exist and self.isin_ma():
|
864
|
+
return None
|
865
|
+
|
866
|
+
# get multi annual raster values, starting at point location up to 1000m
|
867
|
+
dist = -50
|
868
|
+
new_mas = None
|
869
|
+
ma_raster_bands = self._ma_raster_bands
|
870
|
+
|
871
|
+
while (new_mas is None or (new_mas is not None and not np.any(~np.isnan(new_mas)))) \
|
872
|
+
and dist <= 1000:
|
873
|
+
dist += 50
|
874
|
+
new_mas = self._get_raster_value(
|
875
|
+
raster_conf=self._ma_raster_conf,
|
876
|
+
bands=ma_raster_bands,
|
877
|
+
dist=dist)
|
878
|
+
|
879
|
+
# write to station_ma_raster table
|
880
|
+
if new_mas is not None and np.any(~np.isnan(new_mas)):
|
881
|
+
# convert from raster unit to db unit
|
882
|
+
new_mas = [int(np.round(val * fact * self._decimals))
|
883
|
+
for val, fact in zip(new_mas, self._ma_raster_factors)]
|
884
|
+
|
885
|
+
# upload in database
|
886
|
+
with db_engine.session() as session:
|
887
|
+
stmnt = sa.dialects.postgresql\
|
888
|
+
.insert(StationMARaster)\
|
889
|
+
.values([
|
890
|
+
dict(station_id=self.id,
|
891
|
+
raster_key=self._ma_raster_key,
|
892
|
+
parameter=self._para_base,
|
893
|
+
term=term,
|
894
|
+
value=val,
|
895
|
+
distance=dist)
|
896
|
+
for term, val in zip(
|
897
|
+
self._ma_terms_all,
|
898
|
+
new_mas)])
|
899
|
+
stmnt = stmnt\
|
900
|
+
.on_conflict_do_update(
|
901
|
+
index_elements=["station_id", "raster_key", "parameter", "term"],
|
902
|
+
set_=dict(value=stmnt.excluded.value,
|
903
|
+
distance=stmnt.excluded.distance)
|
904
|
+
)
|
905
|
+
session.execute(stmnt)
|
906
|
+
session.commit()
|
907
|
+
|
908
|
+
elif drop_when_error:
|
909
|
+
# there was no multi annual data found from the raster
|
910
|
+
self._drop(
|
911
|
+
why=f"no multi-annual data was found from 'data:rasters:{self._ma_raster_key}'")
|
912
|
+
|
913
|
+
@db_engine.deco_update_privilege
|
914
|
+
def update_ma_timeseries(self, kind, **kwargs):
|
915
|
+
"""Update the mean annual value from the station timeserie.
|
916
|
+
|
917
|
+
Parameters
|
918
|
+
----------
|
919
|
+
kind : str or list of str
|
920
|
+
The timeseries data kind to update theire multi annual value.
|
921
|
+
Must be a column in the timeseries DB.
|
922
|
+
Must be one of "raw", "qc", "filled".
|
923
|
+
For the precipitation also "corr" is valid.
|
924
|
+
**kwargs : dict, optional
|
925
|
+
Additional keyword arguments catch all, but unused here.
|
926
|
+
"""
|
927
|
+
# check kind input
|
928
|
+
valid_kinds = self._valid_kinds - {"qn", "filled_by"}
|
929
|
+
if kind == "all":
|
930
|
+
kind = valid_kinds
|
931
|
+
if isinstance(kind, list):
|
932
|
+
for kind in self._check_kinds(kind, valids=valid_kinds):
|
933
|
+
self.update_ma_timeseries(kind)
|
934
|
+
return None
|
935
|
+
self._check_kind(kind, valids=valid_kinds)
|
936
|
+
|
937
|
+
# create the sql
|
938
|
+
sql = f"""
|
939
|
+
WITH ts_y AS (
|
940
|
+
SELECT ({self._agg_fun}("{kind}")/count("{kind}")::float*count("timestamp"))::int AS val
|
941
|
+
FROM timeseries."{self.id}_{self._para}"
|
942
|
+
GROUP BY date_trunc('year', "timestamp")
|
943
|
+
HAVING count("{kind}")/count("timestamp")::float>0.9
|
944
|
+
)
|
945
|
+
INSERT INTO station_ma_timeserie (station_id, parameter, kind, value)
|
946
|
+
SELECT *
|
947
|
+
FROM ( SELECT
|
948
|
+
{self.id} AS station_id,
|
949
|
+
'{self._para}' AS parameter,
|
950
|
+
'{kind}' AS kind,
|
951
|
+
avg(val)::int AS value
|
952
|
+
FROM ts_y) sq
|
953
|
+
WHERE sq.value IS NOT NULL
|
954
|
+
ON CONFLICT (station_id, parameter, kind) DO UPDATE
|
955
|
+
SET value = EXCLUDED.value;
|
956
|
+
"""
|
957
|
+
|
958
|
+
# check return_sql
|
959
|
+
if kwargs.get("return_sql", False):
|
960
|
+
return sql
|
961
|
+
|
962
|
+
# execute the sql
|
963
|
+
with db_engine.connect() as con:
|
964
|
+
con.execute(sqltxt(sql))
|
965
|
+
con.commit()
|
966
|
+
|
967
|
+
@db_engine.deco_update_privilege
|
968
|
+
def _update_last_imp_period_meta(self, period):
|
969
|
+
"""Update the meta timestamps for a new import."""
|
970
|
+
#check period format
|
971
|
+
if not isinstance(period, TimestampPeriod):
|
972
|
+
period = TimestampPeriod(*period)
|
973
|
+
|
974
|
+
# update meta file
|
975
|
+
# ----------------
|
976
|
+
# get last_imp valid kinds that are in the meta file
|
977
|
+
last_imp_valid_kinds = self._valid_kinds.copy()
|
978
|
+
last_imp_valid_kinds.remove("raw")
|
979
|
+
for name in {"qn", "filled_by",
|
980
|
+
"raw_min", "raw_max", "filled_min", "filled_max"}:
|
981
|
+
if name in last_imp_valid_kinds:
|
982
|
+
last_imp_valid_kinds.remove(name)
|
983
|
+
|
984
|
+
# create update sql
|
985
|
+
sql_update_meta = '''
|
986
|
+
UPDATE meta_{para} meta SET
|
987
|
+
raw_from = LEAST (meta.raw_from, {min_tstp}),
|
988
|
+
raw_until = GREATEST (meta.raw_until, {max_tstp}),
|
989
|
+
last_imp_from = CASE WHEN {last_imp_test}
|
990
|
+
THEN {min_tstp}
|
991
|
+
ELSE LEAST(meta.last_imp_from, {min_tstp})
|
992
|
+
END,
|
993
|
+
last_imp_until = CASE WHEN {last_imp_test}
|
994
|
+
THEN {max_tstp}
|
995
|
+
ELSE GREATEST(meta.last_imp_until, {max_tstp})
|
996
|
+
END
|
997
|
+
{last_imps}
|
998
|
+
WHERE station_id = {stid};
|
999
|
+
'''.format(
|
1000
|
+
para=self._para,
|
1001
|
+
stid=self.id,
|
1002
|
+
last_imps=(
|
1003
|
+
", last_imp_" +
|
1004
|
+
" = FALSE, last_imp_".join(last_imp_valid_kinds) +
|
1005
|
+
" = FALSE"
|
1006
|
+
) if len(last_imp_valid_kinds) > 0 else "",
|
1007
|
+
last_imp_test=(
|
1008
|
+
"meta.last_imp_" +
|
1009
|
+
" AND meta.last_imp_".join(last_imp_valid_kinds)
|
1010
|
+
) if len(last_imp_valid_kinds) > 0 else "true",
|
1011
|
+
**period.get_sql_format_dict(format=f"'{self._tstp_format_db}'"))
|
1012
|
+
|
1013
|
+
# execute meta update
|
1014
|
+
with db_engine.connect() as con:
|
1015
|
+
con.execute(sqltxt(sql_update_meta))
|
1016
|
+
con.commit()
|
1017
|
+
|
1018
|
+
@db_engine.deco_update_privilege
|
1019
|
+
def update_raw(self, only_new=True, ftp_file_list=None, remove_nas=True, **kwargs):
|
1020
|
+
"""Download data from CDC and upload to database.
|
1021
|
+
|
1022
|
+
Parameters
|
1023
|
+
----------
|
1024
|
+
only_new : bool, optional
|
1025
|
+
Get only the files that are not yet in the database?
|
1026
|
+
If False all the available files are loaded again.
|
1027
|
+
The default is True
|
1028
|
+
ftp_file_list : list of (strings, datetime), optional
|
1029
|
+
A list of files on the FTP server together with their modification time.
|
1030
|
+
If None, then the list is fetched from the server.
|
1031
|
+
The default is None
|
1032
|
+
remove_nas : bool, optional
|
1033
|
+
Remove the NAs from the downloaded data before updating it to the database.
|
1034
|
+
This has computational advantages.
|
1035
|
+
The default is True.
|
1036
|
+
**kwargs : dict
|
1037
|
+
Additional keyword arguments catch all, but unused here.
|
1038
|
+
|
1039
|
+
Returns
|
1040
|
+
-------
|
1041
|
+
pandas.DataFrame
|
1042
|
+
The raw Dataframe of the Stations data.
|
1043
|
+
"""
|
1044
|
+
self._check_min_date(ftp_file_list=ftp_file_list)
|
1045
|
+
|
1046
|
+
zipfiles = self.get_zipfiles(
|
1047
|
+
only_new=only_new,
|
1048
|
+
ftp_file_list=ftp_file_list)
|
1049
|
+
|
1050
|
+
# check for empty list of zipfiles
|
1051
|
+
if zipfiles is None or len(zipfiles)==0:
|
1052
|
+
log.debug(
|
1053
|
+
f"raw_update of {self._para_long} Station {self.id}:" +
|
1054
|
+
f"No {'new ' if only_new else ''}zipfile was found and therefor no new data was imported."""
|
1055
|
+
)
|
1056
|
+
self._update_last_imp_period_meta(period=(None, None))
|
1057
|
+
return None
|
1058
|
+
|
1059
|
+
# download raw data
|
1060
|
+
df_all, max_hist_tstp_new = self._download_raw(zipfiles=zipfiles.index)
|
1061
|
+
|
1062
|
+
# cut out valid time period
|
1063
|
+
min_date = config.get_datetime("weatherdb", "min_date")
|
1064
|
+
df_all = df_all.loc[df_all.index >= min_date]
|
1065
|
+
max_hist_tstp_old = self.get_meta(infos=["hist_until"])
|
1066
|
+
if max_hist_tstp_new is None:
|
1067
|
+
if max_hist_tstp_old is not None:
|
1068
|
+
df_all = df_all.loc[df_all.index >= max_hist_tstp_old]
|
1069
|
+
elif max_hist_tstp_old is None or max_hist_tstp_old <= max_hist_tstp_new:
|
1070
|
+
self._update_meta(cols=["hist_until"], values=[max_hist_tstp_new])
|
1071
|
+
|
1072
|
+
# change to db format
|
1073
|
+
dict_cdc_db = dict(
|
1074
|
+
zip(self._cdc_col_names_imp, self._db_col_names_imp))
|
1075
|
+
cols_change = [
|
1076
|
+
name for name in self._cdc_col_names_imp
|
1077
|
+
if dict_cdc_db[name] not in self._kinds_not_decimal]
|
1078
|
+
selection = df_all[self._cdc_col_names_imp].copy()
|
1079
|
+
selection[cols_change] = (selection[cols_change] * self._decimals)\
|
1080
|
+
.round(0).astype("Int64")
|
1081
|
+
|
1082
|
+
# remove NAs in raw column
|
1083
|
+
raw_col = self._cdc_col_names_imp[
|
1084
|
+
self._db_col_names_imp.index("raw")]
|
1085
|
+
selection_without_na = selection[~selection[raw_col].isna()]
|
1086
|
+
if remove_nas:
|
1087
|
+
selection = selection_without_na
|
1088
|
+
|
1089
|
+
# upload to DB
|
1090
|
+
self._update_db_timeserie(
|
1091
|
+
selection,
|
1092
|
+
kinds=self._db_col_names_imp)
|
1093
|
+
|
1094
|
+
# update raw_files db table
|
1095
|
+
update_values = \
|
1096
|
+
", ".join(
|
1097
|
+
[f"('{self._para}', '{fp}', '{mod}')" for fp, mod in zip(
|
1098
|
+
zipfiles.index,
|
1099
|
+
zipfiles["modtime"].dt.strftime("%Y%m%d %H:%M").values)]
|
1100
|
+
)
|
1101
|
+
with db_engine.connect() as con:
|
1102
|
+
con.execute(sqltxt(f'''
|
1103
|
+
INSERT INTO raw_files(parameter, filepath, modtime)
|
1104
|
+
VALUES {update_values}
|
1105
|
+
ON CONFLICT (parameter, filepath) DO UPDATE SET modtime = EXCLUDED.modtime;'''))
|
1106
|
+
con.commit()
|
1107
|
+
|
1108
|
+
# if empty skip updating meta filepath
|
1109
|
+
if len(selection_without_na) == 0:
|
1110
|
+
log_msg = ("raw_update of {para_long} Station {stid}: " +
|
1111
|
+
"The downloaded new dataframe was empty and therefor no new data was imported.")\
|
1112
|
+
.format(para_long=self._para_long, stid=self.id)
|
1113
|
+
if not only_new and not self.is_virtual():
|
1114
|
+
# delete station from meta file because
|
1115
|
+
# there will never be data for this station
|
1116
|
+
self._drop(
|
1117
|
+
why="while updating raw data with only_new=False the df was empty even thought the station is not virtual")
|
1118
|
+
log_msg += "\nBecause only_new was False, the station got dropped from the meta file."
|
1119
|
+
|
1120
|
+
# return empty df
|
1121
|
+
log.debug(log_msg)
|
1122
|
+
self._update_last_imp_period_meta(period=(None, None))
|
1123
|
+
return None
|
1124
|
+
else:
|
1125
|
+
self._set_is_real()
|
1126
|
+
|
1127
|
+
# update multi annual mean
|
1128
|
+
self.update_ma_timeseries(kind="raw")
|
1129
|
+
|
1130
|
+
# update meta file
|
1131
|
+
imp_period = TimestampPeriod(
|
1132
|
+
selection_without_na.index.min(), selection_without_na.index.max())
|
1133
|
+
self._update_last_imp_period_meta(period=imp_period)
|
1134
|
+
|
1135
|
+
log.info(("The raw data for {para_long} station with ID {stid} got "+
|
1136
|
+
"updated for the period {min_tstp} to {max_tstp}.").format(
|
1137
|
+
para_long=self._para_long,
|
1138
|
+
stid=self.id,
|
1139
|
+
**imp_period.get_sql_format_dict(format=self._tstp_format_human)))
|
1140
|
+
|
1141
|
+
def get_zipfiles(self, only_new=True, ftp_file_list=None):
|
1142
|
+
"""Get the zipfiles on the CDC server with the raw data.
|
1143
|
+
|
1144
|
+
Parameters
|
1145
|
+
----------
|
1146
|
+
only_new : bool, optional
|
1147
|
+
Get only the files that are not yet in the database?
|
1148
|
+
If False all the available files are loaded again.
|
1149
|
+
The default is True
|
1150
|
+
ftp_file_list : list of (strings, datetime), optional
|
1151
|
+
A list of files on the FTP server together with their modification time.
|
1152
|
+
If None, then the list is fetched from the server.
|
1153
|
+
The default is None
|
1154
|
+
|
1155
|
+
Returns
|
1156
|
+
-------
|
1157
|
+
pandas.DataFrame or None
|
1158
|
+
A DataFrame of zipfiles and the corresponding modification time on the CDC server to import.
|
1159
|
+
"""
|
1160
|
+
# check if file list providen
|
1161
|
+
if ftp_file_list is None:
|
1162
|
+
ftp_file_list = get_cdc_file_list(
|
1163
|
+
self._ftp_folders
|
1164
|
+
)
|
1165
|
+
|
1166
|
+
# filter for station
|
1167
|
+
if self._ftp_zip_regex_prefix is not None:
|
1168
|
+
comp = re.compile(
|
1169
|
+
self._ftp_zip_regex_prefix + self.id_str + r"[_\.].*")
|
1170
|
+
else:
|
1171
|
+
comp = re.compile(r".*_" + self.id_str + r"[_\.].*")
|
1172
|
+
zipfiles_CDC = list(filter(
|
1173
|
+
lambda x: comp.match(x[0]),
|
1174
|
+
ftp_file_list
|
1175
|
+
))
|
1176
|
+
zipfiles_CDC = pd.DataFrame(
|
1177
|
+
zipfiles_CDC,
|
1178
|
+
columns=["filepath", "modtime"]
|
1179
|
+
).set_index("filepath")
|
1180
|
+
|
1181
|
+
if only_new:
|
1182
|
+
# get list of files on CDC Server
|
1183
|
+
sql_db_modtimes = \
|
1184
|
+
"""SELECT filepath, modtime
|
1185
|
+
FROM raw_files
|
1186
|
+
WHERE filepath in ({filepaths}) AND parameter='{para}';""".format(
|
1187
|
+
filepaths="'" +
|
1188
|
+
"', '".join(zipfiles_CDC.index.to_list())
|
1189
|
+
+ "'",
|
1190
|
+
para=self._para)
|
1191
|
+
with db_engine.connect() as con:
|
1192
|
+
zipfiles_DB = pd.read_sql(
|
1193
|
+
sqltxt(sql_db_modtimes),
|
1194
|
+
con=con
|
1195
|
+
).set_index("filepath")
|
1196
|
+
|
1197
|
+
# check for updated files
|
1198
|
+
zipfiles = zipfiles_CDC.join(
|
1199
|
+
zipfiles_DB, rsuffix="_DB", lsuffix="_CDC")
|
1200
|
+
zipfiles = zipfiles[zipfiles["modtime_DB"] != zipfiles["modtime_CDC"]]\
|
1201
|
+
.drop("modtime_DB", axis=1)\
|
1202
|
+
.rename({"modtime_CDC": "modtime"}, axis=1)
|
1203
|
+
|
1204
|
+
else:
|
1205
|
+
zipfiles = zipfiles_CDC
|
1206
|
+
|
1207
|
+
# check for empty list of zipfiles
|
1208
|
+
if len(zipfiles) == 0:
|
1209
|
+
return None
|
1210
|
+
else:
|
1211
|
+
return zipfiles
|
1212
|
+
|
1213
|
+
def _download_raw(self, zipfiles):
|
1214
|
+
# download raw data
|
1215
|
+
# import every file and merge data
|
1216
|
+
max_hist_tstp = None
|
1217
|
+
for zf in zipfiles:
|
1218
|
+
df_new = get_dwd_file(zf)
|
1219
|
+
df_new.set_index(self._cdc_date_col, inplace=True)
|
1220
|
+
df_new = self._check_df_raw(df_new)
|
1221
|
+
|
1222
|
+
# check if hist in query and get max tstp of it ##########
|
1223
|
+
if "historical" in zf:
|
1224
|
+
max_hist_tstp_new = df_new.index.max()
|
1225
|
+
if max_hist_tstp is not None:
|
1226
|
+
max_hist_tstp = np.max([max_hist_tstp, max_hist_tstp_new])
|
1227
|
+
else:
|
1228
|
+
max_hist_tstp = max_hist_tstp_new
|
1229
|
+
|
1230
|
+
# merge with df_all
|
1231
|
+
if "df_all" not in locals():
|
1232
|
+
df_all = df_new.copy()
|
1233
|
+
else:
|
1234
|
+
# cut out if already in previous file
|
1235
|
+
df_new = df_new[~df_new.index.isin(df_all.index)]
|
1236
|
+
# concatenate the dfs
|
1237
|
+
df_all = pd.concat([df_all, df_new])
|
1238
|
+
|
1239
|
+
# check for duplicates in date column
|
1240
|
+
if df_all.index.has_duplicates:
|
1241
|
+
df_all = df_all.groupby(df_all.index).mean()
|
1242
|
+
|
1243
|
+
return df_all, max_hist_tstp
|
1244
|
+
|
1245
|
+
def download_raw(self, only_new=False):
|
1246
|
+
"""Download the timeserie from the CDC Server.
|
1247
|
+
|
1248
|
+
This function only returns the timeserie, but is not updating the database.
|
1249
|
+
|
1250
|
+
Parameters
|
1251
|
+
----------
|
1252
|
+
only_new : bool, optional
|
1253
|
+
Get only the files that are not yet in the database?
|
1254
|
+
If False all the available files are loaded again.
|
1255
|
+
The default is False.
|
1256
|
+
|
1257
|
+
Returns
|
1258
|
+
-------
|
1259
|
+
pandas.DataFrame
|
1260
|
+
The Timeseries as a DataFrame with a Timestamp Index.
|
1261
|
+
"""
|
1262
|
+
zipfiles = self.get_zipfiles(only_new=only_new)
|
1263
|
+
if len(zipfiles)>0:
|
1264
|
+
return self._download_raw(zipfiles=zipfiles.index)[0]
|
1265
|
+
else:
|
1266
|
+
return None
|
1267
|
+
|
1268
|
+
@db_engine.deco_update_privilege
|
1269
|
+
def _get_sql_new_qc(self, period=(None, None)):
|
1270
|
+
"""Create the SQL statement for the new quality checked data.
|
1271
|
+
|
1272
|
+
Needs to have one column timestamp and one column qc.
|
1273
|
+
|
1274
|
+
Parameters
|
1275
|
+
----------
|
1276
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
1277
|
+
The minimum and maximum Timestamp for which to do the quality check.
|
1278
|
+
|
1279
|
+
Returns
|
1280
|
+
-------
|
1281
|
+
str
|
1282
|
+
The sql statement for the new quality controlled timeserie.
|
1283
|
+
"""
|
1284
|
+
pass # define in the specific classes
|
1285
|
+
|
1286
|
+
@db_engine.deco_update_privilege
|
1287
|
+
def quality_check(self, period=(None, None), **kwargs):
|
1288
|
+
"""Quality check the raw data for a given period.
|
1289
|
+
|
1290
|
+
Parameters
|
1291
|
+
----------
|
1292
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
1293
|
+
The minimum and maximum Timestamp for which to get the timeseries.
|
1294
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
1295
|
+
The default is (None, None).
|
1296
|
+
**kwargs : dict, optional
|
1297
|
+
Additional keyword arguments catch all, but unused here.
|
1298
|
+
"""
|
1299
|
+
period = self._check_period(period=period, kinds=["raw"], nas_allowed=True)
|
1300
|
+
|
1301
|
+
# create update sql
|
1302
|
+
sql_qc = """
|
1303
|
+
WITH new_qc as ({sql_new_qc})
|
1304
|
+
UPDATE timeseries."{stid}_{para}" ts
|
1305
|
+
SET "qc" = new."qc"
|
1306
|
+
FROM new_qc new
|
1307
|
+
WHERE ts.timestamp = new.timestamp
|
1308
|
+
AND ts."qc" IS DISTINCT FROM new."qc";
|
1309
|
+
""".format(
|
1310
|
+
sql_new_qc=self._get_sql_new_qc(period=period),
|
1311
|
+
stid=self.id, para=self._para)
|
1312
|
+
|
1313
|
+
# calculate the percentage of dropped values
|
1314
|
+
sql_qc += f"""
|
1315
|
+
UPDATE meta_{self._para}
|
1316
|
+
SET "qc_dropped" = ts."qc_dropped"
|
1317
|
+
FROM (
|
1318
|
+
SELECT ROUND(((count("raw")-count("qc"))::numeric/count("raw")), 4)*100 as qc_dropped
|
1319
|
+
FROM timeseries."{self.id}_{self._para}"
|
1320
|
+
) ts
|
1321
|
+
WHERE station_id = {self.id};"""
|
1322
|
+
|
1323
|
+
# run commands
|
1324
|
+
if "return_sql" in kwargs and kwargs["return_sql"]:
|
1325
|
+
return sql_qc
|
1326
|
+
self._execute_long_sql(
|
1327
|
+
sql=sql_qc,
|
1328
|
+
description="quality checked for the period {min_tstp} to {max_tstp}.".format(
|
1329
|
+
**period.get_sql_format_dict(
|
1330
|
+
format=self._tstp_format_human)
|
1331
|
+
))
|
1332
|
+
|
1333
|
+
# update multi annual mean
|
1334
|
+
self.update_ma_timeseries(kind="qc")
|
1335
|
+
|
1336
|
+
# update timespan in meta table
|
1337
|
+
self.update_period_meta(kind="qc")
|
1338
|
+
|
1339
|
+
# mark last import as done if in period
|
1340
|
+
last_imp_period = self.get_last_imp_period()
|
1341
|
+
if last_imp_period.inside(period):
|
1342
|
+
self._mark_last_imp_done(kind="qc")
|
1343
|
+
|
1344
|
+
@db_engine.deco_update_privilege
|
1345
|
+
def fillup(self, period=(None, None), **kwargs):
|
1346
|
+
"""Fill up missing data with measurements from nearby stations.
|
1347
|
+
|
1348
|
+
Parameters
|
1349
|
+
----------
|
1350
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
1351
|
+
The minimum and maximum Timestamp for which to gap fill the timeseries.
|
1352
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
1353
|
+
The default is (None, None).
|
1354
|
+
**kwargs : dict, optional
|
1355
|
+
Additional arguments for the fillup function.
|
1356
|
+
e.g. p_elev to consider the elevation to select nearest stations. (only for T and ET)
|
1357
|
+
"""
|
1358
|
+
self._expand_timeserie_to_period()
|
1359
|
+
self._check_ma()
|
1360
|
+
|
1361
|
+
sql_format_dict = dict(
|
1362
|
+
stid=self.id, para=self._para, para_base=self._para_base,
|
1363
|
+
para_escaped=self._para.replace("_", "\\_"),
|
1364
|
+
ma_terms=", ".join(self._ma_terms),
|
1365
|
+
coef_sign=self._coef_sign,
|
1366
|
+
ma_raster_key=self._ma_raster_key,
|
1367
|
+
base_col="qc" if "qc" in self._valid_kinds else "raw",
|
1368
|
+
cond_mas_not_null=" OR ".join([
|
1369
|
+
"ma_other.{ma_col} IS NOT NULL".format(ma_col=ma_col)
|
1370
|
+
for ma_col in self._ma_terms]),
|
1371
|
+
filled_by_col="NULL::smallint AS filled_by",
|
1372
|
+
exit_cond="SUM((filled IS NULL)::int) = 0",
|
1373
|
+
extra_unfilled_period_where="",
|
1374
|
+
add_meta_col="",
|
1375
|
+
max_fillup_dist=config.get("weatherdb:max_fillup_distance", "p", fallback=200000),
|
1376
|
+
**self._sql_fillup_extra_dict(**kwargs)
|
1377
|
+
)
|
1378
|
+
|
1379
|
+
# make condition for period
|
1380
|
+
if not isinstance(period, TimestampPeriod):
|
1381
|
+
period = TimestampPeriod(*period)
|
1382
|
+
if not period.is_empty():
|
1383
|
+
sql_format_dict.update(dict(
|
1384
|
+
cond_period=" WHERE ts.timestamp BETWEEN {min_tstp} AND {max_tstp}".format(
|
1385
|
+
**period.get_sql_format_dict(
|
1386
|
+
format="'{}'".format(self._tstp_format_db)))
|
1387
|
+
))
|
1388
|
+
else:
|
1389
|
+
sql_format_dict.update(dict(
|
1390
|
+
cond_period=""))
|
1391
|
+
|
1392
|
+
# check if winter/summer or only yearly regionalisation
|
1393
|
+
if len(self._ma_terms) == 1:
|
1394
|
+
sql_format_dict.update(dict(
|
1395
|
+
is_winter_col="",
|
1396
|
+
coef_calc="ma_stat.{ma_term}{coef_sign[0]}ma_other.{ma_term}::float AS coef"
|
1397
|
+
.format(
|
1398
|
+
ma_term=self._ma_terms[0],
|
1399
|
+
coef_sign=self._coef_sign),
|
1400
|
+
coef_format="i.coef",
|
1401
|
+
filled_calc="round(nb.{base_col} {coef_sign[1]} %3$s, 0)::int"
|
1402
|
+
.format(**sql_format_dict)
|
1403
|
+
))
|
1404
|
+
elif len(self._ma_terms) == 2:
|
1405
|
+
sql_format_dict.update(dict(
|
1406
|
+
is_winter_col=""",
|
1407
|
+
CASE WHEN EXTRACT(MONTH FROM timestamp) IN (1, 2, 3, 10, 11, 12)
|
1408
|
+
THEN true::bool
|
1409
|
+
ELSE false::bool
|
1410
|
+
END AS is_winter""",
|
1411
|
+
coef_calc=(
|
1412
|
+
"ma_stat.{ma_terms[0]}{coef_sign[0]}ma_other.{ma_terms[0]}::float AS coef_wi, \n" + " "*24 +
|
1413
|
+
"ma_stat.{ma_terms[1]}{coef_sign[0]}ma_other.{ma_terms[1]}::float AS coef_so"
|
1414
|
+
).format(
|
1415
|
+
ma_terms=self._ma_terms,
|
1416
|
+
coef_sign=self._coef_sign),
|
1417
|
+
coef_format="i.coef_wi, \n" + " " * 24 + "i.coef_so",
|
1418
|
+
filled_calc="""
|
1419
|
+
CASE WHEN nf.is_winter
|
1420
|
+
THEN round(nb.{base_col} {coef_sign[1]} %3$s, 0)::int
|
1421
|
+
ELSE round(nb.{base_col} {coef_sign[1]} %4$s, 0)::int
|
1422
|
+
END""".format(**sql_format_dict)
|
1423
|
+
))
|
1424
|
+
else:
|
1425
|
+
raise ValueError(
|
1426
|
+
"There were too many multi annual columns selected. The fillup method is only implemented for yearly or half yearly regionalisations")
|
1427
|
+
|
1428
|
+
# raster stats cols
|
1429
|
+
sql_format_dict["rast_val_cols"] = ", ".join(
|
1430
|
+
[f"avg(value) FILTER (WHERE \"term\"='{term}') AS \"{term}\""
|
1431
|
+
for term in self._ma_terms])
|
1432
|
+
|
1433
|
+
# check if filled_by column is ARRAY or smallint
|
1434
|
+
if self._filled_by_n>1:
|
1435
|
+
sql_array_init = "ARRAY[{0}]".format(
|
1436
|
+
", ".join(["NULL::smallint"] * self._filled_by_n))
|
1437
|
+
|
1438
|
+
# create execute sql command
|
1439
|
+
sql_exec_fillup=""
|
1440
|
+
prev_check = ""
|
1441
|
+
for i in range(1, self._filled_by_n+1):
|
1442
|
+
sql_exec_fillup += f"""
|
1443
|
+
UPDATE new_filled_{self.id}_{self._para} nf
|
1444
|
+
SET nb_mean[{i}]=round(nb.qc + %3$s, 0)::int,
|
1445
|
+
{sql_format_dict["extra_exec_cols"].format(i=i)}
|
1446
|
+
filled_by[{i}]=%1$s
|
1447
|
+
FROM timeseries.%2$I nb
|
1448
|
+
WHERE nf.filled IS NULL AND nf.nb_mean[{i}] IS NULL {prev_check}
|
1449
|
+
AND nf.timestamp = nb.timestamp;"""
|
1450
|
+
prev_check += f" AND nf.nb_mean[{i}] IS NOT NULL AND nf.filled_by[{i}] != %1$s"
|
1451
|
+
|
1452
|
+
sql_format_dict.update(dict(
|
1453
|
+
filled_by_col = "NULL::smallint[] AS filled_by",
|
1454
|
+
extra_new_temp_cols = sql_format_dict["extra_new_temp_cols"] +
|
1455
|
+
f"{sql_array_init} AS nb_mean,",
|
1456
|
+
sql_exec_fillup=sql_exec_fillup,
|
1457
|
+
extra_unfilled_period_where="AND nb_mean[3] is NULL",
|
1458
|
+
extra_fillup_where=sql_format_dict["extra_fillup_where"] +\
|
1459
|
+
' OR NOT (ts."filled_by" @> new."filled_by" AND ts."filled_by" <@ new."filled_by")'
|
1460
|
+
))
|
1461
|
+
|
1462
|
+
# create exit condition
|
1463
|
+
sql_format_dict.update(dict(
|
1464
|
+
exit_cond=f"SUM((filled IS NULL AND nb_mean[{self._filled_by_n}] is NULL)::int) = 0 "))
|
1465
|
+
if self._fillup_max_dist is not None:
|
1466
|
+
sql_format_dict.update(dict(
|
1467
|
+
add_meta_col=", ST_DISTANCE(geometry_utm,(SELECT geometry_utm FROM stat_row)) as dist",
|
1468
|
+
exit_cond=sql_format_dict["exit_cond"]\
|
1469
|
+
+f"OR ((i.dist > {self._fillup_max_dist}) AND SUM((filled IS NULL AND nb_mean[1] is NULL)::int) = 0)"
|
1470
|
+
))
|
1471
|
+
|
1472
|
+
# create sql after loop, to calculate the median of the regionalised neighbors
|
1473
|
+
sql_format_dict.update(dict(
|
1474
|
+
sql_extra_after_loop = """UPDATE new_filled_{stid}_{para} SET
|
1475
|
+
filled=(SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v)
|
1476
|
+
FROM unnest(nb_mean) as T(v)) {extra_after_loop_extra_col}
|
1477
|
+
WHERE filled is NULL;
|
1478
|
+
{sql_extra_after_loop}""".format(**sql_format_dict)))
|
1479
|
+
else:
|
1480
|
+
# create execute command if only 1 neighbor is considered
|
1481
|
+
sql_format_dict.update(dict(
|
1482
|
+
sql_exec_fillup="""
|
1483
|
+
UPDATE new_filled_{stid}_{para} nf
|
1484
|
+
SET filled={filled_calc}, {extra_cols_fillup_calc}
|
1485
|
+
filled_by=%1$s
|
1486
|
+
FROM timeseries.%2$I nb
|
1487
|
+
WHERE nf.filled IS NULL AND nb.{base_col} IS NOT NULL
|
1488
|
+
AND nf.timestamp = nb.timestamp;""".format(**sql_format_dict),
|
1489
|
+
extra_fillup_where=sql_format_dict["extra_fillup_where"] +\
|
1490
|
+
' OR ts."filled_by" IS DISTINCT FROM new."filled_by"'))
|
1491
|
+
|
1492
|
+
# Make SQL statement to fill the missing values with values from nearby stations
|
1493
|
+
sql = """
|
1494
|
+
CREATE TEMP TABLE new_filled_{stid}_{para}
|
1495
|
+
ON COMMIT DROP
|
1496
|
+
AS (SELECT timestamp, {base_col} AS filled,
|
1497
|
+
{extra_new_temp_cols}{filled_by_col}{is_winter_col}
|
1498
|
+
FROM timeseries."{stid}_{para}" ts {cond_period});
|
1499
|
+
ALTER TABLE new_filled_{stid}_{para} ADD PRIMARY KEY (timestamp);
|
1500
|
+
DO
|
1501
|
+
$do$
|
1502
|
+
DECLARE i RECORD;
|
1503
|
+
unfilled_period RECORD;
|
1504
|
+
BEGIN
|
1505
|
+
SELECT min(timestamp) AS min, max(timestamp) AS max
|
1506
|
+
INTO unfilled_period
|
1507
|
+
FROM new_filled_{stid}_{para}
|
1508
|
+
WHERE "filled" IS NULL;
|
1509
|
+
FOR i IN (
|
1510
|
+
WITH stat_row AS (
|
1511
|
+
SELECT * FROM meta_{para} WHERE station_id={stid}),
|
1512
|
+
rast_vals as (
|
1513
|
+
SELECT station_id, {rast_val_cols}
|
1514
|
+
FROM station_ma_raster
|
1515
|
+
WHERE parameter = '{para_base}' and raster_key='{ma_raster_key}'
|
1516
|
+
GROUP BY station_id
|
1517
|
+
),
|
1518
|
+
meta_dist as (
|
1519
|
+
SELECT *, ST_DISTANCE(
|
1520
|
+
geometry_utm,
|
1521
|
+
(SELECT geometry_utm FROM stat_row)) AS dist_m
|
1522
|
+
FROM meta_{para})
|
1523
|
+
SELECT meta.station_id,
|
1524
|
+
meta.raw_from, meta.raw_until,
|
1525
|
+
meta.station_id || '_{para}' AS tablename,
|
1526
|
+
{coef_calc}{add_meta_col}
|
1527
|
+
FROM meta_dist meta
|
1528
|
+
LEFT JOIN rast_vals ma_other
|
1529
|
+
ON ma_other.station_id=meta.station_id
|
1530
|
+
LEFT JOIN (SELECT {ma_terms}
|
1531
|
+
FROM rast_vals
|
1532
|
+
WHERE station_id = {stid}
|
1533
|
+
) ma_stat
|
1534
|
+
ON 1=1
|
1535
|
+
WHERE meta.station_id != {stid}
|
1536
|
+
AND meta.station_id || '_{para}' IN (
|
1537
|
+
SELECT tablename
|
1538
|
+
FROM pg_catalog.pg_tables
|
1539
|
+
WHERE schemaname ='timeseries'
|
1540
|
+
AND tablename LIKE '%\\_{para_escaped}')
|
1541
|
+
AND ({cond_mas_not_null})
|
1542
|
+
AND (meta.raw_from IS NOT NULL AND meta.raw_until IS NOT NULL)
|
1543
|
+
AND meta.dist_m <= {max_fillup_dist}
|
1544
|
+
ORDER BY meta.dist_m {mul_elev_order} ASC)
|
1545
|
+
LOOP
|
1546
|
+
CONTINUE WHEN i.raw_from > unfilled_period.max
|
1547
|
+
OR i.raw_until < unfilled_period.min
|
1548
|
+
OR (i.raw_from IS NULL AND i.raw_until IS NULL);
|
1549
|
+
EXECUTE FORMAT(
|
1550
|
+
$$
|
1551
|
+
{sql_exec_fillup}
|
1552
|
+
$$,
|
1553
|
+
i.station_id,
|
1554
|
+
i.tablename,
|
1555
|
+
{coef_format}
|
1556
|
+
);
|
1557
|
+
EXIT WHEN (SELECT {exit_cond}
|
1558
|
+
FROM new_filled_{stid}_{para});
|
1559
|
+
SELECT min(timestamp) AS min, max(timestamp) AS max
|
1560
|
+
INTO unfilled_period
|
1561
|
+
FROM new_filled_{stid}_{para}
|
1562
|
+
WHERE "filled" IS NULL {extra_unfilled_period_where};
|
1563
|
+
END LOOP;
|
1564
|
+
{sql_extra_after_loop}
|
1565
|
+
UPDATE timeseries."{stid}_{para}" ts
|
1566
|
+
SET filled = new.filled, {extra_cols_fillup}
|
1567
|
+
filled_by = new.filled_by
|
1568
|
+
FROM new_filled_{stid}_{para} new
|
1569
|
+
WHERE ts.timestamp = new.timestamp
|
1570
|
+
AND (ts."filled" IS DISTINCT FROM new."filled" {extra_fillup_where}) ;
|
1571
|
+
END
|
1572
|
+
$do$;
|
1573
|
+
""".format(**sql_format_dict)
|
1574
|
+
|
1575
|
+
# execute
|
1576
|
+
if "return_sql" in kwargs and kwargs["return_sql"]:
|
1577
|
+
return sql
|
1578
|
+
self._execute_long_sql(
|
1579
|
+
sql=sql,
|
1580
|
+
description="filled for the period {min_tstp} - {max_tstp}".format(
|
1581
|
+
**period.get_sql_format_dict(format=self._tstp_format_human)))
|
1582
|
+
|
1583
|
+
# update multi annual mean
|
1584
|
+
self.update_ma_timeseries(kind="filled")
|
1585
|
+
|
1586
|
+
# update timespan in meta table
|
1587
|
+
self.update_period_meta(kind="filled")
|
1588
|
+
|
1589
|
+
# mark last imp done
|
1590
|
+
if (("qc" not in self._valid_kinds) or
|
1591
|
+
(self.is_last_imp_done(kind="qc"))):
|
1592
|
+
if period.is_empty():
|
1593
|
+
self._mark_last_imp_done(kind="filled")
|
1594
|
+
elif period.contains(self.get_last_imp_period()):
|
1595
|
+
self._mark_last_imp_done(kind="filled")
|
1596
|
+
|
1597
|
+
@db_engine.deco_update_privilege
|
1598
|
+
def _sql_fillup_extra_dict(self, **kwargs):
|
1599
|
+
"""Get the sql statement for the fill to calculate the filling of additional columns.
|
1600
|
+
|
1601
|
+
This is mainly for the temperature Station to fillup max and min
|
1602
|
+
and returns an empty string for the other stations.
|
1603
|
+
|
1604
|
+
And for the precipitation Station and returns an empty string for the other stations.
|
1605
|
+
|
1606
|
+
Returns
|
1607
|
+
-------
|
1608
|
+
dict
|
1609
|
+
A dictionary with the different additional sql_format_dict entries.
|
1610
|
+
"""
|
1611
|
+
return {"sql_extra_after_loop": "",
|
1612
|
+
"extra_new_temp_cols": "",
|
1613
|
+
"extra_cols_fillup": "",
|
1614
|
+
"extra_cols_fillup_calc": "",
|
1615
|
+
"extra_fillup_where": "",
|
1616
|
+
"mul_elev_order": "",
|
1617
|
+
"extra_exec_cols": "",
|
1618
|
+
"extra_after_loop_extra_col": ""}
|
1619
|
+
|
1620
|
+
@db_engine.deco_update_privilege
|
1621
|
+
def _mark_last_imp_done(self, kind):
|
1622
|
+
"""Mark the last import for the given kind as done.
|
1623
|
+
|
1624
|
+
Parameters
|
1625
|
+
----------
|
1626
|
+
kind : str
|
1627
|
+
The data kind to look for filled period.
|
1628
|
+
Must be a column in the timeseries DB.
|
1629
|
+
Must be one of "raw", "qc", "filled", "adj".
|
1630
|
+
If "best" is given, then depending on the parameter of the station the best kind is selected.
|
1631
|
+
For Precipitation this is "corr" and for the other this is "filled".
|
1632
|
+
For the precipitation also "qn" and "corr" are valid.
|
1633
|
+
"""
|
1634
|
+
kind = self._check_kind(kind)
|
1635
|
+
sql = """
|
1636
|
+
UPDATE meta_{para}
|
1637
|
+
SET last_imp_{kind} = TRUE
|
1638
|
+
WHERE station_id = {stid}
|
1639
|
+
""".format(stid=self.id, para=self._para, kind=kind)
|
1640
|
+
|
1641
|
+
with db_engine.connect() as con:
|
1642
|
+
con.execute(sqltxt(sql))
|
1643
|
+
con.commit()
|
1644
|
+
|
1645
|
+
@db_engine.deco_update_privilege
|
1646
|
+
def last_imp_quality_check(self, **kwargs):
|
1647
|
+
"""Do the quality check of the last import.
|
1648
|
+
|
1649
|
+
Parameters
|
1650
|
+
----------
|
1651
|
+
**kwargs : dict, optional
|
1652
|
+
Additional keyword arguments passed to the quality_check function.
|
1653
|
+
"""
|
1654
|
+
if not self.is_last_imp_done(kind="qc"):
|
1655
|
+
self.quality_check(period=self.get_last_imp_period(), **kwargs)
|
1656
|
+
|
1657
|
+
@db_engine.deco_update_privilege
|
1658
|
+
def last_imp_qc(self, **kwargs):
|
1659
|
+
self.last_imp_quality_check(**kwargs)
|
1660
|
+
|
1661
|
+
@db_engine.deco_update_privilege
|
1662
|
+
def last_imp_fillup(self, _last_imp_period=None, **kwargs):
|
1663
|
+
"""Do the gap filling of the last import.
|
1664
|
+
|
1665
|
+
Parameters
|
1666
|
+
----------
|
1667
|
+
_last_imp_period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
1668
|
+
The minimum and maximum Timestamp for which to do the gap filling.
|
1669
|
+
If None is given, the last import period is taken.
|
1670
|
+
This is only for internal use, to speed up the process if run in a batch.
|
1671
|
+
The default is None.
|
1672
|
+
**kwargs : dict, optional
|
1673
|
+
Additional keyword arguments passed to the fillup function.
|
1674
|
+
"""
|
1675
|
+
if not self.is_last_imp_done(kind="filled"):
|
1676
|
+
if _last_imp_period is None:
|
1677
|
+
period = self.get_last_imp_period(all=True)
|
1678
|
+
else:
|
1679
|
+
period = _last_imp_period
|
1680
|
+
|
1681
|
+
self.fillup(period=period, **kwargs)
|
1682
|
+
|
1683
|
+
@classmethod
|
1684
|
+
def get_meta_explanation(cls, infos="all"):
|
1685
|
+
"""Get the explanations of the available meta fields.
|
1686
|
+
|
1687
|
+
Parameters
|
1688
|
+
----------
|
1689
|
+
infos : list or string, optional
|
1690
|
+
The infos you wish to get an explanation for.
|
1691
|
+
If "all" then all the available information get returned.
|
1692
|
+
The default is "all"
|
1693
|
+
|
1694
|
+
Returns
|
1695
|
+
-------
|
1696
|
+
pd.Series
|
1697
|
+
a pandas Series with the information names as index and the explanation as values.
|
1698
|
+
"""
|
1699
|
+
return pd.Series(
|
1700
|
+
{c.key: c.comment
|
1701
|
+
for c in sa.inspect(cls._MetaModel).c
|
1702
|
+
if infos == "all" or c.key in infos})
|
1703
|
+
|
1704
|
+
def get_meta(self, infos="all"):
|
1705
|
+
"""Get Information from the meta table.
|
1706
|
+
|
1707
|
+
Parameters
|
1708
|
+
----------
|
1709
|
+
infos : list of str or str, optional
|
1710
|
+
A list of the information to get from the database.
|
1711
|
+
If "all" then all the information are returned.
|
1712
|
+
The default is "all".
|
1713
|
+
|
1714
|
+
Returns
|
1715
|
+
-------
|
1716
|
+
dict or int/string
|
1717
|
+
dict with the meta information.
|
1718
|
+
The first level has one entry per parameter.
|
1719
|
+
The second level has one entry per information, asked for.
|
1720
|
+
If only one information is asked for, then it is returned as single value and not as subdict.
|
1721
|
+
"""
|
1722
|
+
# check which information to get
|
1723
|
+
if isinstance(infos, str) and (infos == "all"):
|
1724
|
+
cols = self._MetaModel.__table__.columns
|
1725
|
+
else:
|
1726
|
+
if isinstance(infos, str):
|
1727
|
+
infos = [infos]
|
1728
|
+
cols = [self._MetaModel.__table__.columns[col]
|
1729
|
+
for col in infos]
|
1730
|
+
|
1731
|
+
# create query
|
1732
|
+
stmnt = sa.select(*cols).where(self._MetaModel.station_id == self.id)
|
1733
|
+
|
1734
|
+
with db_engine.session() as con:
|
1735
|
+
res = con.execute(stmnt)
|
1736
|
+
keys = res.keys()
|
1737
|
+
values = res.fetchone()
|
1738
|
+
if len(keys)==1:
|
1739
|
+
return values[0]
|
1740
|
+
else:
|
1741
|
+
return dict(zip(keys, values))
|
1742
|
+
|
1743
|
+
def get_geom(self, crs=None):
|
1744
|
+
"""Get the point geometry of the station.
|
1745
|
+
|
1746
|
+
Parameters
|
1747
|
+
----------
|
1748
|
+
crs: str, int or None, optional
|
1749
|
+
The coordinate reference system of the geometry.
|
1750
|
+
If None, then the geometry is returned in WGS84 (EPSG:4326).
|
1751
|
+
If string, then it should be in a pyproj readable format.
|
1752
|
+
If int, then it should be the EPSG code.
|
1753
|
+
The default is None.
|
1754
|
+
|
1755
|
+
Returns
|
1756
|
+
-------
|
1757
|
+
shapely.geometries.Point
|
1758
|
+
The location of the station as shapely Point in the given coordinate reference system.
|
1759
|
+
"""
|
1760
|
+
# get the geom
|
1761
|
+
geom_wkb = self.get_meta(infos=["geometry"])
|
1762
|
+
geom_shp = shapely.wkb.loads(geom_wkb.data.tobytes())
|
1763
|
+
|
1764
|
+
# transform
|
1765
|
+
if crs is not None:
|
1766
|
+
transformer = pyproj.Transformer.from_proj(
|
1767
|
+
geom_wkb.srid,
|
1768
|
+
crs, always_xy=True)
|
1769
|
+
geom_shp = shapely.ops.transform(
|
1770
|
+
transformer.transform, geom_shp)
|
1771
|
+
|
1772
|
+
return geom_shp
|
1773
|
+
|
1774
|
+
def get_geom_shp(self, crs=None):
|
1775
|
+
"""Get the geometry of the station as a shapely Point object.
|
1776
|
+
|
1777
|
+
.. deprecated:: 1.0.0
|
1778
|
+
`get_geom_shp` is deprecated and will be removed in future releases.
|
1779
|
+
It is replaced by `get_geom`.
|
1780
|
+
|
1781
|
+
Parameters
|
1782
|
+
----------
|
1783
|
+
crs: str, int or None, optional
|
1784
|
+
If None, then the geometry is returned in WGS84 (EPSG:4326).
|
1785
|
+
If string, then it should be one of "WGS84" or "UTM".
|
1786
|
+
If int, then it should be the EPSG code.
|
1787
|
+
|
1788
|
+
Returns
|
1789
|
+
-------
|
1790
|
+
shapely.geometries.Point
|
1791
|
+
The location of the station as shapely Point.
|
1792
|
+
"""
|
1793
|
+
warnings.warn(
|
1794
|
+
"This function is deprecated and will disapear in future releases. Use get_geom instead.",
|
1795
|
+
PendingDeprecationWarning)
|
1796
|
+
return self.get_geom(crs=crs)
|
1797
|
+
|
1798
|
+
def get_name(self):
|
1799
|
+
return self.get_meta(infos="stationsname")
|
1800
|
+
|
1801
|
+
def get_quotient(self, kinds_num, kinds_denom, return_as="df"):
|
1802
|
+
"""Get the quotient of multi-annual means of two different kinds or the timeserie and the multi annual raster value.
|
1803
|
+
|
1804
|
+
$quotient = \\overline{ts}_{kind_num} / \\overline{ts}_{denom}$
|
1805
|
+
|
1806
|
+
Parameters
|
1807
|
+
----------
|
1808
|
+
kinds_num : list of str or str
|
1809
|
+
The timeseries kinds of the numerators.
|
1810
|
+
Should be one of ['raw', 'qc', 'filled'].
|
1811
|
+
For precipitation also "corr" is possible.
|
1812
|
+
kinds_denom : list of str or str
|
1813
|
+
The timeseries kinds of the denominator or the multi annual raster key.
|
1814
|
+
If the denominator is a multi annual raster key, then the result is the quotient of the timeserie and the raster value.
|
1815
|
+
Possible values are:
|
1816
|
+
- for timeserie kinds: 'raw', 'qc', 'filled' or for precipitation also "corr".
|
1817
|
+
- for raster keys: 'hyras', 'dwd' or 'regnie', depending on your defined raster files.
|
1818
|
+
return_as : str, optional
|
1819
|
+
The format of the return value.
|
1820
|
+
If "df" then a pandas DataFrame is returned.
|
1821
|
+
If "json" then a list with dictionaries is returned.
|
1822
|
+
|
1823
|
+
Returns
|
1824
|
+
-------
|
1825
|
+
pandas.DataFrame or list of dict
|
1826
|
+
The quotient of the two timeseries as DataFrame or list of dictionaries (JSON) depending on the return_as parameter.
|
1827
|
+
The default is pd.DataFrame.
|
1828
|
+
|
1829
|
+
Raises
|
1830
|
+
------
|
1831
|
+
ValueError
|
1832
|
+
If the input parameters were not correct.
|
1833
|
+
"""
|
1834
|
+
# check kinds
|
1835
|
+
rast_keys = {"hyras", "regnie", "dwd"}
|
1836
|
+
kinds_num = self._check_kinds(kinds_num)
|
1837
|
+
kinds_denom = self._check_kinds(
|
1838
|
+
kinds_denom,
|
1839
|
+
valids=self._valid_kinds | rast_keys)
|
1840
|
+
|
1841
|
+
# get the quotient from the database views
|
1842
|
+
with db_engine.session() as con:
|
1843
|
+
return _get_quotient(
|
1844
|
+
con=con,
|
1845
|
+
stids=self.id,
|
1846
|
+
paras=self._para,
|
1847
|
+
kinds_num=kinds_num,
|
1848
|
+
kinds_denom=kinds_denom,
|
1849
|
+
return_as=return_as)
|
1850
|
+
|
1851
|
+
def count_holes(self,
|
1852
|
+
weeks=[2, 4, 8, 12, 16, 20, 24], kind="qc", period=(None, None),
|
1853
|
+
between_meta_period=True, crop_period=False, **kwargs):
|
1854
|
+
"""Count holes in timeseries depending on there length.
|
1855
|
+
|
1856
|
+
Parameters
|
1857
|
+
----------
|
1858
|
+
weeks : list, optional
|
1859
|
+
A list of hole length to count.
|
1860
|
+
Every hole longer than the duration of weeks specified is counted.
|
1861
|
+
The default is [2, 4, 8, 12, 16, 20, 24]
|
1862
|
+
kind : str
|
1863
|
+
The kind of the timeserie to analyze.
|
1864
|
+
Should be one of ['raw', 'qc', 'filled'].
|
1865
|
+
For N also "corr" is possible.
|
1866
|
+
Normally only "raw" and "qc" make sense, because the other timeseries should not have holes.
|
1867
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
1868
|
+
The minimum and maximum Timestamp for which to analyze the timeseries.
|
1869
|
+
If None is given, the maximum and minimal possible Timestamp is taken.
|
1870
|
+
The default is (None, None).
|
1871
|
+
between_meta_period : bool, optional
|
1872
|
+
Only check between the respective period that is defined in the meta table.
|
1873
|
+
If "qc" is chosen as kind, then the "raw" meta period is taken.
|
1874
|
+
The default is True.
|
1875
|
+
crop_period : bool, optional
|
1876
|
+
should the period get cropped to the maximum filled period.
|
1877
|
+
This will result in holes being ignored when they are at the end or at the beginning of the timeserie.
|
1878
|
+
If period = (None, None) is given, then this parameter is set to True.
|
1879
|
+
The default is False.
|
1880
|
+
|
1881
|
+
Returns
|
1882
|
+
-------
|
1883
|
+
pandas.DataFrame
|
1884
|
+
A Pandas Dataframe, with station_id as index and one column per week.
|
1885
|
+
The numbers in the table are the amount of NA-periods longer than the respective amount of weeks.
|
1886
|
+
|
1887
|
+
Raises
|
1888
|
+
------
|
1889
|
+
ValueError
|
1890
|
+
If the input parameters were not correct.
|
1891
|
+
"""
|
1892
|
+
# check input parameters
|
1893
|
+
kind = self._check_kind(kind)
|
1894
|
+
kind_meta_period = "raw" if kind == "qc" else kind
|
1895
|
+
|
1896
|
+
if period == (None,None):
|
1897
|
+
crop_period = True
|
1898
|
+
period = self._check_period(
|
1899
|
+
period, nas_allowed=not crop_period, kinds=[kind])
|
1900
|
+
|
1901
|
+
if not isinstance(weeks, list):
|
1902
|
+
weeks = [weeks]
|
1903
|
+
if not all([isinstance(el, int) for el in weeks]):
|
1904
|
+
raise ValueError(
|
1905
|
+
"Not all the elements of the weeks input parameters where integers.")
|
1906
|
+
|
1907
|
+
# create SQL statement
|
1908
|
+
sql_format_dict = dict(
|
1909
|
+
stid=self.id, para=self._para,
|
1910
|
+
kind=kind, kind_meta_period=kind_meta_period,
|
1911
|
+
count_weeks=",".join(
|
1912
|
+
[f"COUNT(*) FILTER (WHERE td.diff >= '{w} weeks'::INTERVAL) as \"holes>={w} weeks\""
|
1913
|
+
for w in weeks]),
|
1914
|
+
where_between_raw_period="",
|
1915
|
+
union_from="",
|
1916
|
+
**period.get_sql_format_dict()
|
1917
|
+
)
|
1918
|
+
if between_meta_period:
|
1919
|
+
sql_format_dict.update(dict(
|
1920
|
+
where_between_raw_period=\
|
1921
|
+
f"AND ts.timestamp>=(SELECT {kind_meta_period}_from FROM meta) \
|
1922
|
+
AND ts.timestamp<=(SELECT {kind_meta_period}_until FROM meta)",
|
1923
|
+
union_from=f"UNION (SELECT {kind_meta_period}_from FROM meta)",
|
1924
|
+
union_until=f"UNION (SELECT {kind_meta_period}_until FROM meta)"
|
1925
|
+
))
|
1926
|
+
|
1927
|
+
sql = """
|
1928
|
+
WITH meta AS (
|
1929
|
+
SELECT {kind_meta_period}_from, {kind_meta_period}_until FROM meta_p WHERE station_id={stid})
|
1930
|
+
SELECT {count_weeks}
|
1931
|
+
FROM (
|
1932
|
+
SELECT tst.timestamp-LAG(tst.timestamp) OVER (ORDER BY tst.timestamp) as diff
|
1933
|
+
FROM (
|
1934
|
+
SELECT timestamp
|
1935
|
+
FROM timeseries."{stid}_{para}" ts
|
1936
|
+
WHERE (ts.timestamp BETWEEN {min_tstp} AND {max_tstp})
|
1937
|
+
AND ts.{kind} IS NOT NULL
|
1938
|
+
{where_between_raw_period}
|
1939
|
+
UNION (SELECT {min_tstp} as timestamp {union_from})
|
1940
|
+
UNION (SELECT {max_tstp} as timestamp {union_until})
|
1941
|
+
) tst
|
1942
|
+
) td;
|
1943
|
+
""".format(**sql_format_dict)
|
1944
|
+
|
1945
|
+
# get response from server
|
1946
|
+
if "return_sql" in kwargs:
|
1947
|
+
return sql
|
1948
|
+
with db_engine.connect() as con:
|
1949
|
+
res = pd.read_sql(sqltxt(sql), con)
|
1950
|
+
|
1951
|
+
# set index
|
1952
|
+
res["station_id"] = self.id
|
1953
|
+
res.set_index("station_id", inplace=True)
|
1954
|
+
|
1955
|
+
return res
|
1956
|
+
|
1957
|
+
def get_period_meta(self, kind, all=False):
|
1958
|
+
"""Get a specific period from the meta information table.
|
1959
|
+
|
1960
|
+
This functions returns the information from the meta table.
|
1961
|
+
In this table there are several periods saved, like the period of the last import.
|
1962
|
+
|
1963
|
+
Parameters
|
1964
|
+
----------
|
1965
|
+
kind : str
|
1966
|
+
The kind of period to return.
|
1967
|
+
Should be one of ['filled', 'raw', 'last_imp'].
|
1968
|
+
filled: the maximum filled period of the filled timeserie.
|
1969
|
+
raw: the maximum filled timeperiod of the raw data.
|
1970
|
+
last_imp: the maximum filled timeperiod of the last import.
|
1971
|
+
all : bool, optional
|
1972
|
+
Should the maximum Timespan for all the filled periods be returned.
|
1973
|
+
If False only the period for this station is returned.
|
1974
|
+
The default is False.
|
1975
|
+
|
1976
|
+
Returns
|
1977
|
+
-------
|
1978
|
+
TimestampPeriod:
|
1979
|
+
The TimestampPeriod of the station or of all the stations if all=True.
|
1980
|
+
|
1981
|
+
Raises
|
1982
|
+
------
|
1983
|
+
ValueError
|
1984
|
+
If a wrong kind is handed in.
|
1985
|
+
"""
|
1986
|
+
# check kind
|
1987
|
+
kind = self._check_kind_tstp_meta(kind)
|
1988
|
+
|
1989
|
+
# create sql statement
|
1990
|
+
sql_format_dict = dict(para=self._para, stid=self.id, kind=kind)
|
1991
|
+
if all:
|
1992
|
+
sql = """
|
1993
|
+
SELECT min({kind}_from) as {kind}_from,
|
1994
|
+
max({kind}_until) as {kind}_until
|
1995
|
+
FROM meta_{para};
|
1996
|
+
""".format(**sql_format_dict)
|
1997
|
+
else:
|
1998
|
+
sql = """
|
1999
|
+
SELECT {kind}_from, {kind}_until
|
2000
|
+
FROM meta_{para}
|
2001
|
+
WHERE station_id = {stid};
|
2002
|
+
""".format(**sql_format_dict)
|
2003
|
+
|
2004
|
+
with db_engine.connect() as con:
|
2005
|
+
res = con.execute(sa.text(sql))
|
2006
|
+
|
2007
|
+
return TimestampPeriod(*res.first())
|
2008
|
+
|
2009
|
+
def get_filled_period(self, kind, from_meta=False):
|
2010
|
+
"""Get the min and max Timestamp for which there is data in the corresponding timeserie.
|
2011
|
+
|
2012
|
+
Computes the period from the timeserie or meta table.
|
2013
|
+
|
2014
|
+
Parameters
|
2015
|
+
----------
|
2016
|
+
kind : str
|
2017
|
+
The data kind to look for filled period.
|
2018
|
+
Must be a column in the timeseries DB.
|
2019
|
+
Must be one of "raw", "qc", "filled", "adj".
|
2020
|
+
If "best" is given, then depending on the parameter of the station the best kind is selected.
|
2021
|
+
For Precipitation this is "corr" and for the other this is "filled".
|
2022
|
+
For the precipitation also "qn" and "corr" are valid.
|
2023
|
+
from_meta : bool, optional
|
2024
|
+
Should the period be from the meta table?
|
2025
|
+
If False: the period is returned from the timeserie. In this case this function is only a wrapper for .get_period_meta.
|
2026
|
+
The default is False.
|
2027
|
+
|
2028
|
+
Raises
|
2029
|
+
------
|
2030
|
+
NotImplementedError
|
2031
|
+
If the given kind is not valid.
|
2032
|
+
ValueError
|
2033
|
+
If the given kind is not a string.
|
2034
|
+
|
2035
|
+
Returns
|
2036
|
+
-------
|
2037
|
+
TimestampPeriod
|
2038
|
+
A TimestampPeriod of the filled timeserie.
|
2039
|
+
(NaT, NaT) if the timeserie is all empty or not defined.
|
2040
|
+
"""
|
2041
|
+
if from_meta:
|
2042
|
+
return self.get_period_meta(kind=kind, all=False)
|
2043
|
+
|
2044
|
+
kind = self._check_kind(kind=kind)
|
2045
|
+
|
2046
|
+
if self.isin_db():
|
2047
|
+
sql = """
|
2048
|
+
SELECT min(timestamp), max(timestamp)
|
2049
|
+
FROM timeseries."{stid}_{para}"
|
2050
|
+
WHERE "{kind}" is not NULL
|
2051
|
+
""".format(stid=self.id, kind=kind, para=self._para)
|
2052
|
+
with db_engine.connect() as con:
|
2053
|
+
respond = con.execute(sqltxt(sql)).first()
|
2054
|
+
|
2055
|
+
return TimestampPeriod(*respond)
|
2056
|
+
else:
|
2057
|
+
return TimestampPeriod(None, None)
|
2058
|
+
|
2059
|
+
def get_max_period(self, kinds, nas_allowed=False, **kwargs):
|
2060
|
+
"""Get the maximum available period for this stations timeseries.
|
2061
|
+
|
2062
|
+
If nas_allowed is True, then the maximum range of the timeserie is returned.
|
2063
|
+
Else the minimal filled period is returned
|
2064
|
+
|
2065
|
+
Parameters
|
2066
|
+
----------
|
2067
|
+
kinds : str or list of str
|
2068
|
+
The data kinds to update.
|
2069
|
+
Must be a column in the timeseries DB.
|
2070
|
+
Must be one of "raw", "qc", "filled", "adj".
|
2071
|
+
For the precipitation also "qn" and "corr" are valid.
|
2072
|
+
nas_allowed : bool, optional
|
2073
|
+
Should NAs be allowed?
|
2074
|
+
If True, then the maximum possible period is returned, even if there are NAs in the timeserie.
|
2075
|
+
If False, then the minimal filled period is returned.
|
2076
|
+
The default is False.
|
2077
|
+
|
2078
|
+
Returns
|
2079
|
+
-------
|
2080
|
+
TimestampPeriod
|
2081
|
+
The maximum Timestamp Period
|
2082
|
+
"""
|
2083
|
+
if nas_allowed:
|
2084
|
+
sql_max_tstp = """
|
2085
|
+
SELECT MIN("timestamp"), MAX("timestamp")
|
2086
|
+
FROM timeseries."{stid}_{para}";
|
2087
|
+
""".format(
|
2088
|
+
stid=self.id, para=self._para)
|
2089
|
+
with db_engine.connect() as con:
|
2090
|
+
res = con.execute(sqltxt(sql_max_tstp))
|
2091
|
+
max_period = TimestampPeriod(*res.first())
|
2092
|
+
else:
|
2093
|
+
kinds = self._check_kinds(kinds)
|
2094
|
+
if len(kinds)>0:
|
2095
|
+
max_period = self.get_filled_period(kind=kinds[0], **kwargs)
|
2096
|
+
for kind in kinds[1:]:
|
2097
|
+
max_period = max_period.union(
|
2098
|
+
self.get_filled_period(kind=kind, **kwargs),
|
2099
|
+
how="outer" if nas_allowed else "inner")
|
2100
|
+
else:
|
2101
|
+
max_period = TimestampPeriod(None, None)
|
2102
|
+
|
2103
|
+
return max_period
|
2104
|
+
|
2105
|
+
def get_last_imp_period(self, all=False):
|
2106
|
+
"""Get the last imported Period for this Station.
|
2107
|
+
|
2108
|
+
Parameters
|
2109
|
+
----------
|
2110
|
+
all : bool, optional
|
2111
|
+
Should the maximum Timespan for all the last imports be returned.
|
2112
|
+
If False only the period for this station is returned.
|
2113
|
+
The default is False.
|
2114
|
+
|
2115
|
+
Returns
|
2116
|
+
-------
|
2117
|
+
TimespanPeriod or tuple of datetime.datetime:
|
2118
|
+
(minimal datetime, maximal datetime)
|
2119
|
+
"""
|
2120
|
+
return self.get_period_meta(kind="last_imp", all=all)
|
2121
|
+
|
2122
|
+
def _get_sql_nbs_elev_order(self, p_elev=None):
|
2123
|
+
"""Get the sql part for the elevation order.
|
2124
|
+
Needs to have stat_row defined. e.g with the following statement:
|
2125
|
+
WITH stat_row AS (SELECT * FROM meta_{para} WHERE station_id={stid})
|
2126
|
+
"""
|
2127
|
+
if p_elev is not None:
|
2128
|
+
if len(p_elev) != 2:
|
2129
|
+
raise ValueError("p_elev must be a tuple of length 2 or None")
|
2130
|
+
return f"""*(1+power(
|
2131
|
+
abs(stationshoehe - (SELECT stationshoehe FROM stat_row))
|
2132
|
+
/{p_elev[0]}::float,
|
2133
|
+
{p_elev[1]}::float))"""
|
2134
|
+
else:
|
2135
|
+
return ""
|
2136
|
+
|
2137
|
+
def get_neighboor_stids(self, n=5, only_real=True, p_elev=None, period=None, **kwargs):
|
2138
|
+
"""Get a list with Station Ids of the nearest neighboor stations.
|
2139
|
+
|
2140
|
+
Parameters
|
2141
|
+
----------
|
2142
|
+
n : int, optional
|
2143
|
+
The number of stations to return.
|
2144
|
+
If None, then all the possible stations are returned.
|
2145
|
+
The default is 5.
|
2146
|
+
only_real: bool, optional
|
2147
|
+
Should only real station get considered?
|
2148
|
+
If false also virtual stations are part of the result.
|
2149
|
+
The default is True.
|
2150
|
+
p_elev : tuple of float or None, optional
|
2151
|
+
The parameters (P_1, P_2) to weight the height differences between stations.
|
2152
|
+
The elevation difference is considered with the formula from LARSIM (equation 3-18 & 3-19 from the LARSIM manual [1]_ ):
|
2153
|
+
|
2154
|
+
.. math::
|
2155
|
+
|
2156
|
+
L_{weighted} = L_{horizontal} * (1 + (\\frac{|\\delta H|}{P_1})^{P_2})
|
2157
|
+
If None, then the height difference is not considered and only the nearest stations are returned.
|
2158
|
+
The default is None.
|
2159
|
+
period : TimestampPeriod or None, optional
|
2160
|
+
The period for which the nearest neighboors are returned.
|
2161
|
+
The neighboor station needs to have raw data for at least one half of the period.
|
2162
|
+
If None, then the availability of the data is not checked.
|
2163
|
+
The default is None.
|
2164
|
+
|
2165
|
+
Returns
|
2166
|
+
-------
|
2167
|
+
list of int
|
2168
|
+
A list of station Ids in order of distance.
|
2169
|
+
The closest station is the first in the list.
|
2170
|
+
|
2171
|
+
References
|
2172
|
+
----------
|
2173
|
+
.. [1] LARSIM Dokumentation, last check on 06.04.2023, online available under https://www.larsim.info/dokumentation/LARSIM-Dokumentation.pdf
|
2174
|
+
"""
|
2175
|
+
self._check_isin_meta()
|
2176
|
+
|
2177
|
+
sql_dict = dict(
|
2178
|
+
cond_only_real="AND is_real" if only_real else "",
|
2179
|
+
stid=self.id, para=self._para, n=n,
|
2180
|
+
add_meta_rows="", cond_period="", mul_elev_order="")
|
2181
|
+
|
2182
|
+
# Elevation parts
|
2183
|
+
if p_elev is not None:
|
2184
|
+
if len(p_elev) != 2:
|
2185
|
+
raise ValueError("p_elev must be a tuple of length 2 or None")
|
2186
|
+
sql_dict.update(dict(
|
2187
|
+
add_meta_rows=", stationshoehe",
|
2188
|
+
mul_elev_order = self._get_sql_nbs_elev_order(p_elev=p_elev)
|
2189
|
+
))
|
2190
|
+
|
2191
|
+
# period parts
|
2192
|
+
if period is not None:
|
2193
|
+
if not isinstance(period, TimestampPeriod):
|
2194
|
+
period = TimestampPeriod(*period)
|
2195
|
+
days = period.get_interval().days
|
2196
|
+
tmstp_mid = period.get_middle()
|
2197
|
+
sql_dict.update(dict(
|
2198
|
+
cond_period=f""" AND (raw_until - raw_from > '{np.round(days/2)} days'::INTERVAL
|
2199
|
+
AND (raw_from <= '{tmstp_mid.strftime("%Y%m%d")}'::timestamp
|
2200
|
+
AND raw_until >= '{tmstp_mid.strftime("%Y%m%d")}'::timestamp)) """
|
2201
|
+
))
|
2202
|
+
|
2203
|
+
# create sql statement
|
2204
|
+
sql_nearest_stids = """
|
2205
|
+
WITH stat_row AS (
|
2206
|
+
SELECT geometry_utm {add_meta_rows}
|
2207
|
+
FROM meta_{para} WHERE station_id={stid}
|
2208
|
+
)
|
2209
|
+
SELECT station_id
|
2210
|
+
FROM meta_{para}
|
2211
|
+
WHERE station_id != {stid} {cond_only_real} {cond_period}
|
2212
|
+
ORDER BY ST_DISTANCE(geometry_utm,(SELECT geometry_utm FROM stat_row))
|
2213
|
+
{mul_elev_order}
|
2214
|
+
LIMIT {n};
|
2215
|
+
""".format(**sql_dict)
|
2216
|
+
|
2217
|
+
if "return_sql" in kwargs and kwargs["return_sql"]:
|
2218
|
+
return sql_nearest_stids
|
2219
|
+
|
2220
|
+
with db_engine.connect() as con:
|
2221
|
+
result = con.execute(sqltxt(sql_nearest_stids))
|
2222
|
+
nearest_stids = [res[0] for res in result.all()]
|
2223
|
+
return nearest_stids
|
2224
|
+
|
2225
|
+
def get_multi_annual_raster(self):
|
2226
|
+
"""Get the multi annual raster value(s) for this station.
|
2227
|
+
|
2228
|
+
Returns
|
2229
|
+
-------
|
2230
|
+
list or None
|
2231
|
+
The corresponding multi annual value.
|
2232
|
+
For T en ET the yearly value is returned.
|
2233
|
+
For N the winter and summer half yearly sum is returned in tuple.
|
2234
|
+
The returned unit is mm or °C.
|
2235
|
+
"""
|
2236
|
+
sql_select = sa\
|
2237
|
+
.select(StationMARaster.term,
|
2238
|
+
StationMARaster.value)\
|
2239
|
+
.where(sa.and_(
|
2240
|
+
StationMARaster.station_id == self.id,
|
2241
|
+
StationMARaster.raster_key == self._ma_raster_key,
|
2242
|
+
StationMARaster.parameter == self._para_base,
|
2243
|
+
StationMARaster.term.in_(self._ma_terms)))
|
2244
|
+
with db_engine.session() as session:
|
2245
|
+
res = session.execute(sql_select).all()
|
2246
|
+
|
2247
|
+
# Update ma values if no result returned
|
2248
|
+
if res is None:
|
2249
|
+
self.update_ma_raster()
|
2250
|
+
with db_engine.session() as session:
|
2251
|
+
res = session.execute(sql_select).all()
|
2252
|
+
|
2253
|
+
if len(res) == 0:
|
2254
|
+
return None
|
2255
|
+
else:
|
2256
|
+
res_dict = dict(res)
|
2257
|
+
return [float(np.round(res_dict[term] / self._decimals, self._decimals//10))
|
2258
|
+
for term in self._ma_terms]
|
2259
|
+
|
2260
|
+
def get_ma_raster(self):
|
2261
|
+
"""Wrapper for `get_multi_annual`."""
|
2262
|
+
return self.get_multi_annual_raster()
|
2263
|
+
|
2264
|
+
def _get_raster_value(self, raster_conf, bands="all", dist=0):
|
2265
|
+
"""Get the value of a raster file for this station.
|
2266
|
+
|
2267
|
+
Parameters
|
2268
|
+
----------
|
2269
|
+
raster_conf : dict or configparser.SectionProxy
|
2270
|
+
The configuration of the raster file.
|
2271
|
+
bands : str or int or list of str or int, optional
|
2272
|
+
The band to get the value from.
|
2273
|
+
If "all" then all bands are returned.
|
2274
|
+
If int, then the band with the respective number is returned.
|
2275
|
+
If str, then the band is first checked to be a key in the raster_conf and then this band is returned.
|
2276
|
+
If no key in raster_conf, then name is checked against the raster names and this band is returned.
|
2277
|
+
The default is "all".
|
2278
|
+
dist : int, optional
|
2279
|
+
The distance to the station in the rasters CRS.
|
2280
|
+
Only works for rasters with projected CRS.
|
2281
|
+
The default is 0.
|
2282
|
+
|
2283
|
+
Returns
|
2284
|
+
-------
|
2285
|
+
numpy.array of int or float or np.nan
|
2286
|
+
The rasters value at the stations position
|
2287
|
+
|
2288
|
+
Raises
|
2289
|
+
------
|
2290
|
+
ValueError
|
2291
|
+
If the raster is not in a projected coordinate system and dist > 0.
|
2292
|
+
"""
|
2293
|
+
file = Path(raster_conf["file"])
|
2294
|
+
with rio.open(file) as src:
|
2295
|
+
# get the CRS
|
2296
|
+
src_srid = src.crs.to_epsg()
|
2297
|
+
if src_srid is None:
|
2298
|
+
src_srid = raster_conf["srid"]
|
2299
|
+
|
2300
|
+
# get the station geom
|
2301
|
+
stat_geom = self.get_geom(crs=src_srid)
|
2302
|
+
|
2303
|
+
# get the bands indexes
|
2304
|
+
if isinstance(bands, str) and bands.lower() == "all":
|
2305
|
+
indexes = src.indexes
|
2306
|
+
else:
|
2307
|
+
if not isinstance(bands, list):
|
2308
|
+
bands = [bands]
|
2309
|
+
indexes = []
|
2310
|
+
for band in bands:
|
2311
|
+
if isinstance(bands, int) & (band in src.indexes):
|
2312
|
+
indexes.append(band)
|
2313
|
+
else:
|
2314
|
+
if band in raster_conf:
|
2315
|
+
band = raster_conf[band]
|
2316
|
+
if band in src.descriptions:
|
2317
|
+
indexes.append(src.descriptions.index(band)+1)
|
2318
|
+
else:
|
2319
|
+
raise ValueError(
|
2320
|
+
f"The band {band} is not in the raster file {file}.")
|
2321
|
+
|
2322
|
+
# get the value
|
2323
|
+
if dist==0:
|
2324
|
+
return list(
|
2325
|
+
src.sample(stat_geom.coords, indexes=indexes, masked=True)
|
2326
|
+
)[0].astype(np.float32).filled(np.nan)
|
2327
|
+
else:
|
2328
|
+
proj_srid = pyproj.CRS.from_epsg(
|
2329
|
+
config.get("weatherdb", "RASTER_BUFFER_CRS"))
|
2330
|
+
if not proj_srid.is_projected:
|
2331
|
+
raise ValueError(textwrap.dedent(
|
2332
|
+
"""Buffering the stations position to get raster values for nearby raster cells is only allowed for projected rasters.
|
2333
|
+
Please update the RASTER_BUFFER_CRS in the weatherdb section of the config file to a projected CRS."""))
|
2334
|
+
tr_to = pyproj.Transformer.from_proj(src_srid, proj_srid, always_xy=True)
|
2335
|
+
tr_back = pyproj.Transformer.from_proj(proj_srid, src_srid, always_xy=True)
|
2336
|
+
buf_geom = shapely.ops.transform(
|
2337
|
+
tr_back.transform,
|
2338
|
+
shapely.ops.transform(tr_to.transform, stat_geom).buffer(dist),
|
2339
|
+
)
|
2340
|
+
|
2341
|
+
return np.array([zonal_stats(
|
2342
|
+
buf_geom,
|
2343
|
+
file,
|
2344
|
+
band_num=band_num,
|
2345
|
+
stats=["mean"],
|
2346
|
+
all_touched=True
|
2347
|
+
)[0]["mean"]
|
2348
|
+
for band_num in indexes], dtype=np.float32)
|
2349
|
+
|
2350
|
+
def get_coef(self, other_stid, in_db_unit=False):
|
2351
|
+
"""Get the regionalisation coefficients due to the height.
|
2352
|
+
|
2353
|
+
Those are the values from the dwd grid, HYRAS or REGNIE grids.
|
2354
|
+
|
2355
|
+
Parameters
|
2356
|
+
----------
|
2357
|
+
other_stid : int
|
2358
|
+
The Station Id of the other station from wich to regionalise for own station.
|
2359
|
+
in_db_unit : bool, optional
|
2360
|
+
Should the coefficients be returned in the unit as stored in the database?
|
2361
|
+
This is only relevant for the temperature.
|
2362
|
+
The default is False.
|
2363
|
+
|
2364
|
+
Returns
|
2365
|
+
-------
|
2366
|
+
list of floats or None
|
2367
|
+
A list of coefficients.
|
2368
|
+
For T, ET and N-daily only the the yearly coefficient is returned.
|
2369
|
+
For N the winter and summer half yearly coefficient is returned in tuple.
|
2370
|
+
None is returned if either the own or other stations multi-annual value is not available.
|
2371
|
+
"""
|
2372
|
+
ma_values = self.get_multi_annual_raster()
|
2373
|
+
other_stat = self.__class__(other_stid)
|
2374
|
+
other_ma_values = other_stat.get_multi_annual_raster()
|
2375
|
+
|
2376
|
+
if other_ma_values is None or ma_values is None:
|
2377
|
+
return None
|
2378
|
+
else:
|
2379
|
+
if self._coef_sign[0] == "/":
|
2380
|
+
return [own/other for own, other in zip(ma_values, other_ma_values)]
|
2381
|
+
elif self._coef_sign[0] == "-":
|
2382
|
+
if in_db_unit:
|
2383
|
+
return [int(np.round((own-other)*self._decimals))
|
2384
|
+
for own, other in zip(ma_values, other_ma_values)]
|
2385
|
+
else:
|
2386
|
+
return [own-other for own, other in zip(ma_values, other_ma_values)]
|
2387
|
+
else:
|
2388
|
+
return None
|
2389
|
+
|
2390
|
+
def get_df(self, kinds, period=(None, None), agg_to=None,
|
2391
|
+
nas_allowed=True, add_na_share=False, db_unit=False,
|
2392
|
+
sql_add_where=None, **kwargs):
|
2393
|
+
"""Get a timeseries DataFrame from the database.
|
2394
|
+
|
2395
|
+
Parameters
|
2396
|
+
----------
|
2397
|
+
kinds : str or list of str
|
2398
|
+
The data kinds to update.
|
2399
|
+
Must be a column in the timeseries DB.
|
2400
|
+
Must be one of "raw", "qc", "filled", "adj", "filled_by", "filled_share".
|
2401
|
+
For the precipitation also "qn" and "corr" are valid.
|
2402
|
+
If "filled_by" is given together with an aggregation step, the "filled_by" is replaced by the "filled_share".
|
2403
|
+
The "filled_share" gives the share of filled values in the aggregation group in percent.
|
2404
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
2405
|
+
The minimum and maximum Timestamp for which to get the timeseries.
|
2406
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
2407
|
+
The default is (None, None).
|
2408
|
+
agg_to : str or None, optional
|
2409
|
+
Aggregate to a given timespan.
|
2410
|
+
If more than 20% of missing values in the aggregation group, the aggregated value will be None.
|
2411
|
+
Can be anything smaller than the maximum timespan of the saved data.
|
2412
|
+
If a Timeperiod smaller than the saved data is given, than the maximum possible timeperiod is returned.
|
2413
|
+
For T and ET it can be "month", "year".
|
2414
|
+
For N it can also be "hour".
|
2415
|
+
If None than the maximum timeperiod is taken.
|
2416
|
+
The default is None.
|
2417
|
+
nas_allowed : bool, optional
|
2418
|
+
Should NAs be allowed?
|
2419
|
+
If True, then the maximum possible period is returned, even if there are NAs in the timeserie.
|
2420
|
+
If False, then the minimal filled period is returned.
|
2421
|
+
The default is True.
|
2422
|
+
add_na_share : bool, optional
|
2423
|
+
Should one or several columns be added to the Dataframe with the share of NAs in the data.
|
2424
|
+
This is especially important, when the stations data get aggregated, because the aggregation doesn't make sense if there are a lot of NAs in the original data.
|
2425
|
+
If True, one column per asked kind is added with the respective share of NAs, if the aggregation step is not the smallest.
|
2426
|
+
The "kind"_na_share column is in percentage.
|
2427
|
+
The default is False.
|
2428
|
+
db_unit : bool, optional
|
2429
|
+
Should the result be in the Database unit.
|
2430
|
+
If False the unit is getting converted to normal unit, like mm or °C.
|
2431
|
+
The numbers are saved as integer in the database and got therefor multiplied by 10 or 100 to get to an integer.
|
2432
|
+
The default is False.
|
2433
|
+
sql_add_where : str or None, optional
|
2434
|
+
additional sql where statement to filter the output.
|
2435
|
+
E.g. "EXTRACT(MONTH FROM timestamp) == 2"
|
2436
|
+
The default is None
|
2437
|
+
|
2438
|
+
Returns
|
2439
|
+
-------
|
2440
|
+
pandas.DataFrame
|
2441
|
+
The timeserie Dataframe with a DatetimeIndex.
|
2442
|
+
"""
|
2443
|
+
# check if existing
|
2444
|
+
if not self.isin_db():
|
2445
|
+
return None
|
2446
|
+
|
2447
|
+
# check if adj
|
2448
|
+
if "adj" in kinds:
|
2449
|
+
adj_df = self.get_adj(
|
2450
|
+
period=period, agg_to=agg_to,
|
2451
|
+
nas_allowed=nas_allowed, add_na_share=add_na_share)
|
2452
|
+
if len(kinds) == 1:
|
2453
|
+
return adj_df
|
2454
|
+
else:
|
2455
|
+
kinds.remove("adj")
|
2456
|
+
|
2457
|
+
# check kinds and period
|
2458
|
+
if "filled_share" in kinds:
|
2459
|
+
add_filled_share = True
|
2460
|
+
kinds.remove("filled_share")
|
2461
|
+
else:
|
2462
|
+
add_filled_share = False
|
2463
|
+
kinds = self._check_kinds(kinds=kinds)
|
2464
|
+
if not ("_skip_period_check" in kwargs and kwargs["_skip_period_check"]):
|
2465
|
+
period = self._check_period(
|
2466
|
+
period=period, kinds=kinds, nas_allowed=nas_allowed)
|
2467
|
+
|
2468
|
+
if period.is_empty() and not nas_allowed:
|
2469
|
+
return None
|
2470
|
+
|
2471
|
+
# aggregating?
|
2472
|
+
timestamp_col = "timestamp"
|
2473
|
+
group_by = ""
|
2474
|
+
agg_to = self._check_agg_to(agg_to)
|
2475
|
+
if agg_to is not None:
|
2476
|
+
if "filled_by" in kinds:
|
2477
|
+
warnings.warn(
|
2478
|
+
f"""You selected a filled_by column, but did not select the smallest aggregation (agg_to={self._min_agg_to}).
|
2479
|
+
The filled_by information is only reasonable when using the original time frequency.
|
2480
|
+
Therefor the filled_by column is not returned, but instead the filled_share.
|
2481
|
+
This column gives the percentage of the filled fields in the aggregation group.""")
|
2482
|
+
kinds.remove("filled_by")
|
2483
|
+
add_filled_share = True
|
2484
|
+
|
2485
|
+
# create sql parts
|
2486
|
+
kinds_before = kinds.copy()
|
2487
|
+
kinds = []
|
2488
|
+
for kind in kinds_before:
|
2489
|
+
if re.search(r".*(_min)|(_max)", kind):
|
2490
|
+
agg_fun = "MIN" if re.search(r".*_min", kind) else "MAX"
|
2491
|
+
else:
|
2492
|
+
agg_fun = self._agg_fun
|
2493
|
+
kinds.append(
|
2494
|
+
f"CASE WHEN (COUNT(\"{kind}\")/COUNT(*)::float)>0.8 "+
|
2495
|
+
f"THEN ROUND({agg_fun}({kind}), 0) ELSE NULL END AS {kind}")
|
2496
|
+
|
2497
|
+
timestamp_col = "date_trunc('{agg_to}', timestamp)".format(
|
2498
|
+
agg_to=agg_to)
|
2499
|
+
group_by = "GROUP BY " + timestamp_col
|
2500
|
+
if agg_to in ["day", "month", "year", "decade"]:
|
2501
|
+
timestamp_col += "::date"
|
2502
|
+
|
2503
|
+
# add the filled_share if needed
|
2504
|
+
if add_filled_share:
|
2505
|
+
kinds.append(
|
2506
|
+
'COUNT("filled_by")::float/COUNT(*)::float*100 as filled_share')
|
2507
|
+
|
2508
|
+
# raise warning, when NA_share should get added
|
2509
|
+
if any([kind in ["raw", "qc"] for kind in kinds] ) and not add_na_share:
|
2510
|
+
warnings.warn(
|
2511
|
+
"You aggregate a column that can contain NAs (e.g. \"raw\" or \"qc\")\n" +
|
2512
|
+
"This can result in strange values, because in one aggregation group can be many NAs.\n"+
|
2513
|
+
"To suppress this warning and to consider this effect please use add_na_share=True in the parameters.")
|
2514
|
+
|
2515
|
+
# create na_share columns
|
2516
|
+
if add_na_share:
|
2517
|
+
for kind in kinds_before:
|
2518
|
+
kinds.append(f"(COUNT(*)-COUNT(\"{kind}\"))/COUNT(*)::float * 100 AS {kind}_na_share")
|
2519
|
+
|
2520
|
+
# sql_add_where
|
2521
|
+
if sql_add_where:
|
2522
|
+
if "and" not in sql_add_where.lower():
|
2523
|
+
sql_add_where = " AND " + sql_add_where
|
2524
|
+
else:
|
2525
|
+
sql_add_where = ""
|
2526
|
+
|
2527
|
+
# create base sql
|
2528
|
+
sql = """
|
2529
|
+
SELECT {timestamp_col} as timestamp, {kinds}
|
2530
|
+
FROM timeseries."{stid}_{para}"
|
2531
|
+
WHERE timestamp BETWEEN {min_tstp} AND {max_tstp}{sql_add_where}
|
2532
|
+
{group_by}
|
2533
|
+
ORDER BY timestamp ASC;
|
2534
|
+
""".format(
|
2535
|
+
stid=self.id,
|
2536
|
+
para=self._para,
|
2537
|
+
kinds=', '.join(kinds),
|
2538
|
+
group_by=group_by,
|
2539
|
+
timestamp_col=timestamp_col,
|
2540
|
+
sql_add_where=sql_add_where,
|
2541
|
+
**period.get_sql_format_dict(
|
2542
|
+
format="'{}'".format(self._tstp_format_db))
|
2543
|
+
)
|
2544
|
+
|
2545
|
+
if "return_sql" in kwargs and kwargs["return_sql"]:
|
2546
|
+
return sql
|
2547
|
+
|
2548
|
+
with db_engine.connect() as con:
|
2549
|
+
df = pd.read_sql(
|
2550
|
+
sqltxt(sql),
|
2551
|
+
con=con,
|
2552
|
+
index_col="timestamp")
|
2553
|
+
|
2554
|
+
# convert filled_by to Int16, pandas Integer with NA support
|
2555
|
+
if "filled_by" in kinds and df["filled_by"].dtype != object:
|
2556
|
+
df["filled_by"] = df["filled_by"].astype("Int16")
|
2557
|
+
|
2558
|
+
# change index to pandas DatetimeIndex if necessary
|
2559
|
+
if not isinstance(df.index, pd.DatetimeIndex):
|
2560
|
+
df.set_index(pd.DatetimeIndex(df.index), inplace=True)
|
2561
|
+
|
2562
|
+
# set Timezone to UTC
|
2563
|
+
df.index = df.index.tz_localize("UTC")
|
2564
|
+
|
2565
|
+
# change to normal unit
|
2566
|
+
if not db_unit:
|
2567
|
+
change_cols = [
|
2568
|
+
col for col in df.columns
|
2569
|
+
if col not in self._kinds_not_decimal and "_na_share" not in col]
|
2570
|
+
df[change_cols] = df[change_cols] / self._decimals
|
2571
|
+
|
2572
|
+
# check if adj should be added:
|
2573
|
+
if "adj_df" in locals():
|
2574
|
+
df = df.join(adj_df)
|
2575
|
+
|
2576
|
+
return df
|
2577
|
+
|
2578
|
+
def get_raw(self, **kwargs):
|
2579
|
+
"""Get the raw timeserie.
|
2580
|
+
|
2581
|
+
Parameters
|
2582
|
+
----------
|
2583
|
+
**kwargs : dict, optional
|
2584
|
+
The keyword arguments get passed to the get_df function.
|
2585
|
+
Possible parameters are "period", "agg_to" or "nas_allowed"
|
2586
|
+
|
2587
|
+
Returns
|
2588
|
+
-------
|
2589
|
+
pd.DataFrame
|
2590
|
+
The raw timeserie for this station and the given period.
|
2591
|
+
"""
|
2592
|
+
return self.get_df(kinds="raw",**kwargs)
|
2593
|
+
|
2594
|
+
def get_qc(self, **kwargs):
|
2595
|
+
"""Get the quality checked timeserie.
|
2596
|
+
|
2597
|
+
Parameters
|
2598
|
+
----------
|
2599
|
+
**kwargs : dict, optional
|
2600
|
+
The keyword arguments get passed to the get_df function.
|
2601
|
+
Possible parameters are "period", "agg_to" or "nas_allowed"
|
2602
|
+
|
2603
|
+
Returns
|
2604
|
+
-------
|
2605
|
+
pd.DataFrame
|
2606
|
+
The quality checked timeserie for this station and the given period.
|
2607
|
+
"""
|
2608
|
+
return self.get_df(kinds="qc", **kwargs)
|
2609
|
+
|
2610
|
+
def get_dist(self, period=(None, None)):
|
2611
|
+
"""Get the timeserie with the infomation from which station the data got filled and the corresponding distance to this station.
|
2612
|
+
|
2613
|
+
Parameters
|
2614
|
+
----------
|
2615
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
2616
|
+
The minimum and maximum Timestamp for which to get the timeserie.
|
2617
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
2618
|
+
The default is (None, None).
|
2619
|
+
|
2620
|
+
Returns
|
2621
|
+
-------
|
2622
|
+
pd.DataFrame
|
2623
|
+
The timeserie for this station and the given period with the station_id and the distance in meters from which the data got filled from.
|
2624
|
+
"""
|
2625
|
+
period = self._check_period(period, kinds=["filled"])
|
2626
|
+
|
2627
|
+
sql = """
|
2628
|
+
WITH dist AS (
|
2629
|
+
SELECT
|
2630
|
+
station_id,
|
2631
|
+
round(ST_DISTANCE(
|
2632
|
+
geometry_utm,
|
2633
|
+
(SELECT geometry_utm FROM meta_{para}
|
2634
|
+
WHERE station_id = {stid})
|
2635
|
+
)) AS distance
|
2636
|
+
FROM meta_{para}
|
2637
|
+
WHERE station_id!={stid}
|
2638
|
+
)
|
2639
|
+
SELECT timestamp, filled_by, distance
|
2640
|
+
FROM timeseries."{stid}_{para}"
|
2641
|
+
LEFT JOIN dist ON filled_by=station_id
|
2642
|
+
WHERE BETWEEN {min_tstp} AND {max_tstp};""".format(
|
2643
|
+
stid=self.id,
|
2644
|
+
para=self._para,
|
2645
|
+
**period.get_sql_format_dict(
|
2646
|
+
format="'{}'".format(self._tstp_format_db)
|
2647
|
+
)
|
2648
|
+
)
|
2649
|
+
|
2650
|
+
with db_engine.connect() as con:
|
2651
|
+
df = pd.read_sql(
|
2652
|
+
sqltxt(sql),
|
2653
|
+
con=con,
|
2654
|
+
index_col="timestamp")
|
2655
|
+
|
2656
|
+
# change index to pandas DatetimeIndex if necessary
|
2657
|
+
if not isinstance(df.index, pd.DatetimeIndex):
|
2658
|
+
df.set_index(pd.DatetimeIndex(df.index), inplace=True)
|
2659
|
+
|
2660
|
+
return df
|
2661
|
+
|
2662
|
+
def get_filled(self, period=(None, None), with_dist=False, **kwargs):
|
2663
|
+
"""Get the filled timeserie.
|
2664
|
+
|
2665
|
+
Either only the timeserie is returned or also the id of the station from which the station data got filled, together with the distance to this station in m.
|
2666
|
+
|
2667
|
+
Parameters
|
2668
|
+
----------
|
2669
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
2670
|
+
The minimum and maximum Timestamp for which to get the timeserie.
|
2671
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
2672
|
+
The default is (None, None).
|
2673
|
+
with_dist : bool, optional
|
2674
|
+
Should the distance to the stations from which the timeseries got filled be added.
|
2675
|
+
The default is False.
|
2676
|
+
|
2677
|
+
Returns
|
2678
|
+
-------
|
2679
|
+
pd.DataFrame
|
2680
|
+
The filled timeserie for this station and the given period.
|
2681
|
+
"""
|
2682
|
+
df = self.get_df(period=period, kinds="filled", **kwargs)
|
2683
|
+
|
2684
|
+
# should the distance information get added
|
2685
|
+
if with_dist:
|
2686
|
+
df = df.join(self.get_dist(period=period))
|
2687
|
+
|
2688
|
+
return df
|
2689
|
+
|
2690
|
+
def get_adj(self, **kwargs):
|
2691
|
+
"""Get the adjusted timeserie.
|
2692
|
+
|
2693
|
+
The timeserie is adjusted to the multi annual mean.
|
2694
|
+
So the overall mean of the given period will be the same as the multi annual mean.
|
2695
|
+
|
2696
|
+
Parameters
|
2697
|
+
----------
|
2698
|
+
**kwargs : dict, optional
|
2699
|
+
The keyword arguments are passed to the get_df function.
|
2700
|
+
Possible parameters are "period", "agg_to" or "nas_allowed".
|
2701
|
+
|
2702
|
+
Returns
|
2703
|
+
-------
|
2704
|
+
pandas.DataFrame
|
2705
|
+
A timeserie with the adjusted data.
|
2706
|
+
"""
|
2707
|
+
# this is only the first part of the method
|
2708
|
+
# get basic values
|
2709
|
+
main_df = self.get_df(
|
2710
|
+
kinds=["filled"], # not best, as the ma values are not richter corrected
|
2711
|
+
**kwargs)
|
2712
|
+
ma = self.get_multi_annual_raster()
|
2713
|
+
|
2714
|
+
# create empty adj_df
|
2715
|
+
adj_df = pd.DataFrame(
|
2716
|
+
columns=["adj"],
|
2717
|
+
index=main_df.index,
|
2718
|
+
dtype=main_df["filled"].dtype)
|
2719
|
+
|
2720
|
+
return main_df, adj_df, ma # the rest must get implemented in the subclasses
|
2721
|
+
|
2722
|
+
def plot(self, period=(None, None), kind="filled", agg_to=None, **kwargs):
|
2723
|
+
"""Plot the data of this station.
|
2724
|
+
|
2725
|
+
Parameters
|
2726
|
+
----------
|
2727
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
2728
|
+
The minimum and maximum Timestamp for which to get the timeseries.
|
2729
|
+
If None is given, the maximum or minimal possible Timestamp is taken.
|
2730
|
+
The default is (None, None).
|
2731
|
+
kind : str, optional
|
2732
|
+
The data kind to plot.
|
2733
|
+
Must be a column in the timeseries DB.
|
2734
|
+
Must be one of "raw", "qc", "filled", "adj".
|
2735
|
+
For the precipitation also "qn" and "corr" are valid.
|
2736
|
+
The default is "filled.
|
2737
|
+
agg_to : str or None, optional
|
2738
|
+
Aggregate to a given timespan.
|
2739
|
+
Can be anything smaller than the maximum timespan of the saved data.
|
2740
|
+
If a Timeperiod smaller than the saved data is given, than the maximum possible timeperiod is returned.
|
2741
|
+
For T and ET it can be "month", "year".
|
2742
|
+
For N it can also be "hour".
|
2743
|
+
If None than the maximum timeperiod is taken.
|
2744
|
+
The default is None.
|
2745
|
+
"""
|
2746
|
+
kinds = []
|
2747
|
+
if "kinds" in kwargs:
|
2748
|
+
for kind in kwargs["kinds"]:
|
2749
|
+
if kind not in kinds:
|
2750
|
+
kinds.append(kind)
|
2751
|
+
kwargs.pop("kinds")
|
2752
|
+
else:
|
2753
|
+
kinds = [kind]
|
2754
|
+
|
2755
|
+
df = self.get_df(kinds=kinds, period=period, db_unit=False, agg_to=agg_to)
|
2756
|
+
|
2757
|
+
df.plot(
|
2758
|
+
xlabel="Datum", ylabel=self._unit,
|
2759
|
+
title="{para_long} Station {stid}".format(
|
2760
|
+
para_long=self._para_long,
|
2761
|
+
stid=self.id),
|
2762
|
+
**kwargs
|
2763
|
+
)
|
2764
|
+
|
2765
|
+
|
2766
|
+
class StationCanVirtualBase(StationBase):
|
2767
|
+
"""A class to add the methods for stations that can also be virtual.
|
2768
|
+
Virtual means, that there is no real DWD station with measurements.
|
2769
|
+
But to have data for every parameter at every 10 min precipitation station location, it is necessary to add stations and fill the gaps with data from neighboors."""
|
2770
|
+
|
2771
|
+
def _check_isin_meta(self):
|
2772
|
+
"""Check if the Station is in the Meta table and if not create a virtual station.
|
2773
|
+
|
2774
|
+
Raises:
|
2775
|
+
NotImplementedError:
|
2776
|
+
If the Station ID is neither a real station or in the precipitation meta table.
|
2777
|
+
|
2778
|
+
Returns:
|
2779
|
+
bool: True if the Station check was successfull.
|
2780
|
+
"""
|
2781
|
+
if self.isin_meta():
|
2782
|
+
if self.isin_db():
|
2783
|
+
return True
|
2784
|
+
else:
|
2785
|
+
self._create_timeseries_table()
|
2786
|
+
return True
|
2787
|
+
elif self.isin_meta_p():
|
2788
|
+
self._create_meta_virtual()
|
2789
|
+
self._create_timeseries_table()
|
2790
|
+
return True
|
2791
|
+
raise NotImplementedError(f"""
|
2792
|
+
The given {self._para_long} station with id {self.id} is not in the corresponding meta table
|
2793
|
+
and not in the precipitation meta table in the DB""")
|
2794
|
+
|
2795
|
+
def _create_meta_virtual(self):
|
2796
|
+
"""Create a virtual station in the meta table, for stations that have no real data.
|
2797
|
+
|
2798
|
+
Is only working if a corresponding station is in the precipitation stations meta table.
|
2799
|
+
"""
|
2800
|
+
sql = """
|
2801
|
+
INSERT INTO meta_{para}(
|
2802
|
+
station_id, is_real, geometry, geometry_utm,
|
2803
|
+
stationshoehe, stationsname, bundesland)
|
2804
|
+
(SELECT station_id, false, geometry, geometry_utm,
|
2805
|
+
stationshoehe, stationsname, bundesland
|
2806
|
+
FROM meta_p
|
2807
|
+
WHERE station_id = {stid})
|
2808
|
+
""".format(stid=self.id, para=self._para)
|
2809
|
+
|
2810
|
+
with db_engine.connect() as con:
|
2811
|
+
con.execute(sqltxt(sql))
|
2812
|
+
con.commit()
|
2813
|
+
|
2814
|
+
def isin_meta_p(self):
|
2815
|
+
"""Check if Station is in the precipitation meta table.
|
2816
|
+
|
2817
|
+
Returns
|
2818
|
+
-------
|
2819
|
+
bool
|
2820
|
+
True if Station is in the precipitation meta table.
|
2821
|
+
"""
|
2822
|
+
with db_engine.connect() as con:
|
2823
|
+
result = con.execute(sqltxt(
|
2824
|
+
f"SELECT {self.id} in (SELECT station_id FROM meta_p);"))
|
2825
|
+
return result.first()[0]
|
2826
|
+
|
2827
|
+
def quality_check(self, period=(None, None), **kwargs):
|
2828
|
+
if not self.is_virtual():
|
2829
|
+
return super().quality_check(period=period, **kwargs)
|
2830
|
+
|
2831
|
+
|
2832
|
+
class StationTETBase(StationCanVirtualBase):
|
2833
|
+
"""A base class for T and ET.
|
2834
|
+
|
2835
|
+
This class adds methods that are only used by temperatur and evapotranspiration stations.
|
2836
|
+
"""
|
2837
|
+
# timestamp configurations
|
2838
|
+
_tstp_format_db = "%Y%m%d"
|
2839
|
+
_tstp_format_human = "%Y-%m-%d"
|
2840
|
+
_tstp_dtype = "date"
|
2841
|
+
_interval = "1 day"
|
2842
|
+
|
2843
|
+
# aggregation
|
2844
|
+
_min_agg_to = "day"
|
2845
|
+
|
2846
|
+
def get_neighboor_stids(self, p_elev=(250, 1.5), **kwargs):
|
2847
|
+
"""Get the 5 nearest stations to this station.
|
2848
|
+
|
2849
|
+
Parameters
|
2850
|
+
----------
|
2851
|
+
p_elev : tuple, optional
|
2852
|
+
In Larsim those parameters are defined as $P_1 = 500$ and $P_2 = 1$.
|
2853
|
+
Stoelzle et al. (2016) found that $P_1 = 100$ and $P_2 = 4$ is better for Baden-Würtemberg to consider the quick changes in topographie.
|
2854
|
+
For all of germany, those parameter values are giving too much weight to the elevation difference, which can result in getting neighboor stations from the border of the Tschec Republic for the Feldberg station. Therefor the values $P_1 = 250$ and $P_2 = 1.5$ are used as default values.
|
2855
|
+
literature:
|
2856
|
+
- Stoelzle, Michael & Weiler, Markus & Steinbrich, Andreas. (2016) Starkregengefährdung in Baden-Württemberg – von der Methodenentwicklung zur Starkregenkartierung. Tag der Hydrologie.
|
2857
|
+
- LARSIM Dokumentation, Stand 06.04.2023, online unter https://www.larsim.info/dokumentation/LARSIM-Dokumentation.pdf
|
2858
|
+
The default is (250, 1.5).
|
2859
|
+
|
2860
|
+
Returns
|
2861
|
+
-------
|
2862
|
+
_type_
|
2863
|
+
_description_
|
2864
|
+
"""
|
2865
|
+
# define the P1 and P2 default values for T and ET
|
2866
|
+
return super().get_neighboor_stids(p_elev=p_elev, **kwargs)
|
2867
|
+
|
2868
|
+
def _get_sql_near_median(self, period, only_real=True,
|
2869
|
+
extra_cols=None, add_is_winter=False):
|
2870
|
+
"""Get the SQL statement for the mean of the 5 nearest stations.
|
2871
|
+
|
2872
|
+
Needs to have one column timestamp, mean and raw(original raw value).
|
2873
|
+
|
2874
|
+
Parameters
|
2875
|
+
----------
|
2876
|
+
period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
|
2877
|
+
The minimum and maximum Timestamp for which to get the mean of the nearest stations.
|
2878
|
+
only_real: bool, optional
|
2879
|
+
Should only real station get considered?
|
2880
|
+
If false also virtual stations are part of the result.
|
2881
|
+
The default is True.
|
2882
|
+
extra_cols : str or None, optional
|
2883
|
+
Should there bae additional columns in the result?
|
2884
|
+
Should be a sql-string for the SELECT part.
|
2885
|
+
If None then there are no additional columns.
|
2886
|
+
The default is None.
|
2887
|
+
add_is_winter : bool, optional
|
2888
|
+
Should there be a column ("winter") that indicates if the value is in winter?
|
2889
|
+
The default is False.
|
2890
|
+
|
2891
|
+
Returns
|
2892
|
+
-------
|
2893
|
+
str
|
2894
|
+
SQL statement for the regionalised mean of the 5 nearest stations.
|
2895
|
+
"""
|
2896
|
+
# get neighboring station for every year
|
2897
|
+
start_year = period.start.year
|
2898
|
+
end_year = period.end.year
|
2899
|
+
nbs = pd.DataFrame(
|
2900
|
+
index=pd.Index(range(start_year, end_year+1), name="years"),
|
2901
|
+
columns=["near_stids"], dtype=object)
|
2902
|
+
nbs_stids_all = set()
|
2903
|
+
now = pd.Timestamp.now()
|
2904
|
+
for year in nbs.index:
|
2905
|
+
if year == now.year:
|
2906
|
+
y_period = TimestampPeriod(f"{year}-01-01", now.date())
|
2907
|
+
else:
|
2908
|
+
y_period = TimestampPeriod(f"{year}-01-01", f"{year}-12-31")
|
2909
|
+
nbs_i = self.get_neighboor_stids(period=y_period, only_real=only_real)
|
2910
|
+
nbs_stids_all = nbs_stids_all.union(nbs_i)
|
2911
|
+
nbs.loc[year, "near_stids"] = nbs_i
|
2912
|
+
|
2913
|
+
# add a grouping column if stids of year before is the same
|
2914
|
+
before = None
|
2915
|
+
group_i = 1
|
2916
|
+
for year, row in nbs.iterrows():
|
2917
|
+
if before is None:
|
2918
|
+
before = row["near_stids"]
|
2919
|
+
if before != row["near_stids"]:
|
2920
|
+
group_i += 1
|
2921
|
+
before = row["near_stids"]
|
2922
|
+
nbs.loc[year, "group"] = group_i
|
2923
|
+
|
2924
|
+
|
2925
|
+
# aggregate if neighboors are the same
|
2926
|
+
nbs["start"] = nbs.index
|
2927
|
+
nbs["end"] = nbs.index
|
2928
|
+
nbs = nbs.groupby(nbs["group"])\
|
2929
|
+
.agg({"near_stids":"first", "start": "min", "end": "max"})\
|
2930
|
+
.set_index(["start", "end"])
|
2931
|
+
|
2932
|
+
# get coefs for regionalisation from neighbor stations
|
2933
|
+
coefs = pd.Series(
|
2934
|
+
index=nbs_stids_all,
|
2935
|
+
data=[self.get_coef(other_stid=near_stid, in_db_unit=True)
|
2936
|
+
for near_stid in nbs_stids_all]
|
2937
|
+
).fillna("NULL")\
|
2938
|
+
.apply(lambda x: x[0] if isinstance(x, list) else x)\
|
2939
|
+
.astype(str)
|
2940
|
+
|
2941
|
+
# check extra cols to be in the right format
|
2942
|
+
if extra_cols and len(extra_cols) > 0:
|
2943
|
+
if extra_cols[0] != ",":
|
2944
|
+
extra_cols = ", " + extra_cols
|
2945
|
+
else:
|
2946
|
+
extra_cols = ""
|
2947
|
+
|
2948
|
+
# create sql for winter
|
2949
|
+
if add_is_winter:
|
2950
|
+
sql_is_winter_col = ", EXTRACT(MONTH FROM ts.timestamp) in (1,2,3,10,11,12) AS winter"
|
2951
|
+
else:
|
2952
|
+
sql_is_winter_col = ""
|
2953
|
+
|
2954
|
+
# create year subqueries for near stations mean
|
2955
|
+
sql_near_median_parts = []
|
2956
|
+
for (start, end), row in nbs.iterrows():
|
2957
|
+
period_part = TimestampPeriod(f"{start}-01-01", f"{end}-12-31")
|
2958
|
+
near_stids = row["near_stids"]
|
2959
|
+
|
2960
|
+
# create sql for mean of the near stations and the raw value itself
|
2961
|
+
sql_near_median_parts.append("""
|
2962
|
+
SELECT timestamp,
|
2963
|
+
(SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY T.c)
|
2964
|
+
FROM (VALUES {reg_vals}) T (c)
|
2965
|
+
) as nbs_median
|
2966
|
+
FROM timeseries."{near_stids[0]}_{para}" ts1
|
2967
|
+
{near_joins}
|
2968
|
+
WHERE timestamp BETWEEN {min_tstp}::{tstp_dtype} AND {max_tstp}::{tstp_dtype}
|
2969
|
+
""".format(
|
2970
|
+
para=self._para,
|
2971
|
+
near_stids=near_stids,
|
2972
|
+
reg_vals=", ".join(
|
2973
|
+
[f"(ts{i+1}.raw{self._coef_sign[1]}{coef})"
|
2974
|
+
for i, coef in enumerate(coefs[near_stids].values)]),
|
2975
|
+
near_joins = "\n".join(
|
2976
|
+
[f"FULL OUTER JOIN timeseries.\"{near_stid}_{self._para}\" ts{i+1} USING (timestamp)"
|
2977
|
+
for i, near_stid in enumerate(near_stids)
|
2978
|
+
if i>0]),
|
2979
|
+
tstp_dtype=self._tstp_dtype,
|
2980
|
+
**period_part.get_sql_format_dict()))
|
2981
|
+
|
2982
|
+
# create sql for mean of the near stations and the raw value itself for total period
|
2983
|
+
sql_near_median = """SELECT ts.timestamp, nbs_median, ts.raw as raw {extra_cols}{is_winter_col}
|
2984
|
+
FROM timeseries."{stid}_{para}" AS ts
|
2985
|
+
LEFT JOIN ({sql_near_parts}) nbs
|
2986
|
+
ON ts.timestamp=nbs.timestamp
|
2987
|
+
WHERE ts.timestamp BETWEEN {min_tstp}::{tstp_dtype} AND {max_tstp}::{tstp_dtype}
|
2988
|
+
ORDER BY timestamp ASC"""\
|
2989
|
+
.format(
|
2990
|
+
stid = self.id,
|
2991
|
+
para = self._para,
|
2992
|
+
sql_near_parts = " UNION ".join(sql_near_median_parts),
|
2993
|
+
tstp_dtype=self._tstp_dtype,
|
2994
|
+
extra_cols=extra_cols,
|
2995
|
+
is_winter_col=sql_is_winter_col,
|
2996
|
+
**period.get_sql_format_dict())
|
2997
|
+
|
2998
|
+
return sql_near_median
|
2999
|
+
|
3000
|
+
def _get_sql_nbs_elev_order(self, p_elev=(250, 1.5)):
|
3001
|
+
"""Set the default P values. See _get_sql_near_median for more informations."""
|
3002
|
+
return super()._get_sql_nbs_elev_order(p_elev=p_elev)
|
3003
|
+
|
3004
|
+
def fillup(self, p_elev=(250, 1.5), **kwargs):
|
3005
|
+
"""Set the default P values. See _get_sql_near_median for more informations."""
|
3006
|
+
return super().fillup(p_elev=p_elev, **kwargs)
|
3007
|
+
|
3008
|
+
def _sql_fillup_extra_dict(self, **kwargs):
|
3009
|
+
sql_extra_dict = super()._sql_fillup_extra_dict(**kwargs)
|
3010
|
+
if "p_elev" in kwargs:
|
3011
|
+
sql_extra_dict.update(dict(
|
3012
|
+
mul_elev_order=self._get_sql_nbs_elev_order(p_elev=kwargs["p_elev"])))
|
3013
|
+
else:
|
3014
|
+
sql_extra_dict.update(dict(
|
3015
|
+
mul_elev_order=self._get_sql_nbs_elev_order()))
|
3016
|
+
return sql_extra_dict
|
3017
|
+
|
3018
|
+
def get_adj(self, **kwargs):
|
3019
|
+
"""Get the adjusted timeserie.
|
3020
|
+
|
3021
|
+
The timeserie get adjusted to match the multi-annual value over the given period.
|
3022
|
+
So the yearly variability is kept and only the whole period is adjusted.
|
3023
|
+
|
3024
|
+
Returns
|
3025
|
+
-------
|
3026
|
+
pd.DataFrame
|
3027
|
+
The adjusted timeserie with the timestamp as index.
|
3028
|
+
"""
|
3029
|
+
# this is only the second part of the method
|
3030
|
+
main_df, adj_df, ma = super().get_adj(**kwargs)
|
3031
|
+
|
3032
|
+
# truncate to full years
|
3033
|
+
tstp_min = main_df.index.min()
|
3034
|
+
if tstp_min > pd.Timestamp(year=tstp_min.year, month=1, day=15, tz="UTC"):
|
3035
|
+
tstp_min = pd.Timestamp(
|
3036
|
+
year=tstp_min.year+1, month=1, day=1, tz="UTC")
|
3037
|
+
|
3038
|
+
tstp_max = main_df.index.max()
|
3039
|
+
if tstp_max < pd.Timestamp(year=tstp_min.year, month=12, day=15, tz="UTC"):
|
3040
|
+
tstp_min = pd.Timestamp(
|
3041
|
+
year=tstp_min.year-1, month=12, day=31, tz="UTC")
|
3042
|
+
|
3043
|
+
main_df_tr = main_df.truncate(tstp_min, tstp_max)
|
3044
|
+
|
3045
|
+
# the rest must get implemented in the subclasses
|
3046
|
+
return main_df, adj_df, ma, main_df_tr
|
3047
|
+
|
3048
|
+
|
3049
|
+
class StationPBase(StationBase):
|
3050
|
+
# common settings
|
3051
|
+
_decimals = 100
|
3052
|
+
|
3053
|
+
# cdc dwd parameters
|
3054
|
+
_cdc_date_col = "MESS_DATUM"
|
3055
|
+
|
3056
|
+
# for regionalistaion
|
3057
|
+
_ma_terms = ["wihy", "suhy"]
|
3058
|
+
_ma_raster_key = "hyras"
|
3059
|
+
|
3060
|
+
def get_adj(self, **kwargs):
|
3061
|
+
"""Get the adjusted timeserie.
|
3062
|
+
|
3063
|
+
The timeserie get adjusted to match the multi-annual value over the given period.
|
3064
|
+
So the yearly variability is kept and only the whole period is adjusted.
|
3065
|
+
|
3066
|
+
The basis for the adjusted timeseries is the filled data and not the richter corrected data,
|
3067
|
+
as the ma values are also uncorrected vallues.
|
3068
|
+
|
3069
|
+
Returns
|
3070
|
+
-------
|
3071
|
+
pd.DataFrame
|
3072
|
+
The adjusted timeserie with the timestamp as index.
|
3073
|
+
"""
|
3074
|
+
main_df, adj_df, ma = super().get_adj(**kwargs)
|
3075
|
+
|
3076
|
+
# calculate the half yearly mean
|
3077
|
+
# suhy
|
3078
|
+
suhy_months = [4, 5, 6, 7, 8, 9]
|
3079
|
+
mask_suhy = main_df.index.month.isin(suhy_months)
|
3080
|
+
|
3081
|
+
main_df_suhy = main_df[mask_suhy]
|
3082
|
+
|
3083
|
+
# get the minimum count of elements in the half year
|
3084
|
+
min_count = (365//2 - 10) # days
|
3085
|
+
if "agg_to" not in kwargs:
|
3086
|
+
if self._interval == "10 min":
|
3087
|
+
min_count = min_count * 24 * 6 # 10 minutes
|
3088
|
+
else:
|
3089
|
+
if kwargs["agg_to"] == "month":
|
3090
|
+
min_count=6
|
3091
|
+
elif kwargs["agg_to"] == "hour":
|
3092
|
+
min_count = min_count * 24
|
3093
|
+
elif kwargs["agg_to"] == "year" or kwargs["agg_to"] == "decade":
|
3094
|
+
raise ValueError("The get_adj method does not work on decade values.")
|
3095
|
+
|
3096
|
+
main_df_suhy_y = main_df_suhy.groupby(main_df_suhy.index.year)\
|
3097
|
+
.sum(min_count=min_count).mean()
|
3098
|
+
|
3099
|
+
adj_df[mask_suhy] = (main_df_suhy * (ma[1] / main_df_suhy_y)).round(2)
|
3100
|
+
|
3101
|
+
# wihy
|
3102
|
+
mask_wihy = ~mask_suhy
|
3103
|
+
main_df_wihy = main_df[mask_wihy]
|
3104
|
+
main_df_wihy_y = main_df_wihy.groupby(main_df_wihy.index.year)\
|
3105
|
+
.sum(min_count=min_count).mean()
|
3106
|
+
adj_df[mask_wihy] = (main_df_wihy * (ma[0] / main_df_wihy_y)).round(2)
|
3107
|
+
|
3108
|
+
return adj_df
|