weatherdb 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. docker/Dockerfile +30 -0
  2. docker/docker-compose.yaml +58 -0
  3. docker/docker-compose_test.yaml +24 -0
  4. docker/start-docker-test.sh +6 -0
  5. docs/requirements.txt +10 -0
  6. docs/source/Changelog.md +2 -0
  7. docs/source/License.rst +7 -0
  8. docs/source/Methode.md +161 -0
  9. docs/source/_static/custom.css +8 -0
  10. docs/source/_static/favicon.ico +0 -0
  11. docs/source/_static/logo.png +0 -0
  12. docs/source/api/api.rst +15 -0
  13. docs/source/api/cli.rst +8 -0
  14. docs/source/api/weatherDB.broker.rst +10 -0
  15. docs/source/api/weatherDB.config.rst +7 -0
  16. docs/source/api/weatherDB.db.rst +23 -0
  17. docs/source/api/weatherDB.rst +22 -0
  18. docs/source/api/weatherDB.station.rst +56 -0
  19. docs/source/api/weatherDB.stations.rst +46 -0
  20. docs/source/api/weatherDB.utils.rst +22 -0
  21. docs/source/conf.py +137 -0
  22. docs/source/index.rst +33 -0
  23. docs/source/setup/Configuration.md +127 -0
  24. docs/source/setup/Hosting.md +9 -0
  25. docs/source/setup/Install.md +49 -0
  26. docs/source/setup/Quickstart.md +183 -0
  27. docs/source/setup/setup.rst +12 -0
  28. weatherdb/__init__.py +24 -0
  29. weatherdb/_version.py +1 -0
  30. weatherdb/alembic/README.md +8 -0
  31. weatherdb/alembic/alembic.ini +80 -0
  32. weatherdb/alembic/config.py +9 -0
  33. weatherdb/alembic/env.py +100 -0
  34. weatherdb/alembic/script.py.mako +26 -0
  35. weatherdb/alembic/versions/V1.0.0_initial_database_creation.py +898 -0
  36. weatherdb/alembic/versions/V1.0.2_more_charachters_for_settings+term_station_ma_raster.py +88 -0
  37. weatherdb/alembic/versions/V1.0.5_fix-ma-raster-values.py +152 -0
  38. weatherdb/alembic/versions/V1.0.6_update-views.py +22 -0
  39. weatherdb/broker.py +667 -0
  40. weatherdb/cli.py +214 -0
  41. weatherdb/config/ConfigParser.py +663 -0
  42. weatherdb/config/__init__.py +5 -0
  43. weatherdb/config/config_default.ini +162 -0
  44. weatherdb/db/__init__.py +3 -0
  45. weatherdb/db/connections.py +374 -0
  46. weatherdb/db/fixtures/RichterParameters.json +34 -0
  47. weatherdb/db/models.py +402 -0
  48. weatherdb/db/queries/get_quotient.py +155 -0
  49. weatherdb/db/views.py +165 -0
  50. weatherdb/station/GroupStation.py +710 -0
  51. weatherdb/station/StationBases.py +3108 -0
  52. weatherdb/station/StationET.py +111 -0
  53. weatherdb/station/StationP.py +807 -0
  54. weatherdb/station/StationPD.py +98 -0
  55. weatherdb/station/StationT.py +164 -0
  56. weatherdb/station/__init__.py +13 -0
  57. weatherdb/station/constants.py +21 -0
  58. weatherdb/stations/GroupStations.py +519 -0
  59. weatherdb/stations/StationsBase.py +1021 -0
  60. weatherdb/stations/StationsBaseTET.py +30 -0
  61. weatherdb/stations/StationsET.py +17 -0
  62. weatherdb/stations/StationsP.py +128 -0
  63. weatherdb/stations/StationsPD.py +24 -0
  64. weatherdb/stations/StationsT.py +21 -0
  65. weatherdb/stations/__init__.py +11 -0
  66. weatherdb/utils/TimestampPeriod.py +369 -0
  67. weatherdb/utils/__init__.py +3 -0
  68. weatherdb/utils/dwd.py +350 -0
  69. weatherdb/utils/geometry.py +69 -0
  70. weatherdb/utils/get_data.py +285 -0
  71. weatherdb/utils/logging.py +126 -0
  72. weatherdb-1.1.0.dist-info/LICENSE +674 -0
  73. weatherdb-1.1.0.dist-info/METADATA +765 -0
  74. weatherdb-1.1.0.dist-info/RECORD +77 -0
  75. weatherdb-1.1.0.dist-info/WHEEL +5 -0
  76. weatherdb-1.1.0.dist-info/entry_points.txt +2 -0
  77. weatherdb-1.1.0.dist-info/top_level.txt +3 -0
@@ -0,0 +1,1021 @@
1
+ # libraries
2
+ import warnings
3
+ import traceback
4
+ import pandas as pd
5
+ import geopandas as gpd
6
+ from shapely import wkt
7
+ import multiprocessing as mp
8
+ from multiprocessing.pool import ThreadPool
9
+ import time
10
+ import progressbar as pb
11
+ import logging
12
+ import itertools
13
+ import datetime
14
+ from sqlalchemy import text as sqltxt
15
+ import sqlalchemy as sa
16
+ import textwrap
17
+
18
+ from ..db.connections import db_engine
19
+ from ..utils.dwd import get_dwd_meta, get_cdc_file_list
20
+ from ..station.StationBases import StationBase
21
+ from ..db import models
22
+ from ..db.queries.get_quotient import _get_quotient
23
+
24
+ # set settings
25
+ # ############
26
+ try:# else I get strange errors with linux
27
+ mp.set_start_method('spawn')
28
+ except RuntimeError:
29
+ pass
30
+
31
+ __all__ = ["StationsBase"]
32
+ log = logging.getLogger(__name__)
33
+
34
+ # Base class definitions
35
+ ########################
36
+
37
+ class StationsBase:
38
+ _StationClass = StationBase
39
+ _timeout_raw_imp = 240
40
+
41
+ def __init__(self):
42
+ if type(self) is StationsBase:
43
+ raise NotImplementedError("""
44
+ The StationsBase is only a wrapper class an is not working on its own.
45
+ Please use StationP, StationPD, StationT or StationET instead""")
46
+ self._ftp_folder_base = self._StationClass._ftp_folder_base
47
+ if isinstance(self._ftp_folder_base, str):
48
+ self._ftp_folder_base = [self._ftp_folder_base]
49
+
50
+ # create ftp_folders in order of importance
51
+ self._ftp_folders = list(itertools.chain(*[
52
+ [base + "historical/", base + "recent/"]
53
+ for base in self._ftp_folder_base]))
54
+
55
+ self._para = self._StationClass._para
56
+ self._para_long = self._StationClass._para_long
57
+
58
+ def download_meta(self):
59
+ """Download the meta file(s) from the CDC server.
60
+
61
+ Returns
62
+ -------
63
+ geopandas.GeoDataFrame
64
+ The meta file from the CDC server.
65
+ If there are several meta files on the server, they are joined together.
66
+ """
67
+ # download historic meta file
68
+ meta = get_dwd_meta(self._ftp_folders[0])
69
+
70
+ for ftp_folder in self._ftp_folders[1:]:
71
+ meta_new = get_dwd_meta(ftp_folder=ftp_folder)
72
+
73
+ # add new stations
74
+ meta = pd.concat(
75
+ [meta, meta_new[~meta_new.index.isin(meta.index)]])
76
+ if isinstance(meta_new, gpd.GeoDataFrame):
77
+ meta = gpd.GeoDataFrame(meta, crs=meta_new.crs)
78
+
79
+ # check for wider timespan
80
+ if "bis_datum" in meta.columns:
81
+ meta = meta.join(
82
+ meta_new[["bis_datum", "von_datum"]],
83
+ how="left", rsuffix="_new")
84
+
85
+ mask = meta["von_datum"] > meta["von_datum_new"]
86
+ meta.loc[mask, "von_datum"] = meta_new.loc[mask, "von_datum"]
87
+
88
+ mask = meta["bis_datum"] < meta["bis_datum_new"]
89
+ meta.loc[mask, "bis_datum"] = meta_new.loc[mask, "bis_datum"]
90
+
91
+ meta.drop(["von_datum_new", "bis_datum_new"], axis=1, inplace=True)
92
+
93
+ return meta
94
+
95
+ @db_engine.deco_update_privilege
96
+ def update_meta(self, stids="all", **kwargs):
97
+ """Update the meta table by comparing to the CDC server.
98
+
99
+ The "von_datum" and "bis_datum" is ignored because it is better to set this by the filled period of the stations in the database.
100
+ Often the CDC period is not correct.
101
+
102
+ Parameters
103
+ ----------
104
+ stids: string or list of int, optional
105
+ The Stations for which to compute.
106
+ Can either be "all", for all possible stations
107
+ or a list with the Station IDs.
108
+ The default is "all".
109
+ """
110
+ log.info(
111
+ "The {para_long} meta table gets updated."\
112
+ .format(para_long=self._para_long))
113
+ meta = self.download_meta()
114
+
115
+ # check if Abgabe is in meta
116
+ if "Abgabe" in meta.columns:
117
+ meta.drop("Abgabe", axis=1, inplace=True)
118
+
119
+ # get dropped stations and delete from meta file
120
+ sql_get_dropped = sa\
121
+ .select(models.DroppedStations.station_id)\
122
+ .where(models.DroppedStations.parameter == self._para)
123
+ with db_engine.connect() as con:
124
+ dropped_stids = con.execute(sql_get_dropped).all()
125
+ dropped_stids = [row[0] for row in dropped_stids
126
+ if row[0] in meta.index]
127
+ meta.drop(dropped_stids, inplace=True)
128
+
129
+ # check if only some stids should be updated
130
+ if stids != "all":
131
+ if not isinstance(stids, list):
132
+ stids = [stids,]
133
+ meta.drop([stid for stid in meta.index if stid not in stids], inplace=True)
134
+
135
+ # to have a meta entry for every station before looping over them
136
+ if "von_datum" in meta.columns and "bis_datum" in meta.columns:
137
+ self._update_db_meta(
138
+ meta=meta.drop(["von_datum", "bis_datum"], axis=1))
139
+ else:
140
+ self._update_db_meta(meta=meta)
141
+
142
+ log.info(
143
+ "The {para_long} meta table got successfully updated."\
144
+ .format(para_long=self._para_long))
145
+
146
+ @db_engine.deco_update_privilege
147
+ def _update_db_meta(self, meta):
148
+ """Update a meta table on the database with new DataFrame.
149
+
150
+ Parameters
151
+ ----------
152
+ meta : pandas.DataFrame
153
+ A DataFrame with station_id as index.
154
+ """
155
+ # get the columns of meta
156
+ meta = meta.rename_axis("station_id").reset_index()
157
+ columns = [col.lower() for col in meta.columns]
158
+ columns = columns + ['geometry_utm'] if 'geometry' in columns else columns
159
+ meta.rename(dict(zip(meta.columns, columns)), axis=1, inplace=True)
160
+
161
+ # check if columns are initiated in DB
162
+ with db_engine.connect() as con:
163
+ columns_db = con.execute(sqltxt(
164
+ """
165
+ SELECT column_name
166
+ FROM information_schema.columns
167
+ WHERE table_name='meta_{para}';
168
+ """.format(para=self._para)
169
+ )).all()
170
+ columns_db = [col[0] for col in columns_db]
171
+
172
+ problem_cols = [col for col in columns if col not in columns_db]
173
+ if len(problem_cols) > 0:
174
+ warnings.warn("""
175
+ The meta_{para} column '{cols}' is not initiated in the database.
176
+ This column is therefor skiped.
177
+ Please review the DB or the code.
178
+ """.format(
179
+ para=self._para,
180
+ cols=", ".join(problem_cols))
181
+ )
182
+ columns = [col for col in columns if col in columns_db]
183
+
184
+ # change date columns
185
+ for colname, col in \
186
+ meta.select_dtypes(include="datetime64").items():
187
+ meta.loc[:,colname] = col.dt.strftime("%Y%m%d %H:%M")
188
+
189
+ # change geometry
190
+ if "geometry" in meta.columns:
191
+ with warnings.catch_warnings():
192
+ warnings.simplefilter("ignore")
193
+ meta["geometry_utm"] = meta.geometry.to_crs(25832).to_wkt()
194
+ meta["geometry"] = meta.geometry.to_crs(4326).to_wkt()
195
+
196
+ # change all to strings
197
+ meta = meta.astype(str)
198
+
199
+ # get values
200
+ values_all = ["', '".join(pair) for pair in meta.loc[:,columns].values]
201
+ values = "('" + "'), ('".join(values_all) + "')"
202
+ values = values.replace("'nan'", "NULL").replace("'<NA>'", "NULL")
203
+
204
+ # create sql
205
+ sql = '''
206
+ INSERT INTO meta_{para}({columns})
207
+ Values {values}
208
+ ON CONFLICT (station_id) DO UPDATE SET
209
+ '''.format(
210
+ columns=", ".join(columns),
211
+ values=values,
212
+ para=self._para)
213
+ for col in columns:
214
+ sql += ' "{col}" = EXCLUDED."{col}", '.format(col=col)
215
+
216
+ sql = sql[:-2] + ";"
217
+
218
+ # run sql command
219
+ with db_engine.connect() as con:
220
+ con.execute(sqltxt(sql))
221
+ con.commit()
222
+
223
+ @db_engine.deco_update_privilege
224
+ def update_period_meta(self, stids="all", **kwargs):
225
+ """Update the period in the meta table of the raw data.
226
+
227
+ Parameters
228
+ ----------
229
+ stids: string or list of int, optional
230
+ The Stations for which to compute.
231
+ Can either be "all", for all possible stations
232
+ or a list with the Station IDs.
233
+ The default is "all".
234
+ **kwargs : dict, optional
235
+ **kwargs : dict, optional
236
+ The additional keyword arguments are passed to the get_stations method.
237
+
238
+ Raises
239
+ ------
240
+ ValueError
241
+ If the given stids (Station_IDs) are not all valid.
242
+ """
243
+ self._run_simple_loop(
244
+ stations=self.get_stations(only_real=True, stids=stids, **kwargs),
245
+ method="update_period_meta",
246
+ name="update period in meta",
247
+ kwargs=kwargs
248
+ )
249
+
250
+ @classmethod
251
+ def get_meta_explanation(cls, infos="all"):
252
+ """Get the explanations of the available meta fields.
253
+
254
+ Parameters
255
+ ----------
256
+ infos : list or string, optional
257
+ The infos you wish to get an explanation for.
258
+ If "all" then all the available information get returned.
259
+ The default is "all"
260
+
261
+ Returns
262
+ -------
263
+ pd.Series
264
+ a pandas Series with the information names as index and the explanation as values.
265
+ """
266
+ return cls._StationClass.get_meta_explanation(infos=infos)
267
+
268
+ def get_meta(self,
269
+ infos=["station_id", "filled_from", "filled_until", "geometry"],
270
+ stids="all",
271
+ only_real=True):
272
+ """Get the meta Dataframe from the Database.
273
+
274
+ Parameters
275
+ ----------
276
+ infos : list or str, optional
277
+ A list of information from the meta file to return
278
+ If "all" than all possible columns are returned, but only one geometry column.
279
+ The default is: ["Station_id", "filled_from", "filled_until", "geometry"]
280
+ only_real: bool, optional
281
+ Whether only real stations are returned or also virtual ones.
282
+ True: only stations with own data are returned.
283
+ The default is True.
284
+
285
+ Returns
286
+ -------
287
+ pandas.DataFrame or geopandas.GeoDataFrae
288
+ The meta DataFrame.
289
+ """
290
+ # make sure columns is of type list
291
+ if isinstance(infos, str):
292
+ if infos=="all":
293
+ infos = self.get_meta_explanation(infos="all").index.to_list()
294
+ if "geometry_utm" in infos:
295
+ infos.remove("geometry_utm")
296
+ else:
297
+ infos = [infos]
298
+
299
+ # check infos
300
+ infos = [col.lower() for col in infos]
301
+ if "station_id" not in infos:
302
+ infos.insert(0, "station_id")
303
+ if "geometry" in infos and "geometry_utm" in infos:
304
+ warnings.warn(textwrap.dedent("""\
305
+ You selected 2 geometry columns.
306
+ Only the geometry column with EPSG 4326 is returned"""))
307
+ infos.remove("geometry_utm")
308
+
309
+ # create geometry select statement
310
+ infos_select = []
311
+ for info in infos:
312
+ if info in ["geometry", "geometry_utm"]:
313
+ infos_select.append(
314
+ f"ST_AsText({info}) as {info}")
315
+ else:
316
+ infos_select.append(info)
317
+
318
+ # create sql statement
319
+ sql = "SELECT {cols} FROM meta_{para}"\
320
+ .format(cols=", ".join(infos_select), para=self._para)
321
+ if only_real:
322
+ where_clause = " WHERE is_real=true"
323
+ if stids != "all":
324
+ if not isinstance(stids, list):
325
+ stids = [stids,]
326
+ if "where_clause" not in locals():
327
+ where_clause = " WHERE "
328
+ else:
329
+ where_clause += " AND "
330
+ where_clause += "station_id in ({stids})".format(
331
+ stids=", ".join([str(stid) for stid in stids]))
332
+ if "where_clause" in locals():
333
+ sql += where_clause
334
+
335
+ # execute queries to db
336
+ with db_engine.connect() as con:
337
+ meta = pd.read_sql(
338
+ sqltxt(sql),
339
+ con,
340
+ index_col="station_id")
341
+
342
+ # make datetime columns timezone aware
343
+ meta = meta.apply(
344
+ lambda col: col.dt.tz_localize(datetime.timezone.utc) \
345
+ if hasattr(col, "dt") and not col.dt.tz else col)
346
+
347
+ # change to GeoDataFrame if geometry column was selected
348
+ for geom_col, srid in zip(["geometry", "geometry_utm"],
349
+ ["4326", "25832"]):
350
+ if geom_col in infos:
351
+ meta[geom_col] = meta[geom_col].apply(wkt.loads)
352
+ meta = gpd.GeoDataFrame(
353
+ meta, crs="EPSG:" + srid, geometry=geom_col)
354
+
355
+ # strip whitespaces in string columns
356
+ for col in meta.columns[meta.dtypes == "object"]:
357
+ try:
358
+ meta[col] = meta[col].str.strip()
359
+ except:
360
+ pass
361
+
362
+ return meta
363
+
364
+ def get_stations(self, only_real=True, stids="all", skip_missing_stids=False, **kwargs):
365
+ """Get a list with all the stations as Station-objects.
366
+
367
+ Parameters
368
+ ----------
369
+ only_real: bool, optional
370
+ Whether only real stations are returned or also virtual ones.
371
+ True: only stations with own data are returned.
372
+ The default is True.
373
+ stids: string or list of int, optional
374
+ The Stations to return.
375
+ Can either be "all", for all possible stations
376
+ or a list with the Station IDs.
377
+ The default is "all".
378
+ skip_missing_stids: bool, optional
379
+ Should the method skip the missing stations from input stids?
380
+ If False, then a ValueError is raised if a station is not found.
381
+ The default is False.
382
+ **kwargs : dict, optional
383
+ The additional keyword arguments aren't used in this method.
384
+
385
+ Returns
386
+ -------
387
+ Station-object
388
+ returns a list with the corresponding station objects.
389
+
390
+ Raises
391
+ ------
392
+ ValueError
393
+ If the given stids (Station_IDs) are not all valid.
394
+ """
395
+ meta = self.get_meta(
396
+ infos=["station_id"], only_real=only_real, stids=stids)
397
+
398
+ if isinstance(stids, str) and (stids == "all"):
399
+ stations = [
400
+ self._StationClass(stid, _skip_meta_check=True)
401
+ for stid in meta.index]
402
+ else:
403
+ stids = list(stids)
404
+ stations = [
405
+ self._StationClass(stid, _skip_meta_check=True)
406
+ for stid in meta.index
407
+ if stid in stids]
408
+ if (not skip_missing_stids) and (len(stations) != len(stids)):
409
+ stations_ids = [stat.id for stat in stations]
410
+ raise ValueError(
411
+ "It was not possible to create a {para_long} Station with the following IDs: {stids}".format(
412
+ para_long=self._para_long,
413
+ stids = ", ".join([str(stid) for stid in stids if stid not in stations_ids])
414
+ ))
415
+
416
+ return stations
417
+
418
+ def get_quotient(self, kinds_num, kinds_denom, stids="all", return_as="df", **kwargs):
419
+ """Get the quotient of multi-annual means of two different kinds or the timeserie and the multi annual raster value.
420
+
421
+ $quotient = \\overline{ts}_{kind_num} / \\overline{ts}_{denom}$
422
+
423
+ Parameters
424
+ ----------
425
+ kinds_num : list of str or str
426
+ The timeseries kinds of the numerators.
427
+ Should be one of ['raw', 'qc', 'filled'].
428
+ For precipitation also "corr" is possible.
429
+ kinds_denom : list of str or str
430
+ The timeseries kinds of the denominator or the multi annual raster key.
431
+ If the denominator is a multi annual raster key, then the result is the quotient of the timeserie and the raster value.
432
+ Possible values are:
433
+ - for timeserie kinds: 'raw', 'qc', 'filled' or for precipitation also "corr".
434
+ - for raster keys: 'hyras', 'dwd' or 'regnie', depending on your defined raster files.
435
+ stids : list of Integer
436
+ The stations IDs for which to compute the quotient.
437
+ return_as : str, optional
438
+ The format of the return value.
439
+ If "df" then a pandas DataFrame is returned.
440
+ If "json" then a list with dictionaries is returned.
441
+ **kwargs : dict, optional
442
+ The additional keyword arguments are passed to the get_stations method.
443
+
444
+ Returns
445
+ -------
446
+ pandas.DataFrame or list of dict
447
+ The quotient of the two timeseries as DataFrame or list of dictionaries (JSON) depending on the return_as parameter.
448
+ The default is pd.DataFrame.
449
+
450
+ Raises
451
+ ------
452
+ ValueError
453
+ If the input parameters were not correct.
454
+ """
455
+ # check stids
456
+ if stids == "all":
457
+ stids = None
458
+
459
+ # check kinds
460
+ rast_keys = {"hyras", "regnie", "dwd"}
461
+ kinds_num = self._StationClass._check_kinds(kinds_num)
462
+ kinds_denom = self._StationClass._check_kinds(
463
+ kinds_denom,
464
+ valids=self._StationClass._valid_kinds | rast_keys)
465
+
466
+ # get quotient
467
+ with db_engine.connect() as con:
468
+ return _get_quotient(
469
+ con=con,
470
+ stids=stids,
471
+ paras=self._para,
472
+ kinds_num=kinds_num,
473
+ kinds_denom=kinds_denom,
474
+ return_as=return_as)
475
+
476
+ def count_holes(self, stids="all", **kwargs):
477
+ """Count holes in timeseries depending on there length.
478
+
479
+ Parameters
480
+ ----------
481
+ stids: string or list of int, optional
482
+ The Stations to return.
483
+ Can either be "all", for all possible stations
484
+ or a list with the Station IDs.
485
+ The default is "all".
486
+ **kwargs : dict, optional
487
+ **kwargs : dict, optional
488
+ This is a list of parameters, that is supported by the StationBase.count_holes method.
489
+
490
+ Furthermore the kwargs are passed to the get_stations method.
491
+
492
+ possible values are:
493
+
494
+ - weeks : list, optional
495
+ A list of hole length to count.
496
+ Every hole longer than the duration of weeks specified is counted.
497
+ The default is [2, 4, 8, 12, 16, 20, 24]
498
+ - kind : str
499
+ The kind of the timeserie to analyze.
500
+ Should be one of ['raw', 'qc', 'filled'].
501
+ For N also "corr" is possible.
502
+ Normally only "raw" and "qc" make sense, because the other timeseries should not have holes.
503
+ - period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
504
+ The minimum and maximum Timestamp for which to analyze the timeseries.
505
+ If None is given, the maximum and minimal possible Timestamp is taken.
506
+ The default is (None, None).
507
+ - between_meta_period : bool, optional
508
+ Only check between the respective period that is defined in the meta table.
509
+ If "qc" is chosen as kind, then the "raw" meta period is taken.
510
+ The default is True.
511
+ - crop_period : bool, optional
512
+ should the period get cropped to the maximum filled period.
513
+ This will result in holes being ignored when they are at the end or at the beginning of the timeserie.
514
+ If period = (None, None) is given, then this parameter is set to True.
515
+ The default is False.
516
+
517
+ Returns
518
+ -------
519
+ pandas.DataFrame
520
+ A Pandas Dataframe, with station_id as index and one column per week.
521
+ The numbers in the table are the amount of NA-periods longer than the respective amount of weeks.
522
+
523
+ Raises
524
+ ------
525
+ ValueError
526
+ If the input parameters were not correct.
527
+ """
528
+ # check input parameters
529
+ stations = self.get_stations(stids=stids, only_real=True, **kwargs)
530
+
531
+ # iter stations
532
+ first = True
533
+ for station in pb.progressbar(stations, line_breaks=False):
534
+ new_count = station.count_holes(**kwargs)
535
+ if first:
536
+ meta = new_count
537
+ first = False
538
+ else:
539
+ meta = pd.concat([meta, new_count], axis=0)
540
+
541
+ return meta
542
+
543
+ @staticmethod
544
+ def _get_progressbar(max_value, name):
545
+ pbar = pb.ProgressBar(
546
+ widgets=[
547
+ pb.widgets.RotatingMarker(),
548
+ " " + name ,
549
+ pb.widgets.Percentage(), ' ',
550
+ pb.widgets.SimpleProgress(
551
+ format=("('%(value_s)s/%(max_value_s)s')")), ' ',
552
+ pb.widgets.Bar(min_width=80), ' ',
553
+ pb.widgets.Timer(format='%(elapsed)s'), ' | ',
554
+ pb.widgets.ETA(),
555
+ pb.widgets.DynamicMessage(
556
+ "last_station",
557
+ format=", last id: {formatted_value}",
558
+ precision=4)
559
+ ],
560
+ max_value=max_value,
561
+ variables={"last_station": "None"},
562
+ term_width=100,
563
+ is_terminal=True
564
+ )
565
+ pbar.update(0)
566
+
567
+ return pbar
568
+
569
+ def _run_method(self, stations, method, name, kwds=dict(),
570
+ do_mp=True, processes=mp.cpu_count()-1, **kwargs):
571
+ """Run methods of the given stations objects in multiprocessing/threading mode.
572
+
573
+ Parameters
574
+ ----------
575
+ stations : list of station objects
576
+ A list of station objects. Those must be children of the StationBase class.
577
+ method : str
578
+ The name of the method to call.
579
+ name : str
580
+ A descriptive name of the method to show in the progressbar.
581
+ kwds : dict
582
+ The keyword arguments to give to the methods
583
+ do_mp : bool, optional
584
+ Should the method be done in multiprocessing mode?
585
+ If False the methods will be called in threading mode.
586
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
587
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
588
+ The default is True.
589
+ processes : int, optional
590
+ The number of processes that should get started simultaneously.
591
+ If 1 or less, then the process is computed as a simple loop, so there is no multiprocessing or threading done.
592
+ The default is the cpu count -1.
593
+ """
594
+ log.info(
595
+ f"{self._para_long} Stations async loop over method '{method}' started." +
596
+ "\n" +"-"*80
597
+ )
598
+
599
+ if processes<=1:
600
+ log.info(f"As the number of processes is 1 or lower, the method '{method}' is started as a simple loop.")
601
+ self._run_simple_loop(
602
+ stations=stations, method=method, name=name, kwds=kwds)
603
+ else:
604
+ # progressbar
605
+ num_stations = len(stations)
606
+ pbar = self._get_progressbar(max_value=num_stations, name=name)
607
+
608
+ # create pool
609
+ if do_mp:
610
+ try:
611
+ pool = mp.Pool(processes=processes)
612
+ log.debug("the multiprocessing Pool is started")
613
+ except AssertionError:
614
+ log.debug('daemonic processes are not allowed to have children, therefor threads are used')
615
+ pool = ThreadPool(processes=processes)
616
+ else:
617
+ log.debug("the threading Pool is started")
618
+ pool = ThreadPool(processes=processes)
619
+
620
+ # start processes
621
+ results = []
622
+ for stat in stations:
623
+ results.append(
624
+ pool.apply_async(
625
+ getattr(stat, method),
626
+ kwds=kwds))
627
+ pool.close()
628
+
629
+ # check results until all finished
630
+ finished = [False] * num_stations
631
+ while (True):
632
+ if all(finished):
633
+ break
634
+
635
+ for result in [result for i, result in enumerate(results)
636
+ if not finished[i] and result.ready()]:
637
+ index = results.index(result)
638
+ finished[index] = True
639
+ pbar.variables["last_station"] = stations[index].id
640
+ # get stdout and log
641
+ header = f"""The {name} of the {self._para_long} Station with ID {stations[index].id} finished with """
642
+ try:
643
+ stdout = result.get(10)
644
+ if stdout is not None:
645
+ log.debug(f"{header}stdout:\n{result.get(10)}")
646
+ except Exception:
647
+ log.error(f"{header}stderr:\n{traceback.format_exc()}")
648
+
649
+ pbar.update(sum(finished))
650
+ time.sleep(2)
651
+
652
+ pbar.update(sum(finished))
653
+ pool.join()
654
+ pool.terminate()
655
+
656
+ def _run_simple_loop(self, stations, method, name, kwds=dict()):
657
+ log.info("-"*79 +
658
+ "\n{para_long} Stations simple loop over method '{method}' started.".format(
659
+ para_long=self._para_long,
660
+ method=method
661
+ ))
662
+
663
+ # progressbar
664
+ num_stations = len(stations)
665
+ pbar = self._get_progressbar(max_value=num_stations, name=name)
666
+
667
+ # start processes
668
+ for stat in stations:
669
+ getattr(stat, method)(**kwds)
670
+ pbar.variables["last_station"] = stat.id
671
+ pbar.update(pbar.value + 1)
672
+
673
+ @db_engine.deco_update_privilege
674
+ def update_raw(self, only_new=True, only_real=True, stids="all",
675
+ remove_nas=True, do_mp=True, **kwargs):
676
+ """Download all stations data from CDC and upload to database.
677
+
678
+ Parameters
679
+ ----------
680
+ only_new : bool, optional
681
+ Get only the files that are not yet in the database?
682
+ If False all the available files are loaded again.
683
+ The default is True
684
+ only_real: bool, optional
685
+ Whether only real stations are tried to download.
686
+ True: only stations with a date in raw_from in meta are downloaded.
687
+ The default is True.
688
+ stids: string or list of int, optional
689
+ The Stations to return.
690
+ Can either be "all", for all possible stations
691
+ or a list with the Station IDs.
692
+ The default is "all".
693
+ do_mp : bool, optional
694
+ Should the method be done in multiprocessing mode?
695
+ If False the methods will be called in threading mode.
696
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
697
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
698
+ The default is True.
699
+ remove_nas : bool, optional
700
+ Remove the NAs from the downloaded data before updating it to the database.
701
+ This has computational advantages.
702
+ The default is True.
703
+ **kwargs : dict, optional
704
+ The additional keyword arguments for the _run_method and get_stations method
705
+
706
+ Raises
707
+ ------
708
+ ValueError
709
+ If the given stids (Station_IDs) are not all valid.
710
+ """
711
+ start_tstp = datetime.datetime.now()
712
+
713
+ # get FTP file list
714
+ ftp_file_list = get_cdc_file_list(
715
+ ftp_folders=self._ftp_folders)
716
+
717
+ # run the tasks in multiprocessing mode
718
+ self._run_method(
719
+ stations=self.get_stations(only_real=only_real, stids=stids, **kwargs),
720
+ method="update_raw",
721
+ name="download raw {para} data".format(para=self._para.upper()),
722
+ kwds=dict(
723
+ only_new=only_new,
724
+ ftp_file_list=ftp_file_list,
725
+ remove_nas=remove_nas),
726
+ do_mp=do_mp, **kwargs)
727
+
728
+ # save start time as variable to db
729
+ do_update_period = isinstance(stids, str) and (stids == "all")
730
+ if not do_update_period and isinstance(stids, list):
731
+ all_stids = self.get_meta(["station_id"], stids="all", only_real=True).index
732
+ do_update_period = all([stid in stids for stid in all_stids])
733
+
734
+ if do_update_period:
735
+ with db_engine.connect() as con:
736
+ con.execute(sqltxt("""
737
+ INSERT INTO parameter_variables (parameter, start_tstp_last_imp, max_tstp_last_imp)
738
+ VALUES ('{para}',
739
+ '{start_tstp}'::timestamp,
740
+ (SELECT max(raw_until) FROM meta_{para}))
741
+ ON CONFLICT (parameter) DO UPDATE SET
742
+ start_tstp_last_imp=EXCLUDED.start_tstp_last_imp,
743
+ max_tstp_last_imp=EXCLUDED.max_tstp_last_imp;
744
+ """.format(
745
+ para=self._para,
746
+ start_tstp=start_tstp.strftime("%Y%m%d %H:%M"))))
747
+ con.commit()
748
+
749
+ @db_engine.deco_update_privilege
750
+ def last_imp_quality_check(self, stids="all", do_mp=False, **kwargs):
751
+ """Do the quality check of the last import.
752
+
753
+ Parameters
754
+ ----------
755
+ do_mp : bool, optional
756
+ Should the method be done in multiprocessing mode?
757
+ If False the methods will be called in threading mode.
758
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
759
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
760
+ The default is False.
761
+ stids: string or list of int, optional
762
+ The Stations for which to compute.
763
+ Can either be "all", for all possible stations
764
+ or a list with the Station IDs.
765
+ The default is "all".
766
+ **kwargs : dict, optional
767
+ The additional keyword arguments for the _run_method and get_stations method
768
+ """
769
+ self._run_method(
770
+ stations=self.get_stations(only_real=True, stids=stids, **kwargs),
771
+ method="last_imp_quality_check",
772
+ name="quality check {para} data".format(para=self._para.upper()),
773
+ do_mp=do_mp, **kwargs)
774
+
775
+ @db_engine.deco_update_privilege
776
+ def last_imp_fillup(self, stids="all", do_mp=False, **kwargs):
777
+ """Do the gap filling of the last import.
778
+
779
+ Parameters
780
+ ----------
781
+ do_mp : bool, optional
782
+ Should the method be done in multiprocessing mode?
783
+ If False the methods will be called in threading mode.
784
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
785
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
786
+ The default is False.
787
+ stids: string or list of int, optional
788
+ The Stations for which to compute.
789
+ Can either be "all", for all possible stations
790
+ or a list with the Station IDs.
791
+ The default is "all".
792
+ **kwargs : dict, optional
793
+ The additional keyword arguments for the _run_method and get_stations method
794
+ """
795
+ stations = self.get_stations(only_real=False, stids=stids, **kwargs)
796
+ period = stations[0].get_last_imp_period(all=True)
797
+ period_log = period.strftime("%Y-%m-%d %H:%M")
798
+ log.info("The {para_long} Stations fillup of the last import is started for the period {min_tstp} - {max_tstp}".format(
799
+ para_long=self._para_long,
800
+ min_tstp=period_log[0],
801
+ max_tstp=period_log[1]))
802
+ self._run_method(
803
+ stations=stations,
804
+ method="last_imp_fillup",
805
+ name="fillup {para} data".format(para=self._para.upper()),
806
+ kwds=dict(_last_imp_period=period),
807
+ do_mp=do_mp,
808
+ **kwargs)
809
+
810
+ @db_engine.deco_update_privilege
811
+ def quality_check(self, period=(None, None), only_real=True, stids="all",
812
+ do_mp=False, **kwargs):
813
+ """Quality check the raw data for a given period.
814
+
815
+ Parameters
816
+ ----------
817
+ period : tuple or list of datetime.datetime or None, optional
818
+ The minimum and maximum Timestamp for which to get the timeseries.
819
+ If None is given, the maximum or minimal possible Timestamp is taken.
820
+ The default is (None, None).
821
+ stids: string or list of int, optional
822
+ The Stations for which to compute.
823
+ Can either be "all", for all possible stations
824
+ or a list with the Station IDs.
825
+ The default is "all".
826
+ do_mp : bool, optional
827
+ Should the method be done in multiprocessing mode?
828
+ If False the methods will be called in threading mode.
829
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
830
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
831
+ The default is False.
832
+ **kwargs : dict, optional
833
+ The additional keyword arguments for the _run_method and get_stations method
834
+ """
835
+ self._run_method(
836
+ stations=self.get_stations(only_real=only_real, stids=stids, **kwargs),
837
+ method="quality_check",
838
+ name="quality check {para} data".format(para=self._para.upper()),
839
+ kwds=dict(period=period),
840
+ do_mp=do_mp,
841
+ **kwargs)
842
+
843
+ @db_engine.deco_update_privilege
844
+ def update_ma_raster(self, stids="all", do_mp=False, **kwargs):
845
+ """Update the multi annual raster values for the stations.
846
+
847
+ Get a multi annual value from the corresponding raster and save to the multi annual table in the database.
848
+
849
+ Parameters
850
+ ----------
851
+ stids: string or list of int, optional
852
+ The Stations for which to compute.
853
+ Can either be "all", for all possible stations
854
+ or a list with the Station IDs.
855
+ The default is "all".
856
+ do_mp : bool, optional
857
+ Should the method be done in multiprocessing mode?
858
+ If False the methods will be called in threading mode.
859
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
860
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
861
+ The default is False.
862
+ **kwargs : dict, optional
863
+ The additional keyword arguments for the _run_method and get_stations method
864
+
865
+ Raises
866
+ ------
867
+ ValueError
868
+ If the given stids (Station_IDs) are not all valid.
869
+ """
870
+ self._run_method(
871
+ stations=self.get_stations(only_real=False, stids=stids, **kwargs),
872
+ method="update_ma_raster",
873
+ name="update ma-raster-values for {para}".format(para=self._para.upper()),
874
+ do_mp=do_mp,
875
+ **kwargs)
876
+
877
+ @db_engine.deco_update_privilege
878
+ def update_ma_timeseries(self, kind, stids="all", do_mp=False, **kwargs):
879
+ """Update the multi annual timeseries values for the stations.
880
+
881
+ Get a multi annual value from the corresponding timeseries and save to the database.
882
+
883
+ Parameters
884
+ ----------
885
+ kind : str or list of str
886
+ The timeseries data kind to update theire multi annual value.
887
+ Must be a column in the timeseries DB.
888
+ Must be one of "raw", "qc", "filled".
889
+ For the precipitation also "corr" is valid.
890
+ stids: string or list of int, optional
891
+ The Stations for which to compute.
892
+ Can either be "all", for all possible stations
893
+ or a list with the Station IDs.
894
+ The default is "all".
895
+ do_mp : bool, optional
896
+ Should the method be done in multiprocessing mode?
897
+ If False the methods will be called in threading mode.
898
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
899
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
900
+ The default is False.
901
+ **kwargs : dict, optional
902
+ The additional keyword arguments for the _run_method and get_stations method
903
+
904
+ Raises
905
+ ------
906
+ ValueError
907
+ If the given stids (Station_IDs) are not all valid.
908
+ """
909
+ self._run_method(
910
+ stations=self.get_stations(only_real=False, stids=stids, **kwargs),
911
+ method="update_ma_timeseries",
912
+ name="update ma-ts-values for {para}".format(para=self._para.upper()),
913
+ do_mp=do_mp,
914
+ kwds=dict(kind=kind),
915
+ **kwargs)
916
+
917
+ @db_engine.deco_update_privilege
918
+ def fillup(self, only_real=False, stids="all", do_mp=False, **kwargs):
919
+ """Fill up the quality checked data with data from nearby stations to get complete timeseries.
920
+
921
+ Parameters
922
+ ----------
923
+ only_real: bool, optional
924
+ Whether only real stations are computed or also virtual ones.
925
+ True: only stations with own data are returned.
926
+ The default is True.
927
+ stids: string or list of int, optional
928
+ The Stations for which to compute.
929
+ Can either be "all", for all possible stations
930
+ or a list with the Station IDs.
931
+ The default is "all".
932
+ do_mp : bool, optional
933
+ Should the method be done in multiprocessing mode?
934
+ If False the methods will be called in threading mode.
935
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
936
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
937
+ The default is False.
938
+ **kwargs : dict, optional
939
+ The additional keyword arguments for the _run_method and get_stations method
940
+
941
+ Raises
942
+ ------
943
+ ValueError
944
+ If the given stids (Station_IDs) are not all valid.
945
+ """
946
+ self._run_method(
947
+ stations=self.get_stations(only_real=only_real, stids=stids, **kwargs),
948
+ method="fillup",
949
+ name="fillup {para} data".format(para=self._para.upper()),
950
+ do_mp=do_mp,
951
+ **kwargs)
952
+
953
+ @db_engine.deco_update_privilege
954
+ def update(self, only_new=True, **kwargs):
955
+ """Make a complete update of the stations.
956
+
957
+ Does the update_raw, quality check and fillup of the stations.
958
+
959
+ Parameters
960
+ ----------
961
+ only_new : bool, optional
962
+ Should a only new values be computed?
963
+ If False: The stations are updated for the whole possible period.
964
+ If True, the stations are only updated for new values.
965
+ The default is True.
966
+ """
967
+ self.update_raw(only_new=only_new, **kwargs)
968
+ if only_new:
969
+ self.last_imp_quality_check(**kwargs)
970
+ self.last_imp_fillup(**kwargs)
971
+ else:
972
+ self.quality_check(**kwargs)
973
+ self.fillup(**kwargs)
974
+
975
+ def get_df(self, stids, **kwargs):
976
+ """Get a DataFrame with the corresponding data.
977
+
978
+ Parameters
979
+ ----------
980
+ stids: string or list of int, optional
981
+ The Stations for which to compute.
982
+ Can either be "all", for all possible stations
983
+ or a list with the Station IDs.
984
+ The default is "all".
985
+ **kwargs: optional keyword arguments
986
+ Those keyword arguments are passed to the get_df function of the station class.
987
+ Possible parameters are period, agg_to, kinds.
988
+ Furthermore the kwargs are passed to the get_stations method.
989
+
990
+ Returns
991
+ -------
992
+ pd.Dataframe
993
+ A DataFrame with the timeseries for the selected stations, kind(s) and the given period.
994
+ If multiple columns are selected, the columns in this DataFrame is a MultiIndex with the station IDs as first level and the kind as second level.
995
+ """
996
+ if "kinds" in kwargs and "kind" in kwargs:
997
+ raise ValueError("Either enter kind or kinds, not both.")
998
+ if "kind" in kwargs:
999
+ kinds=[kwargs.pop("kind")]
1000
+ else:
1001
+ kinds=kwargs.pop("kinds")
1002
+ kwargs.update(dict(only_real=kwargs.get("only_real", False)))
1003
+ stats = self.get_stations(stids=stids, **kwargs)
1004
+ df_all = None
1005
+ for stat in pb.progressbar(stats, line_breaks=False):
1006
+ df = stat.get_df(kinds=kinds, **kwargs)
1007
+ if df is None:
1008
+ warnings.warn(
1009
+ f"There was no data for {stat._para_long} station {stat.id}!")
1010
+ continue
1011
+ if len(df.columns) == 1:
1012
+ df.rename(
1013
+ dict(zip(df.columns, [stat.id])),
1014
+ axis=1, inplace=True)
1015
+ else:
1016
+ df.columns = pd.MultiIndex.from_product(
1017
+ [[stat.id], df.columns],
1018
+ names=["Station ID", "kind"])
1019
+ df_all = pd.concat([df_all, df], axis=1)
1020
+
1021
+ return df_all