weatherdb 1.1.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. docker/Dockerfile +30 -0
  2. docker/docker-compose.yaml +58 -0
  3. docker/docker-compose_test.yaml +24 -0
  4. docker/start-docker-test.sh +6 -0
  5. docs/requirements.txt +10 -0
  6. docs/source/Changelog.md +2 -0
  7. docs/source/License.rst +7 -0
  8. docs/source/Methode.md +161 -0
  9. docs/source/_static/custom.css +8 -0
  10. docs/source/_static/favicon.ico +0 -0
  11. docs/source/_static/logo.png +0 -0
  12. docs/source/api/api.rst +15 -0
  13. docs/source/api/cli.rst +8 -0
  14. docs/source/api/weatherDB.broker.rst +10 -0
  15. docs/source/api/weatherDB.config.rst +7 -0
  16. docs/source/api/weatherDB.db.rst +23 -0
  17. docs/source/api/weatherDB.rst +22 -0
  18. docs/source/api/weatherDB.station.rst +56 -0
  19. docs/source/api/weatherDB.stations.rst +46 -0
  20. docs/source/api/weatherDB.utils.rst +22 -0
  21. docs/source/conf.py +137 -0
  22. docs/source/index.rst +33 -0
  23. docs/source/setup/Configuration.md +127 -0
  24. docs/source/setup/Hosting.md +9 -0
  25. docs/source/setup/Install.md +49 -0
  26. docs/source/setup/Quickstart.md +183 -0
  27. docs/source/setup/setup.rst +12 -0
  28. weatherdb/__init__.py +24 -0
  29. weatherdb/_version.py +1 -0
  30. weatherdb/alembic/README.md +8 -0
  31. weatherdb/alembic/alembic.ini +80 -0
  32. weatherdb/alembic/config.py +9 -0
  33. weatherdb/alembic/env.py +100 -0
  34. weatherdb/alembic/script.py.mako +26 -0
  35. weatherdb/alembic/versions/V1.0.0_initial_database_creation.py +898 -0
  36. weatherdb/alembic/versions/V1.0.2_more_charachters_for_settings+term_station_ma_raster.py +88 -0
  37. weatherdb/alembic/versions/V1.0.5_fix-ma-raster-values.py +152 -0
  38. weatherdb/alembic/versions/V1.0.6_update-views.py +22 -0
  39. weatherdb/broker.py +667 -0
  40. weatherdb/cli.py +214 -0
  41. weatherdb/config/ConfigParser.py +663 -0
  42. weatherdb/config/__init__.py +5 -0
  43. weatherdb/config/config_default.ini +162 -0
  44. weatherdb/db/__init__.py +3 -0
  45. weatherdb/db/connections.py +374 -0
  46. weatherdb/db/fixtures/RichterParameters.json +34 -0
  47. weatherdb/db/models.py +402 -0
  48. weatherdb/db/queries/get_quotient.py +155 -0
  49. weatherdb/db/views.py +165 -0
  50. weatherdb/station/GroupStation.py +710 -0
  51. weatherdb/station/StationBases.py +3108 -0
  52. weatherdb/station/StationET.py +111 -0
  53. weatherdb/station/StationP.py +807 -0
  54. weatherdb/station/StationPD.py +98 -0
  55. weatherdb/station/StationT.py +164 -0
  56. weatherdb/station/__init__.py +13 -0
  57. weatherdb/station/constants.py +21 -0
  58. weatherdb/stations/GroupStations.py +519 -0
  59. weatherdb/stations/StationsBase.py +1021 -0
  60. weatherdb/stations/StationsBaseTET.py +30 -0
  61. weatherdb/stations/StationsET.py +17 -0
  62. weatherdb/stations/StationsP.py +128 -0
  63. weatherdb/stations/StationsPD.py +24 -0
  64. weatherdb/stations/StationsT.py +21 -0
  65. weatherdb/stations/__init__.py +11 -0
  66. weatherdb/utils/TimestampPeriod.py +369 -0
  67. weatherdb/utils/__init__.py +3 -0
  68. weatherdb/utils/dwd.py +350 -0
  69. weatherdb/utils/geometry.py +69 -0
  70. weatherdb/utils/get_data.py +285 -0
  71. weatherdb/utils/logging.py +126 -0
  72. weatherdb-1.1.0.dist-info/LICENSE +674 -0
  73. weatherdb-1.1.0.dist-info/METADATA +765 -0
  74. weatherdb-1.1.0.dist-info/RECORD +77 -0
  75. weatherdb-1.1.0.dist-info/WHEEL +5 -0
  76. weatherdb-1.1.0.dist-info/entry_points.txt +2 -0
  77. weatherdb-1.1.0.dist-info/top_level.txt +3 -0
@@ -0,0 +1,1021 @@
1
+ # libraries
2
+ import warnings
3
+ import traceback
4
+ import pandas as pd
5
+ import geopandas as gpd
6
+ from shapely import wkt
7
+ import multiprocessing as mp
8
+ from multiprocessing.pool import ThreadPool
9
+ import time
10
+ import progressbar as pb
11
+ import logging
12
+ import itertools
13
+ import datetime
14
+ from sqlalchemy import text as sqltxt
15
+ import sqlalchemy as sa
16
+ import textwrap
17
+
18
+ from ..db.connections import db_engine
19
+ from ..utils.dwd import get_dwd_meta, get_cdc_file_list
20
+ from ..station.StationBases import StationBase
21
+ from ..db import models
22
+ from ..db.queries.get_quotient import _get_quotient
23
+
24
+ # set settings
25
+ # ############
26
+ try:# else I get strange errors with linux
27
+ mp.set_start_method('spawn')
28
+ except RuntimeError:
29
+ pass
30
+
31
+ __all__ = ["StationsBase"]
32
+ log = logging.getLogger(__name__)
33
+
34
+ # Base class definitions
35
+ ########################
36
+
37
+ class StationsBase:
38
+ _StationClass = StationBase
39
+ _timeout_raw_imp = 240
40
+
41
+ def __init__(self):
42
+ if type(self) is StationsBase:
43
+ raise NotImplementedError("""
44
+ The StationsBase is only a wrapper class an is not working on its own.
45
+ Please use StationP, StationPD, StationT or StationET instead""")
46
+ self._ftp_folder_base = self._StationClass._ftp_folder_base
47
+ if isinstance(self._ftp_folder_base, str):
48
+ self._ftp_folder_base = [self._ftp_folder_base]
49
+
50
+ # create ftp_folders in order of importance
51
+ self._ftp_folders = list(itertools.chain(*[
52
+ [base + "historical/", base + "recent/"]
53
+ for base in self._ftp_folder_base]))
54
+
55
+ self._para = self._StationClass._para
56
+ self._para_long = self._StationClass._para_long
57
+
58
+ def download_meta(self):
59
+ """Download the meta file(s) from the CDC server.
60
+
61
+ Returns
62
+ -------
63
+ geopandas.GeoDataFrame
64
+ The meta file from the CDC server.
65
+ If there are several meta files on the server, they are joined together.
66
+ """
67
+ # download historic meta file
68
+ meta = get_dwd_meta(self._ftp_folders[0])
69
+
70
+ for ftp_folder in self._ftp_folders[1:]:
71
+ meta_new = get_dwd_meta(ftp_folder=ftp_folder)
72
+
73
+ # add new stations
74
+ meta = pd.concat(
75
+ [meta, meta_new[~meta_new.index.isin(meta.index)]])
76
+ if isinstance(meta_new, gpd.GeoDataFrame):
77
+ meta = gpd.GeoDataFrame(meta, crs=meta_new.crs)
78
+
79
+ # check for wider timespan
80
+ if "bis_datum" in meta.columns:
81
+ meta = meta.join(
82
+ meta_new[["bis_datum", "von_datum"]],
83
+ how="left", rsuffix="_new")
84
+
85
+ mask = meta["von_datum"] > meta["von_datum_new"]
86
+ meta.loc[mask, "von_datum"] = meta_new.loc[mask, "von_datum"]
87
+
88
+ mask = meta["bis_datum"] < meta["bis_datum_new"]
89
+ meta.loc[mask, "bis_datum"] = meta_new.loc[mask, "bis_datum"]
90
+
91
+ meta.drop(["von_datum_new", "bis_datum_new"], axis=1, inplace=True)
92
+
93
+ return meta
94
+
95
+ @db_engine.deco_update_privilege
96
+ def update_meta(self, stids="all", **kwargs):
97
+ """Update the meta table by comparing to the CDC server.
98
+
99
+ The "von_datum" and "bis_datum" is ignored because it is better to set this by the filled period of the stations in the database.
100
+ Often the CDC period is not correct.
101
+
102
+ Parameters
103
+ ----------
104
+ stids: string or list of int, optional
105
+ The Stations for which to compute.
106
+ Can either be "all", for all possible stations
107
+ or a list with the Station IDs.
108
+ The default is "all".
109
+ """
110
+ log.info(
111
+ "The {para_long} meta table gets updated."\
112
+ .format(para_long=self._para_long))
113
+ meta = self.download_meta()
114
+
115
+ # check if Abgabe is in meta
116
+ if "Abgabe" in meta.columns:
117
+ meta.drop("Abgabe", axis=1, inplace=True)
118
+
119
+ # get dropped stations and delete from meta file
120
+ sql_get_dropped = sa\
121
+ .select(models.DroppedStations.station_id)\
122
+ .where(models.DroppedStations.parameter == self._para)
123
+ with db_engine.connect() as con:
124
+ dropped_stids = con.execute(sql_get_dropped).all()
125
+ dropped_stids = [row[0] for row in dropped_stids
126
+ if row[0] in meta.index]
127
+ meta.drop(dropped_stids, inplace=True)
128
+
129
+ # check if only some stids should be updated
130
+ if stids != "all":
131
+ if not isinstance(stids, list):
132
+ stids = [stids,]
133
+ meta.drop([stid for stid in meta.index if stid not in stids], inplace=True)
134
+
135
+ # to have a meta entry for every station before looping over them
136
+ if "von_datum" in meta.columns and "bis_datum" in meta.columns:
137
+ self._update_db_meta(
138
+ meta=meta.drop(["von_datum", "bis_datum"], axis=1))
139
+ else:
140
+ self._update_db_meta(meta=meta)
141
+
142
+ log.info(
143
+ "The {para_long} meta table got successfully updated."\
144
+ .format(para_long=self._para_long))
145
+
146
+ @db_engine.deco_update_privilege
147
+ def _update_db_meta(self, meta):
148
+ """Update a meta table on the database with new DataFrame.
149
+
150
+ Parameters
151
+ ----------
152
+ meta : pandas.DataFrame
153
+ A DataFrame with station_id as index.
154
+ """
155
+ # get the columns of meta
156
+ meta = meta.rename_axis("station_id").reset_index()
157
+ columns = [col.lower() for col in meta.columns]
158
+ columns = columns + ['geometry_utm'] if 'geometry' in columns else columns
159
+ meta.rename(dict(zip(meta.columns, columns)), axis=1, inplace=True)
160
+
161
+ # check if columns are initiated in DB
162
+ with db_engine.connect() as con:
163
+ columns_db = con.execute(sqltxt(
164
+ """
165
+ SELECT column_name
166
+ FROM information_schema.columns
167
+ WHERE table_name='meta_{para}';
168
+ """.format(para=self._para)
169
+ )).all()
170
+ columns_db = [col[0] for col in columns_db]
171
+
172
+ problem_cols = [col for col in columns if col not in columns_db]
173
+ if len(problem_cols) > 0:
174
+ warnings.warn("""
175
+ The meta_{para} column '{cols}' is not initiated in the database.
176
+ This column is therefor skiped.
177
+ Please review the DB or the code.
178
+ """.format(
179
+ para=self._para,
180
+ cols=", ".join(problem_cols))
181
+ )
182
+ columns = [col for col in columns if col in columns_db]
183
+
184
+ # change date columns
185
+ for colname, col in \
186
+ meta.select_dtypes(include="datetime64").items():
187
+ meta.loc[:,colname] = col.dt.strftime("%Y%m%d %H:%M")
188
+
189
+ # change geometry
190
+ if "geometry" in meta.columns:
191
+ with warnings.catch_warnings():
192
+ warnings.simplefilter("ignore")
193
+ meta["geometry_utm"] = meta.geometry.to_crs(25832).to_wkt()
194
+ meta["geometry"] = meta.geometry.to_crs(4326).to_wkt()
195
+
196
+ # change all to strings
197
+ meta = meta.astype(str)
198
+
199
+ # get values
200
+ values_all = ["', '".join(pair) for pair in meta.loc[:,columns].values]
201
+ values = "('" + "'), ('".join(values_all) + "')"
202
+ values = values.replace("'nan'", "NULL").replace("'<NA>'", "NULL")
203
+
204
+ # create sql
205
+ sql = '''
206
+ INSERT INTO meta_{para}({columns})
207
+ Values {values}
208
+ ON CONFLICT (station_id) DO UPDATE SET
209
+ '''.format(
210
+ columns=", ".join(columns),
211
+ values=values,
212
+ para=self._para)
213
+ for col in columns:
214
+ sql += ' "{col}" = EXCLUDED."{col}", '.format(col=col)
215
+
216
+ sql = sql[:-2] + ";"
217
+
218
+ # run sql command
219
+ with db_engine.connect() as con:
220
+ con.execute(sqltxt(sql))
221
+ con.commit()
222
+
223
+ @db_engine.deco_update_privilege
224
+ def update_period_meta(self, stids="all", **kwargs):
225
+ """Update the period in the meta table of the raw data.
226
+
227
+ Parameters
228
+ ----------
229
+ stids: string or list of int, optional
230
+ The Stations for which to compute.
231
+ Can either be "all", for all possible stations
232
+ or a list with the Station IDs.
233
+ The default is "all".
234
+ **kwargs : dict, optional
235
+ **kwargs : dict, optional
236
+ The additional keyword arguments are passed to the get_stations method.
237
+
238
+ Raises
239
+ ------
240
+ ValueError
241
+ If the given stids (Station_IDs) are not all valid.
242
+ """
243
+ self._run_simple_loop(
244
+ stations=self.get_stations(only_real=True, stids=stids, **kwargs),
245
+ method="update_period_meta",
246
+ name="update period in meta",
247
+ kwargs=kwargs
248
+ )
249
+
250
+ @classmethod
251
+ def get_meta_explanation(cls, infos="all"):
252
+ """Get the explanations of the available meta fields.
253
+
254
+ Parameters
255
+ ----------
256
+ infos : list or string, optional
257
+ The infos you wish to get an explanation for.
258
+ If "all" then all the available information get returned.
259
+ The default is "all"
260
+
261
+ Returns
262
+ -------
263
+ pd.Series
264
+ a pandas Series with the information names as index and the explanation as values.
265
+ """
266
+ return cls._StationClass.get_meta_explanation(infos=infos)
267
+
268
+ def get_meta(self,
269
+ infos=["station_id", "filled_from", "filled_until", "geometry"],
270
+ stids="all",
271
+ only_real=True):
272
+ """Get the meta Dataframe from the Database.
273
+
274
+ Parameters
275
+ ----------
276
+ infos : list or str, optional
277
+ A list of information from the meta file to return
278
+ If "all" than all possible columns are returned, but only one geometry column.
279
+ The default is: ["Station_id", "filled_from", "filled_until", "geometry"]
280
+ only_real: bool, optional
281
+ Whether only real stations are returned or also virtual ones.
282
+ True: only stations with own data are returned.
283
+ The default is True.
284
+
285
+ Returns
286
+ -------
287
+ pandas.DataFrame or geopandas.GeoDataFrae
288
+ The meta DataFrame.
289
+ """
290
+ # make sure columns is of type list
291
+ if isinstance(infos, str):
292
+ if infos=="all":
293
+ infos = self.get_meta_explanation(infos="all").index.to_list()
294
+ if "geometry_utm" in infos:
295
+ infos.remove("geometry_utm")
296
+ else:
297
+ infos = [infos]
298
+
299
+ # check infos
300
+ infos = [col.lower() for col in infos]
301
+ if "station_id" not in infos:
302
+ infos.insert(0, "station_id")
303
+ if "geometry" in infos and "geometry_utm" in infos:
304
+ warnings.warn(textwrap.dedent("""\
305
+ You selected 2 geometry columns.
306
+ Only the geometry column with EPSG 4326 is returned"""))
307
+ infos.remove("geometry_utm")
308
+
309
+ # create geometry select statement
310
+ infos_select = []
311
+ for info in infos:
312
+ if info in ["geometry", "geometry_utm"]:
313
+ infos_select.append(
314
+ f"ST_AsText({info}) as {info}")
315
+ else:
316
+ infos_select.append(info)
317
+
318
+ # create sql statement
319
+ sql = "SELECT {cols} FROM meta_{para}"\
320
+ .format(cols=", ".join(infos_select), para=self._para)
321
+ if only_real:
322
+ where_clause = " WHERE is_real=true"
323
+ if stids != "all":
324
+ if not isinstance(stids, list):
325
+ stids = [stids,]
326
+ if "where_clause" not in locals():
327
+ where_clause = " WHERE "
328
+ else:
329
+ where_clause += " AND "
330
+ where_clause += "station_id in ({stids})".format(
331
+ stids=", ".join([str(stid) for stid in stids]))
332
+ if "where_clause" in locals():
333
+ sql += where_clause
334
+
335
+ # execute queries to db
336
+ with db_engine.connect() as con:
337
+ meta = pd.read_sql(
338
+ sqltxt(sql),
339
+ con,
340
+ index_col="station_id")
341
+
342
+ # make datetime columns timezone aware
343
+ meta = meta.apply(
344
+ lambda col: col.dt.tz_localize(datetime.timezone.utc) \
345
+ if hasattr(col, "dt") and not col.dt.tz else col)
346
+
347
+ # change to GeoDataFrame if geometry column was selected
348
+ for geom_col, srid in zip(["geometry", "geometry_utm"],
349
+ ["4326", "25832"]):
350
+ if geom_col in infos:
351
+ meta[geom_col] = meta[geom_col].apply(wkt.loads)
352
+ meta = gpd.GeoDataFrame(
353
+ meta, crs="EPSG:" + srid, geometry=geom_col)
354
+
355
+ # strip whitespaces in string columns
356
+ for col in meta.columns[meta.dtypes == "object"]:
357
+ try:
358
+ meta[col] = meta[col].str.strip()
359
+ except:
360
+ pass
361
+
362
+ return meta
363
+
364
+ def get_stations(self, only_real=True, stids="all", skip_missing_stids=False, **kwargs):
365
+ """Get a list with all the stations as Station-objects.
366
+
367
+ Parameters
368
+ ----------
369
+ only_real: bool, optional
370
+ Whether only real stations are returned or also virtual ones.
371
+ True: only stations with own data are returned.
372
+ The default is True.
373
+ stids: string or list of int, optional
374
+ The Stations to return.
375
+ Can either be "all", for all possible stations
376
+ or a list with the Station IDs.
377
+ The default is "all".
378
+ skip_missing_stids: bool, optional
379
+ Should the method skip the missing stations from input stids?
380
+ If False, then a ValueError is raised if a station is not found.
381
+ The default is False.
382
+ **kwargs : dict, optional
383
+ The additional keyword arguments aren't used in this method.
384
+
385
+ Returns
386
+ -------
387
+ Station-object
388
+ returns a list with the corresponding station objects.
389
+
390
+ Raises
391
+ ------
392
+ ValueError
393
+ If the given stids (Station_IDs) are not all valid.
394
+ """
395
+ meta = self.get_meta(
396
+ infos=["station_id"], only_real=only_real, stids=stids)
397
+
398
+ if isinstance(stids, str) and (stids == "all"):
399
+ stations = [
400
+ self._StationClass(stid, _skip_meta_check=True)
401
+ for stid in meta.index]
402
+ else:
403
+ stids = list(stids)
404
+ stations = [
405
+ self._StationClass(stid, _skip_meta_check=True)
406
+ for stid in meta.index
407
+ if stid in stids]
408
+ if (not skip_missing_stids) and (len(stations) != len(stids)):
409
+ stations_ids = [stat.id for stat in stations]
410
+ raise ValueError(
411
+ "It was not possible to create a {para_long} Station with the following IDs: {stids}".format(
412
+ para_long=self._para_long,
413
+ stids = ", ".join([str(stid) for stid in stids if stid not in stations_ids])
414
+ ))
415
+
416
+ return stations
417
+
418
+ def get_quotient(self, kinds_num, kinds_denom, stids="all", return_as="df", **kwargs):
419
+ """Get the quotient of multi-annual means of two different kinds or the timeserie and the multi annual raster value.
420
+
421
+ $quotient = \\overline{ts}_{kind_num} / \\overline{ts}_{denom}$
422
+
423
+ Parameters
424
+ ----------
425
+ kinds_num : list of str or str
426
+ The timeseries kinds of the numerators.
427
+ Should be one of ['raw', 'qc', 'filled'].
428
+ For precipitation also "corr" is possible.
429
+ kinds_denom : list of str or str
430
+ The timeseries kinds of the denominator or the multi annual raster key.
431
+ If the denominator is a multi annual raster key, then the result is the quotient of the timeserie and the raster value.
432
+ Possible values are:
433
+ - for timeserie kinds: 'raw', 'qc', 'filled' or for precipitation also "corr".
434
+ - for raster keys: 'hyras', 'dwd' or 'regnie', depending on your defined raster files.
435
+ stids : list of Integer
436
+ The stations IDs for which to compute the quotient.
437
+ return_as : str, optional
438
+ The format of the return value.
439
+ If "df" then a pandas DataFrame is returned.
440
+ If "json" then a list with dictionaries is returned.
441
+ **kwargs : dict, optional
442
+ The additional keyword arguments are passed to the get_stations method.
443
+
444
+ Returns
445
+ -------
446
+ pandas.DataFrame or list of dict
447
+ The quotient of the two timeseries as DataFrame or list of dictionaries (JSON) depending on the return_as parameter.
448
+ The default is pd.DataFrame.
449
+
450
+ Raises
451
+ ------
452
+ ValueError
453
+ If the input parameters were not correct.
454
+ """
455
+ # check stids
456
+ if stids == "all":
457
+ stids = None
458
+
459
+ # check kinds
460
+ rast_keys = {"hyras", "regnie", "dwd"}
461
+ kinds_num = self._StationClass._check_kinds(kinds_num)
462
+ kinds_denom = self._StationClass._check_kinds(
463
+ kinds_denom,
464
+ valids=self._StationClass._valid_kinds | rast_keys)
465
+
466
+ # get quotient
467
+ with db_engine.connect() as con:
468
+ return _get_quotient(
469
+ con=con,
470
+ stids=stids,
471
+ paras=self._para,
472
+ kinds_num=kinds_num,
473
+ kinds_denom=kinds_denom,
474
+ return_as=return_as)
475
+
476
+ def count_holes(self, stids="all", **kwargs):
477
+ """Count holes in timeseries depending on there length.
478
+
479
+ Parameters
480
+ ----------
481
+ stids: string or list of int, optional
482
+ The Stations to return.
483
+ Can either be "all", for all possible stations
484
+ or a list with the Station IDs.
485
+ The default is "all".
486
+ **kwargs : dict, optional
487
+ **kwargs : dict, optional
488
+ This is a list of parameters, that is supported by the StationBase.count_holes method.
489
+
490
+ Furthermore the kwargs are passed to the get_stations method.
491
+
492
+ possible values are:
493
+
494
+ - weeks : list, optional
495
+ A list of hole length to count.
496
+ Every hole longer than the duration of weeks specified is counted.
497
+ The default is [2, 4, 8, 12, 16, 20, 24]
498
+ - kind : str
499
+ The kind of the timeserie to analyze.
500
+ Should be one of ['raw', 'qc', 'filled'].
501
+ For N also "corr" is possible.
502
+ Normally only "raw" and "qc" make sense, because the other timeseries should not have holes.
503
+ - period : TimestampPeriod or (tuple or list of datetime.datetime or None), optional
504
+ The minimum and maximum Timestamp for which to analyze the timeseries.
505
+ If None is given, the maximum and minimal possible Timestamp is taken.
506
+ The default is (None, None).
507
+ - between_meta_period : bool, optional
508
+ Only check between the respective period that is defined in the meta table.
509
+ If "qc" is chosen as kind, then the "raw" meta period is taken.
510
+ The default is True.
511
+ - crop_period : bool, optional
512
+ should the period get cropped to the maximum filled period.
513
+ This will result in holes being ignored when they are at the end or at the beginning of the timeserie.
514
+ If period = (None, None) is given, then this parameter is set to True.
515
+ The default is False.
516
+
517
+ Returns
518
+ -------
519
+ pandas.DataFrame
520
+ A Pandas Dataframe, with station_id as index and one column per week.
521
+ The numbers in the table are the amount of NA-periods longer than the respective amount of weeks.
522
+
523
+ Raises
524
+ ------
525
+ ValueError
526
+ If the input parameters were not correct.
527
+ """
528
+ # check input parameters
529
+ stations = self.get_stations(stids=stids, only_real=True, **kwargs)
530
+
531
+ # iter stations
532
+ first = True
533
+ for station in pb.progressbar(stations, line_breaks=False):
534
+ new_count = station.count_holes(**kwargs)
535
+ if first:
536
+ meta = new_count
537
+ first = False
538
+ else:
539
+ meta = pd.concat([meta, new_count], axis=0)
540
+
541
+ return meta
542
+
543
+ @staticmethod
544
+ def _get_progressbar(max_value, name):
545
+ pbar = pb.ProgressBar(
546
+ widgets=[
547
+ pb.widgets.RotatingMarker(),
548
+ " " + name ,
549
+ pb.widgets.Percentage(), ' ',
550
+ pb.widgets.SimpleProgress(
551
+ format=("('%(value_s)s/%(max_value_s)s')")), ' ',
552
+ pb.widgets.Bar(min_width=80), ' ',
553
+ pb.widgets.Timer(format='%(elapsed)s'), ' | ',
554
+ pb.widgets.ETA(),
555
+ pb.widgets.DynamicMessage(
556
+ "last_station",
557
+ format=", last id: {formatted_value}",
558
+ precision=4)
559
+ ],
560
+ max_value=max_value,
561
+ variables={"last_station": "None"},
562
+ term_width=100,
563
+ is_terminal=True
564
+ )
565
+ pbar.update(0)
566
+
567
+ return pbar
568
+
569
+ def _run_method(self, stations, method, name, kwds=dict(),
570
+ do_mp=True, processes=mp.cpu_count()-1, **kwargs):
571
+ """Run methods of the given stations objects in multiprocessing/threading mode.
572
+
573
+ Parameters
574
+ ----------
575
+ stations : list of station objects
576
+ A list of station objects. Those must be children of the StationBase class.
577
+ method : str
578
+ The name of the method to call.
579
+ name : str
580
+ A descriptive name of the method to show in the progressbar.
581
+ kwds : dict
582
+ The keyword arguments to give to the methods
583
+ do_mp : bool, optional
584
+ Should the method be done in multiprocessing mode?
585
+ If False the methods will be called in threading mode.
586
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
587
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
588
+ The default is True.
589
+ processes : int, optional
590
+ The number of processes that should get started simultaneously.
591
+ If 1 or less, then the process is computed as a simple loop, so there is no multiprocessing or threading done.
592
+ The default is the cpu count -1.
593
+ """
594
+ log.info(
595
+ f"{self._para_long} Stations async loop over method '{method}' started." +
596
+ "\n" +"-"*80
597
+ )
598
+
599
+ if processes<=1:
600
+ log.info(f"As the number of processes is 1 or lower, the method '{method}' is started as a simple loop.")
601
+ self._run_simple_loop(
602
+ stations=stations, method=method, name=name, kwds=kwds)
603
+ else:
604
+ # progressbar
605
+ num_stations = len(stations)
606
+ pbar = self._get_progressbar(max_value=num_stations, name=name)
607
+
608
+ # create pool
609
+ if do_mp:
610
+ try:
611
+ pool = mp.Pool(processes=processes)
612
+ log.debug("the multiprocessing Pool is started")
613
+ except AssertionError:
614
+ log.debug('daemonic processes are not allowed to have children, therefor threads are used')
615
+ pool = ThreadPool(processes=processes)
616
+ else:
617
+ log.debug("the threading Pool is started")
618
+ pool = ThreadPool(processes=processes)
619
+
620
+ # start processes
621
+ results = []
622
+ for stat in stations:
623
+ results.append(
624
+ pool.apply_async(
625
+ getattr(stat, method),
626
+ kwds=kwds))
627
+ pool.close()
628
+
629
+ # check results until all finished
630
+ finished = [False] * num_stations
631
+ while (True):
632
+ if all(finished):
633
+ break
634
+
635
+ for result in [result for i, result in enumerate(results)
636
+ if not finished[i] and result.ready()]:
637
+ index = results.index(result)
638
+ finished[index] = True
639
+ pbar.variables["last_station"] = stations[index].id
640
+ # get stdout and log
641
+ header = f"""The {name} of the {self._para_long} Station with ID {stations[index].id} finished with """
642
+ try:
643
+ stdout = result.get(10)
644
+ if stdout is not None:
645
+ log.debug(f"{header}stdout:\n{result.get(10)}")
646
+ except Exception:
647
+ log.error(f"{header}stderr:\n{traceback.format_exc()}")
648
+
649
+ pbar.update(sum(finished))
650
+ time.sleep(2)
651
+
652
+ pbar.update(sum(finished))
653
+ pool.join()
654
+ pool.terminate()
655
+
656
+ def _run_simple_loop(self, stations, method, name, kwds=dict()):
657
+ log.info("-"*79 +
658
+ "\n{para_long} Stations simple loop over method '{method}' started.".format(
659
+ para_long=self._para_long,
660
+ method=method
661
+ ))
662
+
663
+ # progressbar
664
+ num_stations = len(stations)
665
+ pbar = self._get_progressbar(max_value=num_stations, name=name)
666
+
667
+ # start processes
668
+ for stat in stations:
669
+ getattr(stat, method)(**kwds)
670
+ pbar.variables["last_station"] = stat.id
671
+ pbar.update(pbar.value + 1)
672
+
673
+ @db_engine.deco_update_privilege
674
+ def update_raw(self, only_new=True, only_real=True, stids="all",
675
+ remove_nas=True, do_mp=True, **kwargs):
676
+ """Download all stations data from CDC and upload to database.
677
+
678
+ Parameters
679
+ ----------
680
+ only_new : bool, optional
681
+ Get only the files that are not yet in the database?
682
+ If False all the available files are loaded again.
683
+ The default is True
684
+ only_real: bool, optional
685
+ Whether only real stations are tried to download.
686
+ True: only stations with a date in raw_from in meta are downloaded.
687
+ The default is True.
688
+ stids: string or list of int, optional
689
+ The Stations to return.
690
+ Can either be "all", for all possible stations
691
+ or a list with the Station IDs.
692
+ The default is "all".
693
+ do_mp : bool, optional
694
+ Should the method be done in multiprocessing mode?
695
+ If False the methods will be called in threading mode.
696
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
697
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
698
+ The default is True.
699
+ remove_nas : bool, optional
700
+ Remove the NAs from the downloaded data before updating it to the database.
701
+ This has computational advantages.
702
+ The default is True.
703
+ **kwargs : dict, optional
704
+ The additional keyword arguments for the _run_method and get_stations method
705
+
706
+ Raises
707
+ ------
708
+ ValueError
709
+ If the given stids (Station_IDs) are not all valid.
710
+ """
711
+ start_tstp = datetime.datetime.now()
712
+
713
+ # get FTP file list
714
+ ftp_file_list = get_cdc_file_list(
715
+ ftp_folders=self._ftp_folders)
716
+
717
+ # run the tasks in multiprocessing mode
718
+ self._run_method(
719
+ stations=self.get_stations(only_real=only_real, stids=stids, **kwargs),
720
+ method="update_raw",
721
+ name="download raw {para} data".format(para=self._para.upper()),
722
+ kwds=dict(
723
+ only_new=only_new,
724
+ ftp_file_list=ftp_file_list,
725
+ remove_nas=remove_nas),
726
+ do_mp=do_mp, **kwargs)
727
+
728
+ # save start time as variable to db
729
+ do_update_period = isinstance(stids, str) and (stids == "all")
730
+ if not do_update_period and isinstance(stids, list):
731
+ all_stids = self.get_meta(["station_id"], stids="all", only_real=True).index
732
+ do_update_period = all([stid in stids for stid in all_stids])
733
+
734
+ if do_update_period:
735
+ with db_engine.connect() as con:
736
+ con.execute(sqltxt("""
737
+ INSERT INTO parameter_variables (parameter, start_tstp_last_imp, max_tstp_last_imp)
738
+ VALUES ('{para}',
739
+ '{start_tstp}'::timestamp,
740
+ (SELECT max(raw_until) FROM meta_{para}))
741
+ ON CONFLICT (parameter) DO UPDATE SET
742
+ start_tstp_last_imp=EXCLUDED.start_tstp_last_imp,
743
+ max_tstp_last_imp=EXCLUDED.max_tstp_last_imp;
744
+ """.format(
745
+ para=self._para,
746
+ start_tstp=start_tstp.strftime("%Y%m%d %H:%M"))))
747
+ con.commit()
748
+
749
+ @db_engine.deco_update_privilege
750
+ def last_imp_quality_check(self, stids="all", do_mp=False, **kwargs):
751
+ """Do the quality check of the last import.
752
+
753
+ Parameters
754
+ ----------
755
+ do_mp : bool, optional
756
+ Should the method be done in multiprocessing mode?
757
+ If False the methods will be called in threading mode.
758
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
759
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
760
+ The default is False.
761
+ stids: string or list of int, optional
762
+ The Stations for which to compute.
763
+ Can either be "all", for all possible stations
764
+ or a list with the Station IDs.
765
+ The default is "all".
766
+ **kwargs : dict, optional
767
+ The additional keyword arguments for the _run_method and get_stations method
768
+ """
769
+ self._run_method(
770
+ stations=self.get_stations(only_real=True, stids=stids, **kwargs),
771
+ method="last_imp_quality_check",
772
+ name="quality check {para} data".format(para=self._para.upper()),
773
+ do_mp=do_mp, **kwargs)
774
+
775
+ @db_engine.deco_update_privilege
776
+ def last_imp_fillup(self, stids="all", do_mp=False, **kwargs):
777
+ """Do the gap filling of the last import.
778
+
779
+ Parameters
780
+ ----------
781
+ do_mp : bool, optional
782
+ Should the method be done in multiprocessing mode?
783
+ If False the methods will be called in threading mode.
784
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
785
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
786
+ The default is False.
787
+ stids: string or list of int, optional
788
+ The Stations for which to compute.
789
+ Can either be "all", for all possible stations
790
+ or a list with the Station IDs.
791
+ The default is "all".
792
+ **kwargs : dict, optional
793
+ The additional keyword arguments for the _run_method and get_stations method
794
+ """
795
+ stations = self.get_stations(only_real=False, stids=stids, **kwargs)
796
+ period = stations[0].get_last_imp_period(all=True)
797
+ period_log = period.strftime("%Y-%m-%d %H:%M")
798
+ log.info("The {para_long} Stations fillup of the last import is started for the period {min_tstp} - {max_tstp}".format(
799
+ para_long=self._para_long,
800
+ min_tstp=period_log[0],
801
+ max_tstp=period_log[1]))
802
+ self._run_method(
803
+ stations=stations,
804
+ method="last_imp_fillup",
805
+ name="fillup {para} data".format(para=self._para.upper()),
806
+ kwds=dict(_last_imp_period=period),
807
+ do_mp=do_mp,
808
+ **kwargs)
809
+
810
+ @db_engine.deco_update_privilege
811
+ def quality_check(self, period=(None, None), only_real=True, stids="all",
812
+ do_mp=False, **kwargs):
813
+ """Quality check the raw data for a given period.
814
+
815
+ Parameters
816
+ ----------
817
+ period : tuple or list of datetime.datetime or None, optional
818
+ The minimum and maximum Timestamp for which to get the timeseries.
819
+ If None is given, the maximum or minimal possible Timestamp is taken.
820
+ The default is (None, None).
821
+ stids: string or list of int, optional
822
+ The Stations for which to compute.
823
+ Can either be "all", for all possible stations
824
+ or a list with the Station IDs.
825
+ The default is "all".
826
+ do_mp : bool, optional
827
+ Should the method be done in multiprocessing mode?
828
+ If False the methods will be called in threading mode.
829
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
830
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
831
+ The default is False.
832
+ **kwargs : dict, optional
833
+ The additional keyword arguments for the _run_method and get_stations method
834
+ """
835
+ self._run_method(
836
+ stations=self.get_stations(only_real=only_real, stids=stids, **kwargs),
837
+ method="quality_check",
838
+ name="quality check {para} data".format(para=self._para.upper()),
839
+ kwds=dict(period=period),
840
+ do_mp=do_mp,
841
+ **kwargs)
842
+
843
+ @db_engine.deco_update_privilege
844
+ def update_ma_raster(self, stids="all", do_mp=False, **kwargs):
845
+ """Update the multi annual raster values for the stations.
846
+
847
+ Get a multi annual value from the corresponding raster and save to the multi annual table in the database.
848
+
849
+ Parameters
850
+ ----------
851
+ stids: string or list of int, optional
852
+ The Stations for which to compute.
853
+ Can either be "all", for all possible stations
854
+ or a list with the Station IDs.
855
+ The default is "all".
856
+ do_mp : bool, optional
857
+ Should the method be done in multiprocessing mode?
858
+ If False the methods will be called in threading mode.
859
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
860
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
861
+ The default is False.
862
+ **kwargs : dict, optional
863
+ The additional keyword arguments for the _run_method and get_stations method
864
+
865
+ Raises
866
+ ------
867
+ ValueError
868
+ If the given stids (Station_IDs) are not all valid.
869
+ """
870
+ self._run_method(
871
+ stations=self.get_stations(only_real=False, stids=stids, **kwargs),
872
+ method="update_ma_raster",
873
+ name="update ma-raster-values for {para}".format(para=self._para.upper()),
874
+ do_mp=do_mp,
875
+ **kwargs)
876
+
877
+ @db_engine.deco_update_privilege
878
+ def update_ma_timeseries(self, kind, stids="all", do_mp=False, **kwargs):
879
+ """Update the multi annual timeseries values for the stations.
880
+
881
+ Get a multi annual value from the corresponding timeseries and save to the database.
882
+
883
+ Parameters
884
+ ----------
885
+ kind : str or list of str
886
+ The timeseries data kind to update theire multi annual value.
887
+ Must be a column in the timeseries DB.
888
+ Must be one of "raw", "qc", "filled".
889
+ For the precipitation also "corr" is valid.
890
+ stids: string or list of int, optional
891
+ The Stations for which to compute.
892
+ Can either be "all", for all possible stations
893
+ or a list with the Station IDs.
894
+ The default is "all".
895
+ do_mp : bool, optional
896
+ Should the method be done in multiprocessing mode?
897
+ If False the methods will be called in threading mode.
898
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
899
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
900
+ The default is False.
901
+ **kwargs : dict, optional
902
+ The additional keyword arguments for the _run_method and get_stations method
903
+
904
+ Raises
905
+ ------
906
+ ValueError
907
+ If the given stids (Station_IDs) are not all valid.
908
+ """
909
+ self._run_method(
910
+ stations=self.get_stations(only_real=False, stids=stids, **kwargs),
911
+ method="update_ma_timeseries",
912
+ name="update ma-ts-values for {para}".format(para=self._para.upper()),
913
+ do_mp=do_mp,
914
+ kwds=dict(kind=kind),
915
+ **kwargs)
916
+
917
+ @db_engine.deco_update_privilege
918
+ def fillup(self, only_real=False, stids="all", do_mp=False, **kwargs):
919
+ """Fill up the quality checked data with data from nearby stations to get complete timeseries.
920
+
921
+ Parameters
922
+ ----------
923
+ only_real: bool, optional
924
+ Whether only real stations are computed or also virtual ones.
925
+ True: only stations with own data are returned.
926
+ The default is True.
927
+ stids: string or list of int, optional
928
+ The Stations for which to compute.
929
+ Can either be "all", for all possible stations
930
+ or a list with the Station IDs.
931
+ The default is "all".
932
+ do_mp : bool, optional
933
+ Should the method be done in multiprocessing mode?
934
+ If False the methods will be called in threading mode.
935
+ Multiprocessing needs more memory and a bit more initiating time. Therefor it is only usefull for methods with a lot of computation effort in the python code.
936
+ If the most computation of a method is done in the postgresql database, then threading is enough to speed the process up.
937
+ The default is False.
938
+ **kwargs : dict, optional
939
+ The additional keyword arguments for the _run_method and get_stations method
940
+
941
+ Raises
942
+ ------
943
+ ValueError
944
+ If the given stids (Station_IDs) are not all valid.
945
+ """
946
+ self._run_method(
947
+ stations=self.get_stations(only_real=only_real, stids=stids, **kwargs),
948
+ method="fillup",
949
+ name="fillup {para} data".format(para=self._para.upper()),
950
+ do_mp=do_mp,
951
+ **kwargs)
952
+
953
+ @db_engine.deco_update_privilege
954
+ def update(self, only_new=True, **kwargs):
955
+ """Make a complete update of the stations.
956
+
957
+ Does the update_raw, quality check and fillup of the stations.
958
+
959
+ Parameters
960
+ ----------
961
+ only_new : bool, optional
962
+ Should a only new values be computed?
963
+ If False: The stations are updated for the whole possible period.
964
+ If True, the stations are only updated for new values.
965
+ The default is True.
966
+ """
967
+ self.update_raw(only_new=only_new, **kwargs)
968
+ if only_new:
969
+ self.last_imp_quality_check(**kwargs)
970
+ self.last_imp_fillup(**kwargs)
971
+ else:
972
+ self.quality_check(**kwargs)
973
+ self.fillup(**kwargs)
974
+
975
+ def get_df(self, stids, **kwargs):
976
+ """Get a DataFrame with the corresponding data.
977
+
978
+ Parameters
979
+ ----------
980
+ stids: string or list of int, optional
981
+ The Stations for which to compute.
982
+ Can either be "all", for all possible stations
983
+ or a list with the Station IDs.
984
+ The default is "all".
985
+ **kwargs: optional keyword arguments
986
+ Those keyword arguments are passed to the get_df function of the station class.
987
+ Possible parameters are period, agg_to, kinds.
988
+ Furthermore the kwargs are passed to the get_stations method.
989
+
990
+ Returns
991
+ -------
992
+ pd.Dataframe
993
+ A DataFrame with the timeseries for the selected stations, kind(s) and the given period.
994
+ If multiple columns are selected, the columns in this DataFrame is a MultiIndex with the station IDs as first level and the kind as second level.
995
+ """
996
+ if "kinds" in kwargs and "kind" in kwargs:
997
+ raise ValueError("Either enter kind or kinds, not both.")
998
+ if "kind" in kwargs:
999
+ kinds=[kwargs.pop("kind")]
1000
+ else:
1001
+ kinds=kwargs.pop("kinds")
1002
+ kwargs.update(dict(only_real=kwargs.get("only_real", False)))
1003
+ stats = self.get_stations(stids=stids, **kwargs)
1004
+ df_all = None
1005
+ for stat in pb.progressbar(stats, line_breaks=False):
1006
+ df = stat.get_df(kinds=kinds, **kwargs)
1007
+ if df is None:
1008
+ warnings.warn(
1009
+ f"There was no data for {stat._para_long} station {stat.id}!")
1010
+ continue
1011
+ if len(df.columns) == 1:
1012
+ df.rename(
1013
+ dict(zip(df.columns, [stat.id])),
1014
+ axis=1, inplace=True)
1015
+ else:
1016
+ df.columns = pd.MultiIndex.from_product(
1017
+ [[stat.id], df.columns],
1018
+ names=["Station ID", "kind"])
1019
+ df_all = pd.concat([df_all, df], axis=1)
1020
+
1021
+ return df_all