pthelma 1.1.0__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,193 @@
1
+ # cython: language_level=3, linetrace=True
2
+ # distutils: define_macros=CYTHON_TRACE=1
3
+
4
+ import datetime as dt
5
+ cimport numpy as np
6
+ import numpy as np
7
+ import pandas as pd
8
+ from libc.math cimport isnan
9
+ from pandas.tseries.frequencies import to_offset
10
+
11
+ from htimeseries import HTimeseries
12
+
13
+ from .haggregate import RegularizationMode as RM
14
+
15
+
16
+ class RegularizeError(Exception):
17
+ pass
18
+
19
+
20
+ def regularize(ts, new_date_flag="DATEINSERT", mode=RM.INTERVAL):
21
+ # Sanity checks
22
+ if not hasattr(ts, "time_step"):
23
+ raise RegularizeError("The source time series does not specify a time step")
24
+ try:
25
+ pd.to_timedelta(to_offset(ts.time_step))
26
+ except ValueError:
27
+ raise RegularizeError(
28
+ "The time step is malformed or is specified in months. Only time steps "
29
+ "specified in minutes, hours or days are supported."
30
+ )
31
+
32
+ # Set metadata of result
33
+ result = HTimeseries()
34
+ attrs = (
35
+ "unit",
36
+ "timezone",
37
+ "time_step",
38
+ "interval_type",
39
+ "variable",
40
+ "precision",
41
+ "location",
42
+ )
43
+ for attr in attrs:
44
+ setattr(result, attr, getattr(ts, attr, None))
45
+ if hasattr(ts, "title"):
46
+ result.title = "Regularized " + ts.title
47
+ if hasattr(ts, "comment"):
48
+ result.comment = (
49
+ "Created by regularizing step of timeseries that had this comment:\n\n"
50
+ + ts.comment
51
+ )
52
+
53
+ # Return immediately if empty
54
+ if len(ts.data) == 0:
55
+ return result
56
+
57
+ # Determine first and last timestamps
58
+ step = pd.Timedelta(ts.time_step)
59
+ first_timestamp_of_result = ts.data.index[0].round(step)
60
+ last_timestamp_of_result = ts.data.index[-1].round(step)
61
+
62
+ # Transform all pandas information to plain numpy, which is way faster and is also
63
+ # supported by numba and Cython
64
+ max_flags_length = max(ts.data["flags"].str.len()) + 1 + len(new_date_flag)
65
+ flags_dtype = "U" + str(max_flags_length)
66
+ ts_index = ts.data.index.values.astype(long)
67
+ ts_values = ts.data["value"].values
68
+ ts_flags = ts.data["flags"].values.astype(flags_dtype)
69
+ result_step = np.timedelta64(step).astype(int) * 1000
70
+ result_index = pd.date_range(
71
+ first_timestamp_of_result, last_timestamp_of_result, freq=ts.time_step
72
+ ).values
73
+ result_values = np.full(len(result_index), np.nan, dtype=object)
74
+ result_flags = np.full(len(result_index), "", dtype=flags_dtype)
75
+
76
+ # Do the job
77
+ _perform_regularization(
78
+ result_index,
79
+ result_values,
80
+ result_flags,
81
+ ts_index,
82
+ ts_values,
83
+ ts_flags,
84
+ result_step,
85
+ new_date_flag,
86
+ mode.value,
87
+ )
88
+
89
+ result.data = pd.DataFrame(
90
+ index=result_index,
91
+ columns=["value", "flags"],
92
+ data=np.vstack((result_values, result_flags)).transpose(),
93
+ ).tz_localize(dt.timezone.utc).tz_convert(first_timestamp_of_result.tz)
94
+ return result
95
+
96
+
97
+ def _perform_regularization(
98
+ np.ndarray result_index,
99
+ np.ndarray result_values,
100
+ np.ndarray result_flags,
101
+ np.ndarray ts_index,
102
+ np.ndarray ts_values,
103
+ np.ndarray ts_flags,
104
+ long result_step,
105
+ str new_date_flag,
106
+ int mode,
107
+ ):
108
+ cdef int i, previous_pos
109
+ cdef long t
110
+
111
+ previous_pos = 0
112
+ for i in range(result_index.size):
113
+ t = result_index[i]
114
+ result_values[i], result_flags[i], previous_pos = _get_record(
115
+ ts_index,
116
+ ts_values,
117
+ ts_flags,
118
+ t,
119
+ result_step,
120
+ new_date_flag,
121
+ previous_pos,
122
+ mode,
123
+ )
124
+
125
+
126
+ def _get_record(
127
+ np.ndarray ts_index,
128
+ np.ndarray ts_values,
129
+ np.ndarray ts_flags,
130
+ long t,
131
+ long result_step,
132
+ str new_date_flag,
133
+ int previous_pos,
134
+ int mode,
135
+ ):
136
+ cdef int i, found, count
137
+ cdef int nearest_i = -1
138
+ cdef int INTERVAL = RM.INTERVAL.value
139
+ cdef int INSTANTANEOUS = RM.INSTANTANEOUS.value
140
+
141
+ # Return the source record if it already exists
142
+ found = False
143
+ for i in range(previous_pos, ts_index.size):
144
+ if ts_index[i] == t and (mode == INTERVAL or not isnan(ts_values[i])):
145
+ found = True
146
+ break
147
+ if ts_index[i] > t:
148
+ break
149
+ if found:
150
+ return ts_values[i], ts_flags[i], i
151
+
152
+ # Otherwise get the nearby record, if it exists
153
+ start = t - result_step / 2
154
+ end = t + result_step / 2
155
+ count = 0
156
+ for i in range(previous_pos, ts_index.size):
157
+ ti = ts_index[i]
158
+ if ti >= start and ti < end and (mode == INTERVAL or not isnan(ts_values[i])):
159
+ count += 1
160
+ nearest_i = _get_nearest(nearest_i, i, ts_index, ts_values, t, mode)
161
+ if ts_index[i] >= end:
162
+ i -= 1
163
+ break
164
+ if count < 1 or (count > 1 and mode == INTERVAL):
165
+ return np.nan, "", i
166
+ value = ts_values[nearest_i]
167
+ flags = ts_flags[nearest_i]
168
+ if flags:
169
+ flags += " "
170
+ flags += new_date_flag
171
+ return value, flags, i + 1
172
+
173
+
174
+ def _get_nearest(
175
+ int previous_nearest_i,
176
+ int current_i,
177
+ np.ndarray ts_index,
178
+ np.ndarray ts_values,
179
+ long t,
180
+ int mode,
181
+ ):
182
+ if mode == RM.INTERVAL.value:
183
+ # In that case it doesn't really matter which is the nearest, so long as it's
184
+ # only one (which is checked elsewhere), so we return immediately.
185
+ return current_i
186
+ if previous_nearest_i < 0:
187
+ return current_i
188
+ current_distance = abs(t - ts_index[current_i])
189
+ previous_distance = abs(t - ts_index[previous_nearest_i])
190
+ if current_distance < previous_distance:
191
+ return current_i
192
+ else:
193
+ return previous_nearest_i
hspatial/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .hspatial import * # NOQA
2
+
3
+ __author__ = """Antonis Christofides"""
4
+ __email__ = "antonis@antonischristofides.com"
hspatial/cli.py ADDED
@@ -0,0 +1,310 @@
1
+ import configparser
2
+ import datetime as dt
3
+ import logging
4
+ import os
5
+ import sys
6
+ import traceback
7
+ from glob import glob
8
+ from io import StringIO
9
+
10
+ import click
11
+ import iso8601
12
+ from osgeo import gdal, ogr, osr
13
+ from simpletail import ropen
14
+
15
+ from hspatial import create_ogr_layer_from_timeseries, h_integrate, idw
16
+ from htimeseries import HTimeseries, TzinfoFromString
17
+ from pthelma._version import __version__
18
+
19
+ gdal.UseExceptions()
20
+
21
+
22
+ class WrongValueError(configparser.Error):
23
+ pass
24
+
25
+
26
+ class App:
27
+ def __init__(self, configfilename):
28
+ self.configfilename = configfilename
29
+
30
+ def run(self):
31
+ self.config = AppConfig(self.configfilename)
32
+ self.config.read()
33
+ self._setup_logger()
34
+ self._execute_with_error_handling()
35
+
36
+ def _execute_with_error_handling(self):
37
+ self.logger.info("Starting spatialize, " + dt.datetime.today().isoformat())
38
+ try:
39
+ self._execute()
40
+ except Exception as e:
41
+ self.logger.error(str(e))
42
+ self.logger.debug(traceback.format_exc())
43
+ self.logger.info(
44
+ "spatialize terminated with error, " + dt.datetime.today().isoformat()
45
+ )
46
+ raise click.ClickException(str(e))
47
+ else:
48
+ self.logger.info("Finished spatialize, " + dt.datetime.today().isoformat())
49
+
50
+ def _setup_logger(self):
51
+ self.logger = logging.getLogger("spatialize")
52
+ self._set_logger_handler()
53
+ self.logger.setLevel(self.config.loglevel.upper())
54
+
55
+ def _set_logger_handler(self):
56
+ if getattr(self.config, "logfile", None):
57
+ self.logger.addHandler(logging.FileHandler(self.config.logfile))
58
+ else:
59
+ self.logger.addHandler(logging.StreamHandler())
60
+
61
+ def _get_last_dates(self, filename, n):
62
+ """
63
+ Assuming specified file contains a time series, scan it from the bottom
64
+ and return the list of the n last dates (may be less than n if the time
65
+ series is too small). 'filename' is used in error messages.
66
+ """
67
+ # Get the time zone
68
+ with open(filename) as fp:
69
+ for line in fp:
70
+ if line.startswith("Timezone") or (line and line[0] in "0123456789"):
71
+ break
72
+ if not line.startswith("Timezone"):
73
+ raise click.ClickException("{} does not contain Timezone".format(filename))
74
+ zonestr = line.partition("=")[2].strip()
75
+ timezone = TzinfoFromString(zonestr)
76
+
77
+ result = []
78
+ previous_line_was_empty = False
79
+ with ropen(filename) as fp:
80
+ for i, line in enumerate(fp):
81
+ if i >= n:
82
+ break
83
+ line = line.strip()
84
+
85
+ # Ignore empty lines
86
+ if not line:
87
+ previous_line_was_empty = True
88
+ continue
89
+
90
+ # Is the line in the form of an ini file configuration line?
91
+ items = line.split("=")
92
+ if len(items) and ("," not in items[0]) and previous_line_was_empty:
93
+ break # Yes; we reached the start of the file
94
+
95
+ previous_line_was_empty = False
96
+
97
+ datestring = line.split(",")[0]
98
+ try:
99
+ result.insert(
100
+ 0, iso8601.parse_date(datestring, default_timezone=timezone)
101
+ )
102
+ except iso8601.ParseError as e:
103
+ raise iso8601.ParseError(
104
+ str(e)
105
+ + " (file {}, {} lines from the end)".format(filename, i + 1)
106
+ )
107
+ return result
108
+
109
+ @property
110
+ def _dates_to_calculate(self):
111
+ """
112
+ Generator that yields the dates for which h_integrate should be run;
113
+ this is the latest list of dates such that:
114
+ * At least one of the time series has data
115
+ * The length of the list is the 'number_of_output_files' configuration
116
+ option (maybe less if the time series don't have enough data yet).
117
+ """
118
+ n = self.config.number_of_output_files
119
+ dates = set()
120
+ for filename in self.config.files:
121
+ dates |= set(self._get_last_dates(filename, n))
122
+ dates = list(dates)
123
+ dates.sort()
124
+ dates = dates[-n:]
125
+ for d in dates:
126
+ yield d
127
+
128
+ @property
129
+ def _time_step(self):
130
+ """
131
+ Return time step of all time series. If time step is not the same
132
+ for all time series, raises exception.
133
+ """
134
+ time_step = None
135
+ for filename in self.config.files:
136
+ with open(filename, newline="\n") as f:
137
+ t = HTimeseries(f, start_date="0001-01-01 00:00")
138
+ item_time_step = t.time_step
139
+ if time_step and (item_time_step != time_step):
140
+ raise click.ClickException("Not all time series have the same step")
141
+ time_step = item_time_step
142
+ return time_step
143
+
144
+ @property
145
+ def _date_fmt(self):
146
+ """
147
+ Determine date_fmt based on time series time step.
148
+ """
149
+ if self._time_step.endswith("min") or self._time_step.endswith("H"):
150
+ return "%Y-%m-%d %H:%M%z"
151
+ elif self._time_step.endswith("D"):
152
+ return "%Y-%m-%d"
153
+ elif self._time_step.endswith("M"):
154
+ return "%Y-%m"
155
+ elif self._time_step.endswith("Y"):
156
+ return "%Y"
157
+ raise click.ClickException("Can't use time step " + str(self._time_step))
158
+
159
+ def _delete_obsolete_files(self):
160
+ """
161
+ Delete all tif files produced in the past except the last N,
162
+ where N is the 'number_of_output_files' configuration option.
163
+ """
164
+ pattern = os.path.join(
165
+ self.config.output_dir, "{}-*.tif".format(self.config.filename_prefix)
166
+ )
167
+ files = glob(pattern)
168
+ files.sort()
169
+ for filename in files[: -self.config.number_of_output_files]:
170
+ os.remove(filename)
171
+
172
+ def _execute(self):
173
+ # Create stations layer
174
+ stations = ogr.GetDriverByName("memory").CreateDataSource("stations")
175
+ stations_layer = create_ogr_layer_from_timeseries(
176
+ self.config.files, self.config.epsg, stations
177
+ )
178
+
179
+ # Get mask
180
+ mask = gdal.Open(self.config.mask)
181
+
182
+ # Setup integration method
183
+ if self.config.method == "idw":
184
+ funct = idw
185
+ kwargs = {"alpha": self.config.alpha}
186
+ else:
187
+ assert False
188
+
189
+ for date in self._dates_to_calculate:
190
+ self.logger.info("Processing date " + date.isoformat())
191
+ h_integrate(
192
+ mask,
193
+ stations_layer,
194
+ date,
195
+ os.path.join(self.config.output_dir, self.config.filename_prefix),
196
+ self._date_fmt,
197
+ funct,
198
+ kwargs,
199
+ )
200
+ self._delete_obsolete_files()
201
+
202
+
203
+ class AppConfig:
204
+ config_file_options = {
205
+ "logfile": {"fallback": ""},
206
+ "loglevel": {"fallback": "warning"},
207
+ "mask": {},
208
+ "epsg": {},
209
+ "output_dir": {},
210
+ "filename_prefix": {},
211
+ "number_of_output_files": {},
212
+ "method": {},
213
+ "alpha": {"fallback": "1"},
214
+ "files": {},
215
+ }
216
+
217
+ def __init__(self, configfilename):
218
+ self.configfilename = configfilename
219
+
220
+ def read(self):
221
+ try:
222
+ self._parse_config()
223
+ except (OSError, configparser.Error) as e:
224
+ sys.stderr.write(str(e))
225
+ raise click.ClickException(str(e))
226
+
227
+ def _parse_config(self):
228
+ self._read_config_file()
229
+ self._get_config_options()
230
+ self._parse_config_options()
231
+
232
+ def _read_config_file(self):
233
+ self.config = configparser.ConfigParser(interpolation=None)
234
+ try:
235
+ self._read_config_file_assuming_it_has_section_headers()
236
+ except configparser.MissingSectionHeaderError:
237
+ self._read_config_file_without_sections()
238
+
239
+ def _read_config_file_assuming_it_has_section_headers(self):
240
+ with open(self.configfilename) as f:
241
+ self.config.read_file(f)
242
+
243
+ def _read_config_file_without_sections(self):
244
+ with open(self.configfilename) as f:
245
+ configuration = "[General]\n" + f.read()
246
+ self.config.read_file(StringIO(configuration))
247
+
248
+ def _get_config_options(self):
249
+ self.options = {
250
+ opt: self.config.get("General", opt, **kwargs)
251
+ for opt, kwargs in self.config_file_options.items()
252
+ }
253
+ for key, value in self.options.items():
254
+ setattr(self, key, value)
255
+
256
+ def _parse_config_options(self):
257
+ self._parse_log_level()
258
+ self._parse_files()
259
+ self._check_method()
260
+ self._parse_epsg()
261
+ self._parse_number_of_output_files()
262
+
263
+ def _parse_log_level(self):
264
+ log_levels = ("ERROR", "WARNING", "INFO", "DEBUG")
265
+ self.loglevel = self.loglevel.upper()
266
+ if self.loglevel not in log_levels:
267
+ raise WrongValueError("loglevel must be one of " + ", ".join(log_levels))
268
+
269
+ def _parse_files(self):
270
+ self.files = self.files.split("\n")
271
+
272
+ def _check_method(self):
273
+ # Check method
274
+ if self.method != "idw":
275
+ raise WrongValueError('Option "method" can currently only be idw')
276
+ # Check alpha
277
+ try:
278
+ self.alpha = float(self.alpha)
279
+ except ValueError:
280
+ raise WrongValueError('Option "alpha" must be a number')
281
+
282
+ def _parse_epsg(self):
283
+ try:
284
+ self.epsg = int(self.epsg)
285
+ except ValueError:
286
+ raise WrongValueError('Option "epsg" must be an integer')
287
+ srs = osr.SpatialReference()
288
+ result = srs.ImportFromEPSG(self.epsg)
289
+ if result:
290
+ raise WrongValueError(
291
+ "An error occurred when trying to use epsg={}".format(self.epsg)
292
+ )
293
+
294
+ def _parse_number_of_output_files(self):
295
+ try:
296
+ self.number_of_output_files = int(self.number_of_output_files)
297
+ except ValueError:
298
+ raise WrongValueError('Option "number_of_output_files" must be an integer')
299
+
300
+
301
+ @click.command()
302
+ @click.argument("configfile")
303
+ @click.version_option(
304
+ version=__version__, message="%(prog)s from pthelma v.%(version)s"
305
+ )
306
+ def main(configfile):
307
+ """Spatial integration"""
308
+
309
+ app = App(configfile)
310
+ app.run()