meteostat 1.7.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meteostat/__init__.py +32 -19
- meteostat/api/daily.py +76 -0
- meteostat/api/hourly.py +80 -0
- meteostat/api/interpolate.py +240 -0
- meteostat/api/inventory.py +59 -0
- meteostat/api/merge.py +103 -0
- meteostat/api/monthly.py +73 -0
- meteostat/api/normals.py +144 -0
- meteostat/api/point.py +30 -0
- meteostat/api/stations.py +234 -0
- meteostat/api/timeseries.py +334 -0
- meteostat/core/cache.py +212 -59
- meteostat/core/config.py +158 -0
- meteostat/core/data.py +199 -0
- meteostat/core/logger.py +9 -0
- meteostat/core/network.py +82 -0
- meteostat/core/parameters.py +112 -0
- meteostat/core/providers.py +184 -0
- meteostat/core/schema.py +170 -0
- meteostat/core/validator.py +38 -0
- meteostat/enumerations.py +149 -0
- meteostat/interpolation/idw.py +120 -0
- meteostat/interpolation/lapserate.py +91 -0
- meteostat/interpolation/nearest.py +31 -0
- meteostat/parameters.py +354 -0
- meteostat/providers/dwd/climat.py +166 -0
- meteostat/providers/dwd/daily.py +144 -0
- meteostat/providers/dwd/hourly.py +218 -0
- meteostat/providers/dwd/monthly.py +138 -0
- meteostat/providers/dwd/mosmix.py +351 -0
- meteostat/providers/dwd/poi.py +117 -0
- meteostat/providers/dwd/shared.py +155 -0
- meteostat/providers/eccc/daily.py +87 -0
- meteostat/providers/eccc/hourly.py +104 -0
- meteostat/providers/eccc/monthly.py +66 -0
- meteostat/providers/eccc/shared.py +45 -0
- meteostat/providers/index.py +496 -0
- meteostat/providers/meteostat/daily.py +65 -0
- meteostat/providers/meteostat/daily_derived.py +110 -0
- meteostat/providers/meteostat/hourly.py +66 -0
- meteostat/providers/meteostat/monthly.py +45 -0
- meteostat/providers/meteostat/monthly_derived.py +106 -0
- meteostat/providers/meteostat/shared.py +93 -0
- meteostat/providers/metno/forecast.py +186 -0
- meteostat/providers/noaa/ghcnd.py +228 -0
- meteostat/providers/noaa/isd_lite.py +142 -0
- meteostat/providers/noaa/metar.py +163 -0
- meteostat/typing.py +113 -0
- meteostat/utils/conversions.py +231 -0
- meteostat/utils/data.py +194 -0
- meteostat/utils/geo.py +28 -0
- meteostat/utils/parsers.py +168 -0
- meteostat/utils/types.py +113 -0
- meteostat/utils/validators.py +31 -0
- meteostat-2.0.0.dist-info/METADATA +134 -0
- meteostat-2.0.0.dist-info/RECORD +63 -0
- {meteostat-1.7.6.dist-info → meteostat-2.0.0.dist-info}/WHEEL +1 -2
- meteostat/core/loader.py +0 -103
- meteostat/core/warn.py +0 -34
- meteostat/enumerations/granularity.py +0 -22
- meteostat/interface/base.py +0 -39
- meteostat/interface/daily.py +0 -118
- meteostat/interface/hourly.py +0 -154
- meteostat/interface/meteodata.py +0 -210
- meteostat/interface/monthly.py +0 -109
- meteostat/interface/normals.py +0 -245
- meteostat/interface/point.py +0 -143
- meteostat/interface/stations.py +0 -252
- meteostat/interface/timeseries.py +0 -237
- meteostat/series/aggregate.py +0 -48
- meteostat/series/convert.py +0 -28
- meteostat/series/count.py +0 -17
- meteostat/series/coverage.py +0 -20
- meteostat/series/fetch.py +0 -28
- meteostat/series/interpolate.py +0 -47
- meteostat/series/normalize.py +0 -76
- meteostat/series/stations.py +0 -22
- meteostat/units.py +0 -149
- meteostat/utilities/__init__.py +0 -0
- meteostat/utilities/aggregations.py +0 -37
- meteostat/utilities/endpoint.py +0 -33
- meteostat/utilities/helpers.py +0 -70
- meteostat/utilities/mutations.py +0 -89
- meteostat/utilities/validations.py +0 -30
- meteostat-1.7.6.dist-info/METADATA +0 -112
- meteostat-1.7.6.dist-info/RECORD +0 -39
- meteostat-1.7.6.dist-info/top_level.txt +0 -1
- /meteostat/{core → api}/__init__.py +0 -0
- /meteostat/{enumerations → interpolation}/__init__.py +0 -0
- /meteostat/{interface → providers}/__init__.py +0 -0
- /meteostat/{interface/interpolate.py → py.typed} +0 -0
- /meteostat/{series → utils}/__init__.py +0 -0
- {meteostat-1.7.6.dist-info → meteostat-2.0.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TimeSeries Class
|
|
3
|
+
|
|
4
|
+
A class to handle meteorological time series data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from copy import copy
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from itertools import chain
|
|
10
|
+
from math import floor
|
|
11
|
+
from statistics import mean
|
|
12
|
+
from typing import List, Optional
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from meteostat.core.parameters import parameter_service
|
|
17
|
+
from meteostat.core.validator import Validator
|
|
18
|
+
from meteostat.core.providers import provider_service
|
|
19
|
+
from meteostat.core.schema import schema_service
|
|
20
|
+
from meteostat.enumerations import Parameter, Granularity, Provider, UnitSystem
|
|
21
|
+
from meteostat.typing import License
|
|
22
|
+
from meteostat.utils.data import fill_df, localize, squash_df
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TimeSeries:
|
|
26
|
+
"""
|
|
27
|
+
TimeSeries class which provides features which are
|
|
28
|
+
used across all granularities
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
granularity: Granularity
|
|
32
|
+
stations: pd.DataFrame
|
|
33
|
+
start: Optional[datetime] = None
|
|
34
|
+
end: Optional[datetime] = None
|
|
35
|
+
timezone: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
_df: Optional[pd.DataFrame] = None
|
|
38
|
+
_multi_station: bool = False
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
granularity: Granularity,
|
|
43
|
+
stations: pd.DataFrame,
|
|
44
|
+
df: Optional[pd.DataFrame],
|
|
45
|
+
start: Optional[datetime] = None,
|
|
46
|
+
end: Optional[datetime] = None,
|
|
47
|
+
timezone: Optional[str] = None,
|
|
48
|
+
multi_station: bool = False,
|
|
49
|
+
) -> None:
|
|
50
|
+
self.granularity = granularity
|
|
51
|
+
self.stations = stations
|
|
52
|
+
self.timezone = timezone
|
|
53
|
+
self._multi_station = multi_station
|
|
54
|
+
if df is not None and not df.empty:
|
|
55
|
+
self._df = df
|
|
56
|
+
self.start = start if start else df.index.get_level_values("time").min()
|
|
57
|
+
self.end = end if end else df.index.get_level_values("time").max()
|
|
58
|
+
|
|
59
|
+
def __len__(self) -> int:
|
|
60
|
+
"""
|
|
61
|
+
Return number of rows in DataFrame
|
|
62
|
+
"""
|
|
63
|
+
return len(self._df) if self._df is not None else 0
|
|
64
|
+
|
|
65
|
+
def __str__(self) -> str:
|
|
66
|
+
"""
|
|
67
|
+
Return a stringified version of the DataFrame
|
|
68
|
+
"""
|
|
69
|
+
return self._df.__str__() if self._df is not None else "Empty time series"
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def _target_length(self) -> int:
|
|
73
|
+
"""
|
|
74
|
+
Expected number of non-NaN values
|
|
75
|
+
"""
|
|
76
|
+
if not self.start or not self.end:
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
diff = self.end - self.start
|
|
80
|
+
|
|
81
|
+
return (
|
|
82
|
+
diff.days + 1
|
|
83
|
+
if self.granularity is Granularity.DAILY
|
|
84
|
+
else floor(diff.total_seconds() / 3600) + 1
|
|
85
|
+
) * len(self.stations)
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def parameters(self) -> List[Parameter]:
|
|
89
|
+
"""
|
|
90
|
+
Get parameters
|
|
91
|
+
"""
|
|
92
|
+
return self._df.columns.to_list() if self._df is not None else []
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def freq(self) -> Optional[str]:
|
|
96
|
+
"""
|
|
97
|
+
The time series frequency.
|
|
98
|
+
|
|
99
|
+
Returns a Pandas offset alias string (e.g. "1h", "1D", "1M") or
|
|
100
|
+
`None` for granularities where a regular frequency does not apply
|
|
101
|
+
(e.g. normals).
|
|
102
|
+
"""
|
|
103
|
+
if self.granularity is Granularity.HOURLY:
|
|
104
|
+
return "1h"
|
|
105
|
+
|
|
106
|
+
if self.granularity is Granularity.DAILY:
|
|
107
|
+
return "1D"
|
|
108
|
+
|
|
109
|
+
if self.granularity is Granularity.MONTHLY:
|
|
110
|
+
return "1MS"
|
|
111
|
+
|
|
112
|
+
# Normals (climatological normals) do not have a regular time
|
|
113
|
+
# frequency in the same sense as timeseries data.
|
|
114
|
+
if self.granularity is Granularity.NORMALS:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def empty(self) -> bool:
|
|
121
|
+
"""
|
|
122
|
+
Is the time series empty?
|
|
123
|
+
"""
|
|
124
|
+
return True if self._df is None else self._df.empty
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def providers(self) -> List[Provider]:
|
|
128
|
+
"""
|
|
129
|
+
Get included providers
|
|
130
|
+
"""
|
|
131
|
+
if self._df is None:
|
|
132
|
+
return []
|
|
133
|
+
providers: List[str] = (
|
|
134
|
+
self._df.index.get_level_values("source").unique().to_list()
|
|
135
|
+
)
|
|
136
|
+
return list(
|
|
137
|
+
set(chain.from_iterable([provider.split(" ") for provider in providers]))
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def licenses(self) -> List[License]:
|
|
142
|
+
"""
|
|
143
|
+
Get licenses
|
|
144
|
+
"""
|
|
145
|
+
providers = [
|
|
146
|
+
provider
|
|
147
|
+
for provider_id in self.providers
|
|
148
|
+
if (provider := provider_service.get_provider(provider_id)) is not None
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
return [
|
|
152
|
+
provider.license for provider in providers if provider.license is not None
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def attribution(self) -> str:
|
|
157
|
+
"""
|
|
158
|
+
Attribution string
|
|
159
|
+
"""
|
|
160
|
+
attributions = [
|
|
161
|
+
"Meteostat",
|
|
162
|
+
*set(
|
|
163
|
+
[
|
|
164
|
+
license.attribution
|
|
165
|
+
for license in self.licenses
|
|
166
|
+
if license.attribution
|
|
167
|
+
]
|
|
168
|
+
),
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
return ", ".join(attributions)
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def commercial(self) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
Is commercial use allowed?
|
|
177
|
+
"""
|
|
178
|
+
return all(license.commercial for license in self.licenses)
|
|
179
|
+
|
|
180
|
+
def fetch(
|
|
181
|
+
self,
|
|
182
|
+
squash=True,
|
|
183
|
+
fill=False,
|
|
184
|
+
sources=False,
|
|
185
|
+
location=False,
|
|
186
|
+
clean=True,
|
|
187
|
+
humanize=False,
|
|
188
|
+
units: UnitSystem = UnitSystem.METRIC,
|
|
189
|
+
) -> Optional[pd.DataFrame]:
|
|
190
|
+
"""
|
|
191
|
+
Fetch the time series data as a DataFrame.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
squash : bool, optional
|
|
196
|
+
Whether to squash the DataFrame by source. Defaults to True.
|
|
197
|
+
fill : bool, optional
|
|
198
|
+
Whether to fill missing rows in the DataFrame. Defaults to False.
|
|
199
|
+
sources : bool, optional
|
|
200
|
+
Whether to include source information in the DataFrame. Defaults to False.
|
|
201
|
+
location : bool, optional
|
|
202
|
+
Whether to include location information (latitude, longitude, elevation)
|
|
203
|
+
in the DataFrame. Defaults to False.
|
|
204
|
+
clean : bool, optional
|
|
205
|
+
Whether to clean the DataFrame according to the schema. Defaults to True.
|
|
206
|
+
humanize : bool, optional
|
|
207
|
+
Whether to convert wind direction and condition codes to human-readable values. Defaults to False.
|
|
208
|
+
units : UnitSystem, optional
|
|
209
|
+
The unit system to use for the DataFrame. Defaults to metric units.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
pd.DataFrame or None
|
|
214
|
+
The time series data as a DataFrame, or None if no data is available.
|
|
215
|
+
"""
|
|
216
|
+
df = copy(self._df)
|
|
217
|
+
|
|
218
|
+
if df is None:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
if squash:
|
|
222
|
+
df = squash_df(df, sources=sources)
|
|
223
|
+
|
|
224
|
+
if clean:
|
|
225
|
+
df = schema_service.clean(df, self.granularity)
|
|
226
|
+
|
|
227
|
+
if (
|
|
228
|
+
fill
|
|
229
|
+
and self.start is not None
|
|
230
|
+
and self.end is not None
|
|
231
|
+
and self.freq is not None
|
|
232
|
+
):
|
|
233
|
+
df = fill_df(df, self.start, self.end, self.freq)
|
|
234
|
+
|
|
235
|
+
if self.timezone:
|
|
236
|
+
df = localize(df, self.timezone)
|
|
237
|
+
|
|
238
|
+
if location:
|
|
239
|
+
df = df.join(
|
|
240
|
+
self.stations[["latitude", "longitude", "elevation"]], on="station"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
if humanize:
|
|
244
|
+
df = schema_service.humanize(df)
|
|
245
|
+
|
|
246
|
+
if units != UnitSystem.METRIC:
|
|
247
|
+
df = schema_service.convert(df, self.granularity, units)
|
|
248
|
+
|
|
249
|
+
# Remove station index level if not a multi-station query
|
|
250
|
+
if not self._multi_station and "station" in df.index.names:
|
|
251
|
+
df = df.droplevel("station")
|
|
252
|
+
|
|
253
|
+
return df.sort_index()
|
|
254
|
+
|
|
255
|
+
def count(self, parameter: Optional[Parameter | str] = None) -> int:
|
|
256
|
+
"""
|
|
257
|
+
Get number of non-NaN values for a specific parameter.
|
|
258
|
+
If no parameter is specified, it returns the count for the entire DataFrame.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
parameter : Parameter or str
|
|
263
|
+
The parameter to count non-NaN values for. If None, counts for the entire DataFrame.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
int
|
|
268
|
+
The count of non-NaN values for the specified parameter or the entire DataFrame.
|
|
269
|
+
"""
|
|
270
|
+
if self._df is None:
|
|
271
|
+
return 0
|
|
272
|
+
|
|
273
|
+
if parameter is None:
|
|
274
|
+
return self._df.count().max()
|
|
275
|
+
|
|
276
|
+
return self._df[
|
|
277
|
+
parameter if isinstance(parameter, Parameter) else parameter
|
|
278
|
+
].count()
|
|
279
|
+
|
|
280
|
+
def completeness(self, parameter: Optional[Parameter | str] = None) -> float:
|
|
281
|
+
"""
|
|
282
|
+
Get completeness for a specific parameter or the entire DataFrame.
|
|
283
|
+
|
|
284
|
+
Parameters
|
|
285
|
+
----------
|
|
286
|
+
parameter : Parameter or str, optional
|
|
287
|
+
The parameter to calculate completeness for.
|
|
288
|
+
If None, calculates for the entire DataFrame.
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
float
|
|
293
|
+
The completeness ratio for the specified parameter or the entire DataFrame.
|
|
294
|
+
Returns 0 if no data is available, 1 if complete, or a value between 0 and 1 otherwise.
|
|
295
|
+
"""
|
|
296
|
+
df = self.fetch()
|
|
297
|
+
|
|
298
|
+
if df is None:
|
|
299
|
+
return 0
|
|
300
|
+
|
|
301
|
+
if parameter:
|
|
302
|
+
return round(
|
|
303
|
+
self.count(parameter) / self._target_length,
|
|
304
|
+
2,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
return round(mean([self.completeness(p) for p in df.columns]), 2)
|
|
308
|
+
|
|
309
|
+
def validate(self) -> bool:
|
|
310
|
+
"""
|
|
311
|
+
Does the time series pass all validations?
|
|
312
|
+
"""
|
|
313
|
+
df = self.fetch(fill=True, clean=False)
|
|
314
|
+
|
|
315
|
+
if df is None:
|
|
316
|
+
return True
|
|
317
|
+
|
|
318
|
+
for col in df:
|
|
319
|
+
parameter = parameter_service.get_parameter(col, self.granularity)
|
|
320
|
+
if parameter is None or not hasattr(parameter, "validators"):
|
|
321
|
+
continue
|
|
322
|
+
for validator in parameter.validators:
|
|
323
|
+
if isinstance(validator, Validator):
|
|
324
|
+
test = validator.test(df[col], df, col)
|
|
325
|
+
else:
|
|
326
|
+
test = validator(df[col], df, col)
|
|
327
|
+
|
|
328
|
+
if isinstance(test, bool):
|
|
329
|
+
if not test:
|
|
330
|
+
return False
|
|
331
|
+
elif not test.all():
|
|
332
|
+
return False
|
|
333
|
+
|
|
334
|
+
return True
|
meteostat/core/cache.py
CHANGED
|
@@ -1,71 +1,224 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Cache Service
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
under the terms of the Creative Commons Attribution-NonCommercial
|
|
6
|
-
4.0 International Public License.
|
|
7
|
-
|
|
8
|
-
The code is licensed under the MIT license.
|
|
4
|
+
The Cache Service provides utilities for caching data on the local file system.
|
|
9
5
|
"""
|
|
10
6
|
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from hashlib import md5
|
|
9
|
+
import json
|
|
11
10
|
import os
|
|
12
|
-
import
|
|
13
|
-
import
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def get_local_file_path(cache_dir: str, cache_subdir: str, path: str) -> str:
|
|
17
|
-
"""
|
|
18
|
-
Get the local file path
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
# Get file ID
|
|
22
|
-
file = hashlib.md5(path.encode("utf-8")).hexdigest()
|
|
23
|
-
|
|
24
|
-
return f"{cache_dir}/{cache_subdir}/{file}"
|
|
25
|
-
|
|
11
|
+
from os.path import exists
|
|
12
|
+
from time import time
|
|
13
|
+
from typing import Any, Callable, Optional
|
|
26
14
|
|
|
27
|
-
|
|
28
|
-
"""
|
|
29
|
-
Check if a file exists in the local cache
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
# Get directory
|
|
33
|
-
directory = os.path.dirname(path)
|
|
34
|
-
|
|
35
|
-
# Make sure the cache directory exists
|
|
36
|
-
if not os.path.exists(directory):
|
|
37
|
-
try:
|
|
38
|
-
os.makedirs(directory)
|
|
39
|
-
except FileExistsError:
|
|
40
|
-
pass
|
|
15
|
+
import pandas as pd
|
|
41
16
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
return True
|
|
17
|
+
from meteostat.core.config import config
|
|
18
|
+
from meteostat.core.logger import logger
|
|
45
19
|
|
|
46
|
-
return False
|
|
47
20
|
|
|
48
|
-
|
|
49
|
-
@classmethod
|
|
50
|
-
def clear_cache(cls, max_age: int = None) -> None:
|
|
21
|
+
class CacheService:
|
|
51
22
|
"""
|
|
52
|
-
|
|
23
|
+
Cache Service
|
|
53
24
|
"""
|
|
54
25
|
|
|
55
|
-
if
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
26
|
+
_purged = False # Flag to indicate if cache has been purged automatically
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def _write_pickle(path: str, df: Optional[pd.DataFrame]) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Persist a DataFrame in Pickle format
|
|
32
|
+
"""
|
|
33
|
+
if df is None:
|
|
34
|
+
pd.DataFrame().to_pickle(path)
|
|
35
|
+
else:
|
|
36
|
+
df.to_pickle(path)
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def _read_pickle(path) -> Optional[pd.DataFrame]:
|
|
40
|
+
"""
|
|
41
|
+
Read a pickle file into a DataFrame
|
|
42
|
+
"""
|
|
43
|
+
df: pd.DataFrame = pd.read_pickle(path)
|
|
44
|
+
return None if df.empty else df
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _write_json(path: str, data: dict | list) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Persist data in JSON format
|
|
50
|
+
"""
|
|
51
|
+
with open(path, "w", encoding="utf-8") as file:
|
|
52
|
+
json.dump(data, file)
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def _read_json(path) -> dict | list:
|
|
56
|
+
"""
|
|
57
|
+
Read JSON data into memory
|
|
58
|
+
"""
|
|
59
|
+
with open(path, "r", encoding="utf-8") as file:
|
|
60
|
+
raw = file.read()
|
|
61
|
+
return json.loads(raw)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def _func_to_uid(func, args: tuple, kwargs: dict[str, Any]) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Get a unique ID from a function call based on its module, name and arguments
|
|
67
|
+
"""
|
|
68
|
+
return md5(
|
|
69
|
+
";".join(
|
|
70
|
+
(
|
|
71
|
+
func.__module__,
|
|
72
|
+
func.__name__,
|
|
73
|
+
*map(str, args),
|
|
74
|
+
*[f"{key}:{str(value)}" for key, value in kwargs.items()],
|
|
75
|
+
)
|
|
76
|
+
).encode("utf-8")
|
|
77
|
+
).hexdigest()
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def create_cache_dir() -> None:
|
|
81
|
+
"""
|
|
82
|
+
Create the cache directory if it doesn't exist
|
|
83
|
+
"""
|
|
84
|
+
cache_dir = config.cache_directory
|
|
85
|
+
|
|
86
|
+
if not os.path.exists(cache_dir):
|
|
87
|
+
os.makedirs(cache_dir)
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def get_cache_path(uid: str, filetype: str):
|
|
91
|
+
"""
|
|
92
|
+
Get path of a cached file based on its uid and file type
|
|
93
|
+
"""
|
|
94
|
+
return config.cache_directory + os.sep + f"{uid}.{filetype}"
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def is_stale(path: str, ttl: int) -> bool:
|
|
98
|
+
"""
|
|
99
|
+
Check if a cached file is stale based on its age and TTL
|
|
100
|
+
"""
|
|
101
|
+
return time() - os.path.getmtime(path) > max([ttl, config.cache_ttl])
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def purge(ttl: Optional[int] = None) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Remove stale files from disk cache
|
|
107
|
+
"""
|
|
108
|
+
if ttl is None:
|
|
109
|
+
ttl = config.cache_ttl
|
|
110
|
+
|
|
111
|
+
logger.debug("Removing cached files older than %s seconds", ttl)
|
|
112
|
+
|
|
113
|
+
cache_dir = config.cache_directory
|
|
114
|
+
|
|
115
|
+
if os.path.exists(cache_dir):
|
|
116
|
+
# Get current time
|
|
117
|
+
now = time()
|
|
118
|
+
# Go through all files
|
|
119
|
+
for file in os.listdir(cache_dir):
|
|
120
|
+
# Get full path
|
|
121
|
+
path = os.path.join(cache_dir, file)
|
|
122
|
+
# Check if file is older than TTL
|
|
123
|
+
if now - os.path.getmtime(path) > ttl and os.path.isfile(path):
|
|
124
|
+
# Delete file
|
|
125
|
+
os.remove(path)
|
|
126
|
+
|
|
127
|
+
def persist(
|
|
128
|
+
self, path: str, data: pd.DataFrame | dict | list, data_type: str
|
|
129
|
+
) -> None:
|
|
130
|
+
"""
|
|
131
|
+
Persist any given data under a specific path
|
|
132
|
+
"""
|
|
133
|
+
# Create cache directory if it doesn't exist
|
|
134
|
+
self.create_cache_dir()
|
|
135
|
+
# Save data locally
|
|
136
|
+
if data_type == "json" and isinstance(data, (dict, list)):
|
|
137
|
+
self._write_json(path, data)
|
|
138
|
+
elif data_type == "pickle" and (isinstance(data, pd.DataFrame) or data is None):
|
|
139
|
+
self._write_pickle(path, data)
|
|
140
|
+
|
|
141
|
+
def fetch(self, path, data_type: str) -> pd.DataFrame | dict | list | None:
|
|
142
|
+
"""
|
|
143
|
+
Fetch data from a given path
|
|
144
|
+
"""
|
|
145
|
+
if data_type == "json":
|
|
146
|
+
return self._read_json(path)
|
|
147
|
+
return self._read_pickle(path)
|
|
148
|
+
|
|
149
|
+
def from_func(
|
|
150
|
+
self, func, args, kwargs, ttl: int, data_format: str
|
|
151
|
+
) -> pd.DataFrame | dict | list:
|
|
152
|
+
"""
|
|
153
|
+
Cache a function's return value
|
|
154
|
+
"""
|
|
155
|
+
uid = self._func_to_uid(func, args, kwargs) # Get UID for function call
|
|
156
|
+
path = self.get_cache_path(uid, data_format) # Get the local cache path
|
|
157
|
+
result = (
|
|
158
|
+
self.fetch(path, data_format)
|
|
159
|
+
if ttl > 0 and exists(path) and not self.is_stale(path, ttl)
|
|
160
|
+
else False
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
cache_status = "is" if isinstance(result, pd.DataFrame) or result else "is not"
|
|
164
|
+
logger.debug(
|
|
165
|
+
"%s from module %s with args=%s and kwargs=%s returns %s and %s served from cache",
|
|
166
|
+
func.__name__,
|
|
167
|
+
func.__module__,
|
|
168
|
+
args,
|
|
169
|
+
kwargs,
|
|
170
|
+
data_format,
|
|
171
|
+
cache_status,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if isinstance(result, pd.DataFrame) or result:
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
result = func(*args, **kwargs)
|
|
178
|
+
if ttl > 0:
|
|
179
|
+
self.persist(path, result, data_format)
|
|
180
|
+
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
def cache(
|
|
184
|
+
self,
|
|
185
|
+
ttl: int | Callable[..., int] = 60 * 60 * 24,
|
|
186
|
+
data_format: str = "json",
|
|
187
|
+
):
|
|
188
|
+
"""
|
|
189
|
+
A simple decorator which caches a function's return value
|
|
190
|
+
based on its payload.
|
|
191
|
+
|
|
192
|
+
All data is persisted in either JSON or Pickle format.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def decorator(func):
|
|
196
|
+
@wraps(func)
|
|
197
|
+
def wrapper(*args, **kwargs):
|
|
198
|
+
if not config.cache_enable:
|
|
199
|
+
logger.debug(
|
|
200
|
+
"Omitting cache for %s from module %s with args=%s and kwargs=%s",
|
|
201
|
+
func.__name__,
|
|
202
|
+
func.__module__,
|
|
203
|
+
args,
|
|
204
|
+
kwargs,
|
|
205
|
+
)
|
|
206
|
+
return func(*args, **kwargs)
|
|
207
|
+
if config.cache_autoclean and not self._purged:
|
|
208
|
+
self.purge()
|
|
209
|
+
self._purged = True
|
|
210
|
+
return self.from_func(
|
|
211
|
+
func,
|
|
212
|
+
args,
|
|
213
|
+
kwargs,
|
|
214
|
+
ttl if isinstance(ttl, int) else ttl(*args, **kwargs),
|
|
215
|
+
data_format,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return wrapper
|
|
219
|
+
|
|
220
|
+
return decorator
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
cache_service = CacheService()
|
|
224
|
+
purge = cache_service.purge
|