tonik 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tonik-0.1.2 → tonik-0.1.4}/PKG-INFO +1 -1
- {tonik-0.1.2 → tonik-0.1.4}/pyproject.toml +1 -1
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/storage.py +6 -50
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/xarray2netcdf.py +9 -7
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/xarray2zarr.py +15 -8
- {tonik-0.1.2 → tonik-0.1.4}/tests/backend_speed_test.py +4 -1
- {tonik-0.1.2 → tonik-0.1.4}/tests/test_group.py +36 -23
- {tonik-0.1.2 → tonik-0.1.4}/tests/test_save.py +2 -0
- {tonik-0.1.2 → tonik-0.1.4}/.devcontainer/devcontainer.json +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/.gitignore +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/HOW_TO_RELEASE.md +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/LICENSE +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/README.md +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/docs/index.md +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/docs/tonik_example.ipynb +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/mkdocs.yml +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/__init__.py +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/api.py +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/package_data/index.html +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/src/tonik/utils.py +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/tests/conftest.py +0 -0
- {tonik-0.1.2 → tonik-0.1.4}/tests/test_api.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: tonik
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Store time series data as HDF5 files and access them through an API.
|
|
5
5
|
Project-URL: Homepage, https://tsc-tools.github.io/tonik
|
|
6
6
|
Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import logging.config
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
4
|
|
|
6
|
-
import pandas as pd
|
|
7
5
|
import xarray as xr
|
|
8
6
|
|
|
9
7
|
from .xarray2netcdf import xarray2netcdf
|
|
@@ -113,70 +111,28 @@ class Path(object):
|
|
|
113
111
|
self.children[feature] = Path(feature + file_ending, self.path)
|
|
114
112
|
return _feature_path
|
|
115
113
|
|
|
116
|
-
def __call__(self, feature,
|
|
114
|
+
def __call__(self, feature, group='original'):
|
|
117
115
|
"""
|
|
118
116
|
Request a particular feature
|
|
119
117
|
|
|
120
118
|
:param feature: Feature name
|
|
121
119
|
:type feature: str
|
|
122
|
-
:param stack_length: length of moving average in time
|
|
123
|
-
:type stack_length: str
|
|
124
120
|
|
|
125
121
|
"""
|
|
126
|
-
if self.endtime
|
|
122
|
+
if self.endtime < self.starttime:
|
|
127
123
|
raise ValueError('Startime has to be smaller than endtime.')
|
|
128
124
|
|
|
129
125
|
filename = self.feature_path(feature)
|
|
130
126
|
|
|
131
127
|
logger.debug(
|
|
132
128
|
f"Reading feature {feature} between {self.starttime} and {self.endtime}")
|
|
133
|
-
num_periods = None
|
|
134
|
-
if stack_length is not None:
|
|
135
|
-
valid_stack_units = ['W', 'D', 'h', 'T', 'min', 'S']
|
|
136
|
-
if re.match(r'\d*\s*(\w*)', stack_length).group(1) not in valid_stack_units:
|
|
137
|
-
raise ValueError(
|
|
138
|
-
'Stack length should be one of: {}'.
|
|
139
|
-
format(', '.join(valid_stack_units))
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
if pd.to_timedelta(stack_length) < pd.to_timedelta(interval):
|
|
143
|
-
raise ValueError('Stack length {} is less than interval {}'.
|
|
144
|
-
format(stack_length, interval))
|
|
145
|
-
|
|
146
|
-
# Rewind starttime to account for stack length
|
|
147
|
-
self.starttime -= pd.to_timedelta(stack_length)
|
|
148
|
-
|
|
149
|
-
num_periods = (pd.to_timedelta(stack_length) /
|
|
150
|
-
pd.to_timedelta(interval))
|
|
151
|
-
if not num_periods.is_integer():
|
|
152
|
-
raise ValueError(
|
|
153
|
-
'Stack length {} / interval {} = {}, but it needs'
|
|
154
|
-
' to be a whole number'.
|
|
155
|
-
format(stack_length, interval, num_periods))
|
|
156
129
|
|
|
157
130
|
xd_index = dict(datetime=slice(self.starttime, self.endtime))
|
|
158
|
-
with xr.open_dataset(filename, group=
|
|
159
|
-
rq = ds.loc[xd_index].load()
|
|
160
|
-
|
|
161
|
-
# Stack features
|
|
162
|
-
if stack_length is not None:
|
|
163
|
-
logger.debug("Stacking feature...")
|
|
164
|
-
try:
|
|
165
|
-
xdf = rq[feature].rolling(datetime=int(num_periods),
|
|
166
|
-
center=False,
|
|
167
|
-
min_periods=1).mean()
|
|
168
|
-
# Return requested timeframe to that defined in initialisation
|
|
169
|
-
self.starttime += pd.to_timedelta(stack_length)
|
|
170
|
-
xdf_new = xdf.loc[self.starttime:self.endtime]
|
|
171
|
-
xdf_new = xdf_new.rename(feature)
|
|
172
|
-
except ValueError as e:
|
|
173
|
-
logger.error(e)
|
|
174
|
-
logger.error('Stack length {} is not valid for feature {}'.
|
|
175
|
-
format(stack_length, feature))
|
|
176
|
-
else:
|
|
177
|
-
return xdf_new
|
|
131
|
+
with xr.open_dataset(filename, group=group, engine=self.engine) as ds:
|
|
132
|
+
rq = ds[feature].loc[xd_index].load()
|
|
133
|
+
rq.attrs = ds.attrs
|
|
178
134
|
|
|
179
|
-
return rq
|
|
135
|
+
return rq
|
|
180
136
|
|
|
181
137
|
def load(self, *args, **kwargs):
|
|
182
138
|
"""
|
|
@@ -11,7 +11,7 @@ from cftime import date2num, num2date
|
|
|
11
11
|
from .utils import merge_arrays
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def xarray2netcdf(xArray, fdir,
|
|
14
|
+
def xarray2netcdf(xArray, fdir, group="original", timedim="datetime",
|
|
15
15
|
archive_starttime=datetime(2000, 1, 1), resolution=None,
|
|
16
16
|
mode='a'):
|
|
17
17
|
"""
|
|
@@ -23,7 +23,7 @@ def xarray2netcdf(xArray, fdir, rootGroupName="original", timedim="datetime",
|
|
|
23
23
|
Data to store.
|
|
24
24
|
fdir : str
|
|
25
25
|
Directory to store data under.
|
|
26
|
-
|
|
26
|
+
group : str
|
|
27
27
|
Hdf5 group name.
|
|
28
28
|
timedim : str
|
|
29
29
|
Name of time dimension.
|
|
@@ -49,22 +49,22 @@ def xarray2netcdf(xArray, fdir, rootGroupName="original", timedim="datetime",
|
|
|
49
49
|
if os.path.isfile(h5file) and mode == 'a':
|
|
50
50
|
if archive_starttime > data_starttime:
|
|
51
51
|
xds_existing = xr.open_dataset(
|
|
52
|
-
h5file, group=
|
|
52
|
+
h5file, group=group, engine='h5netcdf')
|
|
53
53
|
xda_new = merge_arrays(
|
|
54
54
|
xds_existing[featureName], xArray[featureName],
|
|
55
55
|
resolution=resolution)
|
|
56
56
|
xds_existing.close()
|
|
57
|
-
xda_new.to_netcdf(h5file, group=
|
|
57
|
+
xda_new.to_netcdf(h5file, group=group,
|
|
58
58
|
mode='w', engine='h5netcdf')
|
|
59
59
|
continue
|
|
60
60
|
_mode = 'a'
|
|
61
61
|
|
|
62
62
|
with h5netcdf.File(h5file, _mode) as h5f:
|
|
63
63
|
try:
|
|
64
|
-
rootGrp = _create_h5_Structure(
|
|
64
|
+
rootGrp = _create_h5_Structure(group, featureName,
|
|
65
65
|
h5f, xArray, starttime, timedim)
|
|
66
66
|
except ValueError: # group already exists, append
|
|
67
|
-
rootGrp = h5f[
|
|
67
|
+
rootGrp = h5f[group]
|
|
68
68
|
|
|
69
69
|
# determine indices
|
|
70
70
|
new_time = date2num(xArray[timedim].values.astype('datetime64[us]').astype(datetime),
|
|
@@ -88,6 +88,8 @@ def xarray2netcdf(xArray, fdir, rootGroupName="original", timedim="datetime",
|
|
|
88
88
|
data[indices] = xArray[featureName].values
|
|
89
89
|
rootGrp.attrs['endtime'] = str(num2date(times[-1], units=rootGrp[timedim].attrs['units'],
|
|
90
90
|
calendar=rootGrp[timedim].attrs['calendar']))
|
|
91
|
+
rootGrp.attrs['resolution'] = resolution
|
|
92
|
+
rootGrp.attrs['resolution_units'] = 'h'
|
|
91
93
|
try:
|
|
92
94
|
_setMetaInfo(featureName, h5f, xArray)
|
|
93
95
|
except KeyError as e:
|
|
@@ -102,7 +104,7 @@ def _create_h5_Structure(defaultGroupName, featureName, h5f, xArray, starttime,
|
|
|
102
104
|
coordinates.attrs['units'] = 'hours since 1970-01-01 00:00:00.0'
|
|
103
105
|
coordinates.attrs['calendar'] = 'gregorian'
|
|
104
106
|
rootGrp.attrs['starttime'] = str(starttime)
|
|
105
|
-
for label, size in xArray.
|
|
107
|
+
for label, size in xArray.sizes.items():
|
|
106
108
|
if not np.issubdtype(xArray[label].dtype, np.datetime64):
|
|
107
109
|
rootGrp.dimensions[label] = size
|
|
108
110
|
coordinates = rootGrp.create_variable(label, (label,), float)
|
|
@@ -2,13 +2,14 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
4
|
import xarray as xr
|
|
5
|
+
from zarr.errors import PathNotFoundError
|
|
5
6
|
|
|
6
7
|
from .utils import merge_arrays
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a'):
|
|
12
|
+
def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original'):
|
|
12
13
|
"""
|
|
13
14
|
Write xarray dataset to zarr files.
|
|
14
15
|
|
|
@@ -20,6 +21,8 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a'):
|
|
|
20
21
|
Path to write the dataset.
|
|
21
22
|
mode : str, optional
|
|
22
23
|
Write mode, by default 'a'.
|
|
24
|
+
group : str, optional
|
|
25
|
+
Group name, by default 'original'
|
|
23
26
|
|
|
24
27
|
Returns
|
|
25
28
|
-------
|
|
@@ -29,28 +32,32 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a'):
|
|
|
29
32
|
fout = os.path.join(path, feature + '.zarr')
|
|
30
33
|
if not os.path.exists(fout) or mode == 'w':
|
|
31
34
|
xds[feature].to_zarr(
|
|
32
|
-
fout, group=
|
|
35
|
+
fout, group=group, mode='w')
|
|
33
36
|
else:
|
|
34
|
-
|
|
37
|
+
try:
|
|
38
|
+
xds_existing = xr.open_zarr(fout, group=group)
|
|
39
|
+
except PathNotFoundError:
|
|
40
|
+
xds[feature].to_zarr(fout, group=group, mode='a')
|
|
41
|
+
continue
|
|
35
42
|
if xds_existing.datetime[0] > xds.datetime[0] or xds_existing.datetime[-1] > xds.datetime[-1]:
|
|
36
43
|
xda_new = merge_arrays(xds_existing[feature], xds[feature])
|
|
37
|
-
xda_new.to_zarr(fout, group=
|
|
44
|
+
xda_new.to_zarr(fout, group=group, mode='w')
|
|
38
45
|
else:
|
|
39
46
|
try:
|
|
40
47
|
overlap = xds_existing.datetime.where(
|
|
41
48
|
xds_existing.datetime == xds.datetime)
|
|
42
49
|
if overlap.size > 0:
|
|
43
50
|
xds[feature].loc[dict(datetime=overlap)].to_zarr(
|
|
44
|
-
fout, group=
|
|
51
|
+
fout, group=group, mode='r+', region='auto')
|
|
45
52
|
xds[feature].drop_sel(datetime=overlap).to_zarr(
|
|
46
|
-
fout, group=
|
|
53
|
+
fout, group=group, mode='a', append_dim="datetime")
|
|
47
54
|
else:
|
|
48
55
|
xds[feature].to_zarr(
|
|
49
|
-
fout, group=
|
|
56
|
+
fout, group=group, append_dim='datetime')
|
|
50
57
|
except Exception as e:
|
|
51
58
|
msg = f"Appending {feature} to {fout} failed: {e}\n"
|
|
52
59
|
msg += "Attempting to merge the two datasets."
|
|
53
60
|
logger.error(msg)
|
|
54
61
|
# remove duplicate datetime entries
|
|
55
62
|
xda_new = merge_arrays(xds_existing[feature], xds[feature])
|
|
56
|
-
xda_new.to_zarr(fout, group=
|
|
63
|
+
xda_new.to_zarr(fout, group=group, mode='w')
|
|
@@ -18,7 +18,10 @@ def write_read(backend):
|
|
|
18
18
|
test_dir = tempfile.mkdtemp()
|
|
19
19
|
sg = Storage('speed_test', test_dir, starttime=tstart, endtime=tend,
|
|
20
20
|
backend=backend)
|
|
21
|
-
|
|
21
|
+
kwargs = {}
|
|
22
|
+
if backend == 'netcdf':
|
|
23
|
+
kwargs['archive_starttime'] = tstart
|
|
24
|
+
sg.save(spec, **kwargs)
|
|
22
25
|
spec_test = sg('ssam')
|
|
23
26
|
|
|
24
27
|
|
|
@@ -27,6 +27,35 @@ def test_group(tmp_path_factory):
|
|
|
27
27
|
c = g.get_substore('MDR1', '00', 'HHZ')
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
def test_subgroup(tmp_path_factory):
|
|
31
|
+
"""
|
|
32
|
+
Test storing data in different subgroups in netcdf and zarr.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
startdate = datetime(2016, 1, 1)
|
|
36
|
+
enddate = datetime(2016, 1, 2, 12)
|
|
37
|
+
rootdir = tmp_path_factory.mktemp('data')
|
|
38
|
+
g = Storage('volcanoes', rootdir=rootdir,
|
|
39
|
+
starttime=startdate, endtime=enddate)
|
|
40
|
+
xdf = generate_test_data(dim=1, ndays=20, tstart=startdate)
|
|
41
|
+
g.save(xdf)
|
|
42
|
+
xdf.rsam.values += 100.
|
|
43
|
+
g.save(xdf, group='modified')
|
|
44
|
+
rsam_original = g('rsam')
|
|
45
|
+
rsam_modified = g('rsam', group='modified')
|
|
46
|
+
assert int(rsam_modified.mean()) == (int(rsam_original.mean()) + 100)
|
|
47
|
+
g = Storage('volcanoes', rootdir=rootdir,
|
|
48
|
+
starttime=startdate, endtime=enddate,
|
|
49
|
+
backend='zarr')
|
|
50
|
+
xdf = generate_test_data(dim=1, ndays=20, tstart=startdate)
|
|
51
|
+
g.save(xdf)
|
|
52
|
+
xdf.rsam.values += 100.
|
|
53
|
+
g.save(xdf, group='modified')
|
|
54
|
+
rsam_original = g('rsam')
|
|
55
|
+
rsam_modified = g('rsam', group='modified')
|
|
56
|
+
assert int(rsam_modified.mean()) == (int(rsam_original.mean()) + 100)
|
|
57
|
+
|
|
58
|
+
|
|
30
59
|
def test_non_existant_feature(tmp_path_factory):
|
|
31
60
|
rootdir = tmp_path_factory.mktemp('data')
|
|
32
61
|
g = Storage('test_experiment', rootdir)
|
|
@@ -108,34 +137,18 @@ def test_call_single_day(tmp_path_factory):
|
|
|
108
137
|
assert pd.to_datetime(enddate) == last_time
|
|
109
138
|
|
|
110
139
|
|
|
111
|
-
def
|
|
140
|
+
def test_call_single_datapoint(tmp_path_factory):
|
|
112
141
|
rootdir = tmp_path_factory.mktemp('data')
|
|
113
|
-
startdate = datetime(2016, 1, 1)
|
|
114
|
-
enddate = datetime(2016, 1, 2, 12)
|
|
115
|
-
xdf = generate_test_data(dim=1, ndays=20, tstart=startdate)
|
|
116
142
|
g = Storage('volcanoes', rootdir=rootdir)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
stack_len_seconds = 3600
|
|
120
|
-
stack_len_string = '1h'
|
|
121
|
-
|
|
122
|
-
num_windows = int(stack_len_seconds / pd.Timedelta(xdf.interval).seconds)
|
|
143
|
+
startdate = datetime(2016, 1, 2, 1)
|
|
144
|
+
enddate = startdate
|
|
123
145
|
g.starttime = startdate
|
|
124
146
|
g.endtime = enddate
|
|
147
|
+
xdf = generate_test_data(dim=1, tstart=startdate)
|
|
148
|
+
g.save(xdf)
|
|
125
149
|
rsam = g('rsam')
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
# Check correct datetime array
|
|
129
|
-
np.testing.assert_array_equal(rsam.datetime.values,
|
|
130
|
-
rsam_rolling.datetime.values)
|
|
131
|
-
# Check correct values
|
|
132
|
-
rolling_mean = [
|
|
133
|
-
np.nanmean(rsam.data[(ind-num_windows+1):ind+1])
|
|
134
|
-
for ind in np.arange(num_windows, len(rsam_rolling.data))
|
|
135
|
-
]
|
|
136
|
-
np.testing.assert_array_almost_equal(
|
|
137
|
-
np.array(rolling_mean), rsam_rolling.values[num_windows:], 6
|
|
138
|
-
)
|
|
150
|
+
assert float(
|
|
151
|
+
xdf.rsam.loc[dict(datetime='2016-01-02T01:00:00')]) == float(rsam)
|
|
139
152
|
|
|
140
153
|
|
|
141
154
|
def test_shape(tmp_path_factory):
|
|
@@ -85,6 +85,8 @@ def test_xarray2netcdf_resolution(tmp_path_factory):
|
|
|
85
85
|
xdf_test = c('rsam')
|
|
86
86
|
assert xdf_test.loc['2022-07-18T00:12:00'] == xdf['rsam'].loc['2022-07-18T00:10:00']
|
|
87
87
|
assert np.isnan(xdf_test.loc['2022-07-18T00:06:00'].data)
|
|
88
|
+
assert xdf_test.attrs['resolution'] == 0.1
|
|
89
|
+
assert xdf_test.attrs['resolution_units'] == 'h'
|
|
88
90
|
|
|
89
91
|
|
|
90
92
|
def test_xarray2netcdf_with_gaps(tmp_path_factory):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|