tonik 0.0.12__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: tonik
3
- Version: 0.0.12
3
+ Version: 0.1.1
4
4
  Summary: Store time series data as HDF5 files and access them through an API.
5
5
  Project-URL: Homepage, https://tsc-tools.github.io/tonik
6
6
  Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tonik"
7
- version = "0.0.12"
7
+ version = "0.1.1"
8
8
  authors = [
9
9
  { name="Yannik Behr", email="y.behr@gns.cri.nz" },
10
10
  { name="Christof Mueller", email="c.mueller@gns.cri.nz" }
@@ -6,7 +6,7 @@ import re
6
6
  import pandas as pd
7
7
  import xarray as xr
8
8
 
9
- from .xarray2hdf5 import xarray2hdf5
9
+ from .xarray2netcdf import xarray2netcdf
10
10
  from .xarray2zarr import xarray2zarr
11
11
 
12
12
  LOGGING_CONFIG = {
@@ -76,6 +76,7 @@ class Path(object):
76
76
  self.name = name
77
77
  self.create = create
78
78
  self.backend = backend
79
+ self.engine = 'h5netcdf' if self.backend == 'netcdf' else self.backend
79
80
  self.path = os.path.join(parentdir, name)
80
81
  if create:
81
82
  try:
@@ -102,7 +103,7 @@ class Path(object):
102
103
 
103
104
  def feature_path(self, feature):
104
105
 
105
- if self.backend == 'h5netcdf':
106
+ if self.backend == 'netcdf':
106
107
  file_ending = '.nc'
107
108
  elif self.backend == 'zarr':
108
109
  file_ending = '.zarr'
@@ -154,7 +155,7 @@ class Path(object):
154
155
  format(stack_length, interval, num_periods))
155
156
 
156
157
  xd_index = dict(datetime=slice(self.starttime, self.endtime))
157
- with xr.open_dataset(filename, group='original', engine=self.backend) as ds:
158
+ with xr.open_dataset(filename, group='original', engine=self.engine) as ds:
158
159
  rq = ds.loc[xd_index].load()
159
160
 
160
161
  # Stack features
@@ -187,11 +188,19 @@ class Path(object):
187
188
  """
188
189
  Save a feature to disk
189
190
  """
190
- if self.backend == 'h5netcdf':
191
- xarray2hdf5(data, self.path, **kwargs)
191
+ if self.backend == 'netcdf':
192
+ xarray2netcdf(data, self.path, **kwargs)
192
193
  elif self.backend == 'zarr':
193
194
  xarray2zarr(data, self.path, **kwargs)
194
195
 
196
+ def shape(self, feature):
197
+ """
198
+ Get shape of a feature on disk
199
+ """
200
+ filename = self.feature_path(feature)
201
+ with xr.open_dataset(filename, group='original', engine=self.engine) as ds:
202
+ return ds.sizes
203
+
195
204
 
196
205
  class Storage(Path):
197
206
  """
@@ -214,7 +223,7 @@ class Storage(Path):
214
223
  >>> rsam = c("rsam")
215
224
  """
216
225
 
217
- def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='zarr'):
226
+ def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf'):
218
227
  self.stores = set()
219
228
  self.starttime = starttime
220
229
  self.endtime = endtime
@@ -56,3 +56,36 @@ def generate_test_data(dim=1, ndays=30, nfreqs=10,
56
56
  xds.attrs['station'] = 'MDR'
57
57
  xds.attrs['interval'] = '10min'
58
58
  return xds
59
+
60
+
61
+ def merge_arrays(xds_old: xr.DataArray, xds_new: xr.DataArray,
62
+ resolution: float = None) -> xr.DataArray:
63
+ """
64
+ Merge two xarray datasets with the same datetime index.
65
+
66
+ Parameters
67
+ ----------
68
+ xds_old : xr.DataArray
69
+ Old array.
70
+ xds_new : xr.DataArray
71
+ New array.
72
+ resolution : float
73
+ Time resolution in hours.
74
+
75
+ Returns
76
+ -------
77
+ xr.DataArray
78
+ Merged array.
79
+ """
80
+ xda_old = xds_old.drop_duplicates(
81
+ 'datetime', keep='last')
82
+ xda_new = xds_new.drop_duplicates(
83
+ 'datetime', keep='last')
84
+ xda_new = xda_new.combine_first(xda_old)
85
+ if resolution is not None:
86
+ new_dates = pd.date_range(
87
+ xda_new.datetime.values[0],
88
+ xda_new.datetime.values[-1],
89
+ freq=f'{resolution}h')
90
+ xda_new = xda_new.reindex(datetime=new_dates)
91
+ return xda_new
@@ -1,15 +1,19 @@
1
- from datetime import datetime
2
1
  import logging
3
2
  import os
3
+ from datetime import datetime
4
4
  from warnings import filterwarnings
5
5
 
6
- from cftime import num2date, date2num
7
6
  import h5netcdf
8
7
  import numpy as np
8
+ import xarray as xr
9
+ from cftime import date2num, num2date
10
+
11
+ from .utils import merge_arrays
9
12
 
10
13
 
11
- def xarray2hdf5(xArray, fdir, rootGroupName="original", timedim="datetime",
12
- archive_starttime=datetime(2000, 1, 1), resolution=None):
14
+ def xarray2netcdf(xArray, fdir, rootGroupName="original", timedim="datetime",
15
+ archive_starttime=datetime(2000, 1, 1), resolution=None,
16
+ mode='a'):
13
17
  """
14
18
  Store an xarray dataset as an HDF5 file.
15
19
 
@@ -31,23 +35,35 @@ def xarray2hdf5(xArray, fdir, rootGroupName="original", timedim="datetime",
31
35
  determined from the data.
32
36
  """
33
37
  filterwarnings(action='ignore', category=DeprecationWarning,
34
- message='`np.bool` is a deprecated alias')
38
+ message='`np.bool` is a deprecated alias')
35
39
 
36
- starttime = xArray[timedim].values[0].astype('datetime64[us]').astype(datetime)
37
- starttime = min(starttime, archive_starttime)
40
+ data_starttime = xArray[timedim].values[0].astype(
41
+ 'datetime64[us]').astype(datetime)
42
+ starttime = min(data_starttime, archive_starttime)
38
43
  if resolution is None:
39
44
  resolution = (np.diff(xArray[timedim])/np.timedelta64(1, 'h'))[0]
40
45
 
41
46
  for featureName in list(xArray.data_vars.keys()):
42
- h5file = os.path.join(fdir, featureName +'.nc')
47
+ h5file = os.path.join(fdir, featureName + '.nc')
48
+ _mode = 'w'
49
+ if os.path.isfile(h5file) and mode == 'a':
50
+ if archive_starttime > data_starttime:
51
+ xds_existing = xr.open_dataset(
52
+ h5file, group='original', engine='h5netcdf')
53
+ xda_new = merge_arrays(
54
+ xds_existing[featureName], xArray[featureName],
55
+ resolution=resolution)
56
+ xds_existing.close()
57
+ xda_new.to_netcdf(h5file, group='original',
58
+ mode='w', engine='h5netcdf')
59
+ continue
60
+ _mode = 'a'
43
61
 
44
- mode = 'a' if os.path.isfile(h5file) else 'w'
45
-
46
- with h5netcdf.File(h5file, mode) as h5f:
62
+ with h5netcdf.File(h5file, _mode) as h5f:
47
63
  try:
48
64
  rootGrp = _create_h5_Structure(rootGroupName, featureName,
49
65
  h5f, xArray, starttime, timedim)
50
- except ValueError: # group already exists, append
66
+ except ValueError: # group already exists, append
51
67
  rootGrp = h5f[rootGroupName]
52
68
 
53
69
  # determine indices
@@ -75,7 +91,8 @@ def xarray2hdf5(xArray, fdir, rootGroupName="original", timedim="datetime",
75
91
  try:
76
92
  _setMetaInfo(featureName, h5f, xArray)
77
93
  except KeyError as e:
78
- logging.warning(f"Could not set all meta info for {featureName}: {e}")
94
+ logging.warning(
95
+ f"Could not set all meta info for {featureName}: {e}")
79
96
 
80
97
 
81
98
  def _create_h5_Structure(defaultGroupName, featureName, h5f, xArray, starttime, timedim):
@@ -85,15 +102,16 @@ def _create_h5_Structure(defaultGroupName, featureName, h5f, xArray, starttime,
85
102
  coordinates.attrs['units'] = 'hours since 1970-01-01 00:00:00.0'
86
103
  coordinates.attrs['calendar'] = 'gregorian'
87
104
  rootGrp.attrs['starttime'] = str(starttime)
88
- for label, size in xArray.dims.items():
105
+ for label, size in xArray.dims.items():
89
106
  if not np.issubdtype(xArray[label].dtype, np.datetime64):
90
- rootGrp.dimensions[label] = size
107
+ rootGrp.dimensions[label] = size
91
108
  coordinates = rootGrp.create_variable(label, (label,), float)
92
109
  coordinates[:] = xArray[label].values
93
110
  # Note: xArray.dims returns a dictionary of dimensions that are not necesarily
94
111
  # in the right order; xArray[featureName].dims returns a tuple with dimension
95
112
  # names in the correct order
96
- rootGrp.create_variable(featureName, tuple(xArray[featureName].dims), dtype=float, fillvalue=0.)
113
+ rootGrp.create_variable(featureName, tuple(
114
+ xArray[featureName].dims), dtype=float, fillvalue=0.)
97
115
  return rootGrp
98
116
 
99
117
 
@@ -102,4 +120,3 @@ def _setMetaInfo(featureName, h5f, xArray):
102
120
  h5f.attrs['latitude'] = -42
103
121
  h5f.attrs['longitude'] = 168
104
122
  h5f.attrs['datatype'] = featureName
105
-
@@ -3,31 +3,9 @@ import os
3
3
 
4
4
  import xarray as xr
5
5
 
6
- logger = logging.getLogger(__name__)
7
-
8
-
9
- def merge_arrays(xds_old: xr.DataArray, xds_new: xr.DataArray) -> xr.DataArray:
10
- """
11
- Merge two xarray datasets with the same datetime index.
6
+ from .utils import merge_arrays
12
7
 
13
- Parameters
14
- ----------
15
- xds_old : xr.DataArray
16
- Old array.
17
- xds_new : xr.DataArray
18
- New array.
19
-
20
- Returns
21
- -------
22
- xr.DataArray
23
- Merged array.
24
- """
25
- xda_old = xds_old.drop_duplicates(
26
- 'datetime', keep='last')
27
- xda_new = xds_new.drop_duplicates(
28
- 'datetime', keep='last')
29
- xda_new = xda_new.combine_first(xda_old)
30
- return xda_new
8
+ logger = logging.getLogger(__name__)
31
9
 
32
10
 
33
11
  def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a'):
@@ -29,7 +29,7 @@ def test_backend_speed():
29
29
  execution_time_zarr = timeit.timeit(lambda: write_read('zarr'), number=5)
30
30
  logger.info('Write and read with zarr took {} seconds.'.format(
31
31
  execution_time_zarr/5))
32
- execution_time_h5 = timeit.timeit(lambda: write_read('h5netcdf'), number=5)
32
+ execution_time_h5 = timeit.timeit(lambda: write_read('netcdf'), number=5)
33
33
  logger.info('Write and read with h5 took {} seconds.'.format(
34
34
  execution_time_h5/5))
35
35
 
@@ -1,7 +1,5 @@
1
- from datetime import datetime
2
- import json
3
1
  import os
4
- import tempfile
2
+ from datetime import datetime
5
3
 
6
4
  import numpy as np
7
5
  import pandas as pd
@@ -13,7 +11,7 @@ from tonik.utils import generate_test_data
13
11
 
14
12
  def test_group(tmp_path_factory):
15
13
  rootdir = tmp_path_factory.mktemp('data')
16
- g = Storage('test_experiment', rootdir, backend='h5netcdf')
14
+ g = Storage('test_experiment', rootdir, backend='netcdf')
17
15
  c = g.get_substore('site1', 'sensor1', 'channel1')
18
16
  assert c.path == os.path.join(
19
17
  rootdir, 'test_experiment/site1/sensor1/channel1')
@@ -41,7 +39,7 @@ def test_non_existant_feature(tmp_path_factory):
41
39
 
42
40
  def test_from_directory(tmp_path_factory):
43
41
  rootdir = tmp_path_factory.mktemp('data')
44
- g = Storage('test_experiment', rootdir, backend='h5netcdf')
42
+ g = Storage('test_experiment', rootdir, backend='netcdf')
45
43
  c = g.get_substore('site1', 'sensor1', 'channel1')
46
44
  assert c.path == os.path.join(rootdir, 'test_experiment', 'site1',
47
45
  'sensor1', 'channel1')
@@ -138,3 +136,21 @@ def test_rolling_window(tmp_path_factory):
138
136
  np.testing.assert_array_almost_equal(
139
137
  np.array(rolling_mean), rsam_rolling.values[num_windows:], 6
140
138
  )
139
+
140
+
141
+ def test_shape(tmp_path_factory):
142
+ rootdir = tmp_path_factory.mktemp('data')
143
+ g = Storage('volcanoes', rootdir=rootdir)
144
+ tstart = datetime(2016, 1, 1)
145
+ xdf = generate_test_data(dim=2, intervals=20, tstart=tstart)
146
+ g.save(xdf, mode='w', archive_starttime=tstart)
147
+ rsam_shape = g.shape('ssam')
148
+ assert rsam_shape['datetime'] == 20
149
+ assert rsam_shape['frequency'] == 10
150
+ assert rsam_shape['fbfrequency'] == 10
151
+
152
+ g1 = Storage('volcanoes', rootdir=rootdir, backend='zarr')
153
+ g1.save(xdf, mode='w')
154
+ rsam_shape = g1.shape('ssam')
155
+ assert rsam_shape['datetime'] == 20
156
+ assert rsam_shape['frequency'] == 10
@@ -6,20 +6,20 @@ import pytest
6
6
  import xarray as xr
7
7
 
8
8
  from tonik import Storage, generate_test_data
9
- from tonik.xarray2hdf5 import xarray2hdf5
9
+ from tonik.xarray2netcdf import xarray2netcdf
10
10
 
11
11
 
12
- def test_xarray2hdf5(tmp_path_factory):
12
+ def test_xarray2netcdf(tmp_path_factory):
13
13
  """
14
14
  Test writing xarray data to hdf5.
15
15
  """
16
16
  xdf = generate_test_data(
17
17
  dim=2, ndays=3, tstart=datetime(2022, 7, 18, 0, 0, 0))
18
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
18
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
19
19
  g = Storage('test_experiment', rootdir=temp_dir,
20
20
  starttime=datetime.fromisoformat(xdf.attrs['starttime']),
21
21
  endtime=datetime.fromisoformat(xdf.attrs['endtime']),
22
- backend='h5netcdf')
22
+ backend='netcdf')
23
23
  c = g.get_substore('MDR', '00', 'HHZ')
24
24
  c.save(xdf)
25
25
 
@@ -34,14 +34,14 @@ def test_xarray2hdf5(tmp_path_factory):
34
34
  assert dt < np.timedelta64(1, 'us')
35
35
 
36
36
 
37
- def test_xarray2hdf5_archive_starttime(tmp_path_factory):
37
+ def test_xarray2netcdf_archive_starttime(tmp_path_factory):
38
38
  xdf = generate_test_data(
39
39
  dim=1, ndays=3, tstart=datetime(2022, 7, 18, 0, 0, 0))
40
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
40
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
41
41
  g = Storage('test_experiment', rootdir=temp_dir,
42
42
  starttime=datetime(2000, 1, 1),
43
43
  endtime=datetime.fromisoformat(xdf.attrs['endtime']),
44
- backend='h5netcdf')
44
+ backend='netcdf')
45
45
  c = g.get_substore('MDR', '00', 'HHZ')
46
46
  c.save(xdf, archive_starttime=datetime(2022, 1, 1))
47
47
 
@@ -53,14 +53,32 @@ def test_xarray2hdf5_archive_starttime(tmp_path_factory):
53
53
  assert xdf_test.loc['2000-01-01':'2022-07-17T23:50:00'].shape[0] == nitems
54
54
 
55
55
 
56
- def test_xarray2hdf5_resolution(tmp_path_factory):
56
+ def test_xarray2netcdf_merge_arrays(tmp_path_factory):
57
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
58
+ start = datetime(2022, 7, 18, 8, 0, 0)
59
+ end = datetime(2022, 7, 19, 12, 0, 0)
60
+ xdf1 = generate_test_data(dim=1, ndays=1, tstart=start, add_nans=False)
61
+ xdf2 = generate_test_data(dim=1, ndays=1, tstart=end, add_nans=False)
62
+ g = Storage('test_experiment', rootdir=temp_dir,
63
+ starttime=start, endtime=end + timedelta(days=1),
64
+ backend='netcdf')
65
+ c = g.get_substore('MDR', '00', 'HHZ')
66
+ c.save(xdf2, archive_starttime=datetime(2022, 8, 1))
67
+ c.save(xdf1, archive_starttime=datetime(2022, 8, 1))
68
+ xdf_test = c('rsam')
69
+ assert xdf_test.isnull().sum() == 24
70
+ assert xdf_test.loc['2022-07-18T08:00:00'] == xdf1['rsam'].loc['2022-07-18T08:00:00']
71
+ assert xdf_test.loc['2022-07-20T11:50:00'] == xdf2['rsam'].loc['2022-07-20T11:50:00']
72
+
73
+
74
+ def test_xarray2netcdf_resolution(tmp_path_factory):
57
75
  xdf = generate_test_data(dim=1, ndays=1, tstart=datetime(2022, 7, 18, 0, 0, 0),
58
76
  add_nans=False)
59
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
77
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
60
78
  g = Storage('test_experiment', rootdir=temp_dir,
61
79
  starttime=datetime(2000, 1, 1),
62
80
  endtime=datetime.fromisoformat(xdf.attrs['endtime']),
63
- backend='h5netcdf')
81
+ backend='netcdf')
64
82
  c = g.get_substore('MDR', '00', 'HHZ')
65
83
  c.save(xdf, resolution=0.1, archive_starttime=datetime(2022, 7, 18))
66
84
 
@@ -69,18 +87,18 @@ def test_xarray2hdf5_resolution(tmp_path_factory):
69
87
  assert np.isnan(xdf_test.loc['2022-07-18T00:06:00'].data)
70
88
 
71
89
 
72
- def test_xarray2hdf5_with_gaps(tmp_path_factory):
90
+ def test_xarray2netcdf_with_gaps(tmp_path_factory):
73
91
  """
74
92
  Test writing xarray data to hdf5 with gaps.
75
93
  """
76
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
94
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
77
95
  start = datetime(2022, 7, 18, 8, 0, 0)
78
96
  end = datetime(2022, 7, 19, 12, 0, 0)
79
97
  xdf1 = generate_test_data(dim=1, ndays=1, tstart=start, add_nans=False)
80
98
  xdf2 = generate_test_data(dim=1, ndays=1, tstart=end, add_nans=False)
81
99
  g = Storage('test_experiment', rootdir=temp_dir,
82
100
  starttime=start, endtime=end + timedelta(days=1),
83
- backend='h5netcdf')
101
+ backend='netcdf')
84
102
  c = g.get_substore('MDR', '00', 'HHZ')
85
103
  c.save(xdf1)
86
104
  c.save(xdf2)
@@ -89,23 +107,23 @@ def test_xarray2hdf5_with_gaps(tmp_path_factory):
89
107
 
90
108
 
91
109
  @pytest.mark.xfail(raises=OSError)
92
- def test_xarray2hdf5_multi_access(tmp_path_factory):
110
+ def test_xarray2netcdf_multi_access(tmp_path_factory):
93
111
  """
94
112
  Test writing xarray data to hdf5 while the file is open. This is currently
95
113
  not working with NetCDF4. See the following discussions for reference:
96
114
  https://github.com/pydata/xarray/issues/2887
97
115
  https://stackoverflow.com/questions/49701623/is-there-a-way-to-release-the-file-lock-for-a-xarray-dataset
98
116
  """
99
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
117
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
100
118
  xdf1 = generate_test_data(
101
119
  dim=1, ndays=1, tstart=datetime(2022, 7, 18, 8, 0, 0))
102
120
  xdf2 = generate_test_data(
103
121
  dim=1, ndays=1, tstart=datetime(2022, 7, 19, 12, 0, 0))
104
122
 
105
- xarray2hdf5(xdf1, temp_dir)
123
+ xarray2netcdf(xdf1, temp_dir)
106
124
  xdf_dummy = xr.open_dataset(os.path.join(temp_dir, 'rsam.nc'),
107
125
  group='original', engine='h5netcdf')
108
- xarray2hdf5(xdf2, temp_dir)
126
+ xarray2netcdf(xdf2, temp_dir)
109
127
 
110
128
 
111
129
  def test_xarray2zarr(tmp_path_factory):
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes