tonik 0.0.11__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: tonik
3
- Version: 0.0.11
3
+ Version: 0.1.0
4
4
  Summary: Store time series data as HDF5 files and access them through an API.
5
5
  Project-URL: Homepage, https://tsc-tools.github.io/tonik
6
6
  Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tonik"
7
- version = "0.0.11"
7
+ version = "0.1.0"
8
8
  authors = [
9
9
  { name="Yannik Behr", email="y.behr@gns.cri.nz" },
10
10
  { name="Christof Mueller", email="c.mueller@gns.cri.nz" }
@@ -6,7 +6,7 @@ import re
6
6
  import pandas as pd
7
7
  import xarray as xr
8
8
 
9
- from .xarray2hdf5 import xarray2hdf5
9
+ from .xarray2netcdf import xarray2netcdf
10
10
  from .xarray2zarr import xarray2zarr
11
11
 
12
12
  LOGGING_CONFIG = {
@@ -102,7 +102,7 @@ class Path(object):
102
102
 
103
103
  def feature_path(self, feature):
104
104
 
105
- if self.backend == 'h5netcdf':
105
+ if self.backend == 'netcdf':
106
106
  file_ending = '.nc'
107
107
  elif self.backend == 'zarr':
108
108
  file_ending = '.zarr'
@@ -154,7 +154,8 @@ class Path(object):
154
154
  format(stack_length, interval, num_periods))
155
155
 
156
156
  xd_index = dict(datetime=slice(self.starttime, self.endtime))
157
- with xr.open_dataset(filename, group='original', engine=self.backend) as ds:
157
+ engine = 'h5netcdf' if self.backend == 'netcdf' else self.backend
158
+ with xr.open_dataset(filename, group='original', engine=engine) as ds:
158
159
  rq = ds.loc[xd_index].load()
159
160
 
160
161
  # Stack features
@@ -187,8 +188,8 @@ class Path(object):
187
188
  """
188
189
  Save a feature to disk
189
190
  """
190
- if self.backend == 'h5netcdf':
191
- xarray2hdf5(data, self.path, **kwargs)
191
+ if self.backend == 'netcdf':
192
+ xarray2netcdf(data, self.path, **kwargs)
192
193
  elif self.backend == 'zarr':
193
194
  xarray2zarr(data, self.path, **kwargs)
194
195
 
@@ -214,7 +215,7 @@ class Storage(Path):
214
215
  >>> rsam = c("rsam")
215
216
  """
216
217
 
217
- def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='zarr'):
218
+ def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf'):
218
219
  self.stores = set()
219
220
  self.starttime = starttime
220
221
  self.endtime = endtime
@@ -56,3 +56,36 @@ def generate_test_data(dim=1, ndays=30, nfreqs=10,
56
56
  xds.attrs['station'] = 'MDR'
57
57
  xds.attrs['interval'] = '10min'
58
58
  return xds
59
+
60
+
61
+ def merge_arrays(xds_old: xr.DataArray, xds_new: xr.DataArray,
62
+ resolution: float = None) -> xr.DataArray:
63
+ """
64
+ Merge two xarray datasets with the same datetime index.
65
+
66
+ Parameters
67
+ ----------
68
+ xds_old : xr.DataArray
69
+ Old array.
70
+ xds_new : xr.DataArray
71
+ New array.
72
+ resolution : float
73
+ Time resolution in hours.
74
+
75
+ Returns
76
+ -------
77
+ xr.DataArray
78
+ Merged array.
79
+ """
80
+ xda_old = xds_old.drop_duplicates(
81
+ 'datetime', keep='last')
82
+ xda_new = xds_new.drop_duplicates(
83
+ 'datetime', keep='last')
84
+ xda_new = xda_new.combine_first(xda_old)
85
+ if resolution is not None:
86
+ new_dates = pd.date_range(
87
+ xda_new.datetime.values[0],
88
+ xda_new.datetime.values[-1],
89
+ freq=f'{resolution}h')
90
+ xda_new = xda_new.reindex(datetime=new_dates)
91
+ return xda_new
@@ -1,15 +1,18 @@
1
- from datetime import datetime
2
1
  import logging
3
2
  import os
3
+ from datetime import datetime
4
4
  from warnings import filterwarnings
5
5
 
6
- from cftime import num2date, date2num
7
6
  import h5netcdf
8
7
  import numpy as np
8
+ import xarray as xr
9
+ from cftime import date2num, num2date
10
+
11
+ from .utils import merge_arrays
9
12
 
10
13
 
11
- def xarray2hdf5(xArray, fdir, rootGroupName="original", timedim="datetime",
12
- archive_starttime=datetime(2000, 1, 1), resolution=None):
14
+ def xarray2netcdf(xArray, fdir, rootGroupName="original", timedim="datetime",
15
+ archive_starttime=datetime(2000, 1, 1), resolution=None):
13
16
  """
14
17
  Store an xarray dataset as an HDF5 file.
15
18
 
@@ -31,23 +34,35 @@ def xarray2hdf5(xArray, fdir, rootGroupName="original", timedim="datetime",
31
34
  determined from the data.
32
35
  """
33
36
  filterwarnings(action='ignore', category=DeprecationWarning,
34
- message='`np.bool` is a deprecated alias')
37
+ message='`np.bool` is a deprecated alias')
35
38
 
36
- starttime = xArray[timedim].values[0].astype('datetime64[us]').astype(datetime)
37
- starttime = min(starttime, archive_starttime)
39
+ data_starttime = xArray[timedim].values[0].astype(
40
+ 'datetime64[us]').astype(datetime)
41
+ starttime = min(data_starttime, archive_starttime)
38
42
  if resolution is None:
39
43
  resolution = (np.diff(xArray[timedim])/np.timedelta64(1, 'h'))[0]
40
44
 
41
45
  for featureName in list(xArray.data_vars.keys()):
42
- h5file = os.path.join(fdir, featureName +'.nc')
46
+ h5file = os.path.join(fdir, featureName + '.nc')
47
+ mode = 'w'
48
+ if os.path.isfile(h5file):
49
+ if archive_starttime > data_starttime:
50
+ xds_existing = xr.open_dataset(
51
+ h5file, group='original', engine='h5netcdf')
52
+ xda_new = merge_arrays(
53
+ xds_existing[featureName], xArray[featureName],
54
+ resolution=resolution)
55
+ xds_existing.close()
56
+ xda_new.to_netcdf(h5file, group='original',
57
+ mode='w', engine='h5netcdf')
58
+ continue
59
+ mode = 'a'
43
60
 
44
- mode = 'a' if os.path.isfile(h5file) else 'w'
45
-
46
61
  with h5netcdf.File(h5file, mode) as h5f:
47
62
  try:
48
63
  rootGrp = _create_h5_Structure(rootGroupName, featureName,
49
64
  h5f, xArray, starttime, timedim)
50
- except ValueError: # group already exists, append
65
+ except ValueError: # group already exists, append
51
66
  rootGrp = h5f[rootGroupName]
52
67
 
53
68
  # determine indices
@@ -75,7 +90,8 @@ def xarray2hdf5(xArray, fdir, rootGroupName="original", timedim="datetime",
75
90
  try:
76
91
  _setMetaInfo(featureName, h5f, xArray)
77
92
  except KeyError as e:
78
- logging.warning(f"Could not set all meta info for {featureName}: {e}")
93
+ logging.warning(
94
+ f"Could not set all meta info for {featureName}: {e}")
79
95
 
80
96
 
81
97
  def _create_h5_Structure(defaultGroupName, featureName, h5f, xArray, starttime, timedim):
@@ -85,15 +101,16 @@ def _create_h5_Structure(defaultGroupName, featureName, h5f, xArray, starttime,
85
101
  coordinates.attrs['units'] = 'hours since 1970-01-01 00:00:00.0'
86
102
  coordinates.attrs['calendar'] = 'gregorian'
87
103
  rootGrp.attrs['starttime'] = str(starttime)
88
- for label, size in xArray.dims.items():
104
+ for label, size in xArray.dims.items():
89
105
  if not np.issubdtype(xArray[label].dtype, np.datetime64):
90
- rootGrp.dimensions[label] = size
106
+ rootGrp.dimensions[label] = size
91
107
  coordinates = rootGrp.create_variable(label, (label,), float)
92
108
  coordinates[:] = xArray[label].values
93
109
  # Note: xArray.dims returns a dictionary of dimensions that are not necesarily
94
110
  # in the right order; xArray[featureName].dims returns a tuple with dimension
95
111
  # names in the correct order
96
- rootGrp.create_variable(featureName, tuple(xArray[featureName].dims), dtype=float, fillvalue=0.)
112
+ rootGrp.create_variable(featureName, tuple(
113
+ xArray[featureName].dims), dtype=float, fillvalue=0.)
97
114
  return rootGrp
98
115
 
99
116
 
@@ -102,4 +119,3 @@ def _setMetaInfo(featureName, h5f, xArray):
102
119
  h5f.attrs['latitude'] = -42
103
120
  h5f.attrs['longitude'] = 168
104
121
  h5f.attrs['datatype'] = featureName
105
-
@@ -3,10 +3,28 @@ import os
3
3
 
4
4
  import xarray as xr
5
5
 
6
+ from .utils import merge_arrays
7
+
6
8
  logger = logging.getLogger(__name__)
7
9
 
8
10
 
9
- def xarray2zarr(xds, path, mode='a'):
11
+ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a'):
12
+ """
13
+ Write xarray dataset to zarr files.
14
+
15
+ Parameters
16
+ ----------
17
+ xds : xr.Dataset
18
+ Dataset to write.
19
+ path : str
20
+ Path to write the dataset.
21
+ mode : str, optional
22
+ Write mode, by default 'a'.
23
+
24
+ Returns
25
+ -------
26
+ None
27
+ """
10
28
  for feature in xds.data_vars.keys():
11
29
  fout = os.path.join(path, feature + '.zarr')
12
30
  if not os.path.exists(fout) or mode == 'w':
@@ -15,8 +33,8 @@ def xarray2zarr(xds, path, mode='a'):
15
33
  else:
16
34
  xds_existing = xr.open_zarr(fout, group='original')
17
35
  if xds_existing.datetime[0] > xds.datetime[0] or xds_existing.datetime[-1] > xds.datetime[-1]:
18
- xds_new = xr.merge([xds_existing[feature], xds[feature]])
19
- xds_new.to_zarr(fout, group='original', mode='w')
36
+ xda_new = merge_arrays(xds_existing[feature], xds[feature])
37
+ xda_new.to_zarr(fout, group='original', mode='w')
20
38
  else:
21
39
  try:
22
40
  overlap = xds_existing.datetime.where(
@@ -34,9 +52,5 @@ def xarray2zarr(xds, path, mode='a'):
34
52
  msg += "Attempting to merge the two datasets."
35
53
  logger.error(msg)
36
54
  # remove duplicate datetime entries
37
- xda_existing = xds_existing[feature].drop_duplicates(
38
- 'datetime', keep='last')
39
- xda_new = xds[feature].drop_duplicates(
40
- 'datetime', keep='last')
41
- xda_new = xda_new.combine_first(xda_existing)
55
+ xda_new = merge_arrays(xds_existing[feature], xds[feature])
42
56
  xda_new.to_zarr(fout, group='original', mode='w')
@@ -29,7 +29,7 @@ def test_backend_speed():
29
29
  execution_time_zarr = timeit.timeit(lambda: write_read('zarr'), number=5)
30
30
  logger.info('Write and read with zarr took {} seconds.'.format(
31
31
  execution_time_zarr/5))
32
- execution_time_h5 = timeit.timeit(lambda: write_read('h5netcdf'), number=5)
32
+ execution_time_h5 = timeit.timeit(lambda: write_read('netcdf'), number=5)
33
33
  logger.info('Write and read with h5 took {} seconds.'.format(
34
34
  execution_time_h5/5))
35
35
 
@@ -1,7 +1,5 @@
1
- from datetime import datetime
2
- import json
3
1
  import os
4
- import tempfile
2
+ from datetime import datetime
5
3
 
6
4
  import numpy as np
7
5
  import pandas as pd
@@ -13,7 +11,7 @@ from tonik.utils import generate_test_data
13
11
 
14
12
  def test_group(tmp_path_factory):
15
13
  rootdir = tmp_path_factory.mktemp('data')
16
- g = Storage('test_experiment', rootdir, backend='h5netcdf')
14
+ g = Storage('test_experiment', rootdir, backend='netcdf')
17
15
  c = g.get_substore('site1', 'sensor1', 'channel1')
18
16
  assert c.path == os.path.join(
19
17
  rootdir, 'test_experiment/site1/sensor1/channel1')
@@ -41,7 +39,7 @@ def test_non_existant_feature(tmp_path_factory):
41
39
 
42
40
  def test_from_directory(tmp_path_factory):
43
41
  rootdir = tmp_path_factory.mktemp('data')
44
- g = Storage('test_experiment', rootdir, backend='h5netcdf')
42
+ g = Storage('test_experiment', rootdir, backend='netcdf')
45
43
  c = g.get_substore('site1', 'sensor1', 'channel1')
46
44
  assert c.path == os.path.join(rootdir, 'test_experiment', 'site1',
47
45
  'sensor1', 'channel1')
@@ -6,20 +6,20 @@ import pytest
6
6
  import xarray as xr
7
7
 
8
8
  from tonik import Storage, generate_test_data
9
- from tonik.xarray2hdf5 import xarray2hdf5
9
+ from tonik.xarray2netcdf import xarray2netcdf
10
10
 
11
11
 
12
- def test_xarray2hdf5(tmp_path_factory):
12
+ def test_xarray2netcdf(tmp_path_factory):
13
13
  """
14
14
  Test writing xarray data to hdf5.
15
15
  """
16
16
  xdf = generate_test_data(
17
17
  dim=2, ndays=3, tstart=datetime(2022, 7, 18, 0, 0, 0))
18
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
18
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
19
19
  g = Storage('test_experiment', rootdir=temp_dir,
20
20
  starttime=datetime.fromisoformat(xdf.attrs['starttime']),
21
21
  endtime=datetime.fromisoformat(xdf.attrs['endtime']),
22
- backend='h5netcdf')
22
+ backend='netcdf')
23
23
  c = g.get_substore('MDR', '00', 'HHZ')
24
24
  c.save(xdf)
25
25
 
@@ -34,14 +34,14 @@ def test_xarray2hdf5(tmp_path_factory):
34
34
  assert dt < np.timedelta64(1, 'us')
35
35
 
36
36
 
37
- def test_xarray2hdf5_archive_starttime(tmp_path_factory):
37
+ def test_xarray2netcdf_archive_starttime(tmp_path_factory):
38
38
  xdf = generate_test_data(
39
39
  dim=1, ndays=3, tstart=datetime(2022, 7, 18, 0, 0, 0))
40
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
40
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
41
41
  g = Storage('test_experiment', rootdir=temp_dir,
42
42
  starttime=datetime(2000, 1, 1),
43
43
  endtime=datetime.fromisoformat(xdf.attrs['endtime']),
44
- backend='h5netcdf')
44
+ backend='netcdf')
45
45
  c = g.get_substore('MDR', '00', 'HHZ')
46
46
  c.save(xdf, archive_starttime=datetime(2022, 1, 1))
47
47
 
@@ -53,14 +53,32 @@ def test_xarray2hdf5_archive_starttime(tmp_path_factory):
53
53
  assert xdf_test.loc['2000-01-01':'2022-07-17T23:50:00'].shape[0] == nitems
54
54
 
55
55
 
56
- def test_xarray2hdf5_resolution(tmp_path_factory):
56
+ def test_xarray2netcdf_merge_arrays(tmp_path_factory):
57
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
58
+ start = datetime(2022, 7, 18, 8, 0, 0)
59
+ end = datetime(2022, 7, 19, 12, 0, 0)
60
+ xdf1 = generate_test_data(dim=1, ndays=1, tstart=start, add_nans=False)
61
+ xdf2 = generate_test_data(dim=1, ndays=1, tstart=end, add_nans=False)
62
+ g = Storage('test_experiment', rootdir=temp_dir,
63
+ starttime=start, endtime=end + timedelta(days=1),
64
+ backend='netcdf')
65
+ c = g.get_substore('MDR', '00', 'HHZ')
66
+ c.save(xdf2, archive_starttime=datetime(2022, 8, 1))
67
+ c.save(xdf1, archive_starttime=datetime(2022, 8, 1))
68
+ xdf_test = c('rsam')
69
+ assert xdf_test.isnull().sum() == 24
70
+ assert xdf_test.loc['2022-07-18T08:00:00'] == xdf1['rsam'].loc['2022-07-18T08:00:00']
71
+ assert xdf_test.loc['2022-07-20T11:50:00'] == xdf2['rsam'].loc['2022-07-20T11:50:00']
72
+
73
+
74
+ def test_xarray2netcdf_resolution(tmp_path_factory):
57
75
  xdf = generate_test_data(dim=1, ndays=1, tstart=datetime(2022, 7, 18, 0, 0, 0),
58
76
  add_nans=False)
59
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
77
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
60
78
  g = Storage('test_experiment', rootdir=temp_dir,
61
79
  starttime=datetime(2000, 1, 1),
62
80
  endtime=datetime.fromisoformat(xdf.attrs['endtime']),
63
- backend='h5netcdf')
81
+ backend='netcdf')
64
82
  c = g.get_substore('MDR', '00', 'HHZ')
65
83
  c.save(xdf, resolution=0.1, archive_starttime=datetime(2022, 7, 18))
66
84
 
@@ -69,18 +87,18 @@ def test_xarray2hdf5_resolution(tmp_path_factory):
69
87
  assert np.isnan(xdf_test.loc['2022-07-18T00:06:00'].data)
70
88
 
71
89
 
72
- def test_xarray2hdf5_with_gaps(tmp_path_factory):
90
+ def test_xarray2netcdf_with_gaps(tmp_path_factory):
73
91
  """
74
92
  Test writing xarray data to hdf5 with gaps.
75
93
  """
76
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
94
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
77
95
  start = datetime(2022, 7, 18, 8, 0, 0)
78
96
  end = datetime(2022, 7, 19, 12, 0, 0)
79
97
  xdf1 = generate_test_data(dim=1, ndays=1, tstart=start, add_nans=False)
80
98
  xdf2 = generate_test_data(dim=1, ndays=1, tstart=end, add_nans=False)
81
99
  g = Storage('test_experiment', rootdir=temp_dir,
82
100
  starttime=start, endtime=end + timedelta(days=1),
83
- backend='h5netcdf')
101
+ backend='netcdf')
84
102
  c = g.get_substore('MDR', '00', 'HHZ')
85
103
  c.save(xdf1)
86
104
  c.save(xdf2)
@@ -89,23 +107,23 @@ def test_xarray2hdf5_with_gaps(tmp_path_factory):
89
107
 
90
108
 
91
109
  @pytest.mark.xfail(raises=OSError)
92
- def test_xarray2hdf5_multi_access(tmp_path_factory):
110
+ def test_xarray2netcdf_multi_access(tmp_path_factory):
93
111
  """
94
112
  Test writing xarray data to hdf5 while the file is open. This is currently
95
113
  not working with NetCDF4. See the following discussions for reference:
96
114
  https://github.com/pydata/xarray/issues/2887
97
115
  https://stackoverflow.com/questions/49701623/is-there-a-way-to-release-the-file-lock-for-a-xarray-dataset
98
116
  """
99
- temp_dir = tmp_path_factory.mktemp('test_xarray2hdf5')
117
+ temp_dir = tmp_path_factory.mktemp('test_xarray2netcdf')
100
118
  xdf1 = generate_test_data(
101
119
  dim=1, ndays=1, tstart=datetime(2022, 7, 18, 8, 0, 0))
102
120
  xdf2 = generate_test_data(
103
121
  dim=1, ndays=1, tstart=datetime(2022, 7, 19, 12, 0, 0))
104
122
 
105
- xarray2hdf5(xdf1, temp_dir)
123
+ xarray2netcdf(xdf1, temp_dir)
106
124
  xdf_dummy = xr.open_dataset(os.path.join(temp_dir, 'rsam.nc'),
107
125
  group='original', engine='h5netcdf')
108
- xarray2hdf5(xdf2, temp_dir)
126
+ xarray2netcdf(xdf2, temp_dir)
109
127
 
110
128
 
111
129
  def test_xarray2zarr(tmp_path_factory):
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes