tonik 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tonik/ingest.py ADDED
@@ -0,0 +1,166 @@
1
+ # src/tonik/ingest.py
2
+ import json
3
+ import logging
4
+ import os
5
+ import pickle
6
+ import threading
7
+ import uuid
8
+ from datetime import datetime, timezone
9
+ from typing import Optional
10
+
11
+ import xarray as xr
12
+
13
+ from .xarray2netcdf import xarray2netcdf
14
+ from .xarray2zarr import xarray2zarr
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ __all__ = ["enqueue_dataset", "IngestWorker"]
19
+
20
+
21
+ def _norm_timeseries(xds: xr.Dataset, timedim: str) -> xr.Dataset:
22
+ xds = xds.sortby(timedim)
23
+ xds = xds.drop_duplicates(timedim, keep='last')
24
+ xds[timedim] = xds[timedim].astype('datetime64[ns]')
25
+ return xds
26
+
27
+
28
+ def enqueue_dataset(data: xr.Dataset, target_path: str, *, backend: str,
29
+ ingest_config: dict, save_kwargs: Optional[dict] = None) -> dict:
30
+ """
31
+ Enqueue a dataset for ingestion.
32
+ Parameters
33
+ ----------
34
+ data : xr.Dataset
35
+ The dataset to enqueue.
36
+ target_path : str
37
+ The target path where the dataset should be saved.
38
+ backend : str
39
+ The backend to use for saving the dataset ('zarr' or 'netcdf').
40
+ ingest_config : dict
41
+ Configuration for the ingest queue, must include 'queue_path'.
42
+ save_kwargs : Optional[dict], optional
43
+ Additional keyword arguments to pass to the save function, by default None.
44
+ Returns
45
+ -------
46
+ dict
47
+ A message dictionary representing the enqueued dataset.
48
+ """
49
+
50
+ queue_path = ingest_config.get("queue_path")
51
+ if not queue_path:
52
+ raise ValueError("ingest_config must provide a 'queue_path'.")
53
+ queue_path = os.path.abspath(queue_path)
54
+ payload_dir = os.path.join(queue_path, "payloads")
55
+ message_dir = os.path.join(queue_path, "messages")
56
+ os.makedirs(payload_dir, exist_ok=True)
57
+ os.makedirs(message_dir, exist_ok=True)
58
+ timedim = save_kwargs.get(
59
+ "timedim", "datetime") if save_kwargs else "datetime"
60
+
61
+ if isinstance(data, xr.DataArray):
62
+ name = data.name or "data"
63
+ data = data.to_dataset(name=name)
64
+
65
+ dataset = _norm_timeseries(data, timedim=timedim)
66
+ entry_id = uuid.uuid4().hex
67
+ payload_path = os.path.join(payload_dir, f"{entry_id}.nc")
68
+ kwargs_path = os.path.join(payload_dir, f"{entry_id}.pkl")
69
+
70
+ dataset.to_netcdf(payload_path, engine="h5netcdf")
71
+ with open(kwargs_path, "wb") as handle:
72
+ pickle.dump(save_kwargs or {}, handle)
73
+
74
+ message = {
75
+ "id": entry_id,
76
+ "target_path": os.path.abspath(target_path),
77
+ "backend": backend,
78
+ "payload_path": payload_path,
79
+ "kwargs_path": kwargs_path,
80
+ "created_at": datetime.now(tz=timezone.utc).isoformat(),
81
+ }
82
+ tmp_path = os.path.join(message_dir, f"{entry_id}.json.tmp")
83
+ final_path = os.path.join(message_dir, f"{entry_id}.json")
84
+ with open(tmp_path, "w", encoding="utf-8") as handle:
85
+ json.dump(message, handle)
86
+ os.replace(tmp_path, final_path)
87
+ logger.debug("Queued dataset %s for %s backend at %s",
88
+ entry_id, backend, target_path)
89
+ return message
90
+
91
+
92
+ class IngestWorker:
93
+ def __init__(self, queue_path: str, poll_interval: float = 10.0,
94
+ target_prefix: Optional[str] = None):
95
+ self.queue_path = os.path.abspath(queue_path)
96
+ self.messages_dir = os.path.join(self.queue_path, "messages")
97
+ self.payloads_dir = os.path.join(self.queue_path, "payloads")
98
+ os.makedirs(self.messages_dir, exist_ok=True)
99
+ os.makedirs(self.payloads_dir, exist_ok=True)
100
+ self.poll_interval = poll_interval
101
+ self.target_prefix = os.path.abspath(
102
+ target_prefix) if target_prefix else None
103
+
104
+ def _iter_messages(self):
105
+ for name in sorted(os.listdir(self.messages_dir)):
106
+ if not name.endswith(".json"):
107
+ continue
108
+ msg_path = os.path.join(self.messages_dir, name)
109
+ with open(msg_path, "r", encoding="utf-8") as handle:
110
+ message = json.load(handle)
111
+ target = os.path.abspath(message.get("target_path", ""))
112
+ if self.target_prefix and not target.startswith(self.target_prefix):
113
+ continue
114
+ yield msg_path, message
115
+
116
+ def run_once(self) -> int:
117
+ processed = 0
118
+ for msg_path, message in self._iter_messages():
119
+ payload_path = message.get("payload_path")
120
+ kwargs_path = message.get("kwargs_path")
121
+ if not payload_path or not os.path.exists(payload_path):
122
+ logger.warning(
123
+ "Missing payload for %s, dropping message", msg_path)
124
+ os.remove(msg_path)
125
+ if kwargs_path and os.path.exists(kwargs_path):
126
+ os.remove(kwargs_path)
127
+ continue
128
+
129
+ dataset = None
130
+ try:
131
+ with xr.open_dataset(payload_path, engine='h5netcdf') as ds_on_disk:
132
+ dataset = ds_on_disk.load()
133
+
134
+ kwargs = {}
135
+ if kwargs_path and os.path.exists(kwargs_path):
136
+ with open(kwargs_path, "rb") as handle:
137
+ kwargs = pickle.load(handle)
138
+
139
+ backend = message.get("backend", "zarr")
140
+ if backend == "zarr":
141
+ xarray2zarr(dataset, message["target_path"], **kwargs)
142
+ elif backend == "netcdf":
143
+ xarray2netcdf(dataset, message["target_path"], **kwargs)
144
+ else:
145
+ raise ValueError(f"Unsupported backend '{backend}'")
146
+ except Exception as exc:
147
+ logger.error("Failed to ingest %s: %s",
148
+ msg_path, exc, exc_info=True)
149
+ continue
150
+ finally:
151
+ if dataset is not None:
152
+ dataset.close()
153
+
154
+ os.remove(payload_path)
155
+ if kwargs_path and os.path.exists(kwargs_path):
156
+ os.remove(kwargs_path)
157
+ os.remove(msg_path)
158
+ processed += 1
159
+ return processed
160
+
161
+ def run_forever(self, stop_event: Optional[threading.Event] = None) -> None:
162
+ stop_event = stop_event or threading.Event()
163
+ while not stop_event.is_set():
164
+ processed = self.run_once()
165
+ if processed == 0:
166
+ stop_event.wait(self.poll_interval)
tonik/storage.py CHANGED
@@ -1,80 +1,27 @@
1
+ from datetime import datetime
1
2
  import json
2
3
  import logging
3
- import logging.config
4
4
  import os
5
+ import threading
6
+ from typing import Optional
5
7
 
6
8
  import xarray as xr
7
9
 
10
+ from .ingest import IngestWorker, enqueue_dataset
8
11
  from .xarray2netcdf import xarray2netcdf
9
12
  from .xarray2zarr import xarray2zarr
10
13
 
11
- LOGGING_CONFIG = {
12
- "version": 1,
13
- "disable_existing_loggers": False,
14
- "formatters": {
15
- "default": { # The formatter name, it can be anything that I wish
16
- # What to add in the message
17
- "format": "%(asctime)s:%(name)s:%(process)d:%(lineno)d " "%(levelname)s %(message)s",
18
- "datefmt": "%Y-%m-%d %H:%M:%S", # How to display dates
19
- },
20
- "json": { # The formatter name
21
- "()": "pythonjsonlogger.json.JsonFormatter", # The class to instantiate!
22
- # Json is more complex, but easier to read, display all attributes!
23
- "format": """
24
- asctime: %(asctime)s
25
- created: %(created)f
26
- filename: %(filename)s
27
- funcName: %(funcName)s
28
- levelname: %(levelname)s
29
- levelno: %(levelno)s
30
- lineno: %(lineno)d
31
- message: %(message)s
32
- module: %(module)s
33
- msec: %(msecs)d
34
- name: %(name)s
35
- pathname: %(pathname)s
36
- process: %(process)d
37
- processName: %(processName)s
38
- relativeCreated: %(relativeCreated)d
39
- thread: %(thread)d
40
- threadName: %(threadName)s
41
- exc_info: %(exc_info)s
42
- """,
43
- "datefmt": "%Y-%m-%d %H:%M:%S", # How to display dates
44
- },
45
- },
46
- "handlers": {
47
- "simple": { # The handler name
48
- "formatter": "default", # Refer to the formatter defined above
49
- "class": "logging.StreamHandler", # OUTPUT: Same as above, stream to console
50
- "stream": "ext://sys.stdout",
51
- },
52
- },
53
- "loggers": {
54
- "storage": { # The name of the logger, this SHOULD match your module!
55
- "level": "DEBUG", # FILTER: only INFO logs onwards from "tryceratops" logger
56
- "handlers": [
57
- "simple", # Refer the handler defined above
58
- ],
59
- },
60
- },
61
- "root": {
62
- "level": "INFO", # FILTER: only INFO logs onwards
63
- "handlers": [
64
- "simple", # Refer the handler defined above
65
- ]
66
- },
67
- }
68
-
69
- logging.config.dictConfig(LOGGING_CONFIG)
70
- logger = logging.getLogger("__name__")
14
+
15
+ logger = logging.getLogger(__name__)
71
16
 
72
17
 
73
18
  class Path(object):
74
- def __init__(self, name, parentdir, create=True, backend='zarr'):
19
+ def __init__(self, name, parentdir, create=True, backend='zarr',
20
+ archive_starttime=datetime(2000, 1, 1), ingest_config=None):
75
21
  self.name = name
76
22
  self.create = create
77
23
  self.backend = backend
24
+ self.archive_starttime = archive_starttime
78
25
  self.engine = 'h5netcdf' if self.backend == 'netcdf' else self.backend
79
26
  self.path = os.path.join(parentdir, name)
80
27
  if create:
@@ -86,6 +33,7 @@ class Path(object):
86
33
  if not os.path.exists(self.path):
87
34
  raise FileNotFoundError(f"Path {self.path} not found")
88
35
  self.children = {}
36
+ self.ingest_config = ingest_config.copy() if ingest_config else None
89
37
 
90
38
  def __str__(self):
91
39
  return self.path
@@ -97,7 +45,8 @@ class Path(object):
97
45
  return self.children[key]
98
46
  except KeyError:
99
47
  self.children[key] = Path(
100
- key, self.path, self.create, self.backend)
48
+ key, self.path, self.create, self.backend, self.archive_starttime,
49
+ ingest_config=self.ingest_config)
101
50
  return self.children[key]
102
51
 
103
52
  def feature_path(self, feature):
@@ -149,10 +98,24 @@ class Path(object):
149
98
  """
150
99
  Save a feature to disk
151
100
  """
101
+ if self.ingest_config and self.ingest_config.get('queue_path'):
102
+ enqueue_dataset(
103
+ data,
104
+ target_path=self.path,
105
+ backend=self.backend,
106
+ ingest_config=self.ingest_config,
107
+ save_kwargs=kwargs,
108
+ )
109
+ logger.debug("Queued data for %s backend at %s",
110
+ self.backend, self.path)
111
+ return
112
+
152
113
  if self.backend == 'netcdf':
153
- xarray2netcdf(data, self.path, **kwargs)
114
+ xarray2netcdf(data, self.path,
115
+ archive_starttime=self.archive_starttime, **kwargs)
154
116
  elif self.backend == 'zarr':
155
- xarray2zarr(data, self.path, **kwargs)
117
+ xarray2zarr(data, self.path,
118
+ archive_starttime=self.archive_starttime, **kwargs)
156
119
 
157
120
  def shape(self, feature):
158
121
  """
@@ -208,11 +171,17 @@ class Storage(Path):
208
171
  >>> rsam = c("rsam")
209
172
  """
210
173
 
211
- def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf'):
174
+ def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf',
175
+ ingest_config=None, archive_starttime=datetime(2000, 1, 1)):
212
176
  self.stores = set()
213
177
  self.starttime = starttime
214
178
  self.endtime = endtime
215
- super().__init__(name, rootdir, create, backend)
179
+ self.archive_starttime = archive_starttime
180
+ self._ingest_worker: Optional[IngestWorker] = None
181
+ self._ingest_thread: Optional[threading.Thread] = None
182
+ self._ingest_stop_event: Optional[threading.Event] = None
183
+ super().__init__(name, rootdir, create, backend, archive_starttime,
184
+ ingest_config=ingest_config)
216
185
 
217
186
  def print_tree(self, site, indent=0, output=''):
218
187
  output += ' ' * indent + site.path + '\n'
@@ -317,3 +286,50 @@ class Storage(Path):
317
286
 
318
287
  starttime = property(get_starttime, set_starttime)
319
288
  endtime = property(get_endtime, set_endtime)
289
+
290
+ def _ensure_ingest_worker(self, poll_interval=None) -> IngestWorker:
291
+ if not (self.ingest_config and self.ingest_config.get('queue_path')):
292
+ raise RuntimeError(
293
+ "Ingestion queue is not configured for this Storage instance.")
294
+
295
+ if self._ingest_worker is None:
296
+ queue_path = self.ingest_config['queue_path']
297
+ poll = poll_interval or self.ingest_config.get(
298
+ 'poll_interval', 10.0)
299
+ self._ingest_worker = IngestWorker(
300
+ queue_path=queue_path,
301
+ poll_interval=poll
302
+ )
303
+ elif poll_interval:
304
+ self._ingest_worker.poll_interval = poll_interval
305
+ return self._ingest_worker
306
+
307
+ def run_ingest_once(self, poll_interval=None) -> int:
308
+ worker = self._ensure_ingest_worker(poll_interval)
309
+ return worker.run_once()
310
+
311
+ def start_ingest_worker(self, *, background=True, poll_interval=None):
312
+ worker = self._ensure_ingest_worker(poll_interval)
313
+ if not background:
314
+ return worker.run_once()
315
+ if self._ingest_thread and self._ingest_thread.is_alive():
316
+ return self._ingest_thread
317
+ stop_event = threading.Event()
318
+ thread = threading.Thread(
319
+ target=worker.run_forever,
320
+ kwargs={'stop_event': stop_event},
321
+ daemon=True,
322
+ name=f"tonik-ingest-{self.name}",
323
+ )
324
+ thread.start()
325
+ self._ingest_thread = thread
326
+ self._ingest_stop_event = stop_event
327
+ return thread
328
+
329
+ def stop_ingest_worker(self, timeout=None):
330
+ if self._ingest_thread and self._ingest_thread.is_alive():
331
+ if self._ingest_stop_event:
332
+ self._ingest_stop_event.set()
333
+ self._ingest_thread.join(timeout=timeout)
334
+ self._ingest_thread = None
335
+ self._ingest_stop_event = None
tonik/xarray2zarr.py CHANGED
@@ -16,29 +16,75 @@ from .utils import merge_arrays, fill_time_gaps, get_dt
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
- def get_chunks(xda: xr.DataArray, chunks: int = 1,
20
- timedim: str = 'datetime') -> dict:
19
+ def _init_timeseries_store(path: str, start: np.datetime64, stop: np.datetime64, interval: pd.Timedelta,
20
+ data_vars: dict, group: str = "original", chunk_size: int = 10,
21
+ timedim: str = "datetime") -> xr.DataArray:
21
22
  """
22
- Determine the chunk size for the datetime dimension. Other dimensions are assumed to be
23
- small enough to not require chunking.
23
+ Initialize an empty zarr store for time series data. This facilitates writing data out
24
+ of sequence and avoid prepending which is costly and difficult to get right.
24
25
 
25
26
  Parameters
26
27
  ----------
27
- coords : xr.core.coordinates.DatasetCoordinates
28
- Coordinates of the dataset.
29
- chunks : int, optional
30
- Number of chunks in days to divide the datetime dimension into, by default 1.
31
- """
32
- if timedim not in xda.coords:
33
- raise ValueError(
34
- f"Datetime coordinate {timedim} not found in dataset coordinates.")
35
- dt = get_dt(xda.coords[timedim])
36
- chunklength = int(pd.Timedelta('%dD' % chunks) / dt)
37
- return chunklength
38
-
28
+ path : str
29
+ Path to the zarr store.
30
+ start : np.datetime64
31
+ Start time of the zarr store.
32
+ stop : np.datetime64
33
+ End time of the zarr store.
34
+ interval : pd.Timedelta
35
+ Sampling interval string (e.g. '1H', '15T') for the time dimension
36
+ data_vars : dict
37
+ Dictionary defining the data variables to create. Keys are variable names,
38
+ values are tuples of (dims, shape, dtype) where dims is a tuple of dimension
39
+ names (excluding the time dimension), shape is a tuple of dimension sizes
40
+ (excluding the time dimension), and dtype is the numpy data type.
41
+ group : str, optional
42
+ Group name in the zarr store, by default "original"
43
+ chunk_size : int, optional
44
+ Chunk size in number of time steps, by default 10
45
+ timedim : str, optional
46
+ Name of the time dimension, by default "datetime"
39
47
 
40
- def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.DataArray, mode: str,
41
- timedim: str = 'datetime') -> xr.DataArray:
48
+ """
49
+ # Make the zarr store a multiple of chunk_size
50
+ stop_ts = pd.Timestamp(stop)
51
+ start_ts = pd.Timestamp(start)
52
+ chunk_length = int(chunk_size)
53
+ if chunk_length <= 0:
54
+ raise ValueError("chunk_size must be a positive integer")
55
+ total_steps = int((stop_ts - start_ts) // interval) + 1
56
+ if total_steps < 1:
57
+ total_steps = chunk_length
58
+ if total_steps % chunk_length:
59
+ required_steps = ((total_steps + chunk_length - 1) //
60
+ chunk_length) * chunk_length
61
+ start_ts = stop_ts - interval * (required_steps - 1)
62
+ time_index = pd.date_range(start=start_ts, end=stop_ts, freq=interval)
63
+ ds = xr.Dataset()
64
+ name, value = list(data_vars.items())[0]
65
+ dims, coords, shape, dtype = value
66
+ dims = dims + (timedim,)
67
+ shape = tuple(shape) + (len(time_index),)
68
+ # Create coordinates for gap dataset
69
+ new_coords = {timedim: time_index}
70
+ for coord_name, coord in coords.items():
71
+ if coord_name != timedim:
72
+ new_coords[coord_name] = coord
73
+
74
+ xda = xr.DataArray(
75
+ np.full(shape, np.nan, dtype=dtype),
76
+ coords=new_coords,
77
+ dims=dims,
78
+ name=name
79
+ )
80
+ xda = xda.chunk(
81
+ {timedim: chunk_size, **{d: -1 for d in dims[:-1]}})
82
+ xda.to_zarr(path, group=group, mode="w")
83
+ return xda
84
+
85
+
86
+ def _fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.DataArray, interval: pd.Timedelta,
87
+ timedim: str = 'datetime', chunk_size: int = 10) -> xr.DataArray:
42
88
  """
43
89
  Fill gaps between existing and new datasets.
44
90
 
@@ -56,29 +102,23 @@ def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.Data
56
102
  xr.Dataset
57
103
  Combined dataset with gaps filled
58
104
  """
59
- if mode not in ['a', 'p']:
60
- raise ValueError(
61
- 'Mode has to be either "a" for append or "p" for prepend')
62
-
63
- # get the sample interval
64
- dt = get_dt(xds_new.coords[timedim])
65
105
 
66
106
  existing_endpoint = xds_existing[timedim].values
67
107
  # Get time ranges
68
- if mode == 'a':
69
- gap_start = existing_endpoint + dt
70
- gap_end = xds_new[timedim].values[0] - dt
71
- elif mode == 'p':
72
- gap_end = existing_endpoint - dt
73
- gap_start = xds_new[timedim].values[-1] + dt
108
+ gap_start = existing_endpoint + interval
109
+ gap_end = xds_new[timedim].values[0] - interval
110
+
111
+ # Prepare shape for gap filling
112
+ shape_list = list(xds_new.shape)
113
+ dims_list = list(xds_new.dims)
114
+ shape_list.pop(dims_list.index(timedim))
74
115
 
75
116
  if gap_start <= gap_end:
76
- gap_times = pd.date_range(start=gap_start, end=gap_end, freq=dt)
117
+ gap_times = pd.date_range(start=gap_start, end=gap_end, freq=interval)
77
118
 
78
119
  # Create NaN array with same shape as variable but for gap times
79
- gap_shape = (len(gap_times),) + \
80
- xds_new.shape[1:] # Skip time dimension
81
- gap_values = np.full(gap_shape, np.nan)
120
+ gap_shape = tuple(shape_list) + (len(gap_times),)
121
+ gap_values = np.full(gap_shape, np.nan, dtype=xds_new.dtype)
82
122
 
83
123
  # Create coordinates for gap dataset
84
124
  gap_coords = {timedim: gap_times}
@@ -94,70 +134,28 @@ def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.Data
94
134
  )
95
135
 
96
136
  # Combine: existing + gap + new
97
- if mode == 'a':
98
- combined = xr.concat([gap_data, xds_new], dim=timedim)
99
- elif mode == 'p':
100
- combined = xr.concat([xds_new, gap_data], dim=timedim)
101
- return combined
137
+ combined = xr.concat([gap_data, xds_new], dim=timedim)
102
138
  else:
103
- return xds_new
104
-
105
-
106
- def _build_append_payload_full_chunks(payload: xr.DataArray, mode: str,
107
- chunklen: int, timedim: str = "datetime") -> xr.DataArray:
108
- """
109
- Construct the sequence to append so that the final total length is a multiple of `chunklen`
110
-
111
- Parameters
112
- ----------
113
- payload : xr.DataArray
114
- DataArray to append
115
- mode : str
116
- 'a' for append, 'p' for prepend
117
- chunklen : int
118
- Chunk length in number of time steps
119
- timedim : str
120
- Name of the time dimension
121
-
122
- Returns
123
- -------
124
- xr.DataArray
125
- Padded DataArray with length a multiple of chunklen
126
- """
127
- if mode not in ['a', 'p']:
128
- raise ValueError(
129
- 'Mode has to be either "a" for append or "p" for prepend')
130
-
131
- # pad the tail so that payload_len % chunklen == 0
132
- pay_len = payload.sizes[timedim]
133
- need = -pay_len % chunklen # 0..chunklen-1
139
+ combined = xds_new
134
140
 
141
+ # ensure new array aligns with chunk size
142
+ arr_len = combined.sizes[timedim]
143
+ need = -arr_len % chunk_size # 0..chunklen-1
135
144
  if need > 0:
136
- dt = get_dt(payload.coords[timedim])
137
- if mode == 'a':
138
- start = payload[timedim].values[-1] + dt
139
- elif mode == 'p':
140
- start = payload[timedim].values[0] - (need+1)*dt
141
- pad_times = pd.date_range(start=start, periods=need, freq=dt)
142
- pad_shape = []
143
- for i, d in enumerate(payload.dims):
144
- if d == timedim:
145
- pad_shape.append(need)
146
- else:
147
- pad_shape.append(payload.shape[i])
148
- pad_vals = np.full(pad_shape, np.nan)
145
+ start = combined[timedim].values[-1] + interval
146
+ pad_times = pd.date_range(start=start, periods=need, freq=interval)
147
+ pad_shape = tuple(shape_list) + (len(pad_times),)
148
+ pad_vals = np.full(pad_shape, np.nan, dtype=xds_new.dtype)
149
149
  pad_coords = {timedim: pad_times}
150
- for c in payload.coords:
151
- if c != timedim:
152
- pad_coords[c] = payload.coords[c]
150
+ for coord_name, coord in xds_new.coords.items():
151
+ if coord_name != timedim:
152
+ pad_coords[coord_name] = coord
153
153
  pad_da = xr.DataArray(pad_vals, coords=pad_coords,
154
- dims=payload.dims, name=payload.name, attrs=payload.attrs)
155
- if mode == 'a':
156
- payload = xr.concat([payload, pad_da], dim=timedim)
157
- elif mode == 'p':
158
- payload = xr.concat([pad_da, payload], dim=timedim)
159
- payload = payload.chunk({timedim: chunklen})
160
- return payload
154
+ dims=xds_new.dims,
155
+ name=xds_new.name)
156
+ combined = xr.concat([combined, pad_da], dim=timedim)
157
+
158
+ return combined
161
159
 
162
160
 
163
161
  def _update_meta_data(fout: str,
@@ -179,7 +177,7 @@ def _update_meta_data(fout: str,
179
177
  Group name for metadata.
180
178
  """
181
179
 
182
- now = np.datetime64(datetime.now(tz=timezone.utc), 'ns')
180
+ now = np.datetime64(datetime.now(tz=timezone.utc), 's')
183
181
  new_update = xr.DataArray([now],
184
182
  coords={'update': [now]},
185
183
  dims=['update'],
@@ -212,8 +210,9 @@ def _update_meta_data(fout: str,
212
210
  xr.Dataset(vars).to_zarr(fout, group=meta_group, mode='w')
213
211
 
214
212
 
215
- def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
216
- chunks: int = 10, timedim: str = 'datetime') -> None:
213
+ def xarray2zarr(xds: xr.Dataset, path: str, group='original',
214
+ chunk_size: int = 1000, timedim: str = 'datetime', interval: str = None,
215
+ archive_starttime: datetime = datetime(2000, 1, 1)) -> None:
217
216
  """
218
217
  Write xarray dataset to zarr files.
219
218
 
@@ -227,7 +226,7 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
227
226
  Write mode, by default 'a'.
228
227
  group : str, optional
229
228
  Group name, by default 'original'
230
- chunks : int, optional
229
+ chunk_size : int, optional
231
230
  Chunk size as the number of days.
232
231
  timedim : str
233
232
  Name of the time dimension, by default 'datetime'
@@ -245,14 +244,16 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
245
244
  # Fill gaps
246
245
  xds = xds.drop_duplicates(timedim, keep='last')
247
246
  xds = fill_time_gaps(xds, timedim=timedim)
247
+ if interval is None:
248
+ interval = get_dt(xds[timedim])
249
+ else:
250
+ interval = pd.to_timedelta(interval)
248
251
 
249
252
  for feature in xds.data_vars.keys():
250
253
  fout = os.path.join(path, feature + '.zarr')
251
- # nchunks = get_chunks(xds[feature], chunks)
252
- nchunks = chunks
253
254
  last_dp = xds[feature][timedim].values[-1]
254
255
  _update_meta_data(fout, last_dp, resolution=float(
255
- get_dt(xds[timedim]) / pd.Timedelta(1, 'h')))
256
+ interval / pd.Timedelta(1, 'h')))
256
257
  try:
257
258
  xds_existing = xr.open_zarr(fout, group=group)
258
259
  has_store = True
@@ -260,57 +261,61 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
260
261
  has_store = False
261
262
 
262
263
  if not has_store:
263
- xda_new = _build_append_payload_full_chunks(
264
- xds[feature], 'a', nchunks)
265
- xda_new.to_zarr(fout, group=group, mode='w',
266
- write_empty_chunks=True)
267
- continue
268
-
269
- if xds_existing[timedim][0] > xds[timedim][-1]:
270
- logger.debug("Prepending data to existing zarr store.")
271
- xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: 0}),
272
- xds[feature], mode='p')
273
- xda_new = _build_append_payload_full_chunks(
274
- xda_new, 'p', nchunks)
275
- combined = xda_new.combine_first(xds_existing[feature]).compute()
276
- combined.chunk({timedim: nchunks}).to_zarr(fout, group=group, mode='w',
277
- write_empty_chunks=True)
264
+ logger.debug("Creating new zarr store.")
265
+ shape_list = list(xds[feature].shape)
266
+ dims_list = list(xds[feature].dims)
267
+ shape_list.pop(dims_list.index(timedim))
268
+ dims_list.pop(dims_list.index(timedim))
269
+ xds_existing = _init_timeseries_store(
270
+ fout,
271
+ start=np.datetime64(archive_starttime),
272
+ stop=xds[feature][timedim].values[-1],
273
+ interval=interval,
274
+ data_vars={
275
+ feature: (tuple(dims_list), xds[feature].coords,
276
+ tuple(shape_list), xds[feature].dtype)},
277
+ group=group,
278
+ chunk_size=chunk_size,
279
+ timedim=timedim
280
+ )
281
+
282
+ if xds_existing[timedim][0] > xds[timedim][0]:
283
+ raise ValueError("New data ends before existing data starts. "
284
+ "Prepending to existing data is currently not supported.")
278
285
 
279
286
  elif xds_existing[timedim][-1] < xds[timedim][0]:
280
287
  logger.debug("Appending data to existing zarr store.")
281
- xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
282
- xds[feature], mode='a')
283
- xda_new = _build_append_payload_full_chunks(
284
- xda_new, 'a', nchunks)
288
+ xda_new = _fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
289
+ xds[feature], interval, chunk_size=chunk_size)
285
290
  xda_new.to_zarr(fout, group=group, mode='a',
286
291
  append_dim=timedim)
287
-
288
- elif xds_existing[timedim][0] > xds[timedim][0] and xds_existing[timedim][-1] < xds[timedim][-1]:
289
- logger.debug(
290
- "Data in zarr store contained in new data. Rewriting zarr store.")
291
- xda_new = _build_append_payload_full_chunks(
292
- xds[feature], 'a', nchunks)
293
- xda_new.to_zarr(fout, group=group, mode='w',
294
- write_empty_chunks=True)
295
-
296
292
  else:
297
293
  logger.debug("Data in zarr store overlaps with new data.")
298
294
  logger.debug(
299
295
  f"Endtime of existing data: {xds_existing[timedim][-1].values}")
300
296
  logger.debug(f"Starttime of new data: {xds[timedim][0].values}")
301
- xds_existing = xds_existing.drop_duplicates(timedim, keep='last')
302
- overlap = xds_existing[timedim].where(
303
- xds_existing[timedim] == xds[timedim])
304
- xds[feature].loc[{timedim: overlap}].to_zarr(
305
- fout, group=group, mode='r+', region='auto')
306
- remainder = xds[feature].drop_sel({timedim: overlap})
297
+ existing_times = xds_existing[timedim].values
298
+ new_times = xds[timedim].values
299
+
300
+ overlap_times, idx_existing, idx_new = np.intersect1d(
301
+ existing_times,
302
+ new_times,
303
+ assume_unique=True,
304
+ return_indices=True,
305
+ )
306
+ region = {}
307
+ for dim in xds[feature].dims:
308
+ if dim == timedim:
309
+ start = int(idx_existing.min())
310
+ stop = start + len(idx_existing)
311
+ region[dim] = slice(start, stop)
312
+ else:
313
+ region[dim] = 'auto'
314
+ xds[feature].isel({timedim: idx_new}).to_zarr(
315
+ fout, group=group, mode='r+', region=region)
316
+ remainder = xds[feature].drop_sel({timedim: new_times[idx_new]})
307
317
  if remainder.sizes[timedim] > 0:
308
- mode = 'a'
309
- if remainder[timedim][-1] < xds_existing[timedim][0]:
310
- mode = 'p'
311
- xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: 0}),
312
- xds[feature], mode=mode)
313
- xda_new = _build_append_payload_full_chunks(
314
- xda_new, mode, nchunks)
318
+ xda_new = _fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
319
+ remainder, interval, chunk_size=chunk_size)
315
320
  xda_new.to_zarr(fout, group=group, mode='a',
316
321
  append_dim=timedim)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tonik
3
- Version: 0.1.21
3
+ Version: 0.1.22
4
4
  Summary: Store time series data as HDF5 files and access them through an API.
5
5
  Project-URL: Homepage, https://tsc-tools.github.io/tonik
6
6
  Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
@@ -9,29 +9,29 @@ License-File: LICENSE
9
9
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.9
13
- Requires-Dist: datashader>=0.14
14
- Requires-Dist: fastapi>=0.112
15
- Requires-Dist: h5netcdf>=1.1
16
- Requires-Dist: h5py>=3.8
17
- Requires-Dist: matplotlib
18
- Requires-Dist: netcdf4>=1.6
19
- Requires-Dist: pandas>=2.0
20
- Requires-Dist: python-json-logger>=2.0
21
- Requires-Dist: s3fs
22
- Requires-Dist: uvicorn[standard]>=0.22
23
- Requires-Dist: xarray[accel,io,parallel]
24
- Requires-Dist: zarr
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: datashader<0.19,>=0.18.2
14
+ Requires-Dist: fastapi<0.129,>=0.128.0
15
+ Requires-Dist: h5netcdf<2,>=1.7.3
16
+ Requires-Dist: h5py<4,>=3.15.1
17
+ Requires-Dist: matplotlib<4,>=3.10.8
18
+ Requires-Dist: pandas<3,>=2.3.3
19
+ Requires-Dist: s3fs<2026,>=2025.12.0
20
+ Requires-Dist: uvicorn[standard]<0.41,>=0.40.0
21
+ Requires-Dist: xarray[accel,io,parallel]<2026,>=2025.6.1
22
+ Requires-Dist: zarr<4,>=3.1.5
25
23
  Provides-Extra: dev
26
- Requires-Dist: build; extra == 'dev'
27
- Requires-Dist: httpx; extra == 'dev'
28
- Requires-Dist: ipykernel; extra == 'dev'
29
- Requires-Dist: mkdocs; extra == 'dev'
30
- Requires-Dist: mkdocs-jupyter; extra == 'dev'
31
- Requires-Dist: mkdocstrings[python]; extra == 'dev'
24
+ Requires-Dist: build<2,>=1.4.0; extra == 'dev'
25
+ Requires-Dist: hatch<2,>=1.16.2; extra == 'dev'
26
+ Requires-Dist: httpx<0.29,>=0.28.1; extra == 'dev'
27
+ Requires-Dist: ipykernel<7,>=6.31.0; extra == 'dev'
28
+ Requires-Dist: mkdocs-jupyter<0.26,>=0.25.1; extra == 'dev'
29
+ Requires-Dist: mkdocs<2,>=1.6.1; extra == 'dev'
30
+ Requires-Dist: mkdocstrings[python]<2,>=1.0.0; extra == 'dev'
31
+ Requires-Dist: moto[s3]<6,>=5.1.19; extra == 'dev'
32
32
  Requires-Dist: pytest; extra == 'dev'
33
- Requires-Dist: twine; extra == 'dev'
34
- Requires-Dist: zarr[remote-tests]; extra == 'dev'
33
+ Requires-Dist: twine<7,>=6.2.0; extra == 'dev'
34
+ Requires-Dist: zarr[remote-tests]<4,>=3.1.5; extra == 'dev'
35
35
  Description-Content-Type: text/markdown
36
36
 
37
37
  # Tonik
@@ -1,14 +1,15 @@
1
1
  tonik/__init__.py,sha256=dov-nMeGFBzLspmj4rWKjC4r736vmaPDgMEkHSUfP98,523
2
2
  tonik/api.py,sha256=vW0ykOo5iGAV0_WuOepdrnUyFp83F7KyJTd43ksLmUk,7985
3
3
  tonik/grafana_annotations.py,sha256=ZU9Cy-HT4vvMfYIQzD9WboaDVOCBDv__NmXbk1qKWJo,5838
4
- tonik/storage.py,sha256=bYBl3JPpH8D3iIFOj5AZQXc4M8txAbwFKt4xTfdotgg,10583
4
+ tonik/ingest.py,sha256=RWJLasAVM8iaoCK5HCXEXybXARupw58Im0Ic7KrAThk,6228
5
+ tonik/storage.py,sha256=zHXrIjbSPC3Sni1_KOn_OqCk0HtWaOyXgAhMTTdO18w,11500
5
6
  tonik/utils.py,sha256=GwAXfGFQWhlsLThQvSux1SooRkW-iIkJP99JMH72t5Y,11791
6
7
  tonik/xarray2netcdf.py,sha256=nq6RHk5ciaAg1bxNDiyHPRdAts1C7fj7jtDbaLaSTWM,6497
7
- tonik/xarray2zarr.py,sha256=kRhgDdo8CDT1ceszwQEQNfXdgbnmL5nNejUzaMnyFXM,11707
8
+ tonik/xarray2zarr.py,sha256=HeqKBArNcYUzd_azgCK0iptq1qAA6h2j4brfIpkV_gs,12156
8
9
  tonik/package_data/index.html,sha256=ZCZ-BtGRERsL-6c_dfY43qd2WAaggH7xereennGL6ww,4372
9
10
  tonik/package_data/whakaari_labels.json,sha256=96UZSq41yXgAJxuKivLBKlRTw-33jkjh7AGKTsDQ9Yg,3993
10
- tonik-0.1.21.dist-info/METADATA,sha256=rEp5KTN5xDizNLajJkP3i6lvGI02hrR-sqqfIRfn4M0,2207
11
- tonik-0.1.21.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
12
- tonik-0.1.21.dist-info/entry_points.txt,sha256=y82XyTeQddM87gCTzgSQaTlKF3VFicO4hhClHUv6j1A,127
13
- tonik-0.1.21.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
14
- tonik-0.1.21.dist-info/RECORD,,
11
+ tonik-0.1.22.dist-info/METADATA,sha256=sjriW0whFAAo3VFxdu-1xlND0sw7RqcbULJz5HRUT-Q,2424
12
+ tonik-0.1.22.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ tonik-0.1.22.dist-info/entry_points.txt,sha256=y82XyTeQddM87gCTzgSQaTlKF3VFicO4hhClHUv6j1A,127
14
+ tonik-0.1.22.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
15
+ tonik-0.1.22.dist-info/RECORD,,
File without changes