PyPI - tonik - Versions diffs - 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl - Mend

tonik 0.1.21py3-none-any.whl → 0.1.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

tonik/ingest.py +166 -0
tonik/storage.py +83 -67
tonik/xarray2zarr.py +146 -141
{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/METADATA +22 -22
{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/RECORD +8 -7
{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/WHEEL +0 -0
{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/entry_points.txt +0 -0
{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/licenses/LICENSE +0 -0

tonik/ingest.py ADDED Viewed

@@ -0,0 +1,166 @@
+# src/tonik/ingest.py
+import json
+import logging
+import os
+import pickle
+import threading
+import uuid
+from datetime import datetime, timezone
+from typing import Optional
+import xarray as xr
+from .xarray2netcdf import xarray2netcdf
+from .xarray2zarr import xarray2zarr
+logger = logging.getLogger(__name__)
+__all__ = ["enqueue_dataset", "IngestWorker"]
+def _norm_timeseries(xds: xr.Dataset, timedim: str) -> xr.Dataset:
+    xds = xds.sortby(timedim)
+    xds = xds.drop_duplicates(timedim, keep='last')
+    xds[timedim] = xds[timedim].astype('datetime64[ns]')
+    return xds
+def enqueue_dataset(data: xr.Dataset, target_path: str, *, backend: str,
+                    ingest_config: dict, save_kwargs: Optional[dict] = None) -> dict:
+    """
+    Enqueue a dataset for ingestion.
+    Parameters
+    ----------
+    data : xr.Dataset
+        The dataset to enqueue.
+    target_path : str
+        The target path where the dataset should be saved.
+    backend : str
+        The backend to use for saving the dataset ('zarr' or 'netcdf').
+    ingest_config : dict
+        Configuration for the ingest queue, must include 'queue_path'.
+    save_kwargs : Optional[dict], optional
+        Additional keyword arguments to pass to the save function, by default None.
+    Returns
+    -------
+    dict
+        A message dictionary representing the enqueued dataset.
+    """
+    queue_path = ingest_config.get("queue_path")
+    if not queue_path:
+        raise ValueError("ingest_config must provide a 'queue_path'.")
+    queue_path = os.path.abspath(queue_path)
+    payload_dir = os.path.join(queue_path, "payloads")
+    message_dir = os.path.join(queue_path, "messages")
+    os.makedirs(payload_dir, exist_ok=True)
+    os.makedirs(message_dir, exist_ok=True)
+    timedim = save_kwargs.get(
+        "timedim", "datetime") if save_kwargs else "datetime"
+    if isinstance(data, xr.DataArray):
+        name = data.name or "data"
+        data = data.to_dataset(name=name)
+    dataset = _norm_timeseries(data, timedim=timedim)
+    entry_id = uuid.uuid4().hex
+    payload_path = os.path.join(payload_dir, f"{entry_id}.nc")
+    kwargs_path = os.path.join(payload_dir, f"{entry_id}.pkl")
+    dataset.to_netcdf(payload_path, engine="h5netcdf")
+    with open(kwargs_path, "wb") as handle:
+        pickle.dump(save_kwargs or {}, handle)
+    message = {
+        "id": entry_id,
+        "target_path": os.path.abspath(target_path),
+        "backend": backend,
+        "payload_path": payload_path,
+        "kwargs_path": kwargs_path,
+        "created_at": datetime.now(tz=timezone.utc).isoformat(),
+    }
+    tmp_path = os.path.join(message_dir, f"{entry_id}.json.tmp")
+    final_path = os.path.join(message_dir, f"{entry_id}.json")
+    with open(tmp_path, "w", encoding="utf-8") as handle:
+        json.dump(message, handle)
+    os.replace(tmp_path, final_path)
+    logger.debug("Queued dataset %s for %s backend at %s",
+                 entry_id, backend, target_path)
+    return message
+class IngestWorker:
+    def __init__(self, queue_path: str, poll_interval: float = 10.0,
+                 target_prefix: Optional[str] = None):
+        self.queue_path = os.path.abspath(queue_path)
+        self.messages_dir = os.path.join(self.queue_path, "messages")
+        self.payloads_dir = os.path.join(self.queue_path, "payloads")
+        os.makedirs(self.messages_dir, exist_ok=True)
+        os.makedirs(self.payloads_dir, exist_ok=True)
+        self.poll_interval = poll_interval
+        self.target_prefix = os.path.abspath(
+            target_prefix) if target_prefix else None
+    def _iter_messages(self):
+        for name in sorted(os.listdir(self.messages_dir)):
+            if not name.endswith(".json"):
+                continue
+            msg_path = os.path.join(self.messages_dir, name)
+            with open(msg_path, "r", encoding="utf-8") as handle:
+                message = json.load(handle)
+            target = os.path.abspath(message.get("target_path", ""))
+            if self.target_prefix and not target.startswith(self.target_prefix):
+                continue
+            yield msg_path, message
+    def run_once(self) -> int:
+        processed = 0
+        for msg_path, message in self._iter_messages():
+            payload_path = message.get("payload_path")
+            kwargs_path = message.get("kwargs_path")
+            if not payload_path or not os.path.exists(payload_path):
+                logger.warning(
+                    "Missing payload for %s, dropping message", msg_path)
+                os.remove(msg_path)
+                if kwargs_path and os.path.exists(kwargs_path):
+                    os.remove(kwargs_path)
+                continue
+            dataset = None
+            try:
+                with xr.open_dataset(payload_path, engine='h5netcdf') as ds_on_disk:
+                    dataset = ds_on_disk.load()
+                kwargs = {}
+                if kwargs_path and os.path.exists(kwargs_path):
+                    with open(kwargs_path, "rb") as handle:
+                        kwargs = pickle.load(handle)
+                backend = message.get("backend", "zarr")
+                if backend == "zarr":
+                    xarray2zarr(dataset, message["target_path"], **kwargs)
+                elif backend == "netcdf":
+                    xarray2netcdf(dataset, message["target_path"], **kwargs)
+                else:
+                    raise ValueError(f"Unsupported backend '{backend}'")
+            except Exception as exc:
+                logger.error("Failed to ingest %s: %s",
+                             msg_path, exc, exc_info=True)
+                continue
+            finally:
+                if dataset is not None:
+                    dataset.close()
+            os.remove(payload_path)
+            if kwargs_path and os.path.exists(kwargs_path):
+                os.remove(kwargs_path)
+            os.remove(msg_path)
+            processed += 1
+        return processed
+    def run_forever(self, stop_event: Optional[threading.Event] = None) -> None:
+        stop_event = stop_event or threading.Event()
+        while not stop_event.is_set():
+            processed = self.run_once()
+            if processed == 0:
+                stop_event.wait(self.poll_interval)

tonik/storage.py CHANGED Viewed

@@ -1,80 +1,27 @@
+from datetime import datetime
 import json
 import logging
-import logging.config
 import os
+import threading
+from typing import Optional
 import xarray as xr
+from .ingest import IngestWorker, enqueue_dataset
 from .xarray2netcdf import xarray2netcdf
 from .xarray2zarr import xarray2zarr
-LOGGING_CONFIG = {
-    "version": 1,
-    "disable_existing_loggers": False,
-    "formatters": {
-        "default": {  # The formatter name, it can be anything that I wish
-            # What to add in the message
-            "format": "%(asctime)s:%(name)s:%(process)d:%(lineno)d " "%(levelname)s %(message)s",
-            "datefmt": "%Y-%m-%d %H:%M:%S",  # How to display dates
-        },
-        "json": {  # The formatter name
-            "()": "pythonjsonlogger.json.JsonFormatter",  # The class to instantiate!
-            # Json is more complex, but easier to read, display all attributes!
-            "format": """
-                    asctime: %(asctime)s
-                    created: %(created)f
-                    filename: %(filename)s
-                    funcName: %(funcName)s
-                    levelname: %(levelname)s
-                    levelno: %(levelno)s
-                    lineno: %(lineno)d
-                    message: %(message)s
-                    module: %(module)s
-                    msec: %(msecs)d
-                    name: %(name)s
-                    pathname: %(pathname)s
-                    process: %(process)d
-                    processName: %(processName)s
-                    relativeCreated: %(relativeCreated)d
-                    thread: %(thread)d
-                    threadName: %(threadName)s
-                    exc_info: %(exc_info)s
-                """,
-            "datefmt": "%Y-%m-%d %H:%M:%S",  # How to display dates
-        },
-    },
-    "handlers": {
-        "simple": {  # The handler name
-            "formatter": "default",  # Refer to the formatter defined above
-            "class": "logging.StreamHandler",  # OUTPUT: Same as above, stream to console
-            "stream": "ext://sys.stdout",
-        },
-    },
-    "loggers": {
-        "storage": {  # The name of the logger, this SHOULD match your module!
-            "level": "DEBUG",  # FILTER: only INFO logs onwards from "tryceratops" logger
-            "handlers": [
-                "simple",  # Refer the handler defined above
-            ],
-        },
-    },
-    "root": {
-        "level": "INFO",  # FILTER: only INFO logs onwards
-        "handlers": [
-            "simple",  # Refer the handler defined above
-        ]
-    },
-}
-logging.config.dictConfig(LOGGING_CONFIG)
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 class Path(object):
-    def __init__(self, name, parentdir, create=True, backend='zarr'):
+    def __init__(self, name, parentdir, create=True, backend='zarr',
+                 archive_starttime=datetime(2000, 1, 1), ingest_config=None):
         self.name = name
         self.create = create
         self.backend = backend
+        self.archive_starttime = archive_starttime
         self.engine = 'h5netcdf' if self.backend == 'netcdf' else self.backend
         self.path = os.path.join(parentdir, name)
         if create:
@@ -86,6 +33,7 @@ class Path(object):
             if not os.path.exists(self.path):
                 raise FileNotFoundError(f"Path {self.path} not found")
         self.children = {}
+        self.ingest_config = ingest_config.copy() if ingest_config else None
     def __str__(self):
         return self.path
@@ -97,7 +45,8 @@ class Path(object):
             return self.children[key]
         except KeyError:
             self.children[key] = Path(
-                key, self.path, self.create, self.backend)
+                key, self.path, self.create, self.backend, self.archive_starttime,
+                ingest_config=self.ingest_config)
             return self.children[key]
     def feature_path(self, feature):
@@ -149,10 +98,24 @@ class Path(object):
         """
         Save a feature to disk
         """
+        if self.ingest_config and self.ingest_config.get('queue_path'):
+            enqueue_dataset(
+                data,
+                target_path=self.path,
+                backend=self.backend,
+                ingest_config=self.ingest_config,
+                save_kwargs=kwargs,
+            )
+            logger.debug("Queued data for %s backend at %s",
+                         self.backend, self.path)
+            return
         if self.backend == 'netcdf':
-            xarray2netcdf(data, self.path, **kwargs)
+            xarray2netcdf(data, self.path,
+                          archive_starttime=self.archive_starttime, **kwargs)
         elif self.backend == 'zarr':
-            xarray2zarr(data, self.path, **kwargs)
+            xarray2zarr(data, self.path,
+                        archive_starttime=self.archive_starttime, **kwargs)
     def shape(self, feature):
         """
@@ -208,11 +171,17 @@ class Storage(Path):
     >>> rsam = c("rsam")
     """
-    def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf'):
+    def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf',
+                 ingest_config=None, archive_starttime=datetime(2000, 1, 1)):
         self.stores = set()
         self.starttime = starttime
         self.endtime = endtime
-        super().__init__(name, rootdir, create, backend)
+        self.archive_starttime = archive_starttime
+        self._ingest_worker: Optional[IngestWorker] = None
+        self._ingest_thread: Optional[threading.Thread] = None
+        self._ingest_stop_event: Optional[threading.Event] = None
+        super().__init__(name, rootdir, create, backend, archive_starttime,
+                         ingest_config=ingest_config)
     def print_tree(self, site, indent=0, output=''):
         output += ' ' * indent + site.path + '\n'
@@ -317,3 +286,50 @@ class Storage(Path):
     starttime = property(get_starttime, set_starttime)
     endtime = property(get_endtime, set_endtime)
+    def _ensure_ingest_worker(self, poll_interval=None) -> IngestWorker:
+        if not (self.ingest_config and self.ingest_config.get('queue_path')):
+            raise RuntimeError(
+                "Ingestion queue is not configured for this Storage instance.")
+        if self._ingest_worker is None:
+            queue_path = self.ingest_config['queue_path']
+            poll = poll_interval or self.ingest_config.get(
+                'poll_interval', 10.0)
+            self._ingest_worker = IngestWorker(
+                queue_path=queue_path,
+                poll_interval=poll
+            )
+        elif poll_interval:
+            self._ingest_worker.poll_interval = poll_interval
+        return self._ingest_worker
+    def run_ingest_once(self, poll_interval=None) -> int:
+        worker = self._ensure_ingest_worker(poll_interval)
+        return worker.run_once()
+    def start_ingest_worker(self, *, background=True, poll_interval=None):
+        worker = self._ensure_ingest_worker(poll_interval)
+        if not background:
+            return worker.run_once()
+        if self._ingest_thread and self._ingest_thread.is_alive():
+            return self._ingest_thread
+        stop_event = threading.Event()
+        thread = threading.Thread(
+            target=worker.run_forever,
+            kwargs={'stop_event': stop_event},
+            daemon=True,
+            name=f"tonik-ingest-{self.name}",
+        )
+        thread.start()
+        self._ingest_thread = thread
+        self._ingest_stop_event = stop_event
+        return thread
+    def stop_ingest_worker(self, timeout=None):
+        if self._ingest_thread and self._ingest_thread.is_alive():
+            if self._ingest_stop_event:
+                self._ingest_stop_event.set()
+            self._ingest_thread.join(timeout=timeout)
+        self._ingest_thread = None
+        self._ingest_stop_event = None

tonik/xarray2zarr.py CHANGED Viewed

@@ -16,29 +16,75 @@ from .utils import merge_arrays, fill_time_gaps, get_dt
 logger = logging.getLogger(__name__)
-def get_chunks(xda: xr.DataArray, chunks: int = 1,
-               timedim: str = 'datetime') -> dict:
+def _init_timeseries_store(path: str, start: np.datetime64, stop: np.datetime64, interval: pd.Timedelta,
+                           data_vars: dict, group: str = "original", chunk_size: int = 10,
+                           timedim: str = "datetime") -> xr.DataArray:
     """
-    Determine the chunk size for the datetime dimension. Other dimensions are assumed to be
-    small enough to not require chunking.
+    Initialize an empty zarr store for time series data. This facilitates writing data out
+    of sequence and avoid prepending which is costly and difficult to get right.
     Parameters
     ----------
-    coords : xr.core.coordinates.DatasetCoordinates
-        Coordinates of the dataset.
-    chunks : int, optional
-        Number of chunks in days to divide the datetime dimension into, by default 1.
-    """
-    if timedim not in xda.coords:
-        raise ValueError(
-            f"Datetime coordinate {timedim} not found in dataset coordinates.")
-    dt = get_dt(xda.coords[timedim])
-    chunklength = int(pd.Timedelta('%dD' % chunks) / dt)
-    return chunklength
+    path : str
+        Path to the zarr store.
+    start : np.datetime64
+        Start time of the zarr store.
+    stop : np.datetime64
+        End time of the zarr store.
+    interval : pd.Timedelta
+        Sampling interval string (e.g. '1H', '15T') for the time dimension
+    data_vars : dict
+        Dictionary defining the data variables to create. Keys are variable names,
+        values are tuples of (dims, shape, dtype) where dims is a tuple of dimension
+        names (excluding the time dimension), shape is a tuple of dimension sizes
+        (excluding the time dimension), and dtype is the numpy data type.
+    group : str, optional
+        Group name in the zarr store, by default "original"
+    chunk_size : int, optional
+        Chunk size in number of time steps, by default 10
+    timedim : str, optional
+        Name of the time dimension, by default "datetime"
-def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.DataArray, mode: str,
-                                    timedim: str = 'datetime') -> xr.DataArray:
+    """
+    # Make the zarr store a multiple of chunk_size
+    stop_ts = pd.Timestamp(stop)
+    start_ts = pd.Timestamp(start)
+    chunk_length = int(chunk_size)
+    if chunk_length <= 0:
+        raise ValueError("chunk_size must be a positive integer")
+    total_steps = int((stop_ts - start_ts) // interval) + 1
+    if total_steps < 1:
+        total_steps = chunk_length
+    if total_steps % chunk_length:
+        required_steps = ((total_steps + chunk_length - 1) //
+                          chunk_length) * chunk_length
+        start_ts = stop_ts - interval * (required_steps - 1)
+    time_index = pd.date_range(start=start_ts, end=stop_ts, freq=interval)
+    ds = xr.Dataset()
+    name, value = list(data_vars.items())[0]
+    dims, coords, shape, dtype = value
+    dims = dims + (timedim,)
+    shape = tuple(shape) + (len(time_index),)
+    # Create coordinates for gap dataset
+    new_coords = {timedim: time_index}
+    for coord_name, coord in coords.items():
+        if coord_name != timedim:
+            new_coords[coord_name] = coord
+    xda = xr.DataArray(
+        np.full(shape, np.nan, dtype=dtype),
+        coords=new_coords,
+        dims=dims,
+        name=name
+    )
+    xda = xda.chunk(
+        {timedim: chunk_size, **{d: -1 for d in dims[:-1]}})
+    xda.to_zarr(path, group=group, mode="w")
+    return xda
+def _fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.DataArray, interval: pd.Timedelta,
+                                     timedim: str = 'datetime', chunk_size: int = 10) -> xr.DataArray:
     """
     Fill gaps between existing and new datasets.
@@ -56,29 +102,23 @@ def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.Data
     xr.Dataset
         Combined dataset with gaps filled
     """
-    if mode not in ['a', 'p']:
-        raise ValueError(
-            'Mode has to be either "a" for append or "p" for prepend')
-    # get the sample interval
-    dt = get_dt(xds_new.coords[timedim])
     existing_endpoint = xds_existing[timedim].values
     # Get time ranges
-    if mode == 'a':
-        gap_start = existing_endpoint + dt
-        gap_end = xds_new[timedim].values[0] - dt
-    elif mode == 'p':
-        gap_end = existing_endpoint - dt
-        gap_start = xds_new[timedim].values[-1] + dt
+    gap_start = existing_endpoint + interval
+    gap_end = xds_new[timedim].values[0] - interval
+    # Prepare shape for gap filling
+    shape_list = list(xds_new.shape)
+    dims_list = list(xds_new.dims)
+    shape_list.pop(dims_list.index(timedim))
     if gap_start <= gap_end:
-        gap_times = pd.date_range(start=gap_start, end=gap_end, freq=dt)
+        gap_times = pd.date_range(start=gap_start, end=gap_end, freq=interval)
         # Create NaN array with same shape as variable but for gap times
-        gap_shape = (len(gap_times),) + \
-            xds_new.shape[1:]  # Skip time dimension
-        gap_values = np.full(gap_shape, np.nan)
+        gap_shape = tuple(shape_list) + (len(gap_times),)
+        gap_values = np.full(gap_shape, np.nan, dtype=xds_new.dtype)
         # Create coordinates for gap dataset
         gap_coords = {timedim: gap_times}
@@ -94,70 +134,28 @@ def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.Data
         )
         # Combine: existing + gap + new
-        if mode == 'a':
-            combined = xr.concat([gap_data, xds_new], dim=timedim)
-        elif mode == 'p':
-            combined = xr.concat([xds_new, gap_data], dim=timedim)
-        return combined
+        combined = xr.concat([gap_data, xds_new], dim=timedim)
     else:
-        return xds_new
-def _build_append_payload_full_chunks(payload: xr.DataArray, mode: str,
-                                      chunklen: int, timedim: str = "datetime") -> xr.DataArray:
-    """
-    Construct the sequence to append so that the final total length is a multiple of `chunklen`
-    Parameters
-    ----------
-    payload : xr.DataArray
-        DataArray to append
-    mode : str
-        'a' for append, 'p' for prepend
-    chunklen : int
-        Chunk length in number of time steps
-    timedim : str
-        Name of the time dimension
-    Returns
-    -------
-    xr.DataArray
-        Padded DataArray with length a multiple of chunklen
-    """
-    if mode not in ['a', 'p']:
-        raise ValueError(
-            'Mode has to be either "a" for append or "p" for prepend')
-    # pad the tail so that payload_len % chunklen == 0
-    pay_len = payload.sizes[timedim]
-    need = -pay_len % chunklen  # 0..chunklen-1
+        combined = xds_new
+    # ensure new array aligns with chunk size
+    arr_len = combined.sizes[timedim]
+    need = -arr_len % chunk_size  # 0..chunklen-1
     if need > 0:
-        dt = get_dt(payload.coords[timedim])
-        if mode == 'a':
-            start = payload[timedim].values[-1] + dt
-        elif mode == 'p':
-            start = payload[timedim].values[0] - (need+1)*dt
-        pad_times = pd.date_range(start=start, periods=need, freq=dt)
-        pad_shape = []
-        for i, d in enumerate(payload.dims):
-            if d == timedim:
-                pad_shape.append(need)
-            else:
-                pad_shape.append(payload.shape[i])
-        pad_vals = np.full(pad_shape, np.nan)
+        start = combined[timedim].values[-1] + interval
+        pad_times = pd.date_range(start=start, periods=need, freq=interval)
+        pad_shape = tuple(shape_list) + (len(pad_times),)
+        pad_vals = np.full(pad_shape, np.nan, dtype=xds_new.dtype)
         pad_coords = {timedim: pad_times}
-        for c in payload.coords:
-            if c != timedim:
-                pad_coords[c] = payload.coords[c]
+        for coord_name, coord in xds_new.coords.items():
+            if coord_name != timedim:
+                pad_coords[coord_name] = coord
         pad_da = xr.DataArray(pad_vals, coords=pad_coords,
-                              dims=payload.dims, name=payload.name, attrs=payload.attrs)
-        if mode == 'a':
-            payload = xr.concat([payload, pad_da], dim=timedim)
-        elif mode == 'p':
-            payload = xr.concat([pad_da, payload], dim=timedim)
-        payload = payload.chunk({timedim: chunklen})
-    return payload
+                              dims=xds_new.dims,
+                              name=xds_new.name)
+        combined = xr.concat([combined, pad_da], dim=timedim)
+    return combined
 def _update_meta_data(fout: str,
@@ -179,7 +177,7 @@ def _update_meta_data(fout: str,
         Group name for metadata.
     """
-    now = np.datetime64(datetime.now(tz=timezone.utc), 'ns')
+    now = np.datetime64(datetime.now(tz=timezone.utc), 's')
     new_update = xr.DataArray([now],
                               coords={'update': [now]},
                               dims=['update'],
@@ -212,8 +210,9 @@ def _update_meta_data(fout: str,
     xr.Dataset(vars).to_zarr(fout, group=meta_group, mode='w')
-def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
-                chunks: int = 10, timedim: str = 'datetime') -> None:
+def xarray2zarr(xds: xr.Dataset, path: str, group='original',
+                chunk_size: int = 1000, timedim: str = 'datetime', interval: str = None,
+                archive_starttime: datetime = datetime(2000, 1, 1)) -> None:
     """
     Write xarray dataset to zarr files.
@@ -227,7 +226,7 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
         Write mode, by default 'a'.
     group : str, optional
         Group name, by default 'original'
-    chunks : int, optional
+    chunk_size : int, optional
         Chunk size as the number of days.
     timedim : str
         Name of the time dimension, by default 'datetime'
@@ -245,14 +244,16 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
     # Fill gaps
     xds = xds.drop_duplicates(timedim, keep='last')
     xds = fill_time_gaps(xds, timedim=timedim)
+    if interval is None:
+        interval = get_dt(xds[timedim])
+    else:
+        interval = pd.to_timedelta(interval)
     for feature in xds.data_vars.keys():
         fout = os.path.join(path, feature + '.zarr')
-        # nchunks = get_chunks(xds[feature], chunks)
-        nchunks = chunks
         last_dp = xds[feature][timedim].values[-1]
         _update_meta_data(fout, last_dp, resolution=float(
-            get_dt(xds[timedim]) / pd.Timedelta(1, 'h')))
+            interval / pd.Timedelta(1, 'h')))
         try:
             xds_existing = xr.open_zarr(fout, group=group)
             has_store = True
@@ -260,57 +261,61 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
             has_store = False
         if not has_store:
-            xda_new = _build_append_payload_full_chunks(
-                xds[feature], 'a', nchunks)
-            xda_new.to_zarr(fout, group=group, mode='w',
-                            write_empty_chunks=True)
-            continue
-        if xds_existing[timedim][0] > xds[timedim][-1]:
-            logger.debug("Prepending data to existing zarr store.")
-            xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: 0}),
-                                                      xds[feature], mode='p')
-            xda_new = _build_append_payload_full_chunks(
-                xda_new, 'p', nchunks)
-            combined = xda_new.combine_first(xds_existing[feature]).compute()
-            combined.chunk({timedim: nchunks}).to_zarr(fout, group=group, mode='w',
-                                                       write_empty_chunks=True)
+            logger.debug("Creating new zarr store.")
+            shape_list = list(xds[feature].shape)
+            dims_list = list(xds[feature].dims)
+            shape_list.pop(dims_list.index(timedim))
+            dims_list.pop(dims_list.index(timedim))
+            xds_existing = _init_timeseries_store(
+                fout,
+                start=np.datetime64(archive_starttime),
+                stop=xds[feature][timedim].values[-1],
+                interval=interval,
+                data_vars={
+                    feature: (tuple(dims_list), xds[feature].coords,
+                              tuple(shape_list), xds[feature].dtype)},
+                group=group,
+                chunk_size=chunk_size,
+                timedim=timedim
+            )
+        if xds_existing[timedim][0] > xds[timedim][0]:
+            raise ValueError("New data ends before existing data starts. "
+                             "Prepending to existing data is currently not supported.")
         elif xds_existing[timedim][-1] < xds[timedim][0]:
             logger.debug("Appending data to existing zarr store.")
-            xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
-                                                      xds[feature], mode='a')
-            xda_new = _build_append_payload_full_chunks(
-                xda_new, 'a', nchunks)
+            xda_new = _fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
+                                                       xds[feature], interval, chunk_size=chunk_size)
             xda_new.to_zarr(fout, group=group, mode='a',
                             append_dim=timedim)
-        elif xds_existing[timedim][0] > xds[timedim][0] and xds_existing[timedim][-1] < xds[timedim][-1]:
-            logger.debug(
-                "Data in zarr store contained in new data. Rewriting zarr store.")
-            xda_new = _build_append_payload_full_chunks(
-                xds[feature], 'a', nchunks)
-            xda_new.to_zarr(fout, group=group, mode='w',
-                            write_empty_chunks=True)
         else:
             logger.debug("Data in zarr store overlaps with new data.")
             logger.debug(
                 f"Endtime of existing data: {xds_existing[timedim][-1].values}")
             logger.debug(f"Starttime of new data: {xds[timedim][0].values}")
-            xds_existing = xds_existing.drop_duplicates(timedim, keep='last')
-            overlap = xds_existing[timedim].where(
-                xds_existing[timedim] == xds[timedim])
-            xds[feature].loc[{timedim: overlap}].to_zarr(
-                fout, group=group, mode='r+', region='auto')
-            remainder = xds[feature].drop_sel({timedim: overlap})
+            existing_times = xds_existing[timedim].values
+            new_times = xds[timedim].values
+            overlap_times, idx_existing, idx_new = np.intersect1d(
+                existing_times,
+                new_times,
+                assume_unique=True,
+                return_indices=True,
+            )
+            region = {}
+            for dim in xds[feature].dims:
+                if dim == timedim:
+                    start = int(idx_existing.min())
+                    stop = start + len(idx_existing)
+                    region[dim] = slice(start, stop)
+                else:
+                    region[dim] = 'auto'
+            xds[feature].isel({timedim: idx_new}).to_zarr(
+                fout, group=group, mode='r+', region=region)
+            remainder = xds[feature].drop_sel({timedim: new_times[idx_new]})
             if remainder.sizes[timedim] > 0:
-                mode = 'a'
-                if remainder[timedim][-1] < xds_existing[timedim][0]:
-                    mode = 'p'
-                xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: 0}),
-                                                          xds[feature], mode=mode)
-                xda_new = _build_append_payload_full_chunks(
-                    xda_new, mode, nchunks)
+                xda_new = _fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
+                                                           remainder, interval, chunk_size=chunk_size)
                 xda_new.to_zarr(fout, group=group, mode='a',
                                 append_dim=timedim)

{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tonik
-Version: 0.1.21
+Version: 0.1.22
 Summary: Store time series data as HDF5 files and access them through an API.
 Project-URL: Homepage, https://tsc-tools.github.io/tonik
 Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
@@ -9,29 +9,29 @@ License-File: LICENSE
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.9
-Requires-Dist: datashader>=0.14
-Requires-Dist: fastapi>=0.112
-Requires-Dist: h5netcdf>=1.1
-Requires-Dist: h5py>=3.8
-Requires-Dist: matplotlib
-Requires-Dist: netcdf4>=1.6
-Requires-Dist: pandas>=2.0
-Requires-Dist: python-json-logger>=2.0
-Requires-Dist: s3fs
-Requires-Dist: uvicorn[standard]>=0.22
-Requires-Dist: xarray[accel,io,parallel]
-Requires-Dist: zarr
+Requires-Python: >=3.10
+Requires-Dist: datashader<0.19,>=0.18.2
+Requires-Dist: fastapi<0.129,>=0.128.0
+Requires-Dist: h5netcdf<2,>=1.7.3
+Requires-Dist: h5py<4,>=3.15.1
+Requires-Dist: matplotlib<4,>=3.10.8
+Requires-Dist: pandas<3,>=2.3.3
+Requires-Dist: s3fs<2026,>=2025.12.0
+Requires-Dist: uvicorn[standard]<0.41,>=0.40.0
+Requires-Dist: xarray[accel,io,parallel]<2026,>=2025.6.1
+Requires-Dist: zarr<4,>=3.1.5
 Provides-Extra: dev
-Requires-Dist: build; extra == 'dev'
-Requires-Dist: httpx; extra == 'dev'
-Requires-Dist: ipykernel; extra == 'dev'
-Requires-Dist: mkdocs; extra == 'dev'
-Requires-Dist: mkdocs-jupyter; extra == 'dev'
-Requires-Dist: mkdocstrings[python]; extra == 'dev'
+Requires-Dist: build<2,>=1.4.0; extra == 'dev'
+Requires-Dist: hatch<2,>=1.16.2; extra == 'dev'
+Requires-Dist: httpx<0.29,>=0.28.1; extra == 'dev'
+Requires-Dist: ipykernel<7,>=6.31.0; extra == 'dev'
+Requires-Dist: mkdocs-jupyter<0.26,>=0.25.1; extra == 'dev'
+Requires-Dist: mkdocs<2,>=1.6.1; extra == 'dev'
+Requires-Dist: mkdocstrings[python]<2,>=1.0.0; extra == 'dev'
+Requires-Dist: moto[s3]<6,>=5.1.19; extra == 'dev'
 Requires-Dist: pytest; extra == 'dev'
-Requires-Dist: twine; extra == 'dev'
-Requires-Dist: zarr[remote-tests]; extra == 'dev'
+Requires-Dist: twine<7,>=6.2.0; extra == 'dev'
+Requires-Dist: zarr[remote-tests]<4,>=3.1.5; extra == 'dev'
 Description-Content-Type: text/markdown
 # Tonik

{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,15 @@
 tonik/__init__.py,sha256=dov-nMeGFBzLspmj4rWKjC4r736vmaPDgMEkHSUfP98,523
 tonik/api.py,sha256=vW0ykOo5iGAV0_WuOepdrnUyFp83F7KyJTd43ksLmUk,7985
 tonik/grafana_annotations.py,sha256=ZU9Cy-HT4vvMfYIQzD9WboaDVOCBDv__NmXbk1qKWJo,5838
-tonik/storage.py,sha256=bYBl3JPpH8D3iIFOj5AZQXc4M8txAbwFKt4xTfdotgg,10583
+tonik/ingest.py,sha256=RWJLasAVM8iaoCK5HCXEXybXARupw58Im0Ic7KrAThk,6228
+tonik/storage.py,sha256=zHXrIjbSPC3Sni1_KOn_OqCk0HtWaOyXgAhMTTdO18w,11500
 tonik/utils.py,sha256=GwAXfGFQWhlsLThQvSux1SooRkW-iIkJP99JMH72t5Y,11791
 tonik/xarray2netcdf.py,sha256=nq6RHk5ciaAg1bxNDiyHPRdAts1C7fj7jtDbaLaSTWM,6497
-tonik/xarray2zarr.py,sha256=kRhgDdo8CDT1ceszwQEQNfXdgbnmL5nNejUzaMnyFXM,11707
+tonik/xarray2zarr.py,sha256=HeqKBArNcYUzd_azgCK0iptq1qAA6h2j4brfIpkV_gs,12156
 tonik/package_data/index.html,sha256=ZCZ-BtGRERsL-6c_dfY43qd2WAaggH7xereennGL6ww,4372
 tonik/package_data/whakaari_labels.json,sha256=96UZSq41yXgAJxuKivLBKlRTw-33jkjh7AGKTsDQ9Yg,3993
-tonik-0.1.21.dist-info/METADATA,sha256=rEp5KTN5xDizNLajJkP3i6lvGI02hrR-sqqfIRfn4M0,2207
-tonik-0.1.21.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tonik-0.1.21.dist-info/entry_points.txt,sha256=y82XyTeQddM87gCTzgSQaTlKF3VFicO4hhClHUv6j1A,127
-tonik-0.1.21.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-tonik-0.1.21.dist-info/RECORD,,
+tonik-0.1.22.dist-info/METADATA,sha256=sjriW0whFAAo3VFxdu-1xlND0sw7RqcbULJz5HRUT-Q,2424
+tonik-0.1.22.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tonik-0.1.22.dist-info/entry_points.txt,sha256=y82XyTeQddM87gCTzgSQaTlKF3VFicO4hhClHUv6j1A,127
+tonik-0.1.22.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+tonik-0.1.22.dist-info/RECORD,,

{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/WHEEL RENAMED Viewed

File without changes

{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tonik 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl

tonik 0.1.21py3-none-any.whl → 0.1.22py3-none-any.whl