PyPI - tonik - Versions diffs - 0.1.20__tar.gz → 0.1.22__tar.gz - Mend

tonik 0.1.20tar.gz → 0.1.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{tonik-0.1.20 → tonik-0.1.22}/.devcontainer/devcontainer.json RENAMED Viewed

@@ -14,7 +14,7 @@
 	// "appPort": ["8003:8003"],
 	// Use 'postCreateCommand' to run commands after the container is created.
-	"postCreateCommand": "pip3 install -e . && pip3 install httpx pytest ipykernel hatch",
+	"postCreateCommand": "pip3 install -e '.[dev]'",
 	// Configure tool-specific properties.
 	"customizations": {

{tonik-0.1.20 → tonik-0.1.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tonik
-Version: 0.1.20
+Version: 0.1.22
 Summary: Store time series data as HDF5 files and access them through an API.
 Project-URL: Homepage, https://tsc-tools.github.io/tonik
 Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
@@ -9,29 +9,29 @@ License-File: LICENSE
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.9
-Requires-Dist: datashader>=0.14
-Requires-Dist: fastapi>=0.112
-Requires-Dist: h5netcdf>=1.1
-Requires-Dist: h5py>=3.8
-Requires-Dist: matplotlib
-Requires-Dist: netcdf4>=1.6
-Requires-Dist: pandas>=2.0
-Requires-Dist: python-json-logger>=2.0
-Requires-Dist: s3fs
-Requires-Dist: uvicorn[standard]>=0.22
-Requires-Dist: xarray[accel,io,parallel]
-Requires-Dist: zarr
+Requires-Python: >=3.10
+Requires-Dist: datashader<0.19,>=0.18.2
+Requires-Dist: fastapi<0.129,>=0.128.0
+Requires-Dist: h5netcdf<2,>=1.7.3
+Requires-Dist: h5py<4,>=3.15.1
+Requires-Dist: matplotlib<4,>=3.10.8
+Requires-Dist: pandas<3,>=2.3.3
+Requires-Dist: s3fs<2026,>=2025.12.0
+Requires-Dist: uvicorn[standard]<0.41,>=0.40.0
+Requires-Dist: xarray[accel,io,parallel]<2026,>=2025.6.1
+Requires-Dist: zarr<4,>=3.1.5
 Provides-Extra: dev
-Requires-Dist: build; extra == 'dev'
-Requires-Dist: httpx; extra == 'dev'
-Requires-Dist: ipykernel; extra == 'dev'
-Requires-Dist: mkdocs; extra == 'dev'
-Requires-Dist: mkdocs-jupyter; extra == 'dev'
-Requires-Dist: mkdocstrings[python]; extra == 'dev'
+Requires-Dist: build<2,>=1.4.0; extra == 'dev'
+Requires-Dist: hatch<2,>=1.16.2; extra == 'dev'
+Requires-Dist: httpx<0.29,>=0.28.1; extra == 'dev'
+Requires-Dist: ipykernel<7,>=6.31.0; extra == 'dev'
+Requires-Dist: mkdocs-jupyter<0.26,>=0.25.1; extra == 'dev'
+Requires-Dist: mkdocs<2,>=1.6.1; extra == 'dev'
+Requires-Dist: mkdocstrings[python]<2,>=1.0.0; extra == 'dev'
+Requires-Dist: moto[s3]<6,>=5.1.19; extra == 'dev'
 Requires-Dist: pytest; extra == 'dev'
-Requires-Dist: twine; extra == 'dev'
-Requires-Dist: zarr[remote-tests]; extra == 'dev'
+Requires-Dist: twine<7,>=6.2.0; extra == 'dev'
+Requires-Dist: zarr[remote-tests]<4,>=3.1.5; extra == 'dev'
 Description-Content-Type: text/markdown
 # Tonik

tonik-0.1.20/pyproject.toml~ → tonik-0.1.22/pyproject.toml RENAMED Viewed

@@ -12,7 +12,7 @@ exclude = [
 [project]
 name = "tonik"
-version = "0.1.16"
+version = "0.1.22"
 authors = [
   { name="Yannik Behr", email="y.behr@gns.cri.nz" },
   { name="Christof Mueller", email="c.mueller@gns.cri.nz" }
@@ -20,35 +20,24 @@ authors = [
 description = "Store time series data as HDF5 files and access them through an API."
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
     "Operating System :: OS Independent",
 ]
-dependencies = [
-    "h5py>=3.8",
-    "datashader>=0.14",
-    "xarray[io,accel,parallel]",
-    "pandas>=2.0",
-    "netcdf4>=1.6",
-    "h5netcdf>=1.1",
-    "python-json-logger>=2.0",
-    "uvicorn[standard]>=0.22",
-    "fastapi>=0.112",
-    "matplotlib",
-    "zarr[remote_tests]>=3.0.3; python_version >= '3.11'",
-    "zarr[remote_tests]<3; python_version < '3.11'",
-    "s3fs"
-]
-[project.optional-dependencies]
-dev = ["pytest",
-       "httpx",
-       "ipykernel",
-       "mkdocs",
-       "mkdocstrings[python]",
-       "mkdocs-jupyter"]
+dependencies = [
+  "xarray[io,accel,parallel]>=2025.6.1,<2026",
+  "datashader>=0.18.2,<0.19",
+  "h5py>=3.15.1,<4",
+  "pandas>=2.3.3,<3",
+  "h5netcdf>=1.7.3,<2",
+  "uvicorn[standard]>=0.40.0,<0.41",
+  "fastapi>=0.128.0,<0.129",
+  "matplotlib>=3.10.8,<4",
+  "zarr>=3.1.5,<4",
+  "s3fs>=2025.12.0,<2026"]
 [project.urls]
 Homepage = "https://tsc-tools.github.io/tonik"
@@ -59,8 +48,23 @@ tonik_api = "tonik.api:main"
 test_data = "tonik.utils:main"
 grafana_annotations = "tonik.grafana_annotations:main"
+[project.optional-dependencies]
+dev = ["pytest",
+       "httpx>=0.28.1,<0.29",
+       "ipykernel>=6.31.0,<7",
+       "build>=1.4.0,<2",
+       "twine>=6.2.0,<7",
+       "mkdocs>=1.6.1,<2",
+       "mkdocstrings[python]>=1.0.0,<2",
+       "mkdocs-jupyter>=0.25.1,<0.26",
+       "zarr[remote-tests]>=3.1.5,<4",
+       "moto[s3]>=5.1.19,<6",
+       "hatch>=1.16.2,<2"
+]
 [tool.pytest.ini_options]
 log_cli = true
+addopts = "-s"
 [tool.hatch.envs.test]
 dependencies = [
@@ -70,7 +74,7 @@ dependencies = [
 ]
 [[tool.hatch.envs.test.matrix]]
-python = ["3.11", "3.9"]
+python = ["3.10", "3.11", "3.12", "3.13"]
 [tool.hatch.envs.test.scripts]
-run-pytest = "pytest tests"
+run-pytest = "pytest tests"

tonik-0.1.22/src/tonik/ingest.py ADDED Viewed

@@ -0,0 +1,166 @@
+# src/tonik/ingest.py
+import json
+import logging
+import os
+import pickle
+import threading
+import uuid
+from datetime import datetime, timezone
+from typing import Optional
+import xarray as xr
+from .xarray2netcdf import xarray2netcdf
+from .xarray2zarr import xarray2zarr
+logger = logging.getLogger(__name__)
+__all__ = ["enqueue_dataset", "IngestWorker"]
+def _norm_timeseries(xds: xr.Dataset, timedim: str) -> xr.Dataset:
+    xds = xds.sortby(timedim)
+    xds = xds.drop_duplicates(timedim, keep='last')
+    xds[timedim] = xds[timedim].astype('datetime64[ns]')
+    return xds
+def enqueue_dataset(data: xr.Dataset, target_path: str, *, backend: str,
+                    ingest_config: dict, save_kwargs: Optional[dict] = None) -> dict:
+    """
+    Enqueue a dataset for ingestion.
+    Parameters
+    ----------
+    data : xr.Dataset
+        The dataset to enqueue.
+    target_path : str
+        The target path where the dataset should be saved.
+    backend : str
+        The backend to use for saving the dataset ('zarr' or 'netcdf').
+    ingest_config : dict
+        Configuration for the ingest queue, must include 'queue_path'.
+    save_kwargs : Optional[dict], optional
+        Additional keyword arguments to pass to the save function, by default None.
+    Returns
+    -------
+    dict
+        A message dictionary representing the enqueued dataset.
+    """
+    queue_path = ingest_config.get("queue_path")
+    if not queue_path:
+        raise ValueError("ingest_config must provide a 'queue_path'.")
+    queue_path = os.path.abspath(queue_path)
+    payload_dir = os.path.join(queue_path, "payloads")
+    message_dir = os.path.join(queue_path, "messages")
+    os.makedirs(payload_dir, exist_ok=True)
+    os.makedirs(message_dir, exist_ok=True)
+    timedim = save_kwargs.get(
+        "timedim", "datetime") if save_kwargs else "datetime"
+    if isinstance(data, xr.DataArray):
+        name = data.name or "data"
+        data = data.to_dataset(name=name)
+    dataset = _norm_timeseries(data, timedim=timedim)
+    entry_id = uuid.uuid4().hex
+    payload_path = os.path.join(payload_dir, f"{entry_id}.nc")
+    kwargs_path = os.path.join(payload_dir, f"{entry_id}.pkl")
+    dataset.to_netcdf(payload_path, engine="h5netcdf")
+    with open(kwargs_path, "wb") as handle:
+        pickle.dump(save_kwargs or {}, handle)
+    message = {
+        "id": entry_id,
+        "target_path": os.path.abspath(target_path),
+        "backend": backend,
+        "payload_path": payload_path,
+        "kwargs_path": kwargs_path,
+        "created_at": datetime.now(tz=timezone.utc).isoformat(),
+    }
+    tmp_path = os.path.join(message_dir, f"{entry_id}.json.tmp")
+    final_path = os.path.join(message_dir, f"{entry_id}.json")
+    with open(tmp_path, "w", encoding="utf-8") as handle:
+        json.dump(message, handle)
+    os.replace(tmp_path, final_path)
+    logger.debug("Queued dataset %s for %s backend at %s",
+                 entry_id, backend, target_path)
+    return message
+class IngestWorker:
+    def __init__(self, queue_path: str, poll_interval: float = 10.0,
+                 target_prefix: Optional[str] = None):
+        self.queue_path = os.path.abspath(queue_path)
+        self.messages_dir = os.path.join(self.queue_path, "messages")
+        self.payloads_dir = os.path.join(self.queue_path, "payloads")
+        os.makedirs(self.messages_dir, exist_ok=True)
+        os.makedirs(self.payloads_dir, exist_ok=True)
+        self.poll_interval = poll_interval
+        self.target_prefix = os.path.abspath(
+            target_prefix) if target_prefix else None
+    def _iter_messages(self):
+        for name in sorted(os.listdir(self.messages_dir)):
+            if not name.endswith(".json"):
+                continue
+            msg_path = os.path.join(self.messages_dir, name)
+            with open(msg_path, "r", encoding="utf-8") as handle:
+                message = json.load(handle)
+            target = os.path.abspath(message.get("target_path", ""))
+            if self.target_prefix and not target.startswith(self.target_prefix):
+                continue
+            yield msg_path, message
+    def run_once(self) -> int:
+        processed = 0
+        for msg_path, message in self._iter_messages():
+            payload_path = message.get("payload_path")
+            kwargs_path = message.get("kwargs_path")
+            if not payload_path or not os.path.exists(payload_path):
+                logger.warning(
+                    "Missing payload for %s, dropping message", msg_path)
+                os.remove(msg_path)
+                if kwargs_path and os.path.exists(kwargs_path):
+                    os.remove(kwargs_path)
+                continue
+            dataset = None
+            try:
+                with xr.open_dataset(payload_path, engine='h5netcdf') as ds_on_disk:
+                    dataset = ds_on_disk.load()
+                kwargs = {}
+                if kwargs_path and os.path.exists(kwargs_path):
+                    with open(kwargs_path, "rb") as handle:
+                        kwargs = pickle.load(handle)
+                backend = message.get("backend", "zarr")
+                if backend == "zarr":
+                    xarray2zarr(dataset, message["target_path"], **kwargs)
+                elif backend == "netcdf":
+                    xarray2netcdf(dataset, message["target_path"], **kwargs)
+                else:
+                    raise ValueError(f"Unsupported backend '{backend}'")
+            except Exception as exc:
+                logger.error("Failed to ingest %s: %s",
+                             msg_path, exc, exc_info=True)
+                continue
+            finally:
+                if dataset is not None:
+                    dataset.close()
+            os.remove(payload_path)
+            if kwargs_path and os.path.exists(kwargs_path):
+                os.remove(kwargs_path)
+            os.remove(msg_path)
+            processed += 1
+        return processed
+    def run_forever(self, stop_event: Optional[threading.Event] = None) -> None:
+        stop_event = stop_event or threading.Event()
+        while not stop_event.is_set():
+            processed = self.run_once()
+            if processed == 0:
+                stop_event.wait(self.poll_interval)

{tonik-0.1.20 → tonik-0.1.22}/src/tonik/storage.py RENAMED Viewed

@@ -1,80 +1,27 @@
+from datetime import datetime
 import json
 import logging
-import logging.config
 import os
+import threading
+from typing import Optional
 import xarray as xr
+from .ingest import IngestWorker, enqueue_dataset
 from .xarray2netcdf import xarray2netcdf
 from .xarray2zarr import xarray2zarr
-LOGGING_CONFIG = {
-    "version": 1,
-    "disable_existing_loggers": False,
-    "formatters": {
-        "default": {  # The formatter name, it can be anything that I wish
-            # What to add in the message
-            "format": "%(asctime)s:%(name)s:%(process)d:%(lineno)d " "%(levelname)s %(message)s",
-            "datefmt": "%Y-%m-%d %H:%M:%S",  # How to display dates
-        },
-        "json": {  # The formatter name
-            "()": "pythonjsonlogger.jsonlogger.JsonFormatter",  # The class to instantiate!
-            # Json is more complex, but easier to read, display all attributes!
-            "format": """
-                    asctime: %(asctime)s
-                    created: %(created)f
-                    filename: %(filename)s
-                    funcName: %(funcName)s
-                    levelname: %(levelname)s
-                    levelno: %(levelno)s
-                    lineno: %(lineno)d
-                    message: %(message)s
-                    module: %(module)s
-                    msec: %(msecs)d
-                    name: %(name)s
-                    pathname: %(pathname)s
-                    process: %(process)d
-                    processName: %(processName)s
-                    relativeCreated: %(relativeCreated)d
-                    thread: %(thread)d
-                    threadName: %(threadName)s
-                    exc_info: %(exc_info)s
-                """,
-            "datefmt": "%Y-%m-%d %H:%M:%S",  # How to display dates
-        },
-    },
-    "handlers": {
-        "simple": {  # The handler name
-            "formatter": "default",  # Refer to the formatter defined above
-            "class": "logging.StreamHandler",  # OUTPUT: Same as above, stream to console
-            "stream": "ext://sys.stdout",
-        },
-    },
-    "loggers": {
-        "storage": {  # The name of the logger, this SHOULD match your module!
-            "level": "DEBUG",  # FILTER: only INFO logs onwards from "tryceratops" logger
-            "handlers": [
-                "simple",  # Refer the handler defined above
-            ],
-        },
-    },
-    "root": {
-        "level": "INFO",  # FILTER: only INFO logs onwards
-        "handlers": [
-            "simple",  # Refer the handler defined above
-        ]
-    },
-}
-logging.config.dictConfig(LOGGING_CONFIG)
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 class Path(object):
-    def __init__(self, name, parentdir, create=True, backend='zarr'):
+    def __init__(self, name, parentdir, create=True, backend='zarr',
+                 archive_starttime=datetime(2000, 1, 1), ingest_config=None):
         self.name = name
         self.create = create
         self.backend = backend
+        self.archive_starttime = archive_starttime
         self.engine = 'h5netcdf' if self.backend == 'netcdf' else self.backend
         self.path = os.path.join(parentdir, name)
         if create:
@@ -86,6 +33,7 @@ class Path(object):
             if not os.path.exists(self.path):
                 raise FileNotFoundError(f"Path {self.path} not found")
         self.children = {}
+        self.ingest_config = ingest_config.copy() if ingest_config else None
     def __str__(self):
         return self.path
@@ -97,7 +45,8 @@ class Path(object):
             return self.children[key]
         except KeyError:
             self.children[key] = Path(
-                key, self.path, self.create, self.backend)
+                key, self.path, self.create, self.backend, self.archive_starttime,
+                ingest_config=self.ingest_config)
             return self.children[key]
     def feature_path(self, feature):
@@ -149,10 +98,24 @@ class Path(object):
         """
         Save a feature to disk
         """
+        if self.ingest_config and self.ingest_config.get('queue_path'):
+            enqueue_dataset(
+                data,
+                target_path=self.path,
+                backend=self.backend,
+                ingest_config=self.ingest_config,
+                save_kwargs=kwargs,
+            )
+            logger.debug("Queued data for %s backend at %s",
+                         self.backend, self.path)
+            return
         if self.backend == 'netcdf':
-            xarray2netcdf(data, self.path, **kwargs)
+            xarray2netcdf(data, self.path,
+                          archive_starttime=self.archive_starttime, **kwargs)
         elif self.backend == 'zarr':
-            xarray2zarr(data, self.path, **kwargs)
+            xarray2zarr(data, self.path,
+                        archive_starttime=self.archive_starttime, **kwargs)
     def shape(self, feature):
         """
@@ -208,11 +171,17 @@ class Storage(Path):
     >>> rsam = c("rsam")
     """
-    def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf'):
+    def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf',
+                 ingest_config=None, archive_starttime=datetime(2000, 1, 1)):
         self.stores = set()
         self.starttime = starttime
         self.endtime = endtime
-        super().__init__(name, rootdir, create, backend)
+        self.archive_starttime = archive_starttime
+        self._ingest_worker: Optional[IngestWorker] = None
+        self._ingest_thread: Optional[threading.Thread] = None
+        self._ingest_stop_event: Optional[threading.Event] = None
+        super().__init__(name, rootdir, create, backend, archive_starttime,
+                         ingest_config=ingest_config)
     def print_tree(self, site, indent=0, output=''):
         output += ' ' * indent + site.path + '\n'
@@ -317,3 +286,50 @@ class Storage(Path):
     starttime = property(get_starttime, set_starttime)
     endtime = property(get_endtime, set_endtime)
+    def _ensure_ingest_worker(self, poll_interval=None) -> IngestWorker:
+        if not (self.ingest_config and self.ingest_config.get('queue_path')):
+            raise RuntimeError(
+                "Ingestion queue is not configured for this Storage instance.")
+        if self._ingest_worker is None:
+            queue_path = self.ingest_config['queue_path']
+            poll = poll_interval or self.ingest_config.get(
+                'poll_interval', 10.0)
+            self._ingest_worker = IngestWorker(
+                queue_path=queue_path,
+                poll_interval=poll
+            )
+        elif poll_interval:
+            self._ingest_worker.poll_interval = poll_interval
+        return self._ingest_worker
+    def run_ingest_once(self, poll_interval=None) -> int:
+        worker = self._ensure_ingest_worker(poll_interval)
+        return worker.run_once()
+    def start_ingest_worker(self, *, background=True, poll_interval=None):
+        worker = self._ensure_ingest_worker(poll_interval)
+        if not background:
+            return worker.run_once()
+        if self._ingest_thread and self._ingest_thread.is_alive():
+            return self._ingest_thread
+        stop_event = threading.Event()
+        thread = threading.Thread(
+            target=worker.run_forever,
+            kwargs={'stop_event': stop_event},
+            daemon=True,
+            name=f"tonik-ingest-{self.name}",
+        )
+        thread.start()
+        self._ingest_thread = thread
+        self._ingest_stop_event = stop_event
+        return thread
+    def stop_ingest_worker(self, timeout=None):
+        if self._ingest_thread and self._ingest_thread.is_alive():
+            if self._ingest_stop_event:
+                self._ingest_stop_event.set()
+            self._ingest_thread.join(timeout=timeout)
+        self._ingest_thread = None
+        self._ingest_stop_event = None

tonik 0.1.20__tar.gz → 0.1.22__tar.gz

tonik 0.1.20tar.gz → 0.1.22tar.gz