tonik 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tonik/ingest.py +166 -0
- tonik/storage.py +83 -67
- tonik/xarray2zarr.py +146 -141
- {tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/METADATA +22 -22
- {tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/RECORD +8 -7
- {tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/WHEEL +0 -0
- {tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/entry_points.txt +0 -0
- {tonik-0.1.21.dist-info → tonik-0.1.22.dist-info}/licenses/LICENSE +0 -0
tonik/ingest.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# src/tonik/ingest.py
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import pickle
|
|
6
|
+
import threading
|
|
7
|
+
import uuid
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import xarray as xr
|
|
12
|
+
|
|
13
|
+
from .xarray2netcdf import xarray2netcdf
|
|
14
|
+
from .xarray2zarr import xarray2zarr
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
__all__ = ["enqueue_dataset", "IngestWorker"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _norm_timeseries(xds: xr.Dataset, timedim: str) -> xr.Dataset:
|
|
22
|
+
xds = xds.sortby(timedim)
|
|
23
|
+
xds = xds.drop_duplicates(timedim, keep='last')
|
|
24
|
+
xds[timedim] = xds[timedim].astype('datetime64[ns]')
|
|
25
|
+
return xds
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def enqueue_dataset(data: xr.Dataset, target_path: str, *, backend: str,
|
|
29
|
+
ingest_config: dict, save_kwargs: Optional[dict] = None) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
Enqueue a dataset for ingestion.
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
data : xr.Dataset
|
|
35
|
+
The dataset to enqueue.
|
|
36
|
+
target_path : str
|
|
37
|
+
The target path where the dataset should be saved.
|
|
38
|
+
backend : str
|
|
39
|
+
The backend to use for saving the dataset ('zarr' or 'netcdf').
|
|
40
|
+
ingest_config : dict
|
|
41
|
+
Configuration for the ingest queue, must include 'queue_path'.
|
|
42
|
+
save_kwargs : Optional[dict], optional
|
|
43
|
+
Additional keyword arguments to pass to the save function, by default None.
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
dict
|
|
47
|
+
A message dictionary representing the enqueued dataset.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
queue_path = ingest_config.get("queue_path")
|
|
51
|
+
if not queue_path:
|
|
52
|
+
raise ValueError("ingest_config must provide a 'queue_path'.")
|
|
53
|
+
queue_path = os.path.abspath(queue_path)
|
|
54
|
+
payload_dir = os.path.join(queue_path, "payloads")
|
|
55
|
+
message_dir = os.path.join(queue_path, "messages")
|
|
56
|
+
os.makedirs(payload_dir, exist_ok=True)
|
|
57
|
+
os.makedirs(message_dir, exist_ok=True)
|
|
58
|
+
timedim = save_kwargs.get(
|
|
59
|
+
"timedim", "datetime") if save_kwargs else "datetime"
|
|
60
|
+
|
|
61
|
+
if isinstance(data, xr.DataArray):
|
|
62
|
+
name = data.name or "data"
|
|
63
|
+
data = data.to_dataset(name=name)
|
|
64
|
+
|
|
65
|
+
dataset = _norm_timeseries(data, timedim=timedim)
|
|
66
|
+
entry_id = uuid.uuid4().hex
|
|
67
|
+
payload_path = os.path.join(payload_dir, f"{entry_id}.nc")
|
|
68
|
+
kwargs_path = os.path.join(payload_dir, f"{entry_id}.pkl")
|
|
69
|
+
|
|
70
|
+
dataset.to_netcdf(payload_path, engine="h5netcdf")
|
|
71
|
+
with open(kwargs_path, "wb") as handle:
|
|
72
|
+
pickle.dump(save_kwargs or {}, handle)
|
|
73
|
+
|
|
74
|
+
message = {
|
|
75
|
+
"id": entry_id,
|
|
76
|
+
"target_path": os.path.abspath(target_path),
|
|
77
|
+
"backend": backend,
|
|
78
|
+
"payload_path": payload_path,
|
|
79
|
+
"kwargs_path": kwargs_path,
|
|
80
|
+
"created_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
81
|
+
}
|
|
82
|
+
tmp_path = os.path.join(message_dir, f"{entry_id}.json.tmp")
|
|
83
|
+
final_path = os.path.join(message_dir, f"{entry_id}.json")
|
|
84
|
+
with open(tmp_path, "w", encoding="utf-8") as handle:
|
|
85
|
+
json.dump(message, handle)
|
|
86
|
+
os.replace(tmp_path, final_path)
|
|
87
|
+
logger.debug("Queued dataset %s for %s backend at %s",
|
|
88
|
+
entry_id, backend, target_path)
|
|
89
|
+
return message
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class IngestWorker:
|
|
93
|
+
def __init__(self, queue_path: str, poll_interval: float = 10.0,
|
|
94
|
+
target_prefix: Optional[str] = None):
|
|
95
|
+
self.queue_path = os.path.abspath(queue_path)
|
|
96
|
+
self.messages_dir = os.path.join(self.queue_path, "messages")
|
|
97
|
+
self.payloads_dir = os.path.join(self.queue_path, "payloads")
|
|
98
|
+
os.makedirs(self.messages_dir, exist_ok=True)
|
|
99
|
+
os.makedirs(self.payloads_dir, exist_ok=True)
|
|
100
|
+
self.poll_interval = poll_interval
|
|
101
|
+
self.target_prefix = os.path.abspath(
|
|
102
|
+
target_prefix) if target_prefix else None
|
|
103
|
+
|
|
104
|
+
def _iter_messages(self):
|
|
105
|
+
for name in sorted(os.listdir(self.messages_dir)):
|
|
106
|
+
if not name.endswith(".json"):
|
|
107
|
+
continue
|
|
108
|
+
msg_path = os.path.join(self.messages_dir, name)
|
|
109
|
+
with open(msg_path, "r", encoding="utf-8") as handle:
|
|
110
|
+
message = json.load(handle)
|
|
111
|
+
target = os.path.abspath(message.get("target_path", ""))
|
|
112
|
+
if self.target_prefix and not target.startswith(self.target_prefix):
|
|
113
|
+
continue
|
|
114
|
+
yield msg_path, message
|
|
115
|
+
|
|
116
|
+
def run_once(self) -> int:
|
|
117
|
+
processed = 0
|
|
118
|
+
for msg_path, message in self._iter_messages():
|
|
119
|
+
payload_path = message.get("payload_path")
|
|
120
|
+
kwargs_path = message.get("kwargs_path")
|
|
121
|
+
if not payload_path or not os.path.exists(payload_path):
|
|
122
|
+
logger.warning(
|
|
123
|
+
"Missing payload for %s, dropping message", msg_path)
|
|
124
|
+
os.remove(msg_path)
|
|
125
|
+
if kwargs_path and os.path.exists(kwargs_path):
|
|
126
|
+
os.remove(kwargs_path)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
dataset = None
|
|
130
|
+
try:
|
|
131
|
+
with xr.open_dataset(payload_path, engine='h5netcdf') as ds_on_disk:
|
|
132
|
+
dataset = ds_on_disk.load()
|
|
133
|
+
|
|
134
|
+
kwargs = {}
|
|
135
|
+
if kwargs_path and os.path.exists(kwargs_path):
|
|
136
|
+
with open(kwargs_path, "rb") as handle:
|
|
137
|
+
kwargs = pickle.load(handle)
|
|
138
|
+
|
|
139
|
+
backend = message.get("backend", "zarr")
|
|
140
|
+
if backend == "zarr":
|
|
141
|
+
xarray2zarr(dataset, message["target_path"], **kwargs)
|
|
142
|
+
elif backend == "netcdf":
|
|
143
|
+
xarray2netcdf(dataset, message["target_path"], **kwargs)
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(f"Unsupported backend '{backend}'")
|
|
146
|
+
except Exception as exc:
|
|
147
|
+
logger.error("Failed to ingest %s: %s",
|
|
148
|
+
msg_path, exc, exc_info=True)
|
|
149
|
+
continue
|
|
150
|
+
finally:
|
|
151
|
+
if dataset is not None:
|
|
152
|
+
dataset.close()
|
|
153
|
+
|
|
154
|
+
os.remove(payload_path)
|
|
155
|
+
if kwargs_path and os.path.exists(kwargs_path):
|
|
156
|
+
os.remove(kwargs_path)
|
|
157
|
+
os.remove(msg_path)
|
|
158
|
+
processed += 1
|
|
159
|
+
return processed
|
|
160
|
+
|
|
161
|
+
def run_forever(self, stop_event: Optional[threading.Event] = None) -> None:
|
|
162
|
+
stop_event = stop_event or threading.Event()
|
|
163
|
+
while not stop_event.is_set():
|
|
164
|
+
processed = self.run_once()
|
|
165
|
+
if processed == 0:
|
|
166
|
+
stop_event.wait(self.poll_interval)
|
tonik/storage.py
CHANGED
|
@@ -1,80 +1,27 @@
|
|
|
1
|
+
from datetime import datetime
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
|
-
import logging.config
|
|
4
4
|
import os
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Optional
|
|
5
7
|
|
|
6
8
|
import xarray as xr
|
|
7
9
|
|
|
10
|
+
from .ingest import IngestWorker, enqueue_dataset
|
|
8
11
|
from .xarray2netcdf import xarray2netcdf
|
|
9
12
|
from .xarray2zarr import xarray2zarr
|
|
10
13
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
"disable_existing_loggers": False,
|
|
14
|
-
"formatters": {
|
|
15
|
-
"default": { # The formatter name, it can be anything that I wish
|
|
16
|
-
# What to add in the message
|
|
17
|
-
"format": "%(asctime)s:%(name)s:%(process)d:%(lineno)d " "%(levelname)s %(message)s",
|
|
18
|
-
"datefmt": "%Y-%m-%d %H:%M:%S", # How to display dates
|
|
19
|
-
},
|
|
20
|
-
"json": { # The formatter name
|
|
21
|
-
"()": "pythonjsonlogger.json.JsonFormatter", # The class to instantiate!
|
|
22
|
-
# Json is more complex, but easier to read, display all attributes!
|
|
23
|
-
"format": """
|
|
24
|
-
asctime: %(asctime)s
|
|
25
|
-
created: %(created)f
|
|
26
|
-
filename: %(filename)s
|
|
27
|
-
funcName: %(funcName)s
|
|
28
|
-
levelname: %(levelname)s
|
|
29
|
-
levelno: %(levelno)s
|
|
30
|
-
lineno: %(lineno)d
|
|
31
|
-
message: %(message)s
|
|
32
|
-
module: %(module)s
|
|
33
|
-
msec: %(msecs)d
|
|
34
|
-
name: %(name)s
|
|
35
|
-
pathname: %(pathname)s
|
|
36
|
-
process: %(process)d
|
|
37
|
-
processName: %(processName)s
|
|
38
|
-
relativeCreated: %(relativeCreated)d
|
|
39
|
-
thread: %(thread)d
|
|
40
|
-
threadName: %(threadName)s
|
|
41
|
-
exc_info: %(exc_info)s
|
|
42
|
-
""",
|
|
43
|
-
"datefmt": "%Y-%m-%d %H:%M:%S", # How to display dates
|
|
44
|
-
},
|
|
45
|
-
},
|
|
46
|
-
"handlers": {
|
|
47
|
-
"simple": { # The handler name
|
|
48
|
-
"formatter": "default", # Refer to the formatter defined above
|
|
49
|
-
"class": "logging.StreamHandler", # OUTPUT: Same as above, stream to console
|
|
50
|
-
"stream": "ext://sys.stdout",
|
|
51
|
-
},
|
|
52
|
-
},
|
|
53
|
-
"loggers": {
|
|
54
|
-
"storage": { # The name of the logger, this SHOULD match your module!
|
|
55
|
-
"level": "DEBUG", # FILTER: only INFO logs onwards from "tryceratops" logger
|
|
56
|
-
"handlers": [
|
|
57
|
-
"simple", # Refer the handler defined above
|
|
58
|
-
],
|
|
59
|
-
},
|
|
60
|
-
},
|
|
61
|
-
"root": {
|
|
62
|
-
"level": "INFO", # FILTER: only INFO logs onwards
|
|
63
|
-
"handlers": [
|
|
64
|
-
"simple", # Refer the handler defined above
|
|
65
|
-
]
|
|
66
|
-
},
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
logging.config.dictConfig(LOGGING_CONFIG)
|
|
70
|
-
logger = logging.getLogger("__name__")
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
71
16
|
|
|
72
17
|
|
|
73
18
|
class Path(object):
|
|
74
|
-
def __init__(self, name, parentdir, create=True, backend='zarr'
|
|
19
|
+
def __init__(self, name, parentdir, create=True, backend='zarr',
|
|
20
|
+
archive_starttime=datetime(2000, 1, 1), ingest_config=None):
|
|
75
21
|
self.name = name
|
|
76
22
|
self.create = create
|
|
77
23
|
self.backend = backend
|
|
24
|
+
self.archive_starttime = archive_starttime
|
|
78
25
|
self.engine = 'h5netcdf' if self.backend == 'netcdf' else self.backend
|
|
79
26
|
self.path = os.path.join(parentdir, name)
|
|
80
27
|
if create:
|
|
@@ -86,6 +33,7 @@ class Path(object):
|
|
|
86
33
|
if not os.path.exists(self.path):
|
|
87
34
|
raise FileNotFoundError(f"Path {self.path} not found")
|
|
88
35
|
self.children = {}
|
|
36
|
+
self.ingest_config = ingest_config.copy() if ingest_config else None
|
|
89
37
|
|
|
90
38
|
def __str__(self):
|
|
91
39
|
return self.path
|
|
@@ -97,7 +45,8 @@ class Path(object):
|
|
|
97
45
|
return self.children[key]
|
|
98
46
|
except KeyError:
|
|
99
47
|
self.children[key] = Path(
|
|
100
|
-
key, self.path, self.create, self.backend
|
|
48
|
+
key, self.path, self.create, self.backend, self.archive_starttime,
|
|
49
|
+
ingest_config=self.ingest_config)
|
|
101
50
|
return self.children[key]
|
|
102
51
|
|
|
103
52
|
def feature_path(self, feature):
|
|
@@ -149,10 +98,24 @@ class Path(object):
|
|
|
149
98
|
"""
|
|
150
99
|
Save a feature to disk
|
|
151
100
|
"""
|
|
101
|
+
if self.ingest_config and self.ingest_config.get('queue_path'):
|
|
102
|
+
enqueue_dataset(
|
|
103
|
+
data,
|
|
104
|
+
target_path=self.path,
|
|
105
|
+
backend=self.backend,
|
|
106
|
+
ingest_config=self.ingest_config,
|
|
107
|
+
save_kwargs=kwargs,
|
|
108
|
+
)
|
|
109
|
+
logger.debug("Queued data for %s backend at %s",
|
|
110
|
+
self.backend, self.path)
|
|
111
|
+
return
|
|
112
|
+
|
|
152
113
|
if self.backend == 'netcdf':
|
|
153
|
-
xarray2netcdf(data, self.path,
|
|
114
|
+
xarray2netcdf(data, self.path,
|
|
115
|
+
archive_starttime=self.archive_starttime, **kwargs)
|
|
154
116
|
elif self.backend == 'zarr':
|
|
155
|
-
xarray2zarr(data, self.path,
|
|
117
|
+
xarray2zarr(data, self.path,
|
|
118
|
+
archive_starttime=self.archive_starttime, **kwargs)
|
|
156
119
|
|
|
157
120
|
def shape(self, feature):
|
|
158
121
|
"""
|
|
@@ -208,11 +171,17 @@ class Storage(Path):
|
|
|
208
171
|
>>> rsam = c("rsam")
|
|
209
172
|
"""
|
|
210
173
|
|
|
211
|
-
def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf'
|
|
174
|
+
def __init__(self, name, rootdir, starttime=None, endtime=None, create=True, backend='netcdf',
|
|
175
|
+
ingest_config=None, archive_starttime=datetime(2000, 1, 1)):
|
|
212
176
|
self.stores = set()
|
|
213
177
|
self.starttime = starttime
|
|
214
178
|
self.endtime = endtime
|
|
215
|
-
|
|
179
|
+
self.archive_starttime = archive_starttime
|
|
180
|
+
self._ingest_worker: Optional[IngestWorker] = None
|
|
181
|
+
self._ingest_thread: Optional[threading.Thread] = None
|
|
182
|
+
self._ingest_stop_event: Optional[threading.Event] = None
|
|
183
|
+
super().__init__(name, rootdir, create, backend, archive_starttime,
|
|
184
|
+
ingest_config=ingest_config)
|
|
216
185
|
|
|
217
186
|
def print_tree(self, site, indent=0, output=''):
|
|
218
187
|
output += ' ' * indent + site.path + '\n'
|
|
@@ -317,3 +286,50 @@ class Storage(Path):
|
|
|
317
286
|
|
|
318
287
|
starttime = property(get_starttime, set_starttime)
|
|
319
288
|
endtime = property(get_endtime, set_endtime)
|
|
289
|
+
|
|
290
|
+
def _ensure_ingest_worker(self, poll_interval=None) -> IngestWorker:
|
|
291
|
+
if not (self.ingest_config and self.ingest_config.get('queue_path')):
|
|
292
|
+
raise RuntimeError(
|
|
293
|
+
"Ingestion queue is not configured for this Storage instance.")
|
|
294
|
+
|
|
295
|
+
if self._ingest_worker is None:
|
|
296
|
+
queue_path = self.ingest_config['queue_path']
|
|
297
|
+
poll = poll_interval or self.ingest_config.get(
|
|
298
|
+
'poll_interval', 10.0)
|
|
299
|
+
self._ingest_worker = IngestWorker(
|
|
300
|
+
queue_path=queue_path,
|
|
301
|
+
poll_interval=poll
|
|
302
|
+
)
|
|
303
|
+
elif poll_interval:
|
|
304
|
+
self._ingest_worker.poll_interval = poll_interval
|
|
305
|
+
return self._ingest_worker
|
|
306
|
+
|
|
307
|
+
def run_ingest_once(self, poll_interval=None) -> int:
|
|
308
|
+
worker = self._ensure_ingest_worker(poll_interval)
|
|
309
|
+
return worker.run_once()
|
|
310
|
+
|
|
311
|
+
def start_ingest_worker(self, *, background=True, poll_interval=None):
|
|
312
|
+
worker = self._ensure_ingest_worker(poll_interval)
|
|
313
|
+
if not background:
|
|
314
|
+
return worker.run_once()
|
|
315
|
+
if self._ingest_thread and self._ingest_thread.is_alive():
|
|
316
|
+
return self._ingest_thread
|
|
317
|
+
stop_event = threading.Event()
|
|
318
|
+
thread = threading.Thread(
|
|
319
|
+
target=worker.run_forever,
|
|
320
|
+
kwargs={'stop_event': stop_event},
|
|
321
|
+
daemon=True,
|
|
322
|
+
name=f"tonik-ingest-{self.name}",
|
|
323
|
+
)
|
|
324
|
+
thread.start()
|
|
325
|
+
self._ingest_thread = thread
|
|
326
|
+
self._ingest_stop_event = stop_event
|
|
327
|
+
return thread
|
|
328
|
+
|
|
329
|
+
def stop_ingest_worker(self, timeout=None):
|
|
330
|
+
if self._ingest_thread and self._ingest_thread.is_alive():
|
|
331
|
+
if self._ingest_stop_event:
|
|
332
|
+
self._ingest_stop_event.set()
|
|
333
|
+
self._ingest_thread.join(timeout=timeout)
|
|
334
|
+
self._ingest_thread = None
|
|
335
|
+
self._ingest_stop_event = None
|
tonik/xarray2zarr.py
CHANGED
|
@@ -16,29 +16,75 @@ from .utils import merge_arrays, fill_time_gaps, get_dt
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
20
|
-
|
|
19
|
+
def _init_timeseries_store(path: str, start: np.datetime64, stop: np.datetime64, interval: pd.Timedelta,
|
|
20
|
+
data_vars: dict, group: str = "original", chunk_size: int = 10,
|
|
21
|
+
timedim: str = "datetime") -> xr.DataArray:
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
Initialize an empty zarr store for time series data. This facilitates writing data out
|
|
24
|
+
of sequence and avoid prepending which is costly and difficult to get right.
|
|
24
25
|
|
|
25
26
|
Parameters
|
|
26
27
|
----------
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
28
|
+
path : str
|
|
29
|
+
Path to the zarr store.
|
|
30
|
+
start : np.datetime64
|
|
31
|
+
Start time of the zarr store.
|
|
32
|
+
stop : np.datetime64
|
|
33
|
+
End time of the zarr store.
|
|
34
|
+
interval : pd.Timedelta
|
|
35
|
+
Sampling interval string (e.g. '1H', '15T') for the time dimension
|
|
36
|
+
data_vars : dict
|
|
37
|
+
Dictionary defining the data variables to create. Keys are variable names,
|
|
38
|
+
values are tuples of (dims, shape, dtype) where dims is a tuple of dimension
|
|
39
|
+
names (excluding the time dimension), shape is a tuple of dimension sizes
|
|
40
|
+
(excluding the time dimension), and dtype is the numpy data type.
|
|
41
|
+
group : str, optional
|
|
42
|
+
Group name in the zarr store, by default "original"
|
|
43
|
+
chunk_size : int, optional
|
|
44
|
+
Chunk size in number of time steps, by default 10
|
|
45
|
+
timedim : str, optional
|
|
46
|
+
Name of the time dimension, by default "datetime"
|
|
39
47
|
|
|
40
|
-
|
|
41
|
-
|
|
48
|
+
"""
|
|
49
|
+
# Make the zarr store a multiple of chunk_size
|
|
50
|
+
stop_ts = pd.Timestamp(stop)
|
|
51
|
+
start_ts = pd.Timestamp(start)
|
|
52
|
+
chunk_length = int(chunk_size)
|
|
53
|
+
if chunk_length <= 0:
|
|
54
|
+
raise ValueError("chunk_size must be a positive integer")
|
|
55
|
+
total_steps = int((stop_ts - start_ts) // interval) + 1
|
|
56
|
+
if total_steps < 1:
|
|
57
|
+
total_steps = chunk_length
|
|
58
|
+
if total_steps % chunk_length:
|
|
59
|
+
required_steps = ((total_steps + chunk_length - 1) //
|
|
60
|
+
chunk_length) * chunk_length
|
|
61
|
+
start_ts = stop_ts - interval * (required_steps - 1)
|
|
62
|
+
time_index = pd.date_range(start=start_ts, end=stop_ts, freq=interval)
|
|
63
|
+
ds = xr.Dataset()
|
|
64
|
+
name, value = list(data_vars.items())[0]
|
|
65
|
+
dims, coords, shape, dtype = value
|
|
66
|
+
dims = dims + (timedim,)
|
|
67
|
+
shape = tuple(shape) + (len(time_index),)
|
|
68
|
+
# Create coordinates for gap dataset
|
|
69
|
+
new_coords = {timedim: time_index}
|
|
70
|
+
for coord_name, coord in coords.items():
|
|
71
|
+
if coord_name != timedim:
|
|
72
|
+
new_coords[coord_name] = coord
|
|
73
|
+
|
|
74
|
+
xda = xr.DataArray(
|
|
75
|
+
np.full(shape, np.nan, dtype=dtype),
|
|
76
|
+
coords=new_coords,
|
|
77
|
+
dims=dims,
|
|
78
|
+
name=name
|
|
79
|
+
)
|
|
80
|
+
xda = xda.chunk(
|
|
81
|
+
{timedim: chunk_size, **{d: -1 for d in dims[:-1]}})
|
|
82
|
+
xda.to_zarr(path, group=group, mode="w")
|
|
83
|
+
return xda
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.DataArray, interval: pd.Timedelta,
|
|
87
|
+
timedim: str = 'datetime', chunk_size: int = 10) -> xr.DataArray:
|
|
42
88
|
"""
|
|
43
89
|
Fill gaps between existing and new datasets.
|
|
44
90
|
|
|
@@ -56,29 +102,23 @@ def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.Data
|
|
|
56
102
|
xr.Dataset
|
|
57
103
|
Combined dataset with gaps filled
|
|
58
104
|
"""
|
|
59
|
-
if mode not in ['a', 'p']:
|
|
60
|
-
raise ValueError(
|
|
61
|
-
'Mode has to be either "a" for append or "p" for prepend')
|
|
62
|
-
|
|
63
|
-
# get the sample interval
|
|
64
|
-
dt = get_dt(xds_new.coords[timedim])
|
|
65
105
|
|
|
66
106
|
existing_endpoint = xds_existing[timedim].values
|
|
67
107
|
# Get time ranges
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
108
|
+
gap_start = existing_endpoint + interval
|
|
109
|
+
gap_end = xds_new[timedim].values[0] - interval
|
|
110
|
+
|
|
111
|
+
# Prepare shape for gap filling
|
|
112
|
+
shape_list = list(xds_new.shape)
|
|
113
|
+
dims_list = list(xds_new.dims)
|
|
114
|
+
shape_list.pop(dims_list.index(timedim))
|
|
74
115
|
|
|
75
116
|
if gap_start <= gap_end:
|
|
76
|
-
gap_times = pd.date_range(start=gap_start, end=gap_end, freq=
|
|
117
|
+
gap_times = pd.date_range(start=gap_start, end=gap_end, freq=interval)
|
|
77
118
|
|
|
78
119
|
# Create NaN array with same shape as variable but for gap times
|
|
79
|
-
gap_shape = (len(gap_times),)
|
|
80
|
-
|
|
81
|
-
gap_values = np.full(gap_shape, np.nan)
|
|
120
|
+
gap_shape = tuple(shape_list) + (len(gap_times),)
|
|
121
|
+
gap_values = np.full(gap_shape, np.nan, dtype=xds_new.dtype)
|
|
82
122
|
|
|
83
123
|
# Create coordinates for gap dataset
|
|
84
124
|
gap_coords = {timedim: gap_times}
|
|
@@ -94,70 +134,28 @@ def fill_time_gaps_between_datasets(xds_existing: xr.DataArray, xds_new: xr.Data
|
|
|
94
134
|
)
|
|
95
135
|
|
|
96
136
|
# Combine: existing + gap + new
|
|
97
|
-
|
|
98
|
-
combined = xr.concat([gap_data, xds_new], dim=timedim)
|
|
99
|
-
elif mode == 'p':
|
|
100
|
-
combined = xr.concat([xds_new, gap_data], dim=timedim)
|
|
101
|
-
return combined
|
|
137
|
+
combined = xr.concat([gap_data, xds_new], dim=timedim)
|
|
102
138
|
else:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
def _build_append_payload_full_chunks(payload: xr.DataArray, mode: str,
|
|
107
|
-
chunklen: int, timedim: str = "datetime") -> xr.DataArray:
|
|
108
|
-
"""
|
|
109
|
-
Construct the sequence to append so that the final total length is a multiple of `chunklen`
|
|
110
|
-
|
|
111
|
-
Parameters
|
|
112
|
-
----------
|
|
113
|
-
payload : xr.DataArray
|
|
114
|
-
DataArray to append
|
|
115
|
-
mode : str
|
|
116
|
-
'a' for append, 'p' for prepend
|
|
117
|
-
chunklen : int
|
|
118
|
-
Chunk length in number of time steps
|
|
119
|
-
timedim : str
|
|
120
|
-
Name of the time dimension
|
|
121
|
-
|
|
122
|
-
Returns
|
|
123
|
-
-------
|
|
124
|
-
xr.DataArray
|
|
125
|
-
Padded DataArray with length a multiple of chunklen
|
|
126
|
-
"""
|
|
127
|
-
if mode not in ['a', 'p']:
|
|
128
|
-
raise ValueError(
|
|
129
|
-
'Mode has to be either "a" for append or "p" for prepend')
|
|
130
|
-
|
|
131
|
-
# pad the tail so that payload_len % chunklen == 0
|
|
132
|
-
pay_len = payload.sizes[timedim]
|
|
133
|
-
need = -pay_len % chunklen # 0..chunklen-1
|
|
139
|
+
combined = xds_new
|
|
134
140
|
|
|
141
|
+
# ensure new array aligns with chunk size
|
|
142
|
+
arr_len = combined.sizes[timedim]
|
|
143
|
+
need = -arr_len % chunk_size # 0..chunklen-1
|
|
135
144
|
if need > 0:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
start = payload[timedim].values[0] - (need+1)*dt
|
|
141
|
-
pad_times = pd.date_range(start=start, periods=need, freq=dt)
|
|
142
|
-
pad_shape = []
|
|
143
|
-
for i, d in enumerate(payload.dims):
|
|
144
|
-
if d == timedim:
|
|
145
|
-
pad_shape.append(need)
|
|
146
|
-
else:
|
|
147
|
-
pad_shape.append(payload.shape[i])
|
|
148
|
-
pad_vals = np.full(pad_shape, np.nan)
|
|
145
|
+
start = combined[timedim].values[-1] + interval
|
|
146
|
+
pad_times = pd.date_range(start=start, periods=need, freq=interval)
|
|
147
|
+
pad_shape = tuple(shape_list) + (len(pad_times),)
|
|
148
|
+
pad_vals = np.full(pad_shape, np.nan, dtype=xds_new.dtype)
|
|
149
149
|
pad_coords = {timedim: pad_times}
|
|
150
|
-
for
|
|
151
|
-
if
|
|
152
|
-
pad_coords[
|
|
150
|
+
for coord_name, coord in xds_new.coords.items():
|
|
151
|
+
if coord_name != timedim:
|
|
152
|
+
pad_coords[coord_name] = coord
|
|
153
153
|
pad_da = xr.DataArray(pad_vals, coords=pad_coords,
|
|
154
|
-
dims=
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
payload = payload.chunk({timedim: chunklen})
|
|
160
|
-
return payload
|
|
154
|
+
dims=xds_new.dims,
|
|
155
|
+
name=xds_new.name)
|
|
156
|
+
combined = xr.concat([combined, pad_da], dim=timedim)
|
|
157
|
+
|
|
158
|
+
return combined
|
|
161
159
|
|
|
162
160
|
|
|
163
161
|
def _update_meta_data(fout: str,
|
|
@@ -179,7 +177,7 @@ def _update_meta_data(fout: str,
|
|
|
179
177
|
Group name for metadata.
|
|
180
178
|
"""
|
|
181
179
|
|
|
182
|
-
now = np.datetime64(datetime.now(tz=timezone.utc), '
|
|
180
|
+
now = np.datetime64(datetime.now(tz=timezone.utc), 's')
|
|
183
181
|
new_update = xr.DataArray([now],
|
|
184
182
|
coords={'update': [now]},
|
|
185
183
|
dims=['update'],
|
|
@@ -212,8 +210,9 @@ def _update_meta_data(fout: str,
|
|
|
212
210
|
xr.Dataset(vars).to_zarr(fout, group=meta_group, mode='w')
|
|
213
211
|
|
|
214
212
|
|
|
215
|
-
def xarray2zarr(xds: xr.Dataset, path: str,
|
|
216
|
-
|
|
213
|
+
def xarray2zarr(xds: xr.Dataset, path: str, group='original',
|
|
214
|
+
chunk_size: int = 1000, timedim: str = 'datetime', interval: str = None,
|
|
215
|
+
archive_starttime: datetime = datetime(2000, 1, 1)) -> None:
|
|
217
216
|
"""
|
|
218
217
|
Write xarray dataset to zarr files.
|
|
219
218
|
|
|
@@ -227,7 +226,7 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
|
|
|
227
226
|
Write mode, by default 'a'.
|
|
228
227
|
group : str, optional
|
|
229
228
|
Group name, by default 'original'
|
|
230
|
-
|
|
229
|
+
chunk_size : int, optional
|
|
231
230
|
Chunk size as the number of days.
|
|
232
231
|
timedim : str
|
|
233
232
|
Name of the time dimension, by default 'datetime'
|
|
@@ -245,14 +244,16 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
|
|
|
245
244
|
# Fill gaps
|
|
246
245
|
xds = xds.drop_duplicates(timedim, keep='last')
|
|
247
246
|
xds = fill_time_gaps(xds, timedim=timedim)
|
|
247
|
+
if interval is None:
|
|
248
|
+
interval = get_dt(xds[timedim])
|
|
249
|
+
else:
|
|
250
|
+
interval = pd.to_timedelta(interval)
|
|
248
251
|
|
|
249
252
|
for feature in xds.data_vars.keys():
|
|
250
253
|
fout = os.path.join(path, feature + '.zarr')
|
|
251
|
-
# nchunks = get_chunks(xds[feature], chunks)
|
|
252
|
-
nchunks = chunks
|
|
253
254
|
last_dp = xds[feature][timedim].values[-1]
|
|
254
255
|
_update_meta_data(fout, last_dp, resolution=float(
|
|
255
|
-
|
|
256
|
+
interval / pd.Timedelta(1, 'h')))
|
|
256
257
|
try:
|
|
257
258
|
xds_existing = xr.open_zarr(fout, group=group)
|
|
258
259
|
has_store = True
|
|
@@ -260,57 +261,61 @@ def xarray2zarr(xds: xr.Dataset, path: str, mode: str = 'a', group='original',
|
|
|
260
261
|
has_store = False
|
|
261
262
|
|
|
262
263
|
if not has_store:
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
264
|
+
logger.debug("Creating new zarr store.")
|
|
265
|
+
shape_list = list(xds[feature].shape)
|
|
266
|
+
dims_list = list(xds[feature].dims)
|
|
267
|
+
shape_list.pop(dims_list.index(timedim))
|
|
268
|
+
dims_list.pop(dims_list.index(timedim))
|
|
269
|
+
xds_existing = _init_timeseries_store(
|
|
270
|
+
fout,
|
|
271
|
+
start=np.datetime64(archive_starttime),
|
|
272
|
+
stop=xds[feature][timedim].values[-1],
|
|
273
|
+
interval=interval,
|
|
274
|
+
data_vars={
|
|
275
|
+
feature: (tuple(dims_list), xds[feature].coords,
|
|
276
|
+
tuple(shape_list), xds[feature].dtype)},
|
|
277
|
+
group=group,
|
|
278
|
+
chunk_size=chunk_size,
|
|
279
|
+
timedim=timedim
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if xds_existing[timedim][0] > xds[timedim][0]:
|
|
283
|
+
raise ValueError("New data ends before existing data starts. "
|
|
284
|
+
"Prepending to existing data is currently not supported.")
|
|
278
285
|
|
|
279
286
|
elif xds_existing[timedim][-1] < xds[timedim][0]:
|
|
280
287
|
logger.debug("Appending data to existing zarr store.")
|
|
281
|
-
xda_new =
|
|
282
|
-
|
|
283
|
-
xda_new = _build_append_payload_full_chunks(
|
|
284
|
-
xda_new, 'a', nchunks)
|
|
288
|
+
xda_new = _fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
|
|
289
|
+
xds[feature], interval, chunk_size=chunk_size)
|
|
285
290
|
xda_new.to_zarr(fout, group=group, mode='a',
|
|
286
291
|
append_dim=timedim)
|
|
287
|
-
|
|
288
|
-
elif xds_existing[timedim][0] > xds[timedim][0] and xds_existing[timedim][-1] < xds[timedim][-1]:
|
|
289
|
-
logger.debug(
|
|
290
|
-
"Data in zarr store contained in new data. Rewriting zarr store.")
|
|
291
|
-
xda_new = _build_append_payload_full_chunks(
|
|
292
|
-
xds[feature], 'a', nchunks)
|
|
293
|
-
xda_new.to_zarr(fout, group=group, mode='w',
|
|
294
|
-
write_empty_chunks=True)
|
|
295
|
-
|
|
296
292
|
else:
|
|
297
293
|
logger.debug("Data in zarr store overlaps with new data.")
|
|
298
294
|
logger.debug(
|
|
299
295
|
f"Endtime of existing data: {xds_existing[timedim][-1].values}")
|
|
300
296
|
logger.debug(f"Starttime of new data: {xds[timedim][0].values}")
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
297
|
+
existing_times = xds_existing[timedim].values
|
|
298
|
+
new_times = xds[timedim].values
|
|
299
|
+
|
|
300
|
+
overlap_times, idx_existing, idx_new = np.intersect1d(
|
|
301
|
+
existing_times,
|
|
302
|
+
new_times,
|
|
303
|
+
assume_unique=True,
|
|
304
|
+
return_indices=True,
|
|
305
|
+
)
|
|
306
|
+
region = {}
|
|
307
|
+
for dim in xds[feature].dims:
|
|
308
|
+
if dim == timedim:
|
|
309
|
+
start = int(idx_existing.min())
|
|
310
|
+
stop = start + len(idx_existing)
|
|
311
|
+
region[dim] = slice(start, stop)
|
|
312
|
+
else:
|
|
313
|
+
region[dim] = 'auto'
|
|
314
|
+
xds[feature].isel({timedim: idx_new}).to_zarr(
|
|
315
|
+
fout, group=group, mode='r+', region=region)
|
|
316
|
+
remainder = xds[feature].drop_sel({timedim: new_times[idx_new]})
|
|
307
317
|
if remainder.sizes[timedim] > 0:
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
mode = 'p'
|
|
311
|
-
xda_new = fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: 0}),
|
|
312
|
-
xds[feature], mode=mode)
|
|
313
|
-
xda_new = _build_append_payload_full_chunks(
|
|
314
|
-
xda_new, mode, nchunks)
|
|
318
|
+
xda_new = _fill_time_gaps_between_datasets(xds_existing[feature].isel({timedim: -1}),
|
|
319
|
+
remainder, interval, chunk_size=chunk_size)
|
|
315
320
|
xda_new.to_zarr(fout, group=group, mode='a',
|
|
316
321
|
append_dim=timedim)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tonik
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.22
|
|
4
4
|
Summary: Store time series data as HDF5 files and access them through an API.
|
|
5
5
|
Project-URL: Homepage, https://tsc-tools.github.io/tonik
|
|
6
6
|
Project-URL: Issues, https://github.com/tsc-tools/tonik/issues
|
|
@@ -9,29 +9,29 @@ License-File: LICENSE
|
|
|
9
9
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Requires-Python: >=3.
|
|
13
|
-
Requires-Dist: datashader
|
|
14
|
-
Requires-Dist: fastapi
|
|
15
|
-
Requires-Dist: h5netcdf
|
|
16
|
-
Requires-Dist: h5py
|
|
17
|
-
Requires-Dist: matplotlib
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist:
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist: xarray[accel,io,parallel]
|
|
24
|
-
Requires-Dist: zarr
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Requires-Dist: datashader<0.19,>=0.18.2
|
|
14
|
+
Requires-Dist: fastapi<0.129,>=0.128.0
|
|
15
|
+
Requires-Dist: h5netcdf<2,>=1.7.3
|
|
16
|
+
Requires-Dist: h5py<4,>=3.15.1
|
|
17
|
+
Requires-Dist: matplotlib<4,>=3.10.8
|
|
18
|
+
Requires-Dist: pandas<3,>=2.3.3
|
|
19
|
+
Requires-Dist: s3fs<2026,>=2025.12.0
|
|
20
|
+
Requires-Dist: uvicorn[standard]<0.41,>=0.40.0
|
|
21
|
+
Requires-Dist: xarray[accel,io,parallel]<2026,>=2025.6.1
|
|
22
|
+
Requires-Dist: zarr<4,>=3.1.5
|
|
25
23
|
Provides-Extra: dev
|
|
26
|
-
Requires-Dist: build; extra == 'dev'
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist: mkdocs-jupyter; extra == 'dev'
|
|
31
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: build<2,>=1.4.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: hatch<2,>=1.16.2; extra == 'dev'
|
|
26
|
+
Requires-Dist: httpx<0.29,>=0.28.1; extra == 'dev'
|
|
27
|
+
Requires-Dist: ipykernel<7,>=6.31.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: mkdocs-jupyter<0.26,>=0.25.1; extra == 'dev'
|
|
29
|
+
Requires-Dist: mkdocs<2,>=1.6.1; extra == 'dev'
|
|
30
|
+
Requires-Dist: mkdocstrings[python]<2,>=1.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: moto[s3]<6,>=5.1.19; extra == 'dev'
|
|
32
32
|
Requires-Dist: pytest; extra == 'dev'
|
|
33
|
-
Requires-Dist: twine; extra == 'dev'
|
|
34
|
-
Requires-Dist: zarr[remote-tests]; extra == 'dev'
|
|
33
|
+
Requires-Dist: twine<7,>=6.2.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: zarr[remote-tests]<4,>=3.1.5; extra == 'dev'
|
|
35
35
|
Description-Content-Type: text/markdown
|
|
36
36
|
|
|
37
37
|
# Tonik
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
tonik/__init__.py,sha256=dov-nMeGFBzLspmj4rWKjC4r736vmaPDgMEkHSUfP98,523
|
|
2
2
|
tonik/api.py,sha256=vW0ykOo5iGAV0_WuOepdrnUyFp83F7KyJTd43ksLmUk,7985
|
|
3
3
|
tonik/grafana_annotations.py,sha256=ZU9Cy-HT4vvMfYIQzD9WboaDVOCBDv__NmXbk1qKWJo,5838
|
|
4
|
-
tonik/
|
|
4
|
+
tonik/ingest.py,sha256=RWJLasAVM8iaoCK5HCXEXybXARupw58Im0Ic7KrAThk,6228
|
|
5
|
+
tonik/storage.py,sha256=zHXrIjbSPC3Sni1_KOn_OqCk0HtWaOyXgAhMTTdO18w,11500
|
|
5
6
|
tonik/utils.py,sha256=GwAXfGFQWhlsLThQvSux1SooRkW-iIkJP99JMH72t5Y,11791
|
|
6
7
|
tonik/xarray2netcdf.py,sha256=nq6RHk5ciaAg1bxNDiyHPRdAts1C7fj7jtDbaLaSTWM,6497
|
|
7
|
-
tonik/xarray2zarr.py,sha256=
|
|
8
|
+
tonik/xarray2zarr.py,sha256=HeqKBArNcYUzd_azgCK0iptq1qAA6h2j4brfIpkV_gs,12156
|
|
8
9
|
tonik/package_data/index.html,sha256=ZCZ-BtGRERsL-6c_dfY43qd2WAaggH7xereennGL6ww,4372
|
|
9
10
|
tonik/package_data/whakaari_labels.json,sha256=96UZSq41yXgAJxuKivLBKlRTw-33jkjh7AGKTsDQ9Yg,3993
|
|
10
|
-
tonik-0.1.
|
|
11
|
-
tonik-0.1.
|
|
12
|
-
tonik-0.1.
|
|
13
|
-
tonik-0.1.
|
|
14
|
-
tonik-0.1.
|
|
11
|
+
tonik-0.1.22.dist-info/METADATA,sha256=sjriW0whFAAo3VFxdu-1xlND0sw7RqcbULJz5HRUT-Q,2424
|
|
12
|
+
tonik-0.1.22.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
tonik-0.1.22.dist-info/entry_points.txt,sha256=y82XyTeQddM87gCTzgSQaTlKF3VFicO4hhClHUv6j1A,127
|
|
14
|
+
tonik-0.1.22.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
15
|
+
tonik-0.1.22.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|