xarray-dbd 0.2.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xarray_dbd/__init__.py +38 -0
- xarray_dbd/_dbd_cpp.cp313-win_amd64.pyd +0 -0
- xarray_dbd/_dbd_cpp.pyi +33 -0
- xarray_dbd/backend.py +547 -0
- xarray_dbd/cli/__init__.py +0 -0
- xarray_dbd/cli/cache.py +124 -0
- xarray_dbd/cli/csv.py +237 -0
- xarray_dbd/cli/dbd2nc.py +258 -0
- xarray_dbd/cli/logger.py +102 -0
- xarray_dbd/cli/main.py +46 -0
- xarray_dbd/cli/missions.py +79 -0
- xarray_dbd/cli/mkone.py +319 -0
- xarray_dbd/cli/sensors.py +120 -0
- xarray_dbd/compat.py +340 -0
- xarray_dbd/py.typed +0 -0
- xarray_dbd-0.2.0.dist-info/METADATA +406 -0
- xarray_dbd-0.2.0.dist-info/RECORD +20 -0
- xarray_dbd-0.2.0.dist-info/WHEEL +5 -0
- xarray_dbd-0.2.0.dist-info/entry_points.txt +6 -0
- xarray_dbd-0.2.0.dist-info/licenses/License.txt +674 -0
xarray_dbd/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
xarray-dbd: An efficient xarray backend for Dinkum Binary Data (DBD) files
|
|
3
|
+
|
|
4
|
+
This package provides an xarray backend engine for reading glider DBD files
|
|
5
|
+
directly without conversion to NetCDF, using a C++ parser via pybind11.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from importlib.metadata import version
|
|
11
|
+
|
|
12
|
+
from ._dbd_cpp import (
|
|
13
|
+
read_dbd_file,
|
|
14
|
+
read_dbd_files,
|
|
15
|
+
scan_headers,
|
|
16
|
+
scan_sensors,
|
|
17
|
+
)
|
|
18
|
+
from .backend import (
|
|
19
|
+
DBDBackendEntrypoint,
|
|
20
|
+
open_dbd_dataset,
|
|
21
|
+
open_multi_dbd_dataset,
|
|
22
|
+
write_multi_dbd_netcdf,
|
|
23
|
+
)
|
|
24
|
+
from .compat import DBD, MultiDBD
|
|
25
|
+
|
|
26
|
+
__version__ = version("xarray-dbd")
|
|
27
|
+
__all__ = [
|
|
28
|
+
"DBD",
|
|
29
|
+
"DBDBackendEntrypoint",
|
|
30
|
+
"MultiDBD",
|
|
31
|
+
"read_dbd_file",
|
|
32
|
+
"read_dbd_files",
|
|
33
|
+
"scan_headers",
|
|
34
|
+
"scan_sensors",
|
|
35
|
+
"open_dbd_dataset",
|
|
36
|
+
"open_multi_dbd_dataset",
|
|
37
|
+
"write_multi_dbd_netcdf",
|
|
38
|
+
]
|
|
Binary file
|
xarray_dbd/_dbd_cpp.pyi
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Type stubs for the _dbd_cpp C++ extension module."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
def read_dbd_file(
|
|
6
|
+
filename: str,
|
|
7
|
+
cache_dir: str = "",
|
|
8
|
+
to_keep: list[str] = ...,
|
|
9
|
+
criteria: list[str] = ...,
|
|
10
|
+
skip_first_record: bool = True,
|
|
11
|
+
repair: bool = False,
|
|
12
|
+
) -> dict[str, Any]: ...
|
|
13
|
+
def read_dbd_files(
|
|
14
|
+
filenames: list[str],
|
|
15
|
+
cache_dir: str = "",
|
|
16
|
+
to_keep: list[str] = ...,
|
|
17
|
+
criteria: list[str] = ...,
|
|
18
|
+
skip_missions: list[str] = ...,
|
|
19
|
+
keep_missions: list[str] = ...,
|
|
20
|
+
skip_first_record: bool = True,
|
|
21
|
+
repair: bool = False,
|
|
22
|
+
) -> dict[str, Any]: ...
|
|
23
|
+
def scan_sensors(
|
|
24
|
+
filenames: list[str],
|
|
25
|
+
cache_dir: str = "",
|
|
26
|
+
skip_missions: list[str] = ...,
|
|
27
|
+
keep_missions: list[str] = ...,
|
|
28
|
+
) -> dict[str, Any]: ...
|
|
29
|
+
def scan_headers(
|
|
30
|
+
filenames: list[str],
|
|
31
|
+
skip_missions: list[str] = ...,
|
|
32
|
+
keep_missions: list[str] = ...,
|
|
33
|
+
) -> dict[str, Any]: ...
|
xarray_dbd/backend.py
ADDED
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Xarray backend engine for DBD files using C++ parser
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from collections.abc import Iterable
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import xarray as xr
|
|
14
|
+
from xarray.backends import BackendEntrypoint
|
|
15
|
+
|
|
16
|
+
from ._dbd_cpp import read_dbd_file, read_dbd_files, scan_sensors
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"DBDDataStore",
|
|
22
|
+
"DBDBackendEntrypoint",
|
|
23
|
+
"open_dbd_dataset",
|
|
24
|
+
"open_multi_dbd_dataset",
|
|
25
|
+
"write_multi_dbd_netcdf",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DBDDataStore:
|
|
30
|
+
"""Data store for reading a single DBD file using the C++ backend.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
filename : str or Path
|
|
35
|
+
Path to the DBD file.
|
|
36
|
+
skip_first_record : bool
|
|
37
|
+
If True, skip the first data record (for deduplication in multi-file reads).
|
|
38
|
+
repair : bool
|
|
39
|
+
If True, attempt to recover data from corrupted records.
|
|
40
|
+
to_keep : list of str or None
|
|
41
|
+
Sensor names to keep. If None or empty, all sensors are loaded.
|
|
42
|
+
criteria : list of str or None
|
|
43
|
+
Sensor names used for record selection criteria.
|
|
44
|
+
cache_dir : str, Path, or None
|
|
45
|
+
Directory for sensor cache files. Defaults to ``<file_dir>/cache``.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
filename: str | Path,
|
|
51
|
+
skip_first_record: bool = True,
|
|
52
|
+
repair: bool = False,
|
|
53
|
+
to_keep: list[str] | None = None,
|
|
54
|
+
criteria: list[str] | None = None,
|
|
55
|
+
cache_dir: str | Path | None = None,
|
|
56
|
+
):
|
|
57
|
+
self.filename = Path(filename)
|
|
58
|
+
|
|
59
|
+
# Determine cache directory
|
|
60
|
+
cache_dir = str(self.filename.parent / "cache") if cache_dir is None else str(cache_dir)
|
|
61
|
+
|
|
62
|
+
# Call C++ backend
|
|
63
|
+
try:
|
|
64
|
+
result = read_dbd_file(
|
|
65
|
+
str(self.filename),
|
|
66
|
+
cache_dir=cache_dir,
|
|
67
|
+
to_keep=to_keep or [],
|
|
68
|
+
criteria=criteria or [],
|
|
69
|
+
skip_first_record=skip_first_record,
|
|
70
|
+
repair=repair,
|
|
71
|
+
)
|
|
72
|
+
except RuntimeError as e:
|
|
73
|
+
raise OSError(f"Failed to read {self.filename}: {e}") from e
|
|
74
|
+
|
|
75
|
+
required_keys = {
|
|
76
|
+
"columns",
|
|
77
|
+
"sensor_names",
|
|
78
|
+
"sensor_units",
|
|
79
|
+
"sensor_sizes",
|
|
80
|
+
"n_records",
|
|
81
|
+
"header",
|
|
82
|
+
}
|
|
83
|
+
missing = required_keys - result.keys()
|
|
84
|
+
if missing:
|
|
85
|
+
raise OSError(
|
|
86
|
+
f"Incomplete result from C++ backend for {self.filename}: missing {missing}"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
self._columns = list(result["columns"])
|
|
90
|
+
self._sensor_names = list(result["sensor_names"])
|
|
91
|
+
self._sensor_units = list(result["sensor_units"])
|
|
92
|
+
self._sensor_sizes = list(result["sensor_sizes"])
|
|
93
|
+
self._n_records = int(result["n_records"])
|
|
94
|
+
self._header = dict(result["header"])
|
|
95
|
+
|
|
96
|
+
def get_variables(self) -> dict[str, xr.Variable]:
|
|
97
|
+
"""Get xarray variables for all sensors"""
|
|
98
|
+
variables: dict[str, xr.Variable] = {}
|
|
99
|
+
dims = ("i",)
|
|
100
|
+
|
|
101
|
+
for idx, name in enumerate(self._sensor_names):
|
|
102
|
+
data = self._columns[idx]
|
|
103
|
+
attrs = {
|
|
104
|
+
"units": self._sensor_units[idx],
|
|
105
|
+
"sensor_size": self._sensor_sizes[idx],
|
|
106
|
+
}
|
|
107
|
+
variables[name] = xr.Variable(dims, data, attrs=attrs)
|
|
108
|
+
|
|
109
|
+
return variables
|
|
110
|
+
|
|
111
|
+
def get_attrs(self) -> dict[str, Any]:
|
|
112
|
+
"""Get global attributes"""
|
|
113
|
+
return {
|
|
114
|
+
"mission_name": self._header.get("mission_name", ""),
|
|
115
|
+
"fileopen_time": self._header.get("fileopen_time", ""),
|
|
116
|
+
"encoding_version": self._header.get("encoding_version", ""),
|
|
117
|
+
"full_filename": self._header.get("full_filename", ""),
|
|
118
|
+
"sensor_list_crc": self._header.get("sensor_list_crc", ""),
|
|
119
|
+
"source_file": str(self.filename),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
def get_dimensions(self) -> dict[str, int]:
|
|
123
|
+
"""Get dimensions"""
|
|
124
|
+
return {"i": self._n_records, "j": 1}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class DBDBackendEntrypoint(BackendEntrypoint):
|
|
128
|
+
"""Xarray backend entrypoint for DBD files.
|
|
129
|
+
|
|
130
|
+
Registers as the ``"dbd"`` engine for :func:`xarray.open_dataset`.
|
|
131
|
+
Supports all Slocum glider binary formats (``.dbd``, ``.ebd``, ``.sbd``,
|
|
132
|
+
``.tbd``, ``.mbd``, ``.nbd``) and their compressed variants (``.dcd``, etc.).
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
description = "Backend for reading Dinkum Binary Data (DBD) files"
|
|
136
|
+
url = "https://github.com/mousebrains/dbd2netcdf"
|
|
137
|
+
|
|
138
|
+
def open_dataset( # type: ignore[override]
|
|
139
|
+
self,
|
|
140
|
+
filename_or_obj: str | Path,
|
|
141
|
+
*,
|
|
142
|
+
drop_variables: tuple[str, ...] | None = None,
|
|
143
|
+
skip_first_record: bool = True,
|
|
144
|
+
repair: bool = False,
|
|
145
|
+
to_keep: list[str] | None = None,
|
|
146
|
+
criteria: list[str] | None = None,
|
|
147
|
+
cache_dir: str | Path | None = None,
|
|
148
|
+
) -> xr.Dataset:
|
|
149
|
+
"""Open a DBD file as an xarray Dataset.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
filename_or_obj : str or Path
|
|
154
|
+
Path to the DBD file.
|
|
155
|
+
drop_variables : tuple of str, optional
|
|
156
|
+
Variable names to exclude from the returned Dataset.
|
|
157
|
+
skip_first_record : bool
|
|
158
|
+
Skip the first data record (default True).
|
|
159
|
+
repair : bool
|
|
160
|
+
Attempt to repair corrupted records (default False).
|
|
161
|
+
to_keep : list of str, optional
|
|
162
|
+
Sensor names to keep. If None, all sensors are loaded.
|
|
163
|
+
criteria : list of str, optional
|
|
164
|
+
Sensor names for record selection criteria.
|
|
165
|
+
cache_dir : str, Path, or None
|
|
166
|
+
Directory for sensor cache files.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
xr.Dataset
|
|
171
|
+
"""
|
|
172
|
+
filename = Path(filename_or_obj)
|
|
173
|
+
|
|
174
|
+
store = DBDDataStore(
|
|
175
|
+
filename,
|
|
176
|
+
skip_first_record=skip_first_record,
|
|
177
|
+
repair=repair,
|
|
178
|
+
to_keep=to_keep,
|
|
179
|
+
criteria=criteria,
|
|
180
|
+
cache_dir=cache_dir,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
vars_dict = store.get_variables()
|
|
184
|
+
attrs_dict = store.get_attrs()
|
|
185
|
+
|
|
186
|
+
if drop_variables:
|
|
187
|
+
drop_set = set(drop_variables)
|
|
188
|
+
vars_dict = {k: v for k, v in vars_dict.items() if k not in drop_set}
|
|
189
|
+
|
|
190
|
+
return xr.Dataset(vars_dict, attrs=attrs_dict)
|
|
191
|
+
|
|
192
|
+
def guess_can_open(self, filename_or_obj: str | Path) -> bool: # type: ignore[override]
|
|
193
|
+
"""Guess if this backend can open the file"""
|
|
194
|
+
try:
|
|
195
|
+
filename = Path(filename_or_obj)
|
|
196
|
+
ext = filename.suffix.lower()
|
|
197
|
+
return ext in [
|
|
198
|
+
".dbd",
|
|
199
|
+
".ebd",
|
|
200
|
+
".sbd",
|
|
201
|
+
".tbd",
|
|
202
|
+
".mbd",
|
|
203
|
+
".nbd",
|
|
204
|
+
".dcd",
|
|
205
|
+
".ecd",
|
|
206
|
+
".scd",
|
|
207
|
+
".tcd",
|
|
208
|
+
".mcd",
|
|
209
|
+
".ncd",
|
|
210
|
+
]
|
|
211
|
+
except (TypeError, AttributeError):
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def open_dbd_dataset(
|
|
216
|
+
filename: str | Path,
|
|
217
|
+
skip_first_record: bool = True,
|
|
218
|
+
repair: bool = False,
|
|
219
|
+
to_keep: list[str] | None = None,
|
|
220
|
+
criteria: list[str] | None = None,
|
|
221
|
+
drop_variables: list[str] | None = None,
|
|
222
|
+
cache_dir: str | Path | None = None,
|
|
223
|
+
) -> xr.Dataset:
|
|
224
|
+
"""Open a single DBD file as an xarray Dataset.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
filename : str or Path
|
|
229
|
+
Path to the DBD file.
|
|
230
|
+
skip_first_record : bool
|
|
231
|
+
Skip the first data record (default True).
|
|
232
|
+
repair : bool
|
|
233
|
+
Attempt to repair corrupted records (default False).
|
|
234
|
+
to_keep : list of str, optional
|
|
235
|
+
Sensor names to keep. If None, all sensors are loaded.
|
|
236
|
+
criteria : list of str, optional
|
|
237
|
+
Sensor names for record selection criteria.
|
|
238
|
+
drop_variables : list of str, optional
|
|
239
|
+
Variable names to exclude from the returned Dataset.
|
|
240
|
+
cache_dir : str, Path, or None
|
|
241
|
+
Directory for sensor cache files. Defaults to ``<file_dir>/cache``.
|
|
242
|
+
|
|
243
|
+
Returns
|
|
244
|
+
-------
|
|
245
|
+
xr.Dataset
|
|
246
|
+
|
|
247
|
+
Examples
|
|
248
|
+
--------
|
|
249
|
+
>>> ds = open_dbd_dataset("test.sbd")
|
|
250
|
+
>>> ds = open_dbd_dataset("test.sbd", to_keep=["m_depth", "m_lat"])
|
|
251
|
+
"""
|
|
252
|
+
return xr.open_dataset(
|
|
253
|
+
filename,
|
|
254
|
+
engine=DBDBackendEntrypoint,
|
|
255
|
+
skip_first_record=skip_first_record,
|
|
256
|
+
repair=repair,
|
|
257
|
+
to_keep=to_keep,
|
|
258
|
+
criteria=criteria,
|
|
259
|
+
drop_variables=drop_variables,
|
|
260
|
+
cache_dir=cache_dir,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def open_multi_dbd_dataset(
|
|
265
|
+
filenames: Iterable[str | Path],
|
|
266
|
+
skip_first_record: bool = True,
|
|
267
|
+
repair: bool = False,
|
|
268
|
+
to_keep: list[str] | None = None,
|
|
269
|
+
criteria: list[str] | None = None,
|
|
270
|
+
skip_missions: list[str] | None = None,
|
|
271
|
+
keep_missions: list[str] | None = None,
|
|
272
|
+
cache_dir: str | Path | None = None,
|
|
273
|
+
) -> xr.Dataset:
|
|
274
|
+
"""Open multiple DBD files as a single concatenated xarray Dataset.
|
|
275
|
+
|
|
276
|
+
Uses the C++ backend's two-pass approach with SensorsMap to merge sensor
|
|
277
|
+
definitions across files, matching dbd2netCDF behavior exactly.
|
|
278
|
+
|
|
279
|
+
Parameters
|
|
280
|
+
----------
|
|
281
|
+
filenames : iterable of str or Path
|
|
282
|
+
Paths to DBD files. Files are sorted internally.
|
|
283
|
+
skip_first_record : bool
|
|
284
|
+
Skip first record in each file except the first (default True).
|
|
285
|
+
repair : bool
|
|
286
|
+
Attempt to repair corrupted records (default False).
|
|
287
|
+
to_keep : list of str, optional
|
|
288
|
+
Sensor names to keep. If None, all sensors are loaded.
|
|
289
|
+
criteria : list of str, optional
|
|
290
|
+
Sensor names for record selection criteria.
|
|
291
|
+
skip_missions : list of str, optional
|
|
292
|
+
Mission names to exclude.
|
|
293
|
+
keep_missions : list of str, optional
|
|
294
|
+
Mission names to include (excludes all others).
|
|
295
|
+
cache_dir : str, Path, or None
|
|
296
|
+
Directory for sensor cache files.
|
|
297
|
+
|
|
298
|
+
Returns
|
|
299
|
+
-------
|
|
300
|
+
xr.Dataset
|
|
301
|
+
|
|
302
|
+
Examples
|
|
303
|
+
--------
|
|
304
|
+
>>> files = sorted(Path(".").glob("*.sbd"))
|
|
305
|
+
>>> ds = open_multi_dbd_dataset(files)
|
|
306
|
+
>>> ds = open_multi_dbd_dataset(files, to_keep=["m_depth", "m_present_time"])
|
|
307
|
+
"""
|
|
308
|
+
if skip_missions and keep_missions:
|
|
309
|
+
raise ValueError("Cannot specify both skip_missions and keep_missions")
|
|
310
|
+
|
|
311
|
+
file_list = [str(Path(f)) for f in filenames]
|
|
312
|
+
|
|
313
|
+
if not file_list:
|
|
314
|
+
return xr.Dataset()
|
|
315
|
+
|
|
316
|
+
cache_str = str(cache_dir) if cache_dir else ""
|
|
317
|
+
|
|
318
|
+
try:
|
|
319
|
+
result = read_dbd_files(
|
|
320
|
+
file_list,
|
|
321
|
+
cache_dir=cache_str,
|
|
322
|
+
to_keep=to_keep or [],
|
|
323
|
+
criteria=criteria or [],
|
|
324
|
+
skip_missions=skip_missions or [],
|
|
325
|
+
keep_missions=keep_missions or [],
|
|
326
|
+
skip_first_record=skip_first_record,
|
|
327
|
+
repair=repair,
|
|
328
|
+
)
|
|
329
|
+
except RuntimeError as e:
|
|
330
|
+
raise OSError(f"Failed to read {len(file_list)} DBD files: {e}") from e
|
|
331
|
+
|
|
332
|
+
columns = list(result["columns"])
|
|
333
|
+
sensor_names = list(result["sensor_names"])
|
|
334
|
+
sensor_units = list(result["sensor_units"])
|
|
335
|
+
n_records = int(result["n_records"])
|
|
336
|
+
n_files = int(result["n_files"])
|
|
337
|
+
|
|
338
|
+
if to_keep:
|
|
339
|
+
missing = set(to_keep) - set(sensor_names)
|
|
340
|
+
if missing:
|
|
341
|
+
logger.warning("Requested sensors not found in any file: %s", sorted(missing))
|
|
342
|
+
|
|
343
|
+
if not columns:
|
|
344
|
+
return xr.Dataset()
|
|
345
|
+
|
|
346
|
+
# Create dataset
|
|
347
|
+
dims = ("i",)
|
|
348
|
+
data_vars = {}
|
|
349
|
+
|
|
350
|
+
for idx, name in enumerate(sensor_names):
|
|
351
|
+
attrs = {"units": sensor_units[idx]}
|
|
352
|
+
data_vars[name] = xr.Variable(dims, columns[idx], attrs=attrs)
|
|
353
|
+
|
|
354
|
+
attrs = {
|
|
355
|
+
"n_files": n_files,
|
|
356
|
+
"total_records": n_records,
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
ds = xr.Dataset(data_vars, attrs=attrs)
|
|
360
|
+
|
|
361
|
+
return ds
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
# NetCDF dtype and fill value for each sensor byte-size
|
|
365
|
+
_NC_TYPE_INFO = {
|
|
366
|
+
1: ("i1", np.int8(-127)),
|
|
367
|
+
2: ("i2", np.int16(-32768)),
|
|
368
|
+
4: ("f4", np.float32("nan")),
|
|
369
|
+
8: ("f8", np.float64("nan")),
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def write_multi_dbd_netcdf(
|
|
374
|
+
filenames: Iterable[str | Path],
|
|
375
|
+
output: str | Path,
|
|
376
|
+
*,
|
|
377
|
+
skip_first_record: bool = True,
|
|
378
|
+
repair: bool = False,
|
|
379
|
+
to_keep: list[str] | None = None,
|
|
380
|
+
criteria: list[str] | None = None,
|
|
381
|
+
skip_missions: list[str] | None = None,
|
|
382
|
+
keep_missions: list[str] | None = None,
|
|
383
|
+
cache_dir: str | Path | None = None,
|
|
384
|
+
compression: int = 5,
|
|
385
|
+
) -> tuple[int, int]:
|
|
386
|
+
"""Stream multiple DBD files directly to a NetCDF file.
|
|
387
|
+
|
|
388
|
+
Unlike :func:`open_multi_dbd_dataset` which loads all data into memory,
|
|
389
|
+
this function reads one file at a time and writes its records to the
|
|
390
|
+
output NetCDF immediately, keeping peak memory proportional to a single
|
|
391
|
+
file's data.
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
filenames : iterable of str or Path
|
|
396
|
+
Paths to DBD files. Files are sorted internally.
|
|
397
|
+
output : str or Path
|
|
398
|
+
Path for the output NetCDF file.
|
|
399
|
+
skip_first_record : bool
|
|
400
|
+
Skip first record in each file except the first (default True).
|
|
401
|
+
repair : bool
|
|
402
|
+
Attempt to repair corrupted records (default False).
|
|
403
|
+
to_keep : list of str, optional
|
|
404
|
+
Sensor names to keep. If None, all sensors are written.
|
|
405
|
+
criteria : list of str, optional
|
|
406
|
+
Sensor names for record selection criteria.
|
|
407
|
+
skip_missions : list of str, optional
|
|
408
|
+
Mission names to exclude.
|
|
409
|
+
keep_missions : list of str, optional
|
|
410
|
+
Mission names to include (excludes all others).
|
|
411
|
+
cache_dir : str, Path, or None
|
|
412
|
+
Directory for sensor cache files.
|
|
413
|
+
compression : int
|
|
414
|
+
Zlib compression level 0-9 (default 5, 0 disables compression).
|
|
415
|
+
|
|
416
|
+
Returns
|
|
417
|
+
-------
|
|
418
|
+
tuple of (n_records, n_files)
|
|
419
|
+
"""
|
|
420
|
+
import netCDF4
|
|
421
|
+
|
|
422
|
+
if skip_missions and keep_missions:
|
|
423
|
+
raise ValueError("Cannot specify both skip_missions and keep_missions")
|
|
424
|
+
|
|
425
|
+
file_list = sorted(str(Path(f)) for f in filenames)
|
|
426
|
+
if not file_list:
|
|
427
|
+
return 0, 0
|
|
428
|
+
|
|
429
|
+
cache_str = str(cache_dir) if cache_dir else ""
|
|
430
|
+
|
|
431
|
+
# Pass 1: scan sensor union and valid files in one pass
|
|
432
|
+
sensor_result = scan_sensors(
|
|
433
|
+
file_list,
|
|
434
|
+
cache_dir=cache_str,
|
|
435
|
+
skip_missions=skip_missions or [],
|
|
436
|
+
keep_missions=keep_missions or [],
|
|
437
|
+
)
|
|
438
|
+
sensor_names = list(sensor_result["sensor_names"])
|
|
439
|
+
sensor_units = list(sensor_result["sensor_units"])
|
|
440
|
+
sensor_sizes = list(sensor_result["sensor_sizes"])
|
|
441
|
+
valid_files = list(sensor_result["valid_files"])
|
|
442
|
+
|
|
443
|
+
if not valid_files or not sensor_names:
|
|
444
|
+
return 0, 0
|
|
445
|
+
|
|
446
|
+
# Apply to_keep filter to the union sensor list
|
|
447
|
+
if to_keep:
|
|
448
|
+
keep_set = set(to_keep)
|
|
449
|
+
indices = [i for i, n in enumerate(sensor_names) if n in keep_set]
|
|
450
|
+
sensor_names = [sensor_names[i] for i in indices]
|
|
451
|
+
sensor_units = [sensor_units[i] for i in indices]
|
|
452
|
+
sensor_sizes = [sensor_sizes[i] for i in indices]
|
|
453
|
+
|
|
454
|
+
if not sensor_names:
|
|
455
|
+
return 0, 0
|
|
456
|
+
|
|
457
|
+
# Build fill value lookup for sensors missing from a batch
|
|
458
|
+
fill_vals = {}
|
|
459
|
+
for name, size in zip(sensor_names, sensor_sizes, strict=True):
|
|
460
|
+
dtype, fill = _NC_TYPE_INFO.get(size, ("f8", np.float64("nan")))
|
|
461
|
+
fill_vals[name] = (dtype, fill)
|
|
462
|
+
|
|
463
|
+
# Create NetCDF file with variables
|
|
464
|
+
chunk = 5000
|
|
465
|
+
nc = netCDF4.Dataset(str(output), "w", format="NETCDF4")
|
|
466
|
+
try:
|
|
467
|
+
nc.createDimension("i", None)
|
|
468
|
+
for name, units in zip(sensor_names, sensor_units, strict=True):
|
|
469
|
+
dtype, _ = fill_vals[name]
|
|
470
|
+
if compression > 0:
|
|
471
|
+
v = nc.createVariable( # type: ignore[call-overload]
|
|
472
|
+
name,
|
|
473
|
+
dtype,
|
|
474
|
+
("i",),
|
|
475
|
+
fill_value=False,
|
|
476
|
+
zlib=True,
|
|
477
|
+
complevel=compression,
|
|
478
|
+
chunksizes=(chunk,),
|
|
479
|
+
)
|
|
480
|
+
else:
|
|
481
|
+
v = nc.createVariable(name, dtype, ("i",), fill_value=False)
|
|
482
|
+
v.units = units
|
|
483
|
+
finally:
|
|
484
|
+
nc.close()
|
|
485
|
+
|
|
486
|
+
# Pass 2: read files in batches, append to NetCDF
|
|
487
|
+
batch_size = 100
|
|
488
|
+
offset = 0
|
|
489
|
+
total_files = 0
|
|
490
|
+
|
|
491
|
+
for batch_idx in range(0, len(valid_files), batch_size):
|
|
492
|
+
batch_files = valid_files[batch_idx : batch_idx + batch_size]
|
|
493
|
+
|
|
494
|
+
try:
|
|
495
|
+
result = read_dbd_files(
|
|
496
|
+
batch_files,
|
|
497
|
+
cache_dir=cache_str,
|
|
498
|
+
to_keep=to_keep or [],
|
|
499
|
+
criteria=criteria or [],
|
|
500
|
+
skip_missions=skip_missions or [],
|
|
501
|
+
keep_missions=keep_missions or [],
|
|
502
|
+
skip_first_record=skip_first_record,
|
|
503
|
+
repair=repair,
|
|
504
|
+
)
|
|
505
|
+
except (OSError, RuntimeError, ValueError) as e:
|
|
506
|
+
logger.warning("Error reading batch starting at index %d: %s", batch_idx, e)
|
|
507
|
+
continue
|
|
508
|
+
|
|
509
|
+
n = int(result["n_records"])
|
|
510
|
+
batch_files_read = int(result["n_files"])
|
|
511
|
+
|
|
512
|
+
# For batches after the first, the first file's first record overlaps
|
|
513
|
+
# with the previous batch's last file — skip it
|
|
514
|
+
start = 1 if (batch_idx > 0 and skip_first_record and n > 0) else 0
|
|
515
|
+
n_write = n - start
|
|
516
|
+
|
|
517
|
+
total_files += batch_files_read
|
|
518
|
+
|
|
519
|
+
if n_write <= 0:
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
# Build column map from this batch's result
|
|
523
|
+
result_names = list(result["sensor_names"])
|
|
524
|
+
result_cols = list(result["columns"])
|
|
525
|
+
col_map = dict(zip(result_names, result_cols, strict=True))
|
|
526
|
+
|
|
527
|
+
# Append to NetCDF
|
|
528
|
+
nc = netCDF4.Dataset(str(output), "a")
|
|
529
|
+
try:
|
|
530
|
+
for name in sensor_names:
|
|
531
|
+
col = col_map.get(name)
|
|
532
|
+
if col is not None:
|
|
533
|
+
nc.variables[name][offset : offset + n_write] = col[start : start + n_write]
|
|
534
|
+
else:
|
|
535
|
+
_, fill = fill_vals[name]
|
|
536
|
+
nc.variables[name][offset : offset + n_write] = np.full(n_write, fill)
|
|
537
|
+
|
|
538
|
+
offset += n_write
|
|
539
|
+
nc.setncattr("n_files", total_files)
|
|
540
|
+
nc.setncattr("total_records", offset)
|
|
541
|
+
finally:
|
|
542
|
+
nc.close()
|
|
543
|
+
|
|
544
|
+
# result goes out of scope — batch memory freed
|
|
545
|
+
del result, result_cols, col_map
|
|
546
|
+
|
|
547
|
+
return offset, total_files
|
|
File without changes
|