xarray-dbd 0.2.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xarray_dbd/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """
2
+ xarray-dbd: An efficient xarray backend for Dinkum Binary Data (DBD) files
3
+
4
+ This package provides an xarray backend engine for reading glider DBD files
5
+ directly without conversion to NetCDF, using a C++ parser via pybind11.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from importlib.metadata import version
11
+
12
+ from ._dbd_cpp import (
13
+ read_dbd_file,
14
+ read_dbd_files,
15
+ scan_headers,
16
+ scan_sensors,
17
+ )
18
+ from .backend import (
19
+ DBDBackendEntrypoint,
20
+ open_dbd_dataset,
21
+ open_multi_dbd_dataset,
22
+ write_multi_dbd_netcdf,
23
+ )
24
+ from .compat import DBD, MultiDBD
25
+
26
+ __version__ = version("xarray-dbd")
27
+ __all__ = [
28
+ "DBD",
29
+ "DBDBackendEntrypoint",
30
+ "MultiDBD",
31
+ "read_dbd_file",
32
+ "read_dbd_files",
33
+ "scan_headers",
34
+ "scan_sensors",
35
+ "open_dbd_dataset",
36
+ "open_multi_dbd_dataset",
37
+ "write_multi_dbd_netcdf",
38
+ ]
Binary file
@@ -0,0 +1,33 @@
1
+ """Type stubs for the _dbd_cpp C++ extension module."""
2
+
3
+ from typing import Any
4
+
5
+ def read_dbd_file(
6
+ filename: str,
7
+ cache_dir: str = "",
8
+ to_keep: list[str] = ...,
9
+ criteria: list[str] = ...,
10
+ skip_first_record: bool = True,
11
+ repair: bool = False,
12
+ ) -> dict[str, Any]: ...
13
+ def read_dbd_files(
14
+ filenames: list[str],
15
+ cache_dir: str = "",
16
+ to_keep: list[str] = ...,
17
+ criteria: list[str] = ...,
18
+ skip_missions: list[str] = ...,
19
+ keep_missions: list[str] = ...,
20
+ skip_first_record: bool = True,
21
+ repair: bool = False,
22
+ ) -> dict[str, Any]: ...
23
+ def scan_sensors(
24
+ filenames: list[str],
25
+ cache_dir: str = "",
26
+ skip_missions: list[str] = ...,
27
+ keep_missions: list[str] = ...,
28
+ ) -> dict[str, Any]: ...
29
+ def scan_headers(
30
+ filenames: list[str],
31
+ skip_missions: list[str] = ...,
32
+ keep_missions: list[str] = ...,
33
+ ) -> dict[str, Any]: ...
xarray_dbd/backend.py ADDED
@@ -0,0 +1,547 @@
1
+ """
2
+ Xarray backend engine for DBD files using C++ parser
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from collections.abc import Iterable
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import numpy as np
13
+ import xarray as xr
14
+ from xarray.backends import BackendEntrypoint
15
+
16
+ from ._dbd_cpp import read_dbd_file, read_dbd_files, scan_sensors
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ __all__ = [
21
+ "DBDDataStore",
22
+ "DBDBackendEntrypoint",
23
+ "open_dbd_dataset",
24
+ "open_multi_dbd_dataset",
25
+ "write_multi_dbd_netcdf",
26
+ ]
27
+
28
+
29
+ class DBDDataStore:
30
+ """Data store for reading a single DBD file using the C++ backend.
31
+
32
+ Parameters
33
+ ----------
34
+ filename : str or Path
35
+ Path to the DBD file.
36
+ skip_first_record : bool
37
+ If True, skip the first data record (for deduplication in multi-file reads).
38
+ repair : bool
39
+ If True, attempt to recover data from corrupted records.
40
+ to_keep : list of str or None
41
+ Sensor names to keep. If None or empty, all sensors are loaded.
42
+ criteria : list of str or None
43
+ Sensor names used for record selection criteria.
44
+ cache_dir : str, Path, or None
45
+ Directory for sensor cache files. Defaults to ``<file_dir>/cache``.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ filename: str | Path,
51
+ skip_first_record: bool = True,
52
+ repair: bool = False,
53
+ to_keep: list[str] | None = None,
54
+ criteria: list[str] | None = None,
55
+ cache_dir: str | Path | None = None,
56
+ ):
57
+ self.filename = Path(filename)
58
+
59
+ # Determine cache directory
60
+ cache_dir = str(self.filename.parent / "cache") if cache_dir is None else str(cache_dir)
61
+
62
+ # Call C++ backend
63
+ try:
64
+ result = read_dbd_file(
65
+ str(self.filename),
66
+ cache_dir=cache_dir,
67
+ to_keep=to_keep or [],
68
+ criteria=criteria or [],
69
+ skip_first_record=skip_first_record,
70
+ repair=repair,
71
+ )
72
+ except RuntimeError as e:
73
+ raise OSError(f"Failed to read {self.filename}: {e}") from e
74
+
75
+ required_keys = {
76
+ "columns",
77
+ "sensor_names",
78
+ "sensor_units",
79
+ "sensor_sizes",
80
+ "n_records",
81
+ "header",
82
+ }
83
+ missing = required_keys - result.keys()
84
+ if missing:
85
+ raise OSError(
86
+ f"Incomplete result from C++ backend for {self.filename}: missing {missing}"
87
+ )
88
+
89
+ self._columns = list(result["columns"])
90
+ self._sensor_names = list(result["sensor_names"])
91
+ self._sensor_units = list(result["sensor_units"])
92
+ self._sensor_sizes = list(result["sensor_sizes"])
93
+ self._n_records = int(result["n_records"])
94
+ self._header = dict(result["header"])
95
+
96
+ def get_variables(self) -> dict[str, xr.Variable]:
97
+ """Get xarray variables for all sensors"""
98
+ variables: dict[str, xr.Variable] = {}
99
+ dims = ("i",)
100
+
101
+ for idx, name in enumerate(self._sensor_names):
102
+ data = self._columns[idx]
103
+ attrs = {
104
+ "units": self._sensor_units[idx],
105
+ "sensor_size": self._sensor_sizes[idx],
106
+ }
107
+ variables[name] = xr.Variable(dims, data, attrs=attrs)
108
+
109
+ return variables
110
+
111
+ def get_attrs(self) -> dict[str, Any]:
112
+ """Get global attributes"""
113
+ return {
114
+ "mission_name": self._header.get("mission_name", ""),
115
+ "fileopen_time": self._header.get("fileopen_time", ""),
116
+ "encoding_version": self._header.get("encoding_version", ""),
117
+ "full_filename": self._header.get("full_filename", ""),
118
+ "sensor_list_crc": self._header.get("sensor_list_crc", ""),
119
+ "source_file": str(self.filename),
120
+ }
121
+
122
+ def get_dimensions(self) -> dict[str, int]:
123
+ """Get dimensions"""
124
+ return {"i": self._n_records, "j": 1}
125
+
126
+
127
+ class DBDBackendEntrypoint(BackendEntrypoint):
128
+ """Xarray backend entrypoint for DBD files.
129
+
130
+ Registers as the ``"dbd"`` engine for :func:`xarray.open_dataset`.
131
+ Supports all Slocum glider binary formats (``.dbd``, ``.ebd``, ``.sbd``,
132
+ ``.tbd``, ``.mbd``, ``.nbd``) and their compressed variants (``.dcd``, etc.).
133
+ """
134
+
135
+ description = "Backend for reading Dinkum Binary Data (DBD) files"
136
+ url = "https://github.com/mousebrains/dbd2netcdf"
137
+
138
+ def open_dataset( # type: ignore[override]
139
+ self,
140
+ filename_or_obj: str | Path,
141
+ *,
142
+ drop_variables: tuple[str, ...] | None = None,
143
+ skip_first_record: bool = True,
144
+ repair: bool = False,
145
+ to_keep: list[str] | None = None,
146
+ criteria: list[str] | None = None,
147
+ cache_dir: str | Path | None = None,
148
+ ) -> xr.Dataset:
149
+ """Open a DBD file as an xarray Dataset.
150
+
151
+ Parameters
152
+ ----------
153
+ filename_or_obj : str or Path
154
+ Path to the DBD file.
155
+ drop_variables : tuple of str, optional
156
+ Variable names to exclude from the returned Dataset.
157
+ skip_first_record : bool
158
+ Skip the first data record (default True).
159
+ repair : bool
160
+ Attempt to repair corrupted records (default False).
161
+ to_keep : list of str, optional
162
+ Sensor names to keep. If None, all sensors are loaded.
163
+ criteria : list of str, optional
164
+ Sensor names for record selection criteria.
165
+ cache_dir : str, Path, or None
166
+ Directory for sensor cache files.
167
+
168
+ Returns
169
+ -------
170
+ xr.Dataset
171
+ """
172
+ filename = Path(filename_or_obj)
173
+
174
+ store = DBDDataStore(
175
+ filename,
176
+ skip_first_record=skip_first_record,
177
+ repair=repair,
178
+ to_keep=to_keep,
179
+ criteria=criteria,
180
+ cache_dir=cache_dir,
181
+ )
182
+
183
+ vars_dict = store.get_variables()
184
+ attrs_dict = store.get_attrs()
185
+
186
+ if drop_variables:
187
+ drop_set = set(drop_variables)
188
+ vars_dict = {k: v for k, v in vars_dict.items() if k not in drop_set}
189
+
190
+ return xr.Dataset(vars_dict, attrs=attrs_dict)
191
+
192
+ def guess_can_open(self, filename_or_obj: str | Path) -> bool: # type: ignore[override]
193
+ """Guess if this backend can open the file"""
194
+ try:
195
+ filename = Path(filename_or_obj)
196
+ ext = filename.suffix.lower()
197
+ return ext in [
198
+ ".dbd",
199
+ ".ebd",
200
+ ".sbd",
201
+ ".tbd",
202
+ ".mbd",
203
+ ".nbd",
204
+ ".dcd",
205
+ ".ecd",
206
+ ".scd",
207
+ ".tcd",
208
+ ".mcd",
209
+ ".ncd",
210
+ ]
211
+ except (TypeError, AttributeError):
212
+ return False
213
+
214
+
215
+ def open_dbd_dataset(
216
+ filename: str | Path,
217
+ skip_first_record: bool = True,
218
+ repair: bool = False,
219
+ to_keep: list[str] | None = None,
220
+ criteria: list[str] | None = None,
221
+ drop_variables: list[str] | None = None,
222
+ cache_dir: str | Path | None = None,
223
+ ) -> xr.Dataset:
224
+ """Open a single DBD file as an xarray Dataset.
225
+
226
+ Parameters
227
+ ----------
228
+ filename : str or Path
229
+ Path to the DBD file.
230
+ skip_first_record : bool
231
+ Skip the first data record (default True).
232
+ repair : bool
233
+ Attempt to repair corrupted records (default False).
234
+ to_keep : list of str, optional
235
+ Sensor names to keep. If None, all sensors are loaded.
236
+ criteria : list of str, optional
237
+ Sensor names for record selection criteria.
238
+ drop_variables : list of str, optional
239
+ Variable names to exclude from the returned Dataset.
240
+ cache_dir : str, Path, or None
241
+ Directory for sensor cache files. Defaults to ``<file_dir>/cache``.
242
+
243
+ Returns
244
+ -------
245
+ xr.Dataset
246
+
247
+ Examples
248
+ --------
249
+ >>> ds = open_dbd_dataset("test.sbd")
250
+ >>> ds = open_dbd_dataset("test.sbd", to_keep=["m_depth", "m_lat"])
251
+ """
252
+ return xr.open_dataset(
253
+ filename,
254
+ engine=DBDBackendEntrypoint,
255
+ skip_first_record=skip_first_record,
256
+ repair=repair,
257
+ to_keep=to_keep,
258
+ criteria=criteria,
259
+ drop_variables=drop_variables,
260
+ cache_dir=cache_dir,
261
+ )
262
+
263
+
264
+ def open_multi_dbd_dataset(
265
+ filenames: Iterable[str | Path],
266
+ skip_first_record: bool = True,
267
+ repair: bool = False,
268
+ to_keep: list[str] | None = None,
269
+ criteria: list[str] | None = None,
270
+ skip_missions: list[str] | None = None,
271
+ keep_missions: list[str] | None = None,
272
+ cache_dir: str | Path | None = None,
273
+ ) -> xr.Dataset:
274
+ """Open multiple DBD files as a single concatenated xarray Dataset.
275
+
276
+ Uses the C++ backend's two-pass approach with SensorsMap to merge sensor
277
+ definitions across files, matching dbd2netCDF behavior exactly.
278
+
279
+ Parameters
280
+ ----------
281
+ filenames : iterable of str or Path
282
+ Paths to DBD files. Files are sorted internally.
283
+ skip_first_record : bool
284
+ Skip first record in each file except the first (default True).
285
+ repair : bool
286
+ Attempt to repair corrupted records (default False).
287
+ to_keep : list of str, optional
288
+ Sensor names to keep. If None, all sensors are loaded.
289
+ criteria : list of str, optional
290
+ Sensor names for record selection criteria.
291
+ skip_missions : list of str, optional
292
+ Mission names to exclude.
293
+ keep_missions : list of str, optional
294
+ Mission names to include (excludes all others).
295
+ cache_dir : str, Path, or None
296
+ Directory for sensor cache files.
297
+
298
+ Returns
299
+ -------
300
+ xr.Dataset
301
+
302
+ Examples
303
+ --------
304
+ >>> files = sorted(Path(".").glob("*.sbd"))
305
+ >>> ds = open_multi_dbd_dataset(files)
306
+ >>> ds = open_multi_dbd_dataset(files, to_keep=["m_depth", "m_present_time"])
307
+ """
308
+ if skip_missions and keep_missions:
309
+ raise ValueError("Cannot specify both skip_missions and keep_missions")
310
+
311
+ file_list = [str(Path(f)) for f in filenames]
312
+
313
+ if not file_list:
314
+ return xr.Dataset()
315
+
316
+ cache_str = str(cache_dir) if cache_dir else ""
317
+
318
+ try:
319
+ result = read_dbd_files(
320
+ file_list,
321
+ cache_dir=cache_str,
322
+ to_keep=to_keep or [],
323
+ criteria=criteria or [],
324
+ skip_missions=skip_missions or [],
325
+ keep_missions=keep_missions or [],
326
+ skip_first_record=skip_first_record,
327
+ repair=repair,
328
+ )
329
+ except RuntimeError as e:
330
+ raise OSError(f"Failed to read {len(file_list)} DBD files: {e}") from e
331
+
332
+ columns = list(result["columns"])
333
+ sensor_names = list(result["sensor_names"])
334
+ sensor_units = list(result["sensor_units"])
335
+ n_records = int(result["n_records"])
336
+ n_files = int(result["n_files"])
337
+
338
+ if to_keep:
339
+ missing = set(to_keep) - set(sensor_names)
340
+ if missing:
341
+ logger.warning("Requested sensors not found in any file: %s", sorted(missing))
342
+
343
+ if not columns:
344
+ return xr.Dataset()
345
+
346
+ # Create dataset
347
+ dims = ("i",)
348
+ data_vars = {}
349
+
350
+ for idx, name in enumerate(sensor_names):
351
+ attrs = {"units": sensor_units[idx]}
352
+ data_vars[name] = xr.Variable(dims, columns[idx], attrs=attrs)
353
+
354
+ attrs = {
355
+ "n_files": n_files,
356
+ "total_records": n_records,
357
+ }
358
+
359
+ ds = xr.Dataset(data_vars, attrs=attrs)
360
+
361
+ return ds
362
+
363
+
364
+ # NetCDF dtype and fill value for each sensor byte-size
365
+ _NC_TYPE_INFO = {
366
+ 1: ("i1", np.int8(-127)),
367
+ 2: ("i2", np.int16(-32768)),
368
+ 4: ("f4", np.float32("nan")),
369
+ 8: ("f8", np.float64("nan")),
370
+ }
371
+
372
+
373
+ def write_multi_dbd_netcdf(
374
+ filenames: Iterable[str | Path],
375
+ output: str | Path,
376
+ *,
377
+ skip_first_record: bool = True,
378
+ repair: bool = False,
379
+ to_keep: list[str] | None = None,
380
+ criteria: list[str] | None = None,
381
+ skip_missions: list[str] | None = None,
382
+ keep_missions: list[str] | None = None,
383
+ cache_dir: str | Path | None = None,
384
+ compression: int = 5,
385
+ ) -> tuple[int, int]:
386
+ """Stream multiple DBD files directly to a NetCDF file.
387
+
388
+ Unlike :func:`open_multi_dbd_dataset` which loads all data into memory,
389
+ this function reads one file at a time and writes its records to the
390
+ output NetCDF immediately, keeping peak memory proportional to a single
391
+ file's data.
392
+
393
+ Parameters
394
+ ----------
395
+ filenames : iterable of str or Path
396
+ Paths to DBD files. Files are sorted internally.
397
+ output : str or Path
398
+ Path for the output NetCDF file.
399
+ skip_first_record : bool
400
+ Skip first record in each file except the first (default True).
401
+ repair : bool
402
+ Attempt to repair corrupted records (default False).
403
+ to_keep : list of str, optional
404
+ Sensor names to keep. If None, all sensors are written.
405
+ criteria : list of str, optional
406
+ Sensor names for record selection criteria.
407
+ skip_missions : list of str, optional
408
+ Mission names to exclude.
409
+ keep_missions : list of str, optional
410
+ Mission names to include (excludes all others).
411
+ cache_dir : str, Path, or None
412
+ Directory for sensor cache files.
413
+ compression : int
414
+ Zlib compression level 0-9 (default 5, 0 disables compression).
415
+
416
+ Returns
417
+ -------
418
+ tuple of (n_records, n_files)
419
+ """
420
+ import netCDF4
421
+
422
+ if skip_missions and keep_missions:
423
+ raise ValueError("Cannot specify both skip_missions and keep_missions")
424
+
425
+ file_list = sorted(str(Path(f)) for f in filenames)
426
+ if not file_list:
427
+ return 0, 0
428
+
429
+ cache_str = str(cache_dir) if cache_dir else ""
430
+
431
+ # Pass 1: scan sensor union and valid files in one pass
432
+ sensor_result = scan_sensors(
433
+ file_list,
434
+ cache_dir=cache_str,
435
+ skip_missions=skip_missions or [],
436
+ keep_missions=keep_missions or [],
437
+ )
438
+ sensor_names = list(sensor_result["sensor_names"])
439
+ sensor_units = list(sensor_result["sensor_units"])
440
+ sensor_sizes = list(sensor_result["sensor_sizes"])
441
+ valid_files = list(sensor_result["valid_files"])
442
+
443
+ if not valid_files or not sensor_names:
444
+ return 0, 0
445
+
446
+ # Apply to_keep filter to the union sensor list
447
+ if to_keep:
448
+ keep_set = set(to_keep)
449
+ indices = [i for i, n in enumerate(sensor_names) if n in keep_set]
450
+ sensor_names = [sensor_names[i] for i in indices]
451
+ sensor_units = [sensor_units[i] for i in indices]
452
+ sensor_sizes = [sensor_sizes[i] for i in indices]
453
+
454
+ if not sensor_names:
455
+ return 0, 0
456
+
457
+ # Build fill value lookup for sensors missing from a batch
458
+ fill_vals = {}
459
+ for name, size in zip(sensor_names, sensor_sizes, strict=True):
460
+ dtype, fill = _NC_TYPE_INFO.get(size, ("f8", np.float64("nan")))
461
+ fill_vals[name] = (dtype, fill)
462
+
463
+ # Create NetCDF file with variables
464
+ chunk = 5000
465
+ nc = netCDF4.Dataset(str(output), "w", format="NETCDF4")
466
+ try:
467
+ nc.createDimension("i", None)
468
+ for name, units in zip(sensor_names, sensor_units, strict=True):
469
+ dtype, _ = fill_vals[name]
470
+ if compression > 0:
471
+ v = nc.createVariable( # type: ignore[call-overload]
472
+ name,
473
+ dtype,
474
+ ("i",),
475
+ fill_value=False,
476
+ zlib=True,
477
+ complevel=compression,
478
+ chunksizes=(chunk,),
479
+ )
480
+ else:
481
+ v = nc.createVariable(name, dtype, ("i",), fill_value=False)
482
+ v.units = units
483
+ finally:
484
+ nc.close()
485
+
486
+ # Pass 2: read files in batches, append to NetCDF
487
+ batch_size = 100
488
+ offset = 0
489
+ total_files = 0
490
+
491
+ for batch_idx in range(0, len(valid_files), batch_size):
492
+ batch_files = valid_files[batch_idx : batch_idx + batch_size]
493
+
494
+ try:
495
+ result = read_dbd_files(
496
+ batch_files,
497
+ cache_dir=cache_str,
498
+ to_keep=to_keep or [],
499
+ criteria=criteria or [],
500
+ skip_missions=skip_missions or [],
501
+ keep_missions=keep_missions or [],
502
+ skip_first_record=skip_first_record,
503
+ repair=repair,
504
+ )
505
+ except (OSError, RuntimeError, ValueError) as e:
506
+ logger.warning("Error reading batch starting at index %d: %s", batch_idx, e)
507
+ continue
508
+
509
+ n = int(result["n_records"])
510
+ batch_files_read = int(result["n_files"])
511
+
512
+ # For batches after the first, the first file's first record overlaps
513
+ # with the previous batch's last file — skip it
514
+ start = 1 if (batch_idx > 0 and skip_first_record and n > 0) else 0
515
+ n_write = n - start
516
+
517
+ total_files += batch_files_read
518
+
519
+ if n_write <= 0:
520
+ continue
521
+
522
+ # Build column map from this batch's result
523
+ result_names = list(result["sensor_names"])
524
+ result_cols = list(result["columns"])
525
+ col_map = dict(zip(result_names, result_cols, strict=True))
526
+
527
+ # Append to NetCDF
528
+ nc = netCDF4.Dataset(str(output), "a")
529
+ try:
530
+ for name in sensor_names:
531
+ col = col_map.get(name)
532
+ if col is not None:
533
+ nc.variables[name][offset : offset + n_write] = col[start : start + n_write]
534
+ else:
535
+ _, fill = fill_vals[name]
536
+ nc.variables[name][offset : offset + n_write] = np.full(n_write, fill)
537
+
538
+ offset += n_write
539
+ nc.setncattr("n_files", total_files)
540
+ nc.setncattr("total_records", offset)
541
+ finally:
542
+ nc.close()
543
+
544
+ # result goes out of scope — batch memory freed
545
+ del result, result_cols, col_map
546
+
547
+ return offset, total_files
File without changes