xarray-dbd 0.2.3__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. xarray_dbd-0.2.6/CHANGELOG.md +126 -0
  2. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/PKG-INFO +136 -7
  3. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/README.md +135 -6
  4. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/ColumnData.C +6 -2
  5. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Decompress.C +21 -6
  6. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Decompress.H +6 -1
  7. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Header.C +25 -1
  8. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Header.H +3 -0
  9. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/KnownBytes.C +28 -21
  10. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/MyException.H +2 -2
  11. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Sensor.C +11 -5
  12. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Sensors.C +9 -6
  13. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/SensorsMap.C +13 -1
  14. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/SensorsMap.H +1 -1
  15. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/config.h +1 -1
  16. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/dbd_python.cpp +17 -8
  17. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/pyproject.toml +2 -2
  18. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/tests/test_backend.py +93 -0
  19. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/tests/test_cli.py +101 -0
  20. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/tests/test_cpp_backend.py +73 -0
  21. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/tests/test_dbdreader2.py +98 -0
  22. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/_dbd_cpp.pyi +1 -0
  23. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/backend.py +154 -60
  24. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/csv.py +15 -2
  25. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/dbd2nc.py +39 -3
  26. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/mkone.py +44 -9
  27. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/sensors.py +3 -3
  28. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/dbdreader2/_core.py +67 -18
  29. xarray_dbd-0.2.3/CHANGELOG.md +0 -47
  30. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/.clang-tidy +0 -0
  31. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/.gitignore +0 -0
  32. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/.pre-commit-config.yaml +0 -0
  33. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/CMakeLists.txt +0 -0
  34. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/CONTRIBUTING.md +0 -0
  35. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/License.txt +0 -0
  36. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/benchmark_performance.py +0 -0
  37. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/conda/recipe.yaml +0 -0
  38. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/ColumnData.H +0 -0
  39. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Data.C +0 -0
  40. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Data.H +0 -0
  41. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/FileInfo.H +0 -0
  42. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/KnownBytes.H +0 -0
  43. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Logger.H +0 -0
  44. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Sensor.H +0 -0
  45. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/Sensors.H +0 -0
  46. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/lz4.c +0 -0
  47. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/csrc/lz4.h +0 -0
  48. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/examples/README.md +0 -0
  49. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/scripts/README.md +0 -0
  50. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/tests/conftest.py +0 -0
  51. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/__init__.py +0 -0
  52. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/__init__.py +0 -0
  53. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/cache.py +0 -0
  54. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/logger.py +0 -0
  55. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/main.py +0 -0
  56. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/cli/missions.py +0 -0
  57. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/dbdreader2/__init__.py +0 -0
  58. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/dbdreader2/_cache.py +0 -0
  59. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/dbdreader2/_errors.py +0 -0
  60. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/dbdreader2/_list.py +0 -0
  61. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/dbdreader2/_util.py +0 -0
  62. {xarray_dbd-0.2.3 → xarray_dbd-0.2.6}/xarray_dbd/py.typed +0 -0
@@ -0,0 +1,126 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.2.6] - 2026-03-30
9
+
10
+ ### Added
11
+
12
+ - `--list-sensors` flag for `dbd2nc` CLI to print available sensors without conversion
13
+ - `batch_size` parameter for `write_multi_dbd_netcdf()` (was hardcoded at 100)
14
+ - Signal handling in `mkone` — Ctrl+C now terminates child processes cleanly
15
+ - "Working with Glider Data" section in README (sensor discovery, time conversion, fill values)
16
+ - Tests for `get_CTD_sync`, `determine_ctd_type`, `get_global_time_range`, file ordering, batch boundaries
17
+
18
+ ### Changed
19
+
20
+ - `get_sync()` logs interpolation failures at WARNING level instead of INFO
21
+ - Streaming writer logs summary when batches are skipped due to errors
22
+ - `set_time_limits()` accepts numeric epoch seconds in addition to date strings
23
+ - C++ `SensorsMap::setUpForData()` validates sensor byte sizes across files
24
+
25
+ ### Fixed
26
+
27
+ - **Data loss in streaming writer**: removed Python-side double-skip at batch boundaries (C++ already handles `skip_first_record`)
28
+ - **dbdreader2 file ordering**: pass `presorted=True` to `read_dbd_files` so C++ respects chronological order from `DBDList.sort()`
29
+ - **mkone worker error propagation**: workers now exit non-zero on failure so parent detects errors
30
+ - **`_get_with_source` time ordering**: results now sorted by time for consistency with normal `get()` path
31
+ - **`sci_extensions` missing `.sbd`**: file pairing now recognizes `.sbd` as a science file type
32
+ - **`set_time_limits` falsy check**: epoch time 0 no longer causes spurious ValueError
33
+ - **inf-to-NaN for repeated values**: code=1 (repeat) now converts infinity consistently with code=2 (new value)
34
+ - Removed unused `"j"` dimension from `DBDDataStore.get_dimensions()`
35
+ - Fixed `--skip-first` help text (was stale after skip semantics change)
36
+ - Fixed README: CLI command names, removed false wildcard `to_keep` claim
37
+
38
+ ## [0.2.5] - 2026-03-30
39
+
40
+ ### Added
41
+
42
+ - `sort` parameter for `open_multi_dbd_dataset()` and `write_multi_dbd_netcdf()` with three modes: `"header_time"` (default, sort by `fileopen_time` from each file's DBD header), `"lexicographic"`, and `"none"` (preserve caller's order)
43
+ - `--sort` CLI flag for `dbd2nc`, `mkone`, and `2csv` commands
44
+ - `presorted` parameter for `read_dbd_files()` C++ binding to skip internal lexicographic sort when files are pre-sorted by Python
45
+ - `sensor_size` attribute on variables from `open_multi_dbd_dataset()`, matching single-file behavior
46
+ - `--skip-first` flag for `mkone` as consistent alias for the inverse `--keep-first`
47
+ - Duplicate file detection and deduplication with warning in multi-file functions
48
+ - Output directory auto-creation in `write_multi_dbd_netcdf()`
49
+ - "Choosing an API" and "Slocum File Types" sections in README
50
+ - Fill value and CF-compliance guidance in README Known Limitations
51
+
52
+ ### Changed
53
+
54
+ - `skip_first_record` in `read_dbd_files()` now skips the first record of **all** files (including the first), matching Lucas Merckelbach's dbdreader behavior
55
+ - Streaming NetCDF writer keeps a single file handle open instead of reopening per batch
56
+
57
+ ### Fixed
58
+
59
+ - File ordering for TWR-style filenames (e.g. `ce_1137-2026-085-1-10.dbd` incorrectly sorting before `-2.dbd` under lexicographic sort)
60
+ - `_parse_fileopen_time()` now logs a warning instead of silently sorting unparseable files to end
61
+ - `DBD.get_fileopen_time()` no longer raises on unparseable header values
62
+ - Thread-safe random number generator in C++ cache file creation
63
+ - Integer overflow guard in C++ column capacity doubling
64
+
65
+ ## [0.2.3] - 2026-02-23
66
+
67
+ ### Added
68
+
69
+ - `include_source` support in `MultiDBD.get()` — returns per-record source DBD references, matching dbdreader's API
70
+ - `continue_on_reading_error` parameter for `MultiDBD.get()` — skip corrupted files instead of raising, matching dbdreader v0.5.9
71
+ - `DBD_ERROR_READ_ERROR` error code (14) for compatibility with dbdreader
72
+ - Python 3.14 pre-built wheels for all platforms (Linux, macOS, Windows)
73
+ - Attribution to Lucas Merckelbach's [dbdreader](https://github.com/smerckel/dbdreader) in README
74
+
75
+ ## [0.2.2] - 2026-02-23
76
+
77
+ ### Added
78
+
79
+ - `preload` parameter for `DBD` and `MultiDBD` constructors
80
+ - Changelog configuration and tag/version validation in publish workflow
81
+
82
+ ### Fixed
83
+
84
+ - mypy errors: `datetime.UTC`, tuple assignments, type annotations
85
+ - ruff formatting compliance
86
+
87
+ ## [0.2.1] - 2026-02-22
88
+
89
+ ### Added
90
+
91
+ - Streaming NetCDF writer (`write_multi_dbd_netcdf`) for low-memory batch conversion
92
+ - dbdreader-compatible API layer (`DBD` and `MultiDBD` classes in `xarray_dbd.dbdreader2`)
93
+ - Unified CLI under `xdbd` command with subcommands (`2nc`, `mkone`, `2csv`, `missions`, `cache`)
94
+ - Monotonicity check in `get_sync()` to prevent silent wrong results from `np.interp`
95
+
96
+ ### Changed
97
+
98
+ - CLI restructured: standalone `dbd2nc` and `mkone` commands replaced by `xdbd 2nc` and `xdbd mkone`
99
+ - Streaming mode is now the default for non-append `2nc` and `mkone` (requires netCDF4)
100
+ - Fill values corrected: -127 for int8, -32768 for int16 (matching C++ dbd2netCDF standalone)
101
+ - Multi-file reader uses read-copy-discard strategy to reduce peak memory ~53%
102
+ - Replaced inf with NaN in float reads to match C++ dbd2netCDF behavior
103
+
104
+ ### Fixed
105
+
106
+ - Multi-file parse dropping records from unfactored DBD files
107
+ - Corrupted file recovery: discard partial record on I/O error
108
+
109
+ ## [0.1.0] - 2026-02-20
110
+
111
+ ### Added
112
+
113
+ - C++ backend via pybind11 wrapping [dbd2netCDF](https://github.com/mousebrains/dbd2netcdf) parser
114
+ - Native xarray engine integration (`xr.open_dataset(f, engine="dbd")`)
115
+ - Multi-file reading with `open_multi_dbd_dataset()` using C++ SensorsMap two-pass approach
116
+ - CLI tools: `dbd2nc` for single/multi-file conversion, `mkone` for batch directory processing
117
+ - Native dtype support: int8, int16, float32, float64 columns (no double-conversion overhead)
118
+ - LZ4 decompression for compressed `.?cd` files
119
+ - Sensor filtering (`to_keep`), mission filtering (`skip_missions`/`keep_missions`)
120
+ - Corrupted file recovery with `repair=True`
121
+ - Python 3.10+ and free-threaded Python (PEP 703) support
122
+
123
+ ### Changed
124
+
125
+ - Replaced pure-Python parser with C++ pybind11 extension for ~5x performance improvement
126
+ - Fill values: NaN for float32/float64, -127 for int8, -32768 for int16 (matching C++ dbd2netCDF)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray-dbd
3
- Version: 0.2.3
3
+ Version: 0.2.6
4
4
  Summary: Efficient xarray backend for reading glider DBD files
5
5
  Keywords: glider,oceanography,dbd,slocum,xarray,netcdf
6
6
  Author-Email: Pat Welch <pat@mousebrains.com>
@@ -41,7 +41,7 @@ Description-Content-Type: text/markdown
41
41
  [![License](https://img.shields.io/pypi/l/xarray-dbd)](License.txt)
42
42
  [![CI](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml)
43
43
  [![CodeQL](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml)
44
- [![Codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/branch/main/graph/badge.svg)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
44
+ [![codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/graph/badge.svg?token=EJQEIVEB0U)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
45
45
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
46
46
 
47
47
  An efficient xarray backend for reading Dinkum Binary Data (DBD) files from
@@ -74,7 +74,7 @@ pip install xarray-dbd
74
74
  For the CLI tools only:
75
75
 
76
76
  ```bash
77
- pipx install xarray-dbd # installs dbd2nc and mkone commands
77
+ pipx install xarray-dbd # installs xdbd command (xdbd 2nc, xdbd mkone, etc.)
78
78
  ```
79
79
 
80
80
  Or install from source (requires a C++ compiler and CMake):
@@ -147,6 +147,30 @@ ds = xdbd.open_multi_dbd_dataset(
147
147
  )
148
148
  ```
149
149
 
150
+ ### File sort order
151
+
152
+ By default, files are sorted by the `fileopen_time` timestamp in each file's
153
+ header, which is correct regardless of filename convention. Alternative sort
154
+ modes are available:
155
+
156
+ ```python
157
+ # Default: sort by header timestamp (universally correct)
158
+ ds = xdbd.open_multi_dbd_dataset(files)
159
+
160
+ # Sort by filename (lexicographic)
161
+ ds = xdbd.open_multi_dbd_dataset(files, sort="lexicographic")
162
+
163
+ # Preserve the caller's order (no sorting)
164
+ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
165
+ ```
166
+
167
+ The `--sort` flag is also available on all CLI commands:
168
+
169
+ ```bash
170
+ dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
171
+ mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
172
+ ```
173
+
150
174
  ### Advanced options
151
175
 
152
176
  ```python
@@ -154,7 +178,7 @@ ds = xdbd.open_dbd_dataset(
154
178
  'test.sbd',
155
179
  skip_first_record=True, # Skip first record (default)
156
180
  repair=True, # Attempt to repair corrupted data
157
- to_keep=['m_*'], # Keep sensors matching pattern (future feature)
181
+ to_keep=['m_depth', 'm_lat'], # Keep only these sensors
158
182
  criteria=['m_present_time'], # Sensors for record selection
159
183
  )
160
184
  ```
@@ -189,6 +213,7 @@ Open a single DBD file as an xarray Dataset.
189
213
  - `to_keep` (list of str): Sensor names to keep (default: all)
190
214
  - `criteria` (list of str): Sensor names for selection criteria
191
215
  - `drop_variables` (list of str): Variables to exclude
216
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
192
217
 
193
218
  **Returns:** `xarray.Dataset`
194
219
 
@@ -204,9 +229,32 @@ Open multiple DBD files as a single concatenated xarray Dataset.
204
229
  - `criteria` (list of str): Sensor names for selection criteria
205
230
  - `skip_missions` (list of str): Mission names to skip
206
231
  - `keep_missions` (list of str): Mission names to keep
232
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
233
+ - `sort` (str): File sort order — `"header_time"` (default, sort by `fileopen_time` from each file's header), `"lexicographic"`, or `"none"` (preserve caller's order).
207
234
 
208
235
  **Returns:** `xarray.Dataset`
209
236
 
237
+ ### `write_multi_dbd_netcdf(filenames, output, **kwargs)`
238
+
239
+ Stream multiple DBD files directly to a NetCDF file without loading all data
240
+ into memory. Preferred for large datasets (100+ files).
241
+
242
+ **Parameters:**
243
+ - `filenames` (iterable): Paths to DBD files (duplicates removed automatically)
244
+ - `output` (str or Path): Output NetCDF file path (parent directory created if needed)
245
+ - `skip_first_record` (bool): Skip first record in each file (default: True)
246
+ - `repair` (bool): Attempt to repair corrupted records (default: False)
247
+ - `to_keep` (list of str): Sensor names to keep (default: all)
248
+ - `criteria` (list of str): Sensor names for selection criteria
249
+ - `skip_missions` (list of str): Mission names to skip
250
+ - `keep_missions` (list of str): Mission names to keep
251
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
252
+ - `compression` (int): Zlib compression level 0-9 (default: 5, 0 disables)
253
+ - `sort` (str): File sort order (default: `"header_time"`)
254
+ - `batch_size` (int): Files per batch (default: 100; smaller reduces peak memory)
255
+
256
+ **Returns:** `tuple[int, int]` — (n_records, n_files)
257
+
210
258
  ## Migration from dbdreader
211
259
 
212
260
  The dbdreader2 API is derived from Lucas Merckelbach's
@@ -353,9 +401,8 @@ mdbd = dbdreader.MultiDBD(
353
401
  to batch additional sensors into the first `get()` call.
354
402
 
355
403
  - **`skip_initial_line` semantics.** When reading multiple files, the
356
- first contributing file keeps all its records; subsequent files skip
357
- their first record. dbdreader skips the first record of every file.
358
- Multi-file record counts may therefore differ by up to N-1.
404
+ first record of every file is skipped (matching dbdreader). Multi-file
405
+ record counts should match dbdreader exactly.
359
406
 
360
407
  - **Float64 output.** `get()` always returns float64 arrays, matching
361
408
  dbdreader's behavior. Integer fill values (-127 for int8, -32768 for
@@ -472,6 +519,7 @@ print(f"Depth units: {ds['m_depth'].attrs['units']}")
472
519
  ### Working with trajectories
473
520
 
474
521
  ```python
522
+ from pathlib import Path
475
523
  import xarray_dbd as xdbd
476
524
  import matplotlib.pyplot as plt
477
525
 
@@ -492,6 +540,7 @@ plt.show()
492
540
  ### Extracting science data
493
541
 
494
542
  ```python
543
+ from pathlib import Path
495
544
  # Read full resolution science data
496
545
  files = sorted(Path('.').glob('*.ebd'))
497
546
  ds = xdbd.open_multi_dbd_dataset(
@@ -504,6 +553,74 @@ df = ds.to_dataframe()
504
553
  print(df.describe())
505
554
  ```
506
555
 
556
+ ## Choosing an API
557
+
558
+ | Scenario | Recommended API |
559
+ |----------|----------------|
560
+ | Single file, quick look | `xr.open_dataset(f, engine="dbd")` |
561
+ | Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
562
+ | Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
563
+ | Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
564
+ | Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
565
+ | Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
566
+
567
+ ## Slocum File Types
568
+
569
+ | Extension | Name | Contents |
570
+ |-----------|------|----------|
571
+ | `.dbd` / `.dcd` | Flight | Vehicle sensors: depth, attitude, speed, GPS |
572
+ | `.ebd` / `.ecd` | Science | Payload sensors: CTD, optics, oxygen |
573
+ | `.sbd` / `.scd` | Short burst | Surface telemetry summary records |
574
+ | `.tbd` / `.tcd` | Technical | Detailed engineering telemetry |
575
+ | `.mbd` / `.mcd` | Mini | Compact engineering subset |
576
+ | `.nbd` / `.ncd` | Narrow | Compact science subset |
577
+
578
+ Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
579
+
580
+ ## Working with Glider Data
581
+
582
+ ### Discovering available sensors
583
+
584
+ ```python
585
+ import xarray_dbd as xdbd
586
+
587
+ # xarray API
588
+ ds = xdbd.open_dbd_dataset("file.dbd", cache_dir="cache")
589
+ for var in sorted(ds.data_vars):
590
+ print(f" {var:30s} {ds[var].attrs.get('units', '')}")
591
+
592
+ # dbdreader2 API
593
+ dbd = xdbd.MultiDBD(pattern="*.dbd", cacheDir="cache")
594
+ for name in sorted(dbd.parameterNames["eng"]):
595
+ print(f" {name:30s} {dbd.parameterUnits.get(name, '')}")
596
+ ```
597
+
598
+ Sensor naming conventions are documented in
599
+ [TWR's masterdata files](https://gliderfs2.ceoas.oregonstate.edu/gliderweb/masterdata/).
600
+
601
+ ### Time conversion
602
+
603
+ `m_present_time` contains UTC seconds since 1970-01-01 (Unix epoch, float64):
604
+
605
+ ```python
606
+ import pandas as pd
607
+
608
+ time = pd.to_datetime(ds["m_present_time"].values, unit="s", utc=True)
609
+ ```
610
+
611
+ ### Handling fill values
612
+
613
+ Float sensors use NaN for missing data. Integer sensors use sentinel fill
614
+ values (-127 for int8, -32768 for int16). Filter them out:
615
+
616
+ ```python
617
+ # xarray — replace sentinels with NaN
618
+ ds = ds.where(ds != -32768)
619
+
620
+ # dbdreader2 — automatic filtering (default)
621
+ t, v = dbd.get("m_depth") # return_nans=False by default
622
+ ```
623
+
507
624
  ## Known Limitations
508
625
 
509
626
  - **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
@@ -514,6 +631,18 @@ print(df.describe())
514
631
  - **No lazy loading for xarray API** — `open_dataset()` reads all sensor data
515
632
  into memory. For very large deployments, use `to_keep` to select only needed
516
633
  sensors. The dbdreader2 API (`DBD`/`MultiDBD`) uses lazy incremental loading.
634
+ - **Fill values in xarray output** — Integer sensors use sentinel fill values
635
+ (-127 for int8, -32768 for int16) rather than NaN. Between dives, science
636
+ sensors may contain these sentinels or NaN. Filter with
637
+ `ds.where(ds != -32768)` or use the dbdreader2 `get(return_nans=False)` API
638
+ which filters automatically.
639
+ - **Not CF-compliant** — NetCDF output preserves sensor `units` but does not
640
+ add CF attributes (`standard_name`, `axis`, `calendar`). Add metadata
641
+ post-hoc for publication, e.g.:
642
+ ```python
643
+ ds["m_present_time"].attrs["axis"] = "T"
644
+ ds["m_present_time"].attrs["units"] = "seconds since 1970-01-01"
645
+ ```
517
646
 
518
647
  ## Troubleshooting
519
648
 
@@ -5,7 +5,7 @@
5
5
  [![License](https://img.shields.io/pypi/l/xarray-dbd)](License.txt)
6
6
  [![CI](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml)
7
7
  [![CodeQL](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml)
8
- [![Codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/branch/main/graph/badge.svg)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
8
+ [![codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/graph/badge.svg?token=EJQEIVEB0U)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
9
9
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
10
10
 
11
11
  An efficient xarray backend for reading Dinkum Binary Data (DBD) files from
@@ -38,7 +38,7 @@ pip install xarray-dbd
38
38
  For the CLI tools only:
39
39
 
40
40
  ```bash
41
- pipx install xarray-dbd # installs dbd2nc and mkone commands
41
+ pipx install xarray-dbd # installs xdbd command (xdbd 2nc, xdbd mkone, etc.)
42
42
  ```
43
43
 
44
44
  Or install from source (requires a C++ compiler and CMake):
@@ -111,6 +111,30 @@ ds = xdbd.open_multi_dbd_dataset(
111
111
  )
112
112
  ```
113
113
 
114
+ ### File sort order
115
+
116
+ By default, files are sorted by the `fileopen_time` timestamp in each file's
117
+ header, which is correct regardless of filename convention. Alternative sort
118
+ modes are available:
119
+
120
+ ```python
121
+ # Default: sort by header timestamp (universally correct)
122
+ ds = xdbd.open_multi_dbd_dataset(files)
123
+
124
+ # Sort by filename (lexicographic)
125
+ ds = xdbd.open_multi_dbd_dataset(files, sort="lexicographic")
126
+
127
+ # Preserve the caller's order (no sorting)
128
+ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
129
+ ```
130
+
131
+ The `--sort` flag is also available on all CLI commands:
132
+
133
+ ```bash
134
+ dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
135
+ mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
136
+ ```
137
+
114
138
  ### Advanced options
115
139
 
116
140
  ```python
@@ -118,7 +142,7 @@ ds = xdbd.open_dbd_dataset(
118
142
  'test.sbd',
119
143
  skip_first_record=True, # Skip first record (default)
120
144
  repair=True, # Attempt to repair corrupted data
121
- to_keep=['m_*'], # Keep sensors matching pattern (future feature)
145
+ to_keep=['m_depth', 'm_lat'], # Keep only these sensors
122
146
  criteria=['m_present_time'], # Sensors for record selection
123
147
  )
124
148
  ```
@@ -153,6 +177,7 @@ Open a single DBD file as an xarray Dataset.
153
177
  - `to_keep` (list of str): Sensor names to keep (default: all)
154
178
  - `criteria` (list of str): Sensor names for selection criteria
155
179
  - `drop_variables` (list of str): Variables to exclude
180
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
156
181
 
157
182
  **Returns:** `xarray.Dataset`
158
183
 
@@ -168,9 +193,32 @@ Open multiple DBD files as a single concatenated xarray Dataset.
168
193
  - `criteria` (list of str): Sensor names for selection criteria
169
194
  - `skip_missions` (list of str): Mission names to skip
170
195
  - `keep_missions` (list of str): Mission names to keep
196
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
197
+ - `sort` (str): File sort order — `"header_time"` (default, sort by `fileopen_time` from each file's header), `"lexicographic"`, or `"none"` (preserve caller's order).
171
198
 
172
199
  **Returns:** `xarray.Dataset`
173
200
 
201
+ ### `write_multi_dbd_netcdf(filenames, output, **kwargs)`
202
+
203
+ Stream multiple DBD files directly to a NetCDF file without loading all data
204
+ into memory. Preferred for large datasets (100+ files).
205
+
206
+ **Parameters:**
207
+ - `filenames` (iterable): Paths to DBD files (duplicates removed automatically)
208
+ - `output` (str or Path): Output NetCDF file path (parent directory created if needed)
209
+ - `skip_first_record` (bool): Skip first record in each file (default: True)
210
+ - `repair` (bool): Attempt to repair corrupted records (default: False)
211
+ - `to_keep` (list of str): Sensor names to keep (default: all)
212
+ - `criteria` (list of str): Sensor names for selection criteria
213
+ - `skip_missions` (list of str): Mission names to skip
214
+ - `keep_missions` (list of str): Mission names to keep
215
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
216
+ - `compression` (int): Zlib compression level 0-9 (default: 5, 0 disables)
217
+ - `sort` (str): File sort order (default: `"header_time"`)
218
+ - `batch_size` (int): Files per batch (default: 100; smaller reduces peak memory)
219
+
220
+ **Returns:** `tuple[int, int]` — (n_records, n_files)
221
+
174
222
  ## Migration from dbdreader
175
223
 
176
224
  The dbdreader2 API is derived from Lucas Merckelbach's
@@ -317,9 +365,8 @@ mdbd = dbdreader.MultiDBD(
317
365
  to batch additional sensors into the first `get()` call.
318
366
 
319
367
  - **`skip_initial_line` semantics.** When reading multiple files, the
320
- first contributing file keeps all its records; subsequent files skip
321
- their first record. dbdreader skips the first record of every file.
322
- Multi-file record counts may therefore differ by up to N-1.
368
+ first record of every file is skipped (matching dbdreader). Multi-file
369
+ record counts should match dbdreader exactly.
323
370
 
324
371
  - **Float64 output.** `get()` always returns float64 arrays, matching
325
372
  dbdreader's behavior. Integer fill values (-127 for int8, -32768 for
@@ -436,6 +483,7 @@ print(f"Depth units: {ds['m_depth'].attrs['units']}")
436
483
  ### Working with trajectories
437
484
 
438
485
  ```python
486
+ from pathlib import Path
439
487
  import xarray_dbd as xdbd
440
488
  import matplotlib.pyplot as plt
441
489
 
@@ -456,6 +504,7 @@ plt.show()
456
504
  ### Extracting science data
457
505
 
458
506
  ```python
507
+ from pathlib import Path
459
508
  # Read full resolution science data
460
509
  files = sorted(Path('.').glob('*.ebd'))
461
510
  ds = xdbd.open_multi_dbd_dataset(
@@ -468,6 +517,74 @@ df = ds.to_dataframe()
468
517
  print(df.describe())
469
518
  ```
470
519
 
520
+ ## Choosing an API
521
+
522
+ | Scenario | Recommended API |
523
+ |----------|----------------|
524
+ | Single file, quick look | `xr.open_dataset(f, engine="dbd")` |
525
+ | Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
526
+ | Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
527
+ | Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
528
+ | Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
529
+ | Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
530
+
531
+ ## Slocum File Types
532
+
533
+ | Extension | Name | Contents |
534
+ |-----------|------|----------|
535
+ | `.dbd` / `.dcd` | Flight | Vehicle sensors: depth, attitude, speed, GPS |
536
+ | `.ebd` / `.ecd` | Science | Payload sensors: CTD, optics, oxygen |
537
+ | `.sbd` / `.scd` | Short burst | Surface telemetry summary records |
538
+ | `.tbd` / `.tcd` | Technical | Detailed engineering telemetry |
539
+ | `.mbd` / `.mcd` | Mini | Compact engineering subset |
540
+ | `.nbd` / `.ncd` | Narrow | Compact science subset |
541
+
542
+ Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
543
+
544
+ ## Working with Glider Data
545
+
546
+ ### Discovering available sensors
547
+
548
+ ```python
549
+ import xarray_dbd as xdbd
550
+
551
+ # xarray API
552
+ ds = xdbd.open_dbd_dataset("file.dbd", cache_dir="cache")
553
+ for var in sorted(ds.data_vars):
554
+ print(f" {var:30s} {ds[var].attrs.get('units', '')}")
555
+
556
+ # dbdreader2 API
557
+ dbd = xdbd.MultiDBD(pattern="*.dbd", cacheDir="cache")
558
+ for name in sorted(dbd.parameterNames["eng"]):
559
+ print(f" {name:30s} {dbd.parameterUnits.get(name, '')}")
560
+ ```
561
+
562
+ Sensor naming conventions are documented in
563
+ [TWR's masterdata files](https://gliderfs2.ceoas.oregonstate.edu/gliderweb/masterdata/).
564
+
565
+ ### Time conversion
566
+
567
+ `m_present_time` contains UTC seconds since 1970-01-01 (Unix epoch, float64):
568
+
569
+ ```python
570
+ import pandas as pd
571
+
572
+ time = pd.to_datetime(ds["m_present_time"].values, unit="s", utc=True)
573
+ ```
574
+
575
+ ### Handling fill values
576
+
577
+ Float sensors use NaN for missing data. Integer sensors use sentinel fill
578
+ values (-127 for int8, -32768 for int16). Filter them out:
579
+
580
+ ```python
581
+ # xarray — replace sentinels with NaN
582
+ ds = ds.where(ds != -32768)
583
+
584
+ # dbdreader2 — automatic filtering (default)
585
+ t, v = dbd.get("m_depth") # return_nans=False by default
586
+ ```
587
+
471
588
  ## Known Limitations
472
589
 
473
590
  - **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
@@ -478,6 +595,18 @@ print(df.describe())
478
595
  - **No lazy loading for xarray API** — `open_dataset()` reads all sensor data
479
596
  into memory. For very large deployments, use `to_keep` to select only needed
480
597
  sensors. The dbdreader2 API (`DBD`/`MultiDBD`) uses lazy incremental loading.
598
+ - **Fill values in xarray output** — Integer sensors use sentinel fill values
599
+ (-127 for int8, -32768 for int16) rather than NaN. Between dives, science
600
+ sensors may contain these sentinels or NaN. Filter with
601
+ `ds.where(ds != -32768)` or use the dbdreader2 `get(return_nans=False)` API
602
+ which filters automatically.
603
+ - **Not CF-compliant** — NetCDF output preserves sensor `units` but does not
604
+ add CF attributes (`standard_name`, `axis`, `calendar`). Add metadata
605
+ post-hoc for publication, e.g.:
606
+ ```python
607
+ ds["m_present_time"].attrs["axis"] = "T"
608
+ ds["m_present_time"].attrs["units"] = "seconds since 1970-01-01"
609
+ ```
481
610
 
482
611
  ## Troubleshooting
483
612
 
@@ -123,7 +123,7 @@ ColumnDataResult read_columns(std::istream& is,
123
123
  qKeep |= sensor.qCriteria();
124
124
  const int oi = outIndex[i];
125
125
  if (oi >= 0) {
126
- // Copy previous value into current row
126
+ // Copy previous value into current row, converting inf to NaN
127
127
  std::visit([nRows, oi](auto& col_vec, const auto& prev_vec) {
128
128
  using T = typename std::decay_t<decltype(col_vec)>::value_type;
129
129
  using PT = typename std::decay_t<decltype(prev_vec)>::value_type;
@@ -136,7 +136,11 @@ ColumnDataResult read_columns(std::istream& is,
136
136
  else
137
137
  col_vec.resize(col_vec.size() * 2, NAN);
138
138
  }
139
- col_vec[nRows] = prev_vec[0];
139
+ T val = prev_vec[0];
140
+ if constexpr (std::is_floating_point_v<T>) {
141
+ if (std::isinf(val)) val = NAN;
142
+ }
143
+ col_vec[nRows] = val;
140
144
  }
141
145
  }, columns[oi], prevValues[oi]);
142
146
  }
@@ -38,17 +38,20 @@ int DecompressTWRBuf::underflow() {
38
38
  if (!this->mIS.read(frame.data(), n)) { // EOF
39
39
  return std::char_traits<char>::eof();
40
40
  }
41
- const int j = LZ4_decompress_safe(frame.data(), this->mBuffer, static_cast<int>(n), sizeof(this->mBuffer));
41
+ const int j(LZ4_decompress_safe(frame.data(), this->mBuffer, static_cast<int>(n), sizeof(this->mBuffer)));
42
42
  if (j < 0) { // LZ4 decompression error
43
+ LOG_ERROR("LZ4 decompression failed (error {}) in {} (block size {})",
44
+ j, this->mFilename, n);
43
45
  return std::char_traits<char>::eof();
44
46
  }
45
- if (static_cast<size_t>(j) > sizeof(this->mBuffer)) { // Probably a corrupted file
46
- return std::char_traits<char>::eof();
47
- }
48
- this->setg(this->mBuffer, this->mBuffer, this->mBuffer + j);
47
+ const size_t decompressedSize(static_cast<size_t>(j));
48
+ this->setg(this->mBuffer, this->mBuffer, this->mBuffer + decompressedSize);
49
+ this->mPos += decompressedSize;
49
50
  } else { // Not compressed
50
51
  if (this->mIS.read(this->mBuffer, sizeof(this->mBuffer)) || this->mIS.gcount()) {
51
- this->setg(this->mBuffer, this->mBuffer, this->mBuffer + this->mIS.gcount());
52
+ const auto n = this->mIS.gcount();
53
+ this->setg(this->mBuffer, this->mBuffer, this->mBuffer + n);
54
+ this->mPos += static_cast<size_t>(n);
52
55
  } else {
53
56
  return std::char_traits<char>::eof();
54
57
  }
@@ -57,6 +60,18 @@ int DecompressTWRBuf::underflow() {
57
60
  return std::char_traits<char>::to_int_type(*this->gptr());
58
61
  }
59
62
 
63
+ DecompressTWRBuf::pos_type
64
+ DecompressTWRBuf::seekoff(off_type off, std::ios_base::seekdir dir,
65
+ std::ios_base::openmode /*which*/) {
66
+ // Only support tellg(): seekoff(0, cur)
67
+ if (dir == std::ios_base::cur && off == 0) {
68
+ // mPos is total bytes loaded; subtract unread bytes remaining in buffer
69
+ const auto remaining = this->egptr() - this->gptr();
70
+ return static_cast<pos_type>(this->mPos - static_cast<size_t>(remaining));
71
+ }
72
+ return pos_type(off_type(-1)); // Seeking not supported
73
+ }
74
+
60
75
  bool qCompressed(const std::string& fn) {
61
76
  const std::string suffix(fs::path(fn).extension().string());
62
77
  const bool q((suffix.size() == 4) && (std::tolower(static_cast<unsigned char>(suffix[2])) == 'c'));
@@ -11,6 +11,7 @@ class DecompressTWRBuf: public std::streambuf {
11
11
  const bool mqCompressed;
12
12
  char mBuffer[65536];
13
13
  const std::string mFilename;
14
+ size_t mPos = 0; // Total decompressed bytes loaded into buffer
14
15
  public:
15
16
  DecompressTWRBuf(const std::string& fn, const bool qCompressed)
16
17
  : mIS(fn.c_str(), std::ios::binary)
@@ -23,7 +24,11 @@ public:
23
24
 
24
25
  void close() {mIS.close();}
25
26
 
26
- int underflow();
27
+ int underflow() override;
28
+
29
+ protected:
30
+ pos_type seekoff(off_type off, std::ios_base::seekdir dir,
31
+ std::ios_base::openmode which = std::ios_base::in) override;
27
32
  };
28
33
 
29
34
  class DecompressTWR: public std::istream {