xarray-dbd 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xarray_dbd-0.2.5/CHANGELOG.md +96 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/PKG-INFO +67 -5
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/README.md +66 -4
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensors.C +2 -3
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/config.h +1 -1
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/dbd_python.cpp +17 -8
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/pyproject.toml +1 -1
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_cli.py +52 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_cpp_backend.py +73 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/_dbd_cpp.pyi +1 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/backend.py +138 -56
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/csv.py +14 -1
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/dbd2nc.py +9 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/mkone.py +19 -3
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/sensors.py +3 -3
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_core.py +8 -2
- xarray_dbd-0.2.3/CHANGELOG.md +0 -47
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/.clang-tidy +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/.gitignore +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/.pre-commit-config.yaml +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/CMakeLists.txt +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/CONTRIBUTING.md +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/License.txt +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/benchmark_performance.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/conda/recipe.yaml +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/ColumnData.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/ColumnData.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Data.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Data.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Decompress.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Decompress.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/FileInfo.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Header.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Header.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/KnownBytes.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/KnownBytes.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Logger.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/MyException.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensor.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensor.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensors.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/SensorsMap.C +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/SensorsMap.H +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/lz4.c +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/lz4.h +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/examples/README.md +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/scripts/README.md +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/conftest.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_backend.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_dbdreader2.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/__init__.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/__init__.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/cache.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/logger.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/main.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/missions.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/__init__.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_cache.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_errors.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_list.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_util.py +0 -0
- {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/py.typed +0 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.2.5] - 2026-03-30
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- `sort` parameter for `open_multi_dbd_dataset()` and `write_multi_dbd_netcdf()` with three modes: `"header_time"` (default, sort by `fileopen_time` from each file's DBD header), `"lexicographic"`, and `"none"` (preserve caller's order)
|
|
13
|
+
- `--sort` CLI flag for `dbd2nc`, `mkone`, and `2csv` commands
|
|
14
|
+
- `presorted` parameter for `read_dbd_files()` C++ binding to skip internal lexicographic sort when files are pre-sorted by Python
|
|
15
|
+
- `sensor_size` attribute on variables from `open_multi_dbd_dataset()`, matching single-file behavior
|
|
16
|
+
- `--skip-first` flag for `mkone` as consistent alias for the inverse `--keep-first`
|
|
17
|
+
- Duplicate file detection and deduplication with warning in multi-file functions
|
|
18
|
+
- Output directory auto-creation in `write_multi_dbd_netcdf()`
|
|
19
|
+
- "Choosing an API" and "Slocum File Types" sections in README
|
|
20
|
+
- Fill value and CF-compliance guidance in README Known Limitations
|
|
21
|
+
|
|
22
|
+
### Changed
|
|
23
|
+
|
|
24
|
+
- `skip_first_record` in `read_dbd_files()` now skips the first record of **all** files (including the first), matching Lucas Merckelbach's dbdreader behavior
|
|
25
|
+
- Streaming NetCDF writer keeps a single file handle open instead of reopening per batch
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
|
|
29
|
+
- File ordering for TWR-style filenames (e.g. `ce_1137-2026-085-1-10.dbd` incorrectly sorting before `-2.dbd` under lexicographic sort)
|
|
30
|
+
- `_parse_fileopen_time()` now logs a warning instead of silently sorting unparseable files to end
|
|
31
|
+
- `DBD.get_fileopen_time()` no longer raises on unparseable header values
|
|
32
|
+
- Thread-safe random number generator in C++ cache file creation
|
|
33
|
+
- Integer overflow guard in C++ column capacity doubling
|
|
34
|
+
|
|
35
|
+
## [0.2.3] - 2026-02-23
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
|
|
39
|
+
- `include_source` support in `MultiDBD.get()` — returns per-record source DBD references, matching dbdreader's API
|
|
40
|
+
- `continue_on_reading_error` parameter for `MultiDBD.get()` — skip corrupted files instead of raising, matching dbdreader v0.5.9
|
|
41
|
+
- `DBD_ERROR_READ_ERROR` error code (14) for compatibility with dbdreader
|
|
42
|
+
- Python 3.14 pre-built wheels for all platforms (Linux, macOS, Windows)
|
|
43
|
+
- Attribution to Lucas Merckelbach's [dbdreader](https://github.com/smerckel/dbdreader) in README
|
|
44
|
+
|
|
45
|
+
## [0.2.2] - 2026-02-23
|
|
46
|
+
|
|
47
|
+
### Added
|
|
48
|
+
|
|
49
|
+
- `preload` parameter for `DBD` and `MultiDBD` constructors
|
|
50
|
+
- Changelog configuration and tag/version validation in publish workflow
|
|
51
|
+
|
|
52
|
+
### Fixed
|
|
53
|
+
|
|
54
|
+
- mypy errors: `datetime.UTC`, tuple assignments, type annotations
|
|
55
|
+
- ruff formatting compliance
|
|
56
|
+
|
|
57
|
+
## [0.2.1] - 2026-02-22
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
|
|
61
|
+
- Streaming NetCDF writer (`write_multi_dbd_netcdf`) for low-memory batch conversion
|
|
62
|
+
- dbdreader-compatible API layer (`DBD` and `MultiDBD` classes in `xarray_dbd.dbdreader2`)
|
|
63
|
+
- Unified CLI under `xdbd` command with subcommands (`2nc`, `mkone`, `2csv`, `missions`, `cache`)
|
|
64
|
+
- Monotonicity check in `get_sync()` to prevent silent wrong results from `np.interp`
|
|
65
|
+
|
|
66
|
+
### Changed
|
|
67
|
+
|
|
68
|
+
- CLI restructured: standalone `dbd2nc` and `mkone` commands replaced by `xdbd 2nc` and `xdbd mkone`
|
|
69
|
+
- Streaming mode is now the default for non-append `2nc` and `mkone` (requires netCDF4)
|
|
70
|
+
- Fill values corrected: -127 for int8, -32768 for int16 (matching C++ dbd2netCDF standalone)
|
|
71
|
+
- Multi-file reader uses read-copy-discard strategy to reduce peak memory ~53%
|
|
72
|
+
- Replaced inf with NaN in float reads to match C++ dbd2netCDF behavior
|
|
73
|
+
|
|
74
|
+
### Fixed
|
|
75
|
+
|
|
76
|
+
- Multi-file parse dropping records from unfactored DBD files
|
|
77
|
+
- Corrupted file recovery: discard partial record on I/O error
|
|
78
|
+
|
|
79
|
+
## [0.1.0] - 2026-02-20
|
|
80
|
+
|
|
81
|
+
### Added
|
|
82
|
+
|
|
83
|
+
- C++ backend via pybind11 wrapping [dbd2netCDF](https://github.com/mousebrains/dbd2netcdf) parser
|
|
84
|
+
- Native xarray engine integration (`xr.open_dataset(f, engine="dbd")`)
|
|
85
|
+
- Multi-file reading with `open_multi_dbd_dataset()` using C++ SensorsMap two-pass approach
|
|
86
|
+
- CLI tools: `dbd2nc` for single/multi-file conversion, `mkone` for batch directory processing
|
|
87
|
+
- Native dtype support: int8, int16, float32, float64 columns (no double-conversion overhead)
|
|
88
|
+
- LZ4 decompression for compressed `.?cd` files
|
|
89
|
+
- Sensor filtering (`to_keep`), mission filtering (`skip_missions`/`keep_missions`)
|
|
90
|
+
- Corrupted file recovery with `repair=True`
|
|
91
|
+
- Python 3.10+ and free-threaded Python (PEP 703) support
|
|
92
|
+
|
|
93
|
+
### Changed
|
|
94
|
+
|
|
95
|
+
- Replaced pure-Python parser with C++ pybind11 extension for ~5x performance improvement
|
|
96
|
+
- Fill values: NaN for float32/float64, -127 for int8, -32768 for int16 (matching C++ dbd2netCDF)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xarray-dbd
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Efficient xarray backend for reading glider DBD files
|
|
5
5
|
Keywords: glider,oceanography,dbd,slocum,xarray,netcdf
|
|
6
6
|
Author-Email: Pat Welch <pat@mousebrains.com>
|
|
@@ -41,7 +41,7 @@ Description-Content-Type: text/markdown
|
|
|
41
41
|
[](License.txt)
|
|
42
42
|
[](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml)
|
|
43
43
|
[](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml)
|
|
44
|
-
[](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
|
|
45
45
|
[](https://github.com/astral-sh/ruff)
|
|
46
46
|
|
|
47
47
|
An efficient xarray backend for reading Dinkum Binary Data (DBD) files from
|
|
@@ -147,6 +147,30 @@ ds = xdbd.open_multi_dbd_dataset(
|
|
|
147
147
|
)
|
|
148
148
|
```
|
|
149
149
|
|
|
150
|
+
### File sort order
|
|
151
|
+
|
|
152
|
+
By default, files are sorted by the `fileopen_time` timestamp in each file's
|
|
153
|
+
header, which is correct regardless of filename convention. Alternative sort
|
|
154
|
+
modes are available:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
# Default: sort by header timestamp (universally correct)
|
|
158
|
+
ds = xdbd.open_multi_dbd_dataset(files)
|
|
159
|
+
|
|
160
|
+
# Sort by filename (lexicographic)
|
|
161
|
+
ds = xdbd.open_multi_dbd_dataset(files, sort="lexicographic")
|
|
162
|
+
|
|
163
|
+
# Preserve the caller's order (no sorting)
|
|
164
|
+
ds = xdbd.open_multi_dbd_dataset(files, sort="none")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
The `--sort` flag is also available on all CLI commands:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
|
|
171
|
+
mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
|
|
172
|
+
```
|
|
173
|
+
|
|
150
174
|
### Advanced options
|
|
151
175
|
|
|
152
176
|
```python
|
|
@@ -189,6 +213,7 @@ Open a single DBD file as an xarray Dataset.
|
|
|
189
213
|
- `to_keep` (list of str): Sensor names to keep (default: all)
|
|
190
214
|
- `criteria` (list of str): Sensor names for selection criteria
|
|
191
215
|
- `drop_variables` (list of str): Variables to exclude
|
|
216
|
+
- `cache_dir` (str, Path, or None): Directory for sensor cache files
|
|
192
217
|
|
|
193
218
|
**Returns:** `xarray.Dataset`
|
|
194
219
|
|
|
@@ -204,6 +229,8 @@ Open multiple DBD files as a single concatenated xarray Dataset.
|
|
|
204
229
|
- `criteria` (list of str): Sensor names for selection criteria
|
|
205
230
|
- `skip_missions` (list of str): Mission names to skip
|
|
206
231
|
- `keep_missions` (list of str): Mission names to keep
|
|
232
|
+
- `cache_dir` (str, Path, or None): Directory for sensor cache files
|
|
233
|
+
- `sort` (str): File sort order — `"header_time"` (default, sort by `fileopen_time` from each file's header), `"lexicographic"`, or `"none"` (preserve caller's order).
|
|
207
234
|
|
|
208
235
|
**Returns:** `xarray.Dataset`
|
|
209
236
|
|
|
@@ -353,9 +380,8 @@ mdbd = dbdreader.MultiDBD(
|
|
|
353
380
|
to batch additional sensors into the first `get()` call.
|
|
354
381
|
|
|
355
382
|
- **`skip_initial_line` semantics.** When reading multiple files, the
|
|
356
|
-
first
|
|
357
|
-
|
|
358
|
-
Multi-file record counts may therefore differ by up to N-1.
|
|
383
|
+
first record of every file is skipped (matching dbdreader). Multi-file
|
|
384
|
+
record counts should match dbdreader exactly.
|
|
359
385
|
|
|
360
386
|
- **Float64 output.** `get()` always returns float64 arrays, matching
|
|
361
387
|
dbdreader's behavior. Integer fill values (-127 for int8, -32768 for
|
|
@@ -504,6 +530,30 @@ df = ds.to_dataframe()
|
|
|
504
530
|
print(df.describe())
|
|
505
531
|
```
|
|
506
532
|
|
|
533
|
+
## Choosing an API
|
|
534
|
+
|
|
535
|
+
| Scenario | Recommended API |
|
|
536
|
+
|----------|----------------|
|
|
537
|
+
| Single file, quick look | `xr.open_dataset(f, engine="dbd")` |
|
|
538
|
+
| Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
|
|
539
|
+
| Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
|
|
540
|
+
| Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
|
|
541
|
+
| Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
|
|
542
|
+
| Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
|
|
543
|
+
|
|
544
|
+
## Slocum File Types
|
|
545
|
+
|
|
546
|
+
| Extension | Name | Contents |
|
|
547
|
+
|-----------|------|----------|
|
|
548
|
+
| `.dbd` / `.dcd` | Flight | Vehicle sensors: depth, attitude, speed, GPS |
|
|
549
|
+
| `.ebd` / `.ecd` | Science | Payload sensors: CTD, optics, oxygen |
|
|
550
|
+
| `.sbd` / `.scd` | Short burst | Surface telemetry summary records |
|
|
551
|
+
| `.tbd` / `.tcd` | Technical | Detailed engineering telemetry |
|
|
552
|
+
| `.mbd` / `.mcd` | Mini | Compact engineering subset |
|
|
553
|
+
| `.nbd` / `.ncd` | Narrow | Compact science subset |
|
|
554
|
+
|
|
555
|
+
Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
|
|
556
|
+
|
|
507
557
|
## Known Limitations
|
|
508
558
|
|
|
509
559
|
- **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
|
|
@@ -514,6 +564,18 @@ print(df.describe())
|
|
|
514
564
|
- **No lazy loading for xarray API** — `open_dataset()` reads all sensor data
|
|
515
565
|
into memory. For very large deployments, use `to_keep` to select only needed
|
|
516
566
|
sensors. The dbdreader2 API (`DBD`/`MultiDBD`) uses lazy incremental loading.
|
|
567
|
+
- **Fill values in xarray output** — Integer sensors use sentinel fill values
|
|
568
|
+
(-127 for int8, -32768 for int16) rather than NaN. Between dives, science
|
|
569
|
+
sensors may contain these sentinels or NaN. Filter with
|
|
570
|
+
`ds.where(ds != -32768)` or use the dbdreader2 `get(return_nans=False)` API
|
|
571
|
+
which filters automatically.
|
|
572
|
+
- **Not CF-compliant** — NetCDF output preserves sensor `units` but does not
|
|
573
|
+
add CF attributes (`standard_name`, `axis`, `calendar`). Add metadata
|
|
574
|
+
post-hoc for publication, e.g.:
|
|
575
|
+
```python
|
|
576
|
+
ds["m_present_time"].attrs["axis"] = "T"
|
|
577
|
+
ds["m_present_time"].attrs["units"] = "seconds since 1970-01-01"
|
|
578
|
+
```
|
|
517
579
|
|
|
518
580
|
## Troubleshooting
|
|
519
581
|
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
[](License.txt)
|
|
6
6
|
[](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml)
|
|
7
7
|
[](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml)
|
|
8
|
-
[](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
|
|
9
9
|
[](https://github.com/astral-sh/ruff)
|
|
10
10
|
|
|
11
11
|
An efficient xarray backend for reading Dinkum Binary Data (DBD) files from
|
|
@@ -111,6 +111,30 @@ ds = xdbd.open_multi_dbd_dataset(
|
|
|
111
111
|
)
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
### File sort order
|
|
115
|
+
|
|
116
|
+
By default, files are sorted by the `fileopen_time` timestamp in each file's
|
|
117
|
+
header, which is correct regardless of filename convention. Alternative sort
|
|
118
|
+
modes are available:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# Default: sort by header timestamp (universally correct)
|
|
122
|
+
ds = xdbd.open_multi_dbd_dataset(files)
|
|
123
|
+
|
|
124
|
+
# Sort by filename (lexicographic)
|
|
125
|
+
ds = xdbd.open_multi_dbd_dataset(files, sort="lexicographic")
|
|
126
|
+
|
|
127
|
+
# Preserve the caller's order (no sorting)
|
|
128
|
+
ds = xdbd.open_multi_dbd_dataset(files, sort="none")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
The `--sort` flag is also available on all CLI commands:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
|
|
135
|
+
mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
|
|
136
|
+
```
|
|
137
|
+
|
|
114
138
|
### Advanced options
|
|
115
139
|
|
|
116
140
|
```python
|
|
@@ -153,6 +177,7 @@ Open a single DBD file as an xarray Dataset.
|
|
|
153
177
|
- `to_keep` (list of str): Sensor names to keep (default: all)
|
|
154
178
|
- `criteria` (list of str): Sensor names for selection criteria
|
|
155
179
|
- `drop_variables` (list of str): Variables to exclude
|
|
180
|
+
- `cache_dir` (str, Path, or None): Directory for sensor cache files
|
|
156
181
|
|
|
157
182
|
**Returns:** `xarray.Dataset`
|
|
158
183
|
|
|
@@ -168,6 +193,8 @@ Open multiple DBD files as a single concatenated xarray Dataset.
|
|
|
168
193
|
- `criteria` (list of str): Sensor names for selection criteria
|
|
169
194
|
- `skip_missions` (list of str): Mission names to skip
|
|
170
195
|
- `keep_missions` (list of str): Mission names to keep
|
|
196
|
+
- `cache_dir` (str, Path, or None): Directory for sensor cache files
|
|
197
|
+
- `sort` (str): File sort order — `"header_time"` (default, sort by `fileopen_time` from each file's header), `"lexicographic"`, or `"none"` (preserve caller's order).
|
|
171
198
|
|
|
172
199
|
**Returns:** `xarray.Dataset`
|
|
173
200
|
|
|
@@ -317,9 +344,8 @@ mdbd = dbdreader.MultiDBD(
|
|
|
317
344
|
to batch additional sensors into the first `get()` call.
|
|
318
345
|
|
|
319
346
|
- **`skip_initial_line` semantics.** When reading multiple files, the
|
|
320
|
-
first
|
|
321
|
-
|
|
322
|
-
Multi-file record counts may therefore differ by up to N-1.
|
|
347
|
+
first record of every file is skipped (matching dbdreader). Multi-file
|
|
348
|
+
record counts should match dbdreader exactly.
|
|
323
349
|
|
|
324
350
|
- **Float64 output.** `get()` always returns float64 arrays, matching
|
|
325
351
|
dbdreader's behavior. Integer fill values (-127 for int8, -32768 for
|
|
@@ -468,6 +494,30 @@ df = ds.to_dataframe()
|
|
|
468
494
|
print(df.describe())
|
|
469
495
|
```
|
|
470
496
|
|
|
497
|
+
## Choosing an API
|
|
498
|
+
|
|
499
|
+
| Scenario | Recommended API |
|
|
500
|
+
|----------|----------------|
|
|
501
|
+
| Single file, quick look | `xr.open_dataset(f, engine="dbd")` |
|
|
502
|
+
| Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
|
|
503
|
+
| Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
|
|
504
|
+
| Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
|
|
505
|
+
| Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
|
|
506
|
+
| Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
|
|
507
|
+
|
|
508
|
+
## Slocum File Types
|
|
509
|
+
|
|
510
|
+
| Extension | Name | Contents |
|
|
511
|
+
|-----------|------|----------|
|
|
512
|
+
| `.dbd` / `.dcd` | Flight | Vehicle sensors: depth, attitude, speed, GPS |
|
|
513
|
+
| `.ebd` / `.ecd` | Science | Payload sensors: CTD, optics, oxygen |
|
|
514
|
+
| `.sbd` / `.scd` | Short burst | Surface telemetry summary records |
|
|
515
|
+
| `.tbd` / `.tcd` | Technical | Detailed engineering telemetry |
|
|
516
|
+
| `.mbd` / `.mcd` | Mini | Compact engineering subset |
|
|
517
|
+
| `.nbd` / `.ncd` | Narrow | Compact science subset |
|
|
518
|
+
|
|
519
|
+
Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
|
|
520
|
+
|
|
471
521
|
## Known Limitations
|
|
472
522
|
|
|
473
523
|
- **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
|
|
@@ -478,6 +528,18 @@ print(df.describe())
|
|
|
478
528
|
- **No lazy loading for xarray API** — `open_dataset()` reads all sensor data
|
|
479
529
|
into memory. For very large deployments, use `to_keep` to select only needed
|
|
480
530
|
sensors. The dbdreader2 API (`DBD`/`MultiDBD`) uses lazy incremental loading.
|
|
531
|
+
- **Fill values in xarray output** — Integer sensors use sentinel fill values
|
|
532
|
+
(-127 for int8, -32768 for int16) rather than NaN. Between dives, science
|
|
533
|
+
sensors may contain these sentinels or NaN. Filter with
|
|
534
|
+
`ds.where(ds != -32768)` or use the dbdreader2 `get(return_nans=False)` API
|
|
535
|
+
which filters automatically.
|
|
536
|
+
- **Not CF-compliant** — NetCDF output preserves sensor `units` but does not
|
|
537
|
+
add CF attributes (`standard_name`, `axis`, `calendar`). Add metadata
|
|
538
|
+
post-hoc for publication, e.g.:
|
|
539
|
+
```python
|
|
540
|
+
ds["m_present_time"].attrs["axis"] = "T"
|
|
541
|
+
ds["m_present_time"].attrs["units"] = "seconds since 1970-01-01"
|
|
542
|
+
```
|
|
481
543
|
|
|
482
544
|
## Troubleshooting
|
|
483
545
|
|
|
@@ -156,9 +156,8 @@ Sensors::mkFilename(const std::string& dir) const
|
|
|
156
156
|
namespace {
|
|
157
157
|
// Generate a unique temporary filename suffix
|
|
158
158
|
std::string uniqueTempSuffix() {
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
static std::uniform_int_distribution<> dis(100000, 999999);
|
|
159
|
+
thread_local std::mt19937 gen(std::random_device{}());
|
|
160
|
+
thread_local std::uniform_int_distribution<> dis(100000, 999999);
|
|
162
161
|
return std::to_string(dis(gen));
|
|
163
162
|
}
|
|
164
163
|
}
|
|
@@ -178,14 +178,17 @@ MultiFileResult parse_multiple_files(
|
|
|
178
178
|
const std::vector<std::string>& skip_missions,
|
|
179
179
|
const std::vector<std::string>& keep_missions,
|
|
180
180
|
bool skip_first_record,
|
|
181
|
-
bool repair
|
|
181
|
+
bool repair,
|
|
182
|
+
bool presorted)
|
|
182
183
|
{
|
|
183
184
|
if (filenames.empty()) {
|
|
184
185
|
return {{}, {}, 0, 0};
|
|
185
186
|
}
|
|
186
187
|
|
|
187
188
|
std::vector<std::string> sorted_files(filenames);
|
|
188
|
-
|
|
189
|
+
if (!presorted) {
|
|
190
|
+
std::sort(sorted_files.begin(), sorted_files.end());
|
|
191
|
+
}
|
|
189
192
|
|
|
190
193
|
Header::tMissions skipSet, keepSet;
|
|
191
194
|
for (const auto& m : skip_missions) Header::addMission(m, skipSet);
|
|
@@ -280,7 +283,7 @@ MultiFileResult parse_multiple_files(
|
|
|
280
283
|
|
|
281
284
|
size_t n = result.n_records;
|
|
282
285
|
size_t start = 0;
|
|
283
|
-
if (skip_first_record &&
|
|
286
|
+
if (skip_first_record && n > 0) {
|
|
284
287
|
start = 1;
|
|
285
288
|
n -= 1;
|
|
286
289
|
}
|
|
@@ -288,7 +291,8 @@ MultiFileResult parse_multiple_files(
|
|
|
288
291
|
if (n > 0) {
|
|
289
292
|
// Grow union columns if needed (doubling strategy)
|
|
290
293
|
if (offset + n > capacity) {
|
|
291
|
-
|
|
294
|
+
size_t doubled = (capacity <= SIZE_MAX / 2) ? capacity * 2 : SIZE_MAX;
|
|
295
|
+
capacity = std::max(offset + n, doubled);
|
|
292
296
|
grow_union_columns(unionColumns, unionInfo, capacity);
|
|
293
297
|
}
|
|
294
298
|
|
|
@@ -573,14 +577,15 @@ PYBIND11_MODULE(_dbd_cpp, m, py::mod_gil_not_used()) {
|
|
|
573
577
|
const std::vector<std::string>& skip_missions,
|
|
574
578
|
const std::vector<std::string>& keep_missions,
|
|
575
579
|
bool skip_first_record,
|
|
576
|
-
bool repair
|
|
580
|
+
bool repair,
|
|
581
|
+
bool presorted) -> py::dict {
|
|
577
582
|
MultiFileResult result;
|
|
578
583
|
{
|
|
579
584
|
py::gil_scoped_release release;
|
|
580
585
|
result = parse_multiple_files(filenames, cache_dir, to_keep,
|
|
581
586
|
criteria, skip_missions,
|
|
582
587
|
keep_missions, skip_first_record,
|
|
583
|
-
repair);
|
|
588
|
+
repair, presorted);
|
|
584
589
|
}
|
|
585
590
|
return multi_result_to_python(std::move(result));
|
|
586
591
|
},
|
|
@@ -592,10 +597,11 @@ PYBIND11_MODULE(_dbd_cpp, m, py::mod_gil_not_used()) {
|
|
|
592
597
|
py::arg("keep_missions") = std::vector<std::string>(),
|
|
593
598
|
py::arg("skip_first_record") = true,
|
|
594
599
|
py::arg("repair") = false,
|
|
600
|
+
py::arg("presorted") = false,
|
|
595
601
|
"Read multiple DBD files with sensor union and return concatenated data.\n\n"
|
|
596
602
|
"Uses a two-pass approach: pass 1 scans headers and builds a unified\n"
|
|
597
603
|
"sensor list via SensorsMap, pass 2 reads data and merges into union\n"
|
|
598
|
-
"columns. Files are sorted internally.\n\n"
|
|
604
|
+
"columns. Files are sorted internally unless presorted is True.\n\n"
|
|
599
605
|
"Parameters\n"
|
|
600
606
|
"----------\n"
|
|
601
607
|
"filenames : list of str\n"
|
|
@@ -614,7 +620,10 @@ PYBIND11_MODULE(_dbd_cpp, m, py::mod_gil_not_used()) {
|
|
|
614
620
|
" If True (default), drop the first record of each file after\n"
|
|
615
621
|
" the first.\n"
|
|
616
622
|
"repair : bool, optional\n"
|
|
617
|
-
" If True, attempt to recover data from corrupted records.\n
|
|
623
|
+
" If True, attempt to recover data from corrupted records.\n"
|
|
624
|
+
"presorted : bool, optional\n"
|
|
625
|
+
" If True, skip internal lexicographic sort and process files\n"
|
|
626
|
+
" in the order given. Default False.\n\n"
|
|
618
627
|
"Returns\n"
|
|
619
628
|
"-------\n"
|
|
620
629
|
"dict\n"
|
|
@@ -876,6 +876,7 @@ def _base_args(**overrides) -> Namespace:
|
|
|
876
876
|
"mail_from": None,
|
|
877
877
|
"mail_subject": None,
|
|
878
878
|
"smtp_host": "localhost",
|
|
879
|
+
"sort": "header_time",
|
|
879
880
|
}
|
|
880
881
|
defaults.update(overrides)
|
|
881
882
|
return Namespace(**defaults)
|
|
@@ -1056,6 +1057,57 @@ class TestDbd2ncRun:
|
|
|
1056
1057
|
assert len(ds.data_vars) > 0
|
|
1057
1058
|
ds.close()
|
|
1058
1059
|
|
|
1060
|
+
def test_dbd2nc_run_sort_header_time(self, tmp_path):
|
|
1061
|
+
"""Streaming write with --sort header_time produces valid output."""
|
|
1062
|
+
import xarray as xr
|
|
1063
|
+
|
|
1064
|
+
from xarray_dbd.cli.dbd2nc import run
|
|
1065
|
+
|
|
1066
|
+
dcd_files = sorted(DBD_DIR.glob("*.dcd"))[:3]
|
|
1067
|
+
outfile = tmp_path / "out.nc"
|
|
1068
|
+
args = _base_args(
|
|
1069
|
+
files=dcd_files,
|
|
1070
|
+
cache=Path(CACHE_DIR),
|
|
1071
|
+
output=outfile,
|
|
1072
|
+
append=False,
|
|
1073
|
+
sensors=None,
|
|
1074
|
+
sensor_output=None,
|
|
1075
|
+
skip_mission=None,
|
|
1076
|
+
keep_mission=None,
|
|
1077
|
+
skip_first=True,
|
|
1078
|
+
repair=False,
|
|
1079
|
+
compression=5,
|
|
1080
|
+
sort="header_time",
|
|
1081
|
+
)
|
|
1082
|
+
rc = run(args)
|
|
1083
|
+
assert rc == 0
|
|
1084
|
+
ds = xr.open_dataset(str(outfile), decode_timedelta=False)
|
|
1085
|
+
assert len(ds.data_vars) > 0
|
|
1086
|
+
ds.close()
|
|
1087
|
+
|
|
1088
|
+
def test_dbd2nc_run_sort_none(self, tmp_path):
|
|
1089
|
+
"""Streaming write with --sort none produces valid output."""
|
|
1090
|
+
from xarray_dbd.cli.dbd2nc import run
|
|
1091
|
+
|
|
1092
|
+
dcd_files = sorted(DBD_DIR.glob("*.dcd"))[:2]
|
|
1093
|
+
outfile = tmp_path / "out.nc"
|
|
1094
|
+
args = _base_args(
|
|
1095
|
+
files=dcd_files,
|
|
1096
|
+
cache=Path(CACHE_DIR),
|
|
1097
|
+
output=outfile,
|
|
1098
|
+
append=False,
|
|
1099
|
+
sensors=None,
|
|
1100
|
+
sensor_output=None,
|
|
1101
|
+
skip_mission=None,
|
|
1102
|
+
keep_mission=None,
|
|
1103
|
+
skip_first=True,
|
|
1104
|
+
repair=False,
|
|
1105
|
+
compression=5,
|
|
1106
|
+
sort="none",
|
|
1107
|
+
)
|
|
1108
|
+
rc = run(args)
|
|
1109
|
+
assert rc == 0
|
|
1110
|
+
|
|
1059
1111
|
def test_dbd2nc_run_no_compression(self, tmp_path):
|
|
1060
1112
|
from xarray_dbd.cli.dbd2nc import run
|
|
1061
1113
|
|
|
@@ -119,6 +119,79 @@ def test_open_multi_dbd_dataset():
|
|
|
119
119
|
ds.close()
|
|
120
120
|
|
|
121
121
|
|
|
122
|
+
def test_read_dbd_files_presorted():
|
|
123
|
+
"""read_dbd_files with presorted=True preserves caller's file order."""
|
|
124
|
+
files = sorted(str(f) for f in DBD_DIR.glob("*.dcd"))[:5]
|
|
125
|
+
if len(files) < 2:
|
|
126
|
+
pytest.skip("Need at least 2 test files")
|
|
127
|
+
|
|
128
|
+
# Normal (lexicographic) order
|
|
129
|
+
result_lex = read_dbd_files(files, cache_dir=CACHE_DIR, skip_first_record=True)
|
|
130
|
+
|
|
131
|
+
# Reversed order with presorted=True — should produce different data order
|
|
132
|
+
result_rev = read_dbd_files(
|
|
133
|
+
list(reversed(files)),
|
|
134
|
+
cache_dir=CACHE_DIR,
|
|
135
|
+
skip_first_record=True,
|
|
136
|
+
presorted=True,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Both should have the same total records and sensor names
|
|
140
|
+
assert result_lex["n_records"] == result_rev["n_records"]
|
|
141
|
+
assert set(result_lex["sensor_names"]) == set(result_rev["sensor_names"])
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_open_multi_dbd_dataset_sort_header_time():
|
|
145
|
+
"""open_multi_dbd_dataset with sort='header_time' produces valid output."""
|
|
146
|
+
files = sorted(DBD_DIR.glob("*.dcd"))[:5]
|
|
147
|
+
if len(files) < 2:
|
|
148
|
+
pytest.skip("Need at least 2 test files")
|
|
149
|
+
|
|
150
|
+
ds = xdbd.open_multi_dbd_dataset(
|
|
151
|
+
files,
|
|
152
|
+
skip_first_record=True,
|
|
153
|
+
cache_dir=CACHE_DIR,
|
|
154
|
+
sort="header_time",
|
|
155
|
+
)
|
|
156
|
+
assert len(ds.data_vars) > 0
|
|
157
|
+
assert len(ds.i) > 0
|
|
158
|
+
|
|
159
|
+
# Compare record count with lexicographic sort — should be the same
|
|
160
|
+
ds_lex = xdbd.open_multi_dbd_dataset(
|
|
161
|
+
files,
|
|
162
|
+
skip_first_record=True,
|
|
163
|
+
cache_dir=CACHE_DIR,
|
|
164
|
+
sort="lexicographic",
|
|
165
|
+
)
|
|
166
|
+
assert len(ds.i) == len(ds_lex.i)
|
|
167
|
+
ds.close()
|
|
168
|
+
ds_lex.close()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_open_multi_dbd_dataset_sort_none():
|
|
172
|
+
"""open_multi_dbd_dataset with sort='none' preserves caller's order."""
|
|
173
|
+
files = sorted(DBD_DIR.glob("*.dcd"))[:3]
|
|
174
|
+
if len(files) < 2:
|
|
175
|
+
pytest.skip("Need at least 2 test files")
|
|
176
|
+
|
|
177
|
+
ds = xdbd.open_multi_dbd_dataset(
|
|
178
|
+
files,
|
|
179
|
+
skip_first_record=True,
|
|
180
|
+
cache_dir=CACHE_DIR,
|
|
181
|
+
sort="none",
|
|
182
|
+
)
|
|
183
|
+
assert len(ds.data_vars) > 0
|
|
184
|
+
assert len(ds.i) > 0
|
|
185
|
+
ds.close()
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def test_open_multi_dbd_dataset_sort_invalid():
|
|
189
|
+
"""open_multi_dbd_dataset rejects invalid sort values."""
|
|
190
|
+
files = sorted(DBD_DIR.glob("*.dcd"))[:1]
|
|
191
|
+
with pytest.raises(ValueError, match="sort must be one of"):
|
|
192
|
+
xdbd.open_multi_dbd_dataset(files, cache_dir=CACHE_DIR, sort="bogus")
|
|
193
|
+
|
|
194
|
+
|
|
122
195
|
def test_nan_fill_for_floats():
|
|
123
196
|
"""Float columns use NaN for absent values, int columns use 0."""
|
|
124
197
|
files = sorted(str(f) for f in DBD_DIR.glob("*.dcd"))[:5]
|