xarray-dbd 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. xarray_dbd-0.2.5/CHANGELOG.md +96 -0
  2. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/PKG-INFO +67 -5
  3. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/README.md +66 -4
  4. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensors.C +2 -3
  5. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/config.h +1 -1
  6. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/dbd_python.cpp +17 -8
  7. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/pyproject.toml +1 -1
  8. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_cli.py +52 -0
  9. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_cpp_backend.py +73 -0
  10. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/_dbd_cpp.pyi +1 -0
  11. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/backend.py +138 -56
  12. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/csv.py +14 -1
  13. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/dbd2nc.py +9 -0
  14. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/mkone.py +19 -3
  15. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/sensors.py +3 -3
  16. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_core.py +8 -2
  17. xarray_dbd-0.2.3/CHANGELOG.md +0 -47
  18. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/.clang-tidy +0 -0
  19. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/.gitignore +0 -0
  20. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/.pre-commit-config.yaml +0 -0
  21. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/CMakeLists.txt +0 -0
  22. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/CONTRIBUTING.md +0 -0
  23. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/License.txt +0 -0
  24. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/benchmark_performance.py +0 -0
  25. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/conda/recipe.yaml +0 -0
  26. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/ColumnData.C +0 -0
  27. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/ColumnData.H +0 -0
  28. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Data.C +0 -0
  29. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Data.H +0 -0
  30. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Decompress.C +0 -0
  31. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Decompress.H +0 -0
  32. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/FileInfo.H +0 -0
  33. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Header.C +0 -0
  34. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Header.H +0 -0
  35. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/KnownBytes.C +0 -0
  36. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/KnownBytes.H +0 -0
  37. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Logger.H +0 -0
  38. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/MyException.H +0 -0
  39. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensor.C +0 -0
  40. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensor.H +0 -0
  41. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/Sensors.H +0 -0
  42. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/SensorsMap.C +0 -0
  43. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/SensorsMap.H +0 -0
  44. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/lz4.c +0 -0
  45. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/csrc/lz4.h +0 -0
  46. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/examples/README.md +0 -0
  47. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/scripts/README.md +0 -0
  48. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/conftest.py +0 -0
  49. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_backend.py +0 -0
  50. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/tests/test_dbdreader2.py +0 -0
  51. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/__init__.py +0 -0
  52. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/__init__.py +0 -0
  53. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/cache.py +0 -0
  54. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/logger.py +0 -0
  55. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/main.py +0 -0
  56. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/cli/missions.py +0 -0
  57. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/__init__.py +0 -0
  58. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_cache.py +0 -0
  59. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_errors.py +0 -0
  60. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_list.py +0 -0
  61. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/dbdreader2/_util.py +0 -0
  62. {xarray_dbd-0.2.3 → xarray_dbd-0.2.5}/xarray_dbd/py.typed +0 -0
@@ -0,0 +1,96 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.2.5] - 2026-03-30
9
+
10
+ ### Added
11
+
12
+ - `sort` parameter for `open_multi_dbd_dataset()` and `write_multi_dbd_netcdf()` with three modes: `"header_time"` (default, sort by `fileopen_time` from each file's DBD header), `"lexicographic"`, and `"none"` (preserve caller's order)
13
+ - `--sort` CLI flag for `dbd2nc`, `mkone`, and `2csv` commands
14
+ - `presorted` parameter for `read_dbd_files()` C++ binding to skip internal lexicographic sort when files are pre-sorted by Python
15
+ - `sensor_size` attribute on variables from `open_multi_dbd_dataset()`, matching single-file behavior
16
+ - `--skip-first` flag for `mkone` as consistent alias for the inverse `--keep-first`
17
+ - Duplicate file detection and deduplication with warning in multi-file functions
18
+ - Output directory auto-creation in `write_multi_dbd_netcdf()`
19
+ - "Choosing an API" and "Slocum File Types" sections in README
20
+ - Fill value and CF-compliance guidance in README Known Limitations
21
+
22
+ ### Changed
23
+
24
+ - `skip_first_record` in `read_dbd_files()` now skips the first record of **all** files (including the first), matching Lucas Merckelbach's dbdreader behavior
25
+ - Streaming NetCDF writer keeps a single file handle open instead of reopening per batch
26
+
27
+ ### Fixed
28
+
29
+ - File ordering for TWR-style filenames (e.g. `ce_1137-2026-085-1-10.dbd` incorrectly sorting before `-2.dbd` under lexicographic sort)
30
+ - `_parse_fileopen_time()` now logs a warning instead of silently sorting unparseable files to end
31
+ - `DBD.get_fileopen_time()` no longer raises on unparseable header values
32
+ - Thread-safe random number generator in C++ cache file creation
33
+ - Integer overflow guard in C++ column capacity doubling
34
+
35
+ ## [0.2.3] - 2026-02-23
36
+
37
+ ### Added
38
+
39
+ - `include_source` support in `MultiDBD.get()` — returns per-record source DBD references, matching dbdreader's API
40
+ - `continue_on_reading_error` parameter for `MultiDBD.get()` — skip corrupted files instead of raising, matching dbdreader v0.5.9
41
+ - `DBD_ERROR_READ_ERROR` error code (14) for compatibility with dbdreader
42
+ - Python 3.14 pre-built wheels for all platforms (Linux, macOS, Windows)
43
+ - Attribution to Lucas Merckelbach's [dbdreader](https://github.com/smerckel/dbdreader) in README
44
+
45
+ ## [0.2.2] - 2026-02-23
46
+
47
+ ### Added
48
+
49
+ - `preload` parameter for `DBD` and `MultiDBD` constructors
50
+ - Changelog configuration and tag/version validation in publish workflow
51
+
52
+ ### Fixed
53
+
54
+ - mypy errors: `datetime.UTC`, tuple assignments, type annotations
55
+ - ruff formatting compliance
56
+
57
+ ## [0.2.1] - 2026-02-22
58
+
59
+ ### Added
60
+
61
+ - Streaming NetCDF writer (`write_multi_dbd_netcdf`) for low-memory batch conversion
62
+ - dbdreader-compatible API layer (`DBD` and `MultiDBD` classes in `xarray_dbd.dbdreader2`)
63
+ - Unified CLI under `xdbd` command with subcommands (`2nc`, `mkone`, `2csv`, `missions`, `cache`)
64
+ - Monotonicity check in `get_sync()` to prevent silent wrong results from `np.interp`
65
+
66
+ ### Changed
67
+
68
+ - CLI restructured: standalone `dbd2nc` and `mkone` commands replaced by `xdbd 2nc` and `xdbd mkone`
69
+ - Streaming mode is now the default for non-append `2nc` and `mkone` (requires netCDF4)
70
+ - Fill values corrected: -127 for int8, -32768 for int16 (matching C++ dbd2netCDF standalone)
71
+ - Multi-file reader uses read-copy-discard strategy to reduce peak memory ~53%
72
+ - Replaced inf with NaN in float reads to match C++ dbd2netCDF behavior
73
+
74
+ ### Fixed
75
+
76
+ - Multi-file parse dropping records from unfactored DBD files
77
+ - Corrupted file recovery: discard partial record on I/O error
78
+
79
+ ## [0.1.0] - 2026-02-20
80
+
81
+ ### Added
82
+
83
+ - C++ backend via pybind11 wrapping [dbd2netCDF](https://github.com/mousebrains/dbd2netcdf) parser
84
+ - Native xarray engine integration (`xr.open_dataset(f, engine="dbd")`)
85
+ - Multi-file reading with `open_multi_dbd_dataset()` using C++ SensorsMap two-pass approach
86
+ - CLI tools: `dbd2nc` for single/multi-file conversion, `mkone` for batch directory processing
87
+ - Native dtype support: int8, int16, float32, float64 columns (no double-conversion overhead)
88
+ - LZ4 decompression for compressed `.?cd` files
89
+ - Sensor filtering (`to_keep`), mission filtering (`skip_missions`/`keep_missions`)
90
+ - Corrupted file recovery with `repair=True`
91
+ - Python 3.10+ and free-threaded Python (PEP 703) support
92
+
93
+ ### Changed
94
+
95
+ - Replaced pure-Python parser with C++ pybind11 extension for ~5x performance improvement
96
+ - Fill values: NaN for float32/float64, -127 for int8, -32768 for int16 (matching C++ dbd2netCDF)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray-dbd
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Efficient xarray backend for reading glider DBD files
5
5
  Keywords: glider,oceanography,dbd,slocum,xarray,netcdf
6
6
  Author-Email: Pat Welch <pat@mousebrains.com>
@@ -41,7 +41,7 @@ Description-Content-Type: text/markdown
41
41
  [![License](https://img.shields.io/pypi/l/xarray-dbd)](License.txt)
42
42
  [![CI](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml)
43
43
  [![CodeQL](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml)
44
- [![Codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/branch/main/graph/badge.svg)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
44
+ [![codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/graph/badge.svg?token=EJQEIVEB0U)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
45
45
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
46
46
 
47
47
  An efficient xarray backend for reading Dinkum Binary Data (DBD) files from
@@ -147,6 +147,30 @@ ds = xdbd.open_multi_dbd_dataset(
147
147
  )
148
148
  ```
149
149
 
150
+ ### File sort order
151
+
152
+ By default, files are sorted by the `fileopen_time` timestamp in each file's
153
+ header, which is correct regardless of filename convention. Alternative sort
154
+ modes are available:
155
+
156
+ ```python
157
+ # Default: sort by header timestamp (universally correct)
158
+ ds = xdbd.open_multi_dbd_dataset(files)
159
+
160
+ # Sort by filename (lexicographic)
161
+ ds = xdbd.open_multi_dbd_dataset(files, sort="lexicographic")
162
+
163
+ # Preserve the caller's order (no sorting)
164
+ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
165
+ ```
166
+
167
+ The `--sort` flag is also available on all CLI commands:
168
+
169
+ ```bash
170
+ dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
171
+ mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
172
+ ```
173
+
150
174
  ### Advanced options
151
175
 
152
176
  ```python
@@ -189,6 +213,7 @@ Open a single DBD file as an xarray Dataset.
189
213
  - `to_keep` (list of str): Sensor names to keep (default: all)
190
214
  - `criteria` (list of str): Sensor names for selection criteria
191
215
  - `drop_variables` (list of str): Variables to exclude
216
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
192
217
 
193
218
  **Returns:** `xarray.Dataset`
194
219
 
@@ -204,6 +229,8 @@ Open multiple DBD files as a single concatenated xarray Dataset.
204
229
  - `criteria` (list of str): Sensor names for selection criteria
205
230
  - `skip_missions` (list of str): Mission names to skip
206
231
  - `keep_missions` (list of str): Mission names to keep
232
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
233
+ - `sort` (str): File sort order — `"header_time"` (default, sort by `fileopen_time` from each file's header), `"lexicographic"`, or `"none"` (preserve caller's order).
207
234
 
208
235
  **Returns:** `xarray.Dataset`
209
236
 
@@ -353,9 +380,8 @@ mdbd = dbdreader.MultiDBD(
353
380
  to batch additional sensors into the first `get()` call.
354
381
 
355
382
  - **`skip_initial_line` semantics.** When reading multiple files, the
356
- first contributing file keeps all its records; subsequent files skip
357
- their first record. dbdreader skips the first record of every file.
358
- Multi-file record counts may therefore differ by up to N-1.
383
+ first record of every file is skipped (matching dbdreader). Multi-file
384
+ record counts should match dbdreader exactly.
359
385
 
360
386
  - **Float64 output.** `get()` always returns float64 arrays, matching
361
387
  dbdreader's behavior. Integer fill values (-127 for int8, -32768 for
@@ -504,6 +530,30 @@ df = ds.to_dataframe()
504
530
  print(df.describe())
505
531
  ```
506
532
 
533
+ ## Choosing an API
534
+
535
+ | Scenario | Recommended API |
536
+ |----------|----------------|
537
+ | Single file, quick look | `xr.open_dataset(f, engine="dbd")` |
538
+ | Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
539
+ | Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
540
+ | Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
541
+ | Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
542
+ | Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
543
+
544
+ ## Slocum File Types
545
+
546
+ | Extension | Name | Contents |
547
+ |-----------|------|----------|
548
+ | `.dbd` / `.dcd` | Flight | Vehicle sensors: depth, attitude, speed, GPS |
549
+ | `.ebd` / `.ecd` | Science | Payload sensors: CTD, optics, oxygen |
550
+ | `.sbd` / `.scd` | Short burst | Surface telemetry summary records |
551
+ | `.tbd` / `.tcd` | Technical | Detailed engineering telemetry |
552
+ | `.mbd` / `.mcd` | Mini | Compact engineering subset |
553
+ | `.nbd` / `.ncd` | Narrow | Compact science subset |
554
+
555
+ Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
556
+
507
557
  ## Known Limitations
508
558
 
509
559
  - **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
@@ -514,6 +564,18 @@ print(df.describe())
514
564
  - **No lazy loading for xarray API** — `open_dataset()` reads all sensor data
515
565
  into memory. For very large deployments, use `to_keep` to select only needed
516
566
  sensors. The dbdreader2 API (`DBD`/`MultiDBD`) uses lazy incremental loading.
567
+ - **Fill values in xarray output** — Integer sensors use sentinel fill values
568
+ (-127 for int8, -32768 for int16) rather than NaN. Between dives, science
569
+ sensors may contain these sentinels or NaN. Filter with
570
+ `ds.where(ds != -32768)` or use the dbdreader2 `get(return_nans=False)` API
571
+ which filters automatically.
572
+ - **Not CF-compliant** — NetCDF output preserves sensor `units` but does not
573
+ add CF attributes (`standard_name`, `axis`, `calendar`). Add metadata
574
+ post-hoc for publication, e.g.:
575
+ ```python
576
+ ds["m_present_time"].attrs["axis"] = "T"
577
+ ds["m_present_time"].attrs["units"] = "seconds since 1970-01-01"
578
+ ```
517
579
 
518
580
  ## Troubleshooting
519
581
 
@@ -5,7 +5,7 @@
5
5
  [![License](https://img.shields.io/pypi/l/xarray-dbd)](License.txt)
6
6
  [![CI](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/ci.yml)
7
7
  [![CodeQL](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml/badge.svg)](https://github.com/mousebrains/dbd2netcdf-python/actions/workflows/codeql.yml)
8
- [![Codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/branch/main/graph/badge.svg)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
8
+ [![codecov](https://codecov.io/gh/mousebrains/dbd2netcdf-python/graph/badge.svg?token=EJQEIVEB0U)](https://codecov.io/gh/mousebrains/dbd2netcdf-python)
9
9
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
10
10
 
11
11
  An efficient xarray backend for reading Dinkum Binary Data (DBD) files from
@@ -111,6 +111,30 @@ ds = xdbd.open_multi_dbd_dataset(
111
111
  )
112
112
  ```
113
113
 
114
+ ### File sort order
115
+
116
+ By default, files are sorted by the `fileopen_time` timestamp in each file's
117
+ header, which is correct regardless of filename convention. Alternative sort
118
+ modes are available:
119
+
120
+ ```python
121
+ # Default: sort by header timestamp (universally correct)
122
+ ds = xdbd.open_multi_dbd_dataset(files)
123
+
124
+ # Sort by filename (lexicographic)
125
+ ds = xdbd.open_multi_dbd_dataset(files, sort="lexicographic")
126
+
127
+ # Preserve the caller's order (no sorting)
128
+ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
129
+ ```
130
+
131
+ The `--sort` flag is also available on all CLI commands:
132
+
133
+ ```bash
134
+ dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
135
+ mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
136
+ ```
137
+
114
138
  ### Advanced options
115
139
 
116
140
  ```python
@@ -153,6 +177,7 @@ Open a single DBD file as an xarray Dataset.
153
177
  - `to_keep` (list of str): Sensor names to keep (default: all)
154
178
  - `criteria` (list of str): Sensor names for selection criteria
155
179
  - `drop_variables` (list of str): Variables to exclude
180
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
156
181
 
157
182
  **Returns:** `xarray.Dataset`
158
183
 
@@ -168,6 +193,8 @@ Open multiple DBD files as a single concatenated xarray Dataset.
168
193
  - `criteria` (list of str): Sensor names for selection criteria
169
194
  - `skip_missions` (list of str): Mission names to skip
170
195
  - `keep_missions` (list of str): Mission names to keep
196
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
197
+ - `sort` (str): File sort order — `"header_time"` (default, sort by `fileopen_time` from each file's header), `"lexicographic"`, or `"none"` (preserve caller's order).
171
198
 
172
199
  **Returns:** `xarray.Dataset`
173
200
 
@@ -317,9 +344,8 @@ mdbd = dbdreader.MultiDBD(
317
344
  to batch additional sensors into the first `get()` call.
318
345
 
319
346
  - **`skip_initial_line` semantics.** When reading multiple files, the
320
- first contributing file keeps all its records; subsequent files skip
321
- their first record. dbdreader skips the first record of every file.
322
- Multi-file record counts may therefore differ by up to N-1.
347
+ first record of every file is skipped (matching dbdreader). Multi-file
348
+ record counts should match dbdreader exactly.
323
349
 
324
350
  - **Float64 output.** `get()` always returns float64 arrays, matching
325
351
  dbdreader's behavior. Integer fill values (-127 for int8, -32768 for
@@ -468,6 +494,30 @@ df = ds.to_dataframe()
468
494
  print(df.describe())
469
495
  ```
470
496
 
497
+ ## Choosing an API
498
+
499
+ | Scenario | Recommended API |
500
+ |----------|----------------|
501
+ | Single file, quick look | `xr.open_dataset(f, engine="dbd")` |
502
+ | Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
503
+ | Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
504
+ | Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
505
+ | Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
506
+ | Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
507
+
508
+ ## Slocum File Types
509
+
510
+ | Extension | Name | Contents |
511
+ |-----------|------|----------|
512
+ | `.dbd` / `.dcd` | Flight | Vehicle sensors: depth, attitude, speed, GPS |
513
+ | `.ebd` / `.ecd` | Science | Payload sensors: CTD, optics, oxygen |
514
+ | `.sbd` / `.scd` | Short burst | Surface telemetry summary records |
515
+ | `.tbd` / `.tcd` | Technical | Detailed engineering telemetry |
516
+ | `.mbd` / `.mcd` | Mini | Compact engineering subset |
517
+ | `.nbd` / `.ncd` | Narrow | Compact science subset |
518
+
519
+ Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
520
+
471
521
  ## Known Limitations
472
522
 
473
523
  - **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
@@ -478,6 +528,18 @@ print(df.describe())
478
528
  - **No lazy loading for xarray API** — `open_dataset()` reads all sensor data
479
529
  into memory. For very large deployments, use `to_keep` to select only needed
480
530
  sensors. The dbdreader2 API (`DBD`/`MultiDBD`) uses lazy incremental loading.
531
+ - **Fill values in xarray output** — Integer sensors use sentinel fill values
532
+ (-127 for int8, -32768 for int16) rather than NaN. Between dives, science
533
+ sensors may contain these sentinels or NaN. Filter with
534
+ `ds.where(ds != -32768)` or use the dbdreader2 `get(return_nans=False)` API
535
+ which filters automatically.
536
+ - **Not CF-compliant** — NetCDF output preserves sensor `units` but does not
537
+ add CF attributes (`standard_name`, `axis`, `calendar`). Add metadata
538
+ post-hoc for publication, e.g.:
539
+ ```python
540
+ ds["m_present_time"].attrs["axis"] = "T"
541
+ ds["m_present_time"].attrs["units"] = "seconds since 1970-01-01"
542
+ ```
481
543
 
482
544
  ## Troubleshooting
483
545
 
@@ -156,9 +156,8 @@ Sensors::mkFilename(const std::string& dir) const
156
156
  namespace {
157
157
  // Generate a unique temporary filename suffix
158
158
  std::string uniqueTempSuffix() {
159
- static std::random_device rd;
160
- static std::mt19937 gen(rd());
161
- static std::uniform_int_distribution<> dis(100000, 999999);
159
+ thread_local std::mt19937 gen(std::random_device{}());
160
+ thread_local std::uniform_int_distribution<> dis(100000, 999999);
162
161
  return std::to_string(dis(gen));
163
162
  }
164
163
  }
@@ -3,7 +3,7 @@
3
3
  #ifndef INC_CONFIG_H_
4
4
  #define INC_CONFIG_H_
5
5
 
6
- #define VERSION "1.6.10"
6
+ #define VERSION "1.7.0"
7
7
  #define MAINTAINER "pat@mousebrains.com"
8
8
 
9
9
  #define HAVE_INT8_T
@@ -178,14 +178,17 @@ MultiFileResult parse_multiple_files(
178
178
  const std::vector<std::string>& skip_missions,
179
179
  const std::vector<std::string>& keep_missions,
180
180
  bool skip_first_record,
181
- bool repair)
181
+ bool repair,
182
+ bool presorted)
182
183
  {
183
184
  if (filenames.empty()) {
184
185
  return {{}, {}, 0, 0};
185
186
  }
186
187
 
187
188
  std::vector<std::string> sorted_files(filenames);
188
- std::sort(sorted_files.begin(), sorted_files.end());
189
+ if (!presorted) {
190
+ std::sort(sorted_files.begin(), sorted_files.end());
191
+ }
189
192
 
190
193
  Header::tMissions skipSet, keepSet;
191
194
  for (const auto& m : skip_missions) Header::addMission(m, skipSet);
@@ -280,7 +283,7 @@ MultiFileResult parse_multiple_files(
280
283
 
281
284
  size_t n = result.n_records;
282
285
  size_t start = 0;
283
- if (skip_first_record && fileCount > 0 && n > 0) {
286
+ if (skip_first_record && n > 0) {
284
287
  start = 1;
285
288
  n -= 1;
286
289
  }
@@ -288,7 +291,8 @@ MultiFileResult parse_multiple_files(
288
291
  if (n > 0) {
289
292
  // Grow union columns if needed (doubling strategy)
290
293
  if (offset + n > capacity) {
291
- capacity = std::max(offset + n, capacity * 2);
294
+ size_t doubled = (capacity <= SIZE_MAX / 2) ? capacity * 2 : SIZE_MAX;
295
+ capacity = std::max(offset + n, doubled);
292
296
  grow_union_columns(unionColumns, unionInfo, capacity);
293
297
  }
294
298
 
@@ -573,14 +577,15 @@ PYBIND11_MODULE(_dbd_cpp, m, py::mod_gil_not_used()) {
573
577
  const std::vector<std::string>& skip_missions,
574
578
  const std::vector<std::string>& keep_missions,
575
579
  bool skip_first_record,
576
- bool repair) -> py::dict {
580
+ bool repair,
581
+ bool presorted) -> py::dict {
577
582
  MultiFileResult result;
578
583
  {
579
584
  py::gil_scoped_release release;
580
585
  result = parse_multiple_files(filenames, cache_dir, to_keep,
581
586
  criteria, skip_missions,
582
587
  keep_missions, skip_first_record,
583
- repair);
588
+ repair, presorted);
584
589
  }
585
590
  return multi_result_to_python(std::move(result));
586
591
  },
@@ -592,10 +597,11 @@ PYBIND11_MODULE(_dbd_cpp, m, py::mod_gil_not_used()) {
592
597
  py::arg("keep_missions") = std::vector<std::string>(),
593
598
  py::arg("skip_first_record") = true,
594
599
  py::arg("repair") = false,
600
+ py::arg("presorted") = false,
595
601
  "Read multiple DBD files with sensor union and return concatenated data.\n\n"
596
602
  "Uses a two-pass approach: pass 1 scans headers and builds a unified\n"
597
603
  "sensor list via SensorsMap, pass 2 reads data and merges into union\n"
598
- "columns. Files are sorted internally.\n\n"
604
+ "columns. Files are sorted internally unless presorted is True.\n\n"
599
605
  "Parameters\n"
600
606
  "----------\n"
601
607
  "filenames : list of str\n"
@@ -614,7 +620,10 @@ PYBIND11_MODULE(_dbd_cpp, m, py::mod_gil_not_used()) {
614
620
  " If True (default), drop the first record of each file after\n"
615
621
  " the first.\n"
616
622
  "repair : bool, optional\n"
617
- " If True, attempt to recover data from corrupted records.\n\n"
623
+ " If True, attempt to recover data from corrupted records.\n"
624
+ "presorted : bool, optional\n"
625
+ " If True, skip internal lexicographic sort and process files\n"
626
+ " in the order given. Default False.\n\n"
618
627
  "Returns\n"
619
628
  "-------\n"
620
629
  "dict\n"
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
4
4
 
5
5
  [project]
6
6
  name = "xarray-dbd"
7
- version = "0.2.3"
7
+ version = "0.2.5"
8
8
  description = "Efficient xarray backend for reading glider DBD files"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -876,6 +876,7 @@ def _base_args(**overrides) -> Namespace:
876
876
  "mail_from": None,
877
877
  "mail_subject": None,
878
878
  "smtp_host": "localhost",
879
+ "sort": "header_time",
879
880
  }
880
881
  defaults.update(overrides)
881
882
  return Namespace(**defaults)
@@ -1056,6 +1057,57 @@ class TestDbd2ncRun:
1056
1057
  assert len(ds.data_vars) > 0
1057
1058
  ds.close()
1058
1059
 
1060
+ def test_dbd2nc_run_sort_header_time(self, tmp_path):
1061
+ """Streaming write with --sort header_time produces valid output."""
1062
+ import xarray as xr
1063
+
1064
+ from xarray_dbd.cli.dbd2nc import run
1065
+
1066
+ dcd_files = sorted(DBD_DIR.glob("*.dcd"))[:3]
1067
+ outfile = tmp_path / "out.nc"
1068
+ args = _base_args(
1069
+ files=dcd_files,
1070
+ cache=Path(CACHE_DIR),
1071
+ output=outfile,
1072
+ append=False,
1073
+ sensors=None,
1074
+ sensor_output=None,
1075
+ skip_mission=None,
1076
+ keep_mission=None,
1077
+ skip_first=True,
1078
+ repair=False,
1079
+ compression=5,
1080
+ sort="header_time",
1081
+ )
1082
+ rc = run(args)
1083
+ assert rc == 0
1084
+ ds = xr.open_dataset(str(outfile), decode_timedelta=False)
1085
+ assert len(ds.data_vars) > 0
1086
+ ds.close()
1087
+
1088
+ def test_dbd2nc_run_sort_none(self, tmp_path):
1089
+ """Streaming write with --sort none produces valid output."""
1090
+ from xarray_dbd.cli.dbd2nc import run
1091
+
1092
+ dcd_files = sorted(DBD_DIR.glob("*.dcd"))[:2]
1093
+ outfile = tmp_path / "out.nc"
1094
+ args = _base_args(
1095
+ files=dcd_files,
1096
+ cache=Path(CACHE_DIR),
1097
+ output=outfile,
1098
+ append=False,
1099
+ sensors=None,
1100
+ sensor_output=None,
1101
+ skip_mission=None,
1102
+ keep_mission=None,
1103
+ skip_first=True,
1104
+ repair=False,
1105
+ compression=5,
1106
+ sort="none",
1107
+ )
1108
+ rc = run(args)
1109
+ assert rc == 0
1110
+
1059
1111
  def test_dbd2nc_run_no_compression(self, tmp_path):
1060
1112
  from xarray_dbd.cli.dbd2nc import run
1061
1113
 
@@ -119,6 +119,79 @@ def test_open_multi_dbd_dataset():
119
119
  ds.close()
120
120
 
121
121
 
122
+ def test_read_dbd_files_presorted():
123
+ """read_dbd_files with presorted=True preserves caller's file order."""
124
+ files = sorted(str(f) for f in DBD_DIR.glob("*.dcd"))[:5]
125
+ if len(files) < 2:
126
+ pytest.skip("Need at least 2 test files")
127
+
128
+ # Normal (lexicographic) order
129
+ result_lex = read_dbd_files(files, cache_dir=CACHE_DIR, skip_first_record=True)
130
+
131
+ # Reversed order with presorted=True — should produce different data order
132
+ result_rev = read_dbd_files(
133
+ list(reversed(files)),
134
+ cache_dir=CACHE_DIR,
135
+ skip_first_record=True,
136
+ presorted=True,
137
+ )
138
+
139
+ # Both should have the same total records and sensor names
140
+ assert result_lex["n_records"] == result_rev["n_records"]
141
+ assert set(result_lex["sensor_names"]) == set(result_rev["sensor_names"])
142
+
143
+
144
+ def test_open_multi_dbd_dataset_sort_header_time():
145
+ """open_multi_dbd_dataset with sort='header_time' produces valid output."""
146
+ files = sorted(DBD_DIR.glob("*.dcd"))[:5]
147
+ if len(files) < 2:
148
+ pytest.skip("Need at least 2 test files")
149
+
150
+ ds = xdbd.open_multi_dbd_dataset(
151
+ files,
152
+ skip_first_record=True,
153
+ cache_dir=CACHE_DIR,
154
+ sort="header_time",
155
+ )
156
+ assert len(ds.data_vars) > 0
157
+ assert len(ds.i) > 0
158
+
159
+ # Compare record count with lexicographic sort — should be the same
160
+ ds_lex = xdbd.open_multi_dbd_dataset(
161
+ files,
162
+ skip_first_record=True,
163
+ cache_dir=CACHE_DIR,
164
+ sort="lexicographic",
165
+ )
166
+ assert len(ds.i) == len(ds_lex.i)
167
+ ds.close()
168
+ ds_lex.close()
169
+
170
+
171
+ def test_open_multi_dbd_dataset_sort_none():
172
+ """open_multi_dbd_dataset with sort='none' preserves caller's order."""
173
+ files = sorted(DBD_DIR.glob("*.dcd"))[:3]
174
+ if len(files) < 2:
175
+ pytest.skip("Need at least 2 test files")
176
+
177
+ ds = xdbd.open_multi_dbd_dataset(
178
+ files,
179
+ skip_first_record=True,
180
+ cache_dir=CACHE_DIR,
181
+ sort="none",
182
+ )
183
+ assert len(ds.data_vars) > 0
184
+ assert len(ds.i) > 0
185
+ ds.close()
186
+
187
+
188
+ def test_open_multi_dbd_dataset_sort_invalid():
189
+ """open_multi_dbd_dataset rejects invalid sort values."""
190
+ files = sorted(DBD_DIR.glob("*.dcd"))[:1]
191
+ with pytest.raises(ValueError, match="sort must be one of"):
192
+ xdbd.open_multi_dbd_dataset(files, cache_dir=CACHE_DIR, sort="bogus")
193
+
194
+
122
195
  def test_nan_fill_for_floats():
123
196
  """Float columns use NaN for absent values, int columns use 0."""
124
197
  files = sorted(str(f) for f in DBD_DIR.glob("*.dcd"))[:5]
@@ -49,6 +49,7 @@ def read_dbd_files(
49
49
  keep_missions: list[str] = ...,
50
50
  skip_first_record: bool = True,
51
51
  repair: bool = False,
52
+ presorted: bool = False,
52
53
  ) -> _MultiResult: ...
53
54
  def scan_sensors(
54
55
  filenames: list[str],