xarray-dbd 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/.pre-commit-config.yaml +11 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/CHANGELOG.md +30 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/CMakeLists.txt +0 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/PKG-INFO +77 -6
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/README.md +76 -5
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/benchmark_performance.py +2 -2
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/conda/recipe.yaml +5 -3
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/ColumnData.C +6 -2
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Decompress.C +21 -6
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Decompress.H +7 -5
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Header.C +38 -13
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Header.H +3 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/KnownBytes.C +30 -23
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/KnownBytes.H +1 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/MyException.H +2 -2
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensor.C +15 -8
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensor.H +2 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensors.C +7 -3
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensors.H +0 -4
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/SensorsMap.C +13 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/SensorsMap.H +1 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/config.h +0 -2
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/dbd_python.cpp +45 -38
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/pyproject.toml +20 -5
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_backend.py +140 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_cli.py +395 -1
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_cpp_backend.py +8 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_dbdreader2.py +149 -29
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/backend.py +58 -16
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/csv.py +44 -25
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/dbd2nc.py +54 -9
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/mkone.py +28 -8
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_core.py +68 -21
- xarray_dbd-0.2.5/csrc/Data.C +0 -173
- xarray_dbd-0.2.5/csrc/Data.H +0 -67
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/.clang-tidy +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/.gitignore +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/CONTRIBUTING.md +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/License.txt +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/ColumnData.H +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/FileInfo.H +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Logger.H +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/lz4.c +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/lz4.h +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/examples/README.md +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/scripts/README.md +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/conftest.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/__init__.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/_dbd_cpp.pyi +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/__init__.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/cache.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/logger.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/main.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/missions.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/sensors.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/__init__.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_cache.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_errors.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_list.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_util.py +0 -0
- {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/py.typed +0 -0
|
@@ -13,3 +13,14 @@ repos:
|
|
|
13
13
|
- id: ruff
|
|
14
14
|
args: [--fix, --exit-non-zero-on-fix]
|
|
15
15
|
- id: ruff-format
|
|
16
|
+
|
|
17
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
18
|
+
rev: v1.13.0
|
|
19
|
+
hooks:
|
|
20
|
+
- id: mypy
|
|
21
|
+
# Match the CI invocation (xarray_dbd/ only; tests are linted, not typed).
|
|
22
|
+
files: ^xarray_dbd/
|
|
23
|
+
additional_dependencies:
|
|
24
|
+
- numpy>=1.23,<3.0
|
|
25
|
+
- xarray>=2023.6.0
|
|
26
|
+
- netCDF4>=1.6
|
|
@@ -5,6 +5,36 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.6] - 2026-03-30
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- `--list-sensors` flag for `dbd2nc` CLI to print available sensors without conversion
|
|
13
|
+
- `batch_size` parameter for `write_multi_dbd_netcdf()` (was hardcoded at 100)
|
|
14
|
+
- Signal handling in `mkone` — Ctrl+C now terminates child processes cleanly
|
|
15
|
+
- "Working with Glider Data" section in README (sensor discovery, time conversion, fill values)
|
|
16
|
+
- Tests for `get_CTD_sync`, `determine_ctd_type`, `get_global_time_range`, file ordering, batch boundaries
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
|
|
20
|
+
- `get_sync()` logs interpolation failures at WARNING level instead of INFO
|
|
21
|
+
- Streaming writer logs summary when batches are skipped due to errors
|
|
22
|
+
- `set_time_limits()` accepts numeric epoch seconds in addition to date strings
|
|
23
|
+
- C++ `SensorsMap::setUpForData()` validates sensor byte sizes across files
|
|
24
|
+
|
|
25
|
+
### Fixed
|
|
26
|
+
|
|
27
|
+
- **Data loss in streaming writer**: removed Python-side double-skip at batch boundaries (C++ already handles `skip_first_record`)
|
|
28
|
+
- **dbdreader2 file ordering**: pass `presorted=True` to `read_dbd_files` so C++ respects chronological order from `DBDList.sort()`
|
|
29
|
+
- **mkone worker error propagation**: workers now exit non-zero on failure so parent detects errors
|
|
30
|
+
- **`_get_with_source` time ordering**: results now sorted by time for consistency with normal `get()` path
|
|
31
|
+
- **`sci_extensions` missing `.sbd`**: file pairing now recognizes `.sbd` as a science file type
|
|
32
|
+
- **`set_time_limits` falsy check**: epoch time 0 no longer causes spurious ValueError
|
|
33
|
+
- **inf-to-NaN for repeated values**: code=1 (repeat) now converts infinity consistently with code=2 (new value)
|
|
34
|
+
- Removed unused `"j"` dimension from `DBDDataStore.get_dimensions()`
|
|
35
|
+
- Fixed `--skip-first` help text (was stale after skip semantics change)
|
|
36
|
+
- Fixed README: CLI command names, removed false wildcard `to_keep` claim
|
|
37
|
+
|
|
8
38
|
## [0.2.5] - 2026-03-30
|
|
9
39
|
|
|
10
40
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xarray-dbd
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Efficient xarray backend for reading glider DBD files
|
|
5
5
|
Keywords: glider,oceanography,dbd,slocum,xarray,netcdf
|
|
6
6
|
Author-Email: Pat Welch <pat@mousebrains.com>
|
|
@@ -74,7 +74,7 @@ pip install xarray-dbd
|
|
|
74
74
|
For the CLI tools only:
|
|
75
75
|
|
|
76
76
|
```bash
|
|
77
|
-
pipx install xarray-dbd # installs
|
|
77
|
+
pipx install xarray-dbd # installs xdbd command (xdbd 2nc, xdbd mkone, etc.)
|
|
78
78
|
```
|
|
79
79
|
|
|
80
80
|
Or install from source (requires a C++ compiler and CMake):
|
|
@@ -167,8 +167,8 @@ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
|
|
|
167
167
|
The `--sort` flag is also available on all CLI commands:
|
|
168
168
|
|
|
169
169
|
```bash
|
|
170
|
-
|
|
171
|
-
mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
|
|
170
|
+
xdbd 2nc --sort lexicographic -C cache -o output.nc *.dbd
|
|
171
|
+
xdbd mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
|
|
172
172
|
```
|
|
173
173
|
|
|
174
174
|
### Advanced options
|
|
@@ -178,11 +178,15 @@ ds = xdbd.open_dbd_dataset(
|
|
|
178
178
|
'test.sbd',
|
|
179
179
|
skip_first_record=True, # Skip first record (default)
|
|
180
180
|
repair=True, # Attempt to repair corrupted data
|
|
181
|
-
to_keep=['
|
|
181
|
+
to_keep=['m_depth', 'm_lat'], # Keep only these sensors
|
|
182
182
|
criteria=['m_present_time'], # Sensors for record selection
|
|
183
183
|
)
|
|
184
184
|
```
|
|
185
185
|
|
|
186
|
+
The corresponding CLI flag on `xdbd 2nc` and `xdbd 2csv` is `--keep-first`
|
|
187
|
+
(default is to skip the first record of every file, matching `mkone` and
|
|
188
|
+
`dbdreader`). Use `--skip-first` to be explicit or `--keep-first` to invert.
|
|
189
|
+
|
|
186
190
|
## DBD File Format
|
|
187
191
|
|
|
188
192
|
DBD (Dinkum Binary Data) files are the native format used by Slocum ocean gliders. The format consists of:
|
|
@@ -234,6 +238,27 @@ Open multiple DBD files as a single concatenated xarray Dataset.
|
|
|
234
238
|
|
|
235
239
|
**Returns:** `xarray.Dataset`
|
|
236
240
|
|
|
241
|
+
### `write_multi_dbd_netcdf(filenames, output, **kwargs)`
|
|
242
|
+
|
|
243
|
+
Stream multiple DBD files directly to a NetCDF file without loading all data
|
|
244
|
+
into memory. Preferred for large datasets (100+ files).
|
|
245
|
+
|
|
246
|
+
**Parameters:**
|
|
247
|
+
- `filenames` (iterable): Paths to DBD files (duplicates removed automatically)
|
|
248
|
+
- `output` (str or Path): Output NetCDF file path (parent directory created if needed)
|
|
249
|
+
- `skip_first_record` (bool): Skip first record in each file (default: True)
|
|
250
|
+
- `repair` (bool): Attempt to repair corrupted records (default: False)
|
|
251
|
+
- `to_keep` (list of str): Sensor names to keep (default: all)
|
|
252
|
+
- `criteria` (list of str): Sensor names for selection criteria
|
|
253
|
+
- `skip_missions` (list of str): Mission names to skip
|
|
254
|
+
- `keep_missions` (list of str): Mission names to keep
|
|
255
|
+
- `cache_dir` (str, Path, or None): Directory for sensor cache files
|
|
256
|
+
- `compression` (int): Zlib compression level 0-9 (default: 5, 0 disables)
|
|
257
|
+
- `sort` (str): File sort order (default: `"header_time"`)
|
|
258
|
+
- `batch_size` (int): Files per batch (default: 100; smaller reduces peak memory)
|
|
259
|
+
|
|
260
|
+
**Returns:** `tuple[int, int]` — (n_records, n_files)
|
|
261
|
+
|
|
237
262
|
## Migration from dbdreader
|
|
238
263
|
|
|
239
264
|
The dbdreader2 API is derived from Lucas Merckelbach's
|
|
@@ -498,6 +523,7 @@ print(f"Depth units: {ds['m_depth'].attrs['units']}")
|
|
|
498
523
|
### Working with trajectories
|
|
499
524
|
|
|
500
525
|
```python
|
|
526
|
+
from pathlib import Path
|
|
501
527
|
import xarray_dbd as xdbd
|
|
502
528
|
import matplotlib.pyplot as plt
|
|
503
529
|
|
|
@@ -518,6 +544,7 @@ plt.show()
|
|
|
518
544
|
### Extracting science data
|
|
519
545
|
|
|
520
546
|
```python
|
|
547
|
+
from pathlib import Path
|
|
521
548
|
# Read full resolution science data
|
|
522
549
|
files = sorted(Path('.').glob('*.ebd'))
|
|
523
550
|
ds = xdbd.open_multi_dbd_dataset(
|
|
@@ -538,7 +565,7 @@ print(df.describe())
|
|
|
538
565
|
| Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
|
|
539
566
|
| Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
|
|
540
567
|
| Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
|
|
541
|
-
| Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
|
|
568
|
+
| Batch processing 1000+ files | `xdbd mkone` CLI (multiprocessing) |
|
|
542
569
|
| Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
|
|
543
570
|
|
|
544
571
|
## Slocum File Types
|
|
@@ -554,6 +581,50 @@ print(df.describe())
|
|
|
554
581
|
|
|
555
582
|
Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
|
|
556
583
|
|
|
584
|
+
## Working with Glider Data
|
|
585
|
+
|
|
586
|
+
### Discovering available sensors
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
import xarray_dbd as xdbd
|
|
590
|
+
|
|
591
|
+
# xarray API
|
|
592
|
+
ds = xdbd.open_dbd_dataset("file.dbd", cache_dir="cache")
|
|
593
|
+
for var in sorted(ds.data_vars):
|
|
594
|
+
print(f" {var:30s} {ds[var].attrs.get('units', '')}")
|
|
595
|
+
|
|
596
|
+
# dbdreader2 API
|
|
597
|
+
dbd = xdbd.MultiDBD(pattern="*.dbd", cacheDir="cache")
|
|
598
|
+
for name in sorted(dbd.parameterNames["eng"]):
|
|
599
|
+
print(f" {name:30s} {dbd.parameterUnits.get(name, '')}")
|
|
600
|
+
```
|
|
601
|
+
|
|
602
|
+
Sensor naming conventions are documented in
|
|
603
|
+
[TWR's masterdata files](https://gliderfs2.ceoas.oregonstate.edu/gliderweb/masterdata/).
|
|
604
|
+
|
|
605
|
+
### Time conversion
|
|
606
|
+
|
|
607
|
+
`m_present_time` contains UTC seconds since 1970-01-01 (Unix epoch, float64):
|
|
608
|
+
|
|
609
|
+
```python
|
|
610
|
+
import pandas as pd
|
|
611
|
+
|
|
612
|
+
time = pd.to_datetime(ds["m_present_time"].values, unit="s", utc=True)
|
|
613
|
+
```
|
|
614
|
+
|
|
615
|
+
### Handling fill values
|
|
616
|
+
|
|
617
|
+
Float sensors use NaN for missing data. Integer sensors use sentinel fill
|
|
618
|
+
values (-127 for int8, -32768 for int16). Filter them out:
|
|
619
|
+
|
|
620
|
+
```python
|
|
621
|
+
# xarray — replace sentinels with NaN
|
|
622
|
+
ds = ds.where(ds != -32768)
|
|
623
|
+
|
|
624
|
+
# dbdreader2 — automatic filtering (default)
|
|
625
|
+
t, v = dbd.get("m_depth") # return_nans=False by default
|
|
626
|
+
```
|
|
627
|
+
|
|
557
628
|
## Known Limitations
|
|
558
629
|
|
|
559
630
|
- **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
|
|
@@ -38,7 +38,7 @@ pip install xarray-dbd
|
|
|
38
38
|
For the CLI tools only:
|
|
39
39
|
|
|
40
40
|
```bash
|
|
41
|
-
pipx install xarray-dbd # installs
|
|
41
|
+
pipx install xarray-dbd # installs xdbd command (xdbd 2nc, xdbd mkone, etc.)
|
|
42
42
|
```
|
|
43
43
|
|
|
44
44
|
Or install from source (requires a C++ compiler and CMake):
|
|
@@ -131,8 +131,8 @@ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
|
|
|
131
131
|
The `--sort` flag is also available on all CLI commands:
|
|
132
132
|
|
|
133
133
|
```bash
|
|
134
|
-
|
|
135
|
-
mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
|
|
134
|
+
xdbd 2nc --sort lexicographic -C cache -o output.nc *.dbd
|
|
135
|
+
xdbd mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
### Advanced options
|
|
@@ -142,11 +142,15 @@ ds = xdbd.open_dbd_dataset(
|
|
|
142
142
|
'test.sbd',
|
|
143
143
|
skip_first_record=True, # Skip first record (default)
|
|
144
144
|
repair=True, # Attempt to repair corrupted data
|
|
145
|
-
to_keep=['
|
|
145
|
+
to_keep=['m_depth', 'm_lat'], # Keep only these sensors
|
|
146
146
|
criteria=['m_present_time'], # Sensors for record selection
|
|
147
147
|
)
|
|
148
148
|
```
|
|
149
149
|
|
|
150
|
+
The corresponding CLI flag on `xdbd 2nc` and `xdbd 2csv` is `--keep-first`
|
|
151
|
+
(default is to skip the first record of every file, matching `mkone` and
|
|
152
|
+
`dbdreader`). Use `--skip-first` to be explicit or `--keep-first` to invert.
|
|
153
|
+
|
|
150
154
|
## DBD File Format
|
|
151
155
|
|
|
152
156
|
DBD (Dinkum Binary Data) files are the native format used by Slocum ocean gliders. The format consists of:
|
|
@@ -198,6 +202,27 @@ Open multiple DBD files as a single concatenated xarray Dataset.
|
|
|
198
202
|
|
|
199
203
|
**Returns:** `xarray.Dataset`
|
|
200
204
|
|
|
205
|
+
### `write_multi_dbd_netcdf(filenames, output, **kwargs)`
|
|
206
|
+
|
|
207
|
+
Stream multiple DBD files directly to a NetCDF file without loading all data
|
|
208
|
+
into memory. Preferred for large datasets (100+ files).
|
|
209
|
+
|
|
210
|
+
**Parameters:**
|
|
211
|
+
- `filenames` (iterable): Paths to DBD files (duplicates removed automatically)
|
|
212
|
+
- `output` (str or Path): Output NetCDF file path (parent directory created if needed)
|
|
213
|
+
- `skip_first_record` (bool): Skip first record in each file (default: True)
|
|
214
|
+
- `repair` (bool): Attempt to repair corrupted records (default: False)
|
|
215
|
+
- `to_keep` (list of str): Sensor names to keep (default: all)
|
|
216
|
+
- `criteria` (list of str): Sensor names for selection criteria
|
|
217
|
+
- `skip_missions` (list of str): Mission names to skip
|
|
218
|
+
- `keep_missions` (list of str): Mission names to keep
|
|
219
|
+
- `cache_dir` (str, Path, or None): Directory for sensor cache files
|
|
220
|
+
- `compression` (int): Zlib compression level 0-9 (default: 5, 0 disables)
|
|
221
|
+
- `sort` (str): File sort order (default: `"header_time"`)
|
|
222
|
+
- `batch_size` (int): Files per batch (default: 100; smaller reduces peak memory)
|
|
223
|
+
|
|
224
|
+
**Returns:** `tuple[int, int]` — (n_records, n_files)
|
|
225
|
+
|
|
201
226
|
## Migration from dbdreader
|
|
202
227
|
|
|
203
228
|
The dbdreader2 API is derived from Lucas Merckelbach's
|
|
@@ -462,6 +487,7 @@ print(f"Depth units: {ds['m_depth'].attrs['units']}")
|
|
|
462
487
|
### Working with trajectories
|
|
463
488
|
|
|
464
489
|
```python
|
|
490
|
+
from pathlib import Path
|
|
465
491
|
import xarray_dbd as xdbd
|
|
466
492
|
import matplotlib.pyplot as plt
|
|
467
493
|
|
|
@@ -482,6 +508,7 @@ plt.show()
|
|
|
482
508
|
### Extracting science data
|
|
483
509
|
|
|
484
510
|
```python
|
|
511
|
+
from pathlib import Path
|
|
485
512
|
# Read full resolution science data
|
|
486
513
|
files = sorted(Path('.').glob('*.ebd'))
|
|
487
514
|
ds = xdbd.open_multi_dbd_dataset(
|
|
@@ -502,7 +529,7 @@ print(df.describe())
|
|
|
502
529
|
| Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
|
|
503
530
|
| Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
|
|
504
531
|
| Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
|
|
505
|
-
| Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
|
|
532
|
+
| Batch processing 1000+ files | `xdbd mkone` CLI (multiprocessing) |
|
|
506
533
|
| Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
|
|
507
534
|
|
|
508
535
|
## Slocum File Types
|
|
@@ -518,6 +545,50 @@ print(df.describe())
|
|
|
518
545
|
|
|
519
546
|
Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
|
|
520
547
|
|
|
548
|
+
## Working with Glider Data
|
|
549
|
+
|
|
550
|
+
### Discovering available sensors
|
|
551
|
+
|
|
552
|
+
```python
|
|
553
|
+
import xarray_dbd as xdbd
|
|
554
|
+
|
|
555
|
+
# xarray API
|
|
556
|
+
ds = xdbd.open_dbd_dataset("file.dbd", cache_dir="cache")
|
|
557
|
+
for var in sorted(ds.data_vars):
|
|
558
|
+
print(f" {var:30s} {ds[var].attrs.get('units', '')}")
|
|
559
|
+
|
|
560
|
+
# dbdreader2 API
|
|
561
|
+
dbd = xdbd.MultiDBD(pattern="*.dbd", cacheDir="cache")
|
|
562
|
+
for name in sorted(dbd.parameterNames["eng"]):
|
|
563
|
+
print(f" {name:30s} {dbd.parameterUnits.get(name, '')}")
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
Sensor naming conventions are documented in
|
|
567
|
+
[TWR's masterdata files](https://gliderfs2.ceoas.oregonstate.edu/gliderweb/masterdata/).
|
|
568
|
+
|
|
569
|
+
### Time conversion
|
|
570
|
+
|
|
571
|
+
`m_present_time` contains UTC seconds since 1970-01-01 (Unix epoch, float64):
|
|
572
|
+
|
|
573
|
+
```python
|
|
574
|
+
import pandas as pd
|
|
575
|
+
|
|
576
|
+
time = pd.to_datetime(ds["m_present_time"].values, unit="s", utc=True)
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
### Handling fill values
|
|
580
|
+
|
|
581
|
+
Float sensors use NaN for missing data. Integer sensors use sentinel fill
|
|
582
|
+
values (-127 for int8, -32768 for int16). Filter them out:
|
|
583
|
+
|
|
584
|
+
```python
|
|
585
|
+
# xarray — replace sentinels with NaN
|
|
586
|
+
ds = ds.where(ds != -32768)
|
|
587
|
+
|
|
588
|
+
# dbdreader2 — automatic filtering (default)
|
|
589
|
+
t, v = dbd.get("m_depth") # return_nans=False by default
|
|
590
|
+
```
|
|
591
|
+
|
|
521
592
|
## Known Limitations
|
|
522
593
|
|
|
523
594
|
- **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
|
|
@@ -36,11 +36,11 @@ def measure_command(cmd, desc):
|
|
|
36
36
|
break
|
|
37
37
|
time.sleep(0.01)
|
|
38
38
|
|
|
39
|
-
# Final check
|
|
39
|
+
# Final check — process may already have exited between the loop and here
|
|
40
40
|
try:
|
|
41
41
|
mem_info = process.memory_info()
|
|
42
42
|
peak_memory = max(peak_memory, mem_info.rss)
|
|
43
|
-
except:
|
|
43
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, OSError):
|
|
44
44
|
pass
|
|
45
45
|
|
|
46
46
|
except KeyboardInterrupt:
|
|
@@ -6,7 +6,7 @@ schema_version: 1
|
|
|
6
6
|
|
|
7
7
|
context:
|
|
8
8
|
name: xarray-dbd
|
|
9
|
-
version: "0.
|
|
9
|
+
version: "0.2.7"
|
|
10
10
|
|
|
11
11
|
package:
|
|
12
12
|
name: ${{ name }}
|
|
@@ -37,6 +37,7 @@ requirements:
|
|
|
37
37
|
- python >=3.10
|
|
38
38
|
- numpy >=1.23,<3.0
|
|
39
39
|
- xarray >=2023.6.0
|
|
40
|
+
- netcdf4 >=1.6
|
|
40
41
|
|
|
41
42
|
tests:
|
|
42
43
|
- python:
|
|
@@ -45,8 +46,9 @@ tests:
|
|
|
45
46
|
- xarray_dbd._dbd_cpp
|
|
46
47
|
pip_check: true
|
|
47
48
|
- script:
|
|
48
|
-
-
|
|
49
|
-
-
|
|
49
|
+
- xdbd --help
|
|
50
|
+
- xdbd 2nc --help
|
|
51
|
+
- xdbd mkone --help
|
|
50
52
|
|
|
51
53
|
about:
|
|
52
54
|
summary: Efficient xarray backend for reading Slocum ocean glider DBD files
|
|
@@ -123,7 +123,7 @@ ColumnDataResult read_columns(std::istream& is,
|
|
|
123
123
|
qKeep |= sensor.qCriteria();
|
|
124
124
|
const int oi = outIndex[i];
|
|
125
125
|
if (oi >= 0) {
|
|
126
|
-
// Copy previous value into current row
|
|
126
|
+
// Copy previous value into current row, converting inf to NaN
|
|
127
127
|
std::visit([nRows, oi](auto& col_vec, const auto& prev_vec) {
|
|
128
128
|
using T = typename std::decay_t<decltype(col_vec)>::value_type;
|
|
129
129
|
using PT = typename std::decay_t<decltype(prev_vec)>::value_type;
|
|
@@ -136,7 +136,11 @@ ColumnDataResult read_columns(std::istream& is,
|
|
|
136
136
|
else
|
|
137
137
|
col_vec.resize(col_vec.size() * 2, NAN);
|
|
138
138
|
}
|
|
139
|
-
|
|
139
|
+
T val = prev_vec[0];
|
|
140
|
+
if constexpr (std::is_floating_point_v<T>) {
|
|
141
|
+
if (std::isinf(val)) val = NAN;
|
|
142
|
+
}
|
|
143
|
+
col_vec[nRows] = val;
|
|
140
144
|
}
|
|
141
145
|
}, columns[oi], prevValues[oi]);
|
|
142
146
|
}
|
|
@@ -38,17 +38,20 @@ int DecompressTWRBuf::underflow() {
|
|
|
38
38
|
if (!this->mIS.read(frame.data(), n)) { // EOF
|
|
39
39
|
return std::char_traits<char>::eof();
|
|
40
40
|
}
|
|
41
|
-
const int j
|
|
41
|
+
const int j(LZ4_decompress_safe(frame.data(), this->mBuffer, static_cast<int>(n), sizeof(this->mBuffer)));
|
|
42
42
|
if (j < 0) { // LZ4 decompression error
|
|
43
|
+
LOG_ERROR("LZ4 decompression failed (error {}) in {} (block size {})",
|
|
44
|
+
j, this->mFilename, n);
|
|
43
45
|
return std::char_traits<char>::eof();
|
|
44
46
|
}
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
this->setg(this->mBuffer, this->mBuffer, this->mBuffer + j);
|
|
47
|
+
const size_t decompressedSize(static_cast<size_t>(j));
|
|
48
|
+
this->setg(this->mBuffer, this->mBuffer, this->mBuffer + decompressedSize);
|
|
49
|
+
this->mPos += decompressedSize;
|
|
49
50
|
} else { // Not compressed
|
|
50
51
|
if (this->mIS.read(this->mBuffer, sizeof(this->mBuffer)) || this->mIS.gcount()) {
|
|
51
|
-
|
|
52
|
+
const auto n = this->mIS.gcount();
|
|
53
|
+
this->setg(this->mBuffer, this->mBuffer, this->mBuffer + n);
|
|
54
|
+
this->mPos += static_cast<size_t>(n);
|
|
52
55
|
} else {
|
|
53
56
|
return std::char_traits<char>::eof();
|
|
54
57
|
}
|
|
@@ -57,6 +60,18 @@ int DecompressTWRBuf::underflow() {
|
|
|
57
60
|
return std::char_traits<char>::to_int_type(*this->gptr());
|
|
58
61
|
}
|
|
59
62
|
|
|
63
|
+
DecompressTWRBuf::pos_type
|
|
64
|
+
DecompressTWRBuf::seekoff(off_type off, std::ios_base::seekdir dir,
|
|
65
|
+
std::ios_base::openmode /*which*/) {
|
|
66
|
+
// Only support tellg(): seekoff(0, cur)
|
|
67
|
+
if (dir == std::ios_base::cur && off == 0) {
|
|
68
|
+
// mPos is total bytes loaded; subtract unread bytes remaining in buffer
|
|
69
|
+
const auto remaining = this->egptr() - this->gptr();
|
|
70
|
+
return static_cast<pos_type>(this->mPos - static_cast<size_t>(remaining));
|
|
71
|
+
}
|
|
72
|
+
return pos_type(off_type(-1)); // Seeking not supported
|
|
73
|
+
}
|
|
74
|
+
|
|
60
75
|
bool qCompressed(const std::string& fn) {
|
|
61
76
|
const std::string suffix(fs::path(fn).extension().string());
|
|
62
77
|
const bool q((suffix.size() == 4) && (std::tolower(static_cast<unsigned char>(suffix[2])) == 'c'));
|
|
@@ -9,21 +9,23 @@
|
|
|
9
9
|
class DecompressTWRBuf: public std::streambuf {
|
|
10
10
|
std::ifstream mIS;
|
|
11
11
|
const bool mqCompressed;
|
|
12
|
-
char mBuffer[65536];
|
|
12
|
+
char mBuffer[65536]{}; // zero-init so cppcheck across versions stays happy
|
|
13
13
|
const std::string mFilename;
|
|
14
|
+
size_t mPos = 0; // Total decompressed bytes loaded into buffer
|
|
14
15
|
public:
|
|
15
16
|
DecompressTWRBuf(const std::string& fn, const bool qCompressed)
|
|
16
17
|
: mIS(fn.c_str(), std::ios::binary)
|
|
17
18
|
, mqCompressed(qCompressed)
|
|
18
19
|
, mFilename(fn)
|
|
19
|
-
|
|
20
20
|
{}
|
|
21
21
|
|
|
22
|
-
~DecompressTWRBuf() {mIS.close();}
|
|
23
|
-
|
|
24
22
|
void close() {mIS.close();}
|
|
25
23
|
|
|
26
|
-
int underflow();
|
|
24
|
+
int underflow() override;
|
|
25
|
+
|
|
26
|
+
protected:
|
|
27
|
+
pos_type seekoff(off_type off, std::ios_base::seekdir dir,
|
|
28
|
+
std::ios_base::openmode which = std::ios_base::in) override;
|
|
27
29
|
};
|
|
28
30
|
|
|
29
31
|
class DecompressTWR: public std::istream {
|
|
@@ -21,6 +21,10 @@
|
|
|
21
21
|
#include "MyException.H"
|
|
22
22
|
#include "Logger.H"
|
|
23
23
|
#include <iostream>
|
|
24
|
+
#include <sstream>
|
|
25
|
+
#include <iomanip>
|
|
26
|
+
#include <algorithm>
|
|
27
|
+
#include <limits>
|
|
24
28
|
#include <cstdlib>
|
|
25
29
|
|
|
26
30
|
namespace {
|
|
@@ -35,7 +39,9 @@ namespace {
|
|
|
35
39
|
Header::Header(std::istream& is, const char *fn)
|
|
36
40
|
{
|
|
37
41
|
size_t cnt = 0;
|
|
38
|
-
|
|
42
|
+
// 14 is the typical ASCII header length in DBD files; num_ascii_tags (if
|
|
43
|
+
// present in the first 14 lines) overrides this with the exact count.
|
|
44
|
+
for (tRecords::size_type nLines(14); mRecords.size() < nLines;) {
|
|
39
45
|
std::string line;
|
|
40
46
|
if (!getline(is, line)) {
|
|
41
47
|
break;
|
|
@@ -52,7 +58,10 @@ Header::Header(std::istream& is, const char *fn)
|
|
|
52
58
|
mRecords.insert(std::make_pair(key, value));
|
|
53
59
|
if (key == "num_ascii_tags") {
|
|
54
60
|
try {
|
|
55
|
-
|
|
61
|
+
const int parsed = std::stoi(value);
|
|
62
|
+
nLines = (parsed > 0 && parsed <= 10000)
|
|
63
|
+
? static_cast<tRecords::size_type>(parsed)
|
|
64
|
+
: 0;
|
|
56
65
|
} catch (const std::exception&) {
|
|
57
66
|
nLines = 0; // Default to 0 on parse error
|
|
58
67
|
}
|
|
@@ -87,19 +96,13 @@ Header::trim(std::string str)
|
|
|
87
96
|
{
|
|
88
97
|
const std::string whitespace(" \t\n");
|
|
89
98
|
|
|
90
|
-
std::string::size_type
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
str = str.substr(index);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
index = str.find_last_not_of(whitespace);
|
|
97
|
-
|
|
98
|
-
if (index != str.npos) {
|
|
99
|
-
str = str.substr(0, index + 1);
|
|
99
|
+
const std::string::size_type first(str.find_first_not_of(whitespace));
|
|
100
|
+
if (first == str.npos) {
|
|
101
|
+
return std::string();
|
|
100
102
|
}
|
|
101
103
|
|
|
102
|
-
|
|
104
|
+
const std::string::size_type last(str.find_last_not_of(whitespace));
|
|
105
|
+
return str.substr(first, last - first + 1);
|
|
103
106
|
}
|
|
104
107
|
|
|
105
108
|
void
|
|
@@ -126,6 +129,28 @@ Header::qProcessMission(const tMissions& toSkip,
|
|
|
126
129
|
return toKeep.empty() || (toKeep.find(mission) != toKeep.end());
|
|
127
130
|
}
|
|
128
131
|
|
|
132
|
+
// Parse a DBD fileopen_time header value (format: "Day_Mon_DD_HH:MM:SS_YYYY")
|
|
133
|
+
// to a UTC epoch seconds. Slocum gliders record fileopen_time in UTC, so we
|
|
134
|
+
// use timegm / _mkgmtime rather than mktime (which would apply local TZ).
|
|
135
|
+
// Returns time_t::max() on empty input or parse failure so the file sorts to
|
|
136
|
+
// the end of a by-time file list.
|
|
137
|
+
time_t
|
|
138
|
+
Header::parseFileOpenTime(const std::string& timeStr)
|
|
139
|
+
{
|
|
140
|
+
if (timeStr.empty()) return std::numeric_limits<time_t>::max();
|
|
141
|
+
std::string s(timeStr);
|
|
142
|
+
std::replace(s.begin(), s.end(), '_', ' ');
|
|
143
|
+
struct tm tm = {};
|
|
144
|
+
std::istringstream iss(s);
|
|
145
|
+
iss >> std::get_time(&tm, "%a %b %d %H:%M:%S %Y");
|
|
146
|
+
if (iss.fail()) return std::numeric_limits<time_t>::max();
|
|
147
|
+
#ifdef _WIN32
|
|
148
|
+
return _mkgmtime(&tm);
|
|
149
|
+
#else
|
|
150
|
+
return timegm(&tm);
|
|
151
|
+
#endif
|
|
152
|
+
}
|
|
153
|
+
|
|
129
154
|
std::ostream&
|
|
130
155
|
operator << (std::ostream& os,
|
|
131
156
|
const Header& hdr)
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <string>
|
|
8
8
|
#include <map>
|
|
9
9
|
#include <set>
|
|
10
|
+
#include <ctime>
|
|
10
11
|
|
|
11
12
|
class Header {
|
|
12
13
|
private:
|
|
@@ -30,6 +31,8 @@ public:
|
|
|
30
31
|
static void addMission(std::string name, tMissions& missionList);
|
|
31
32
|
bool qProcessMission(const tMissions& toSkip, const tMissions& toProcess) const;
|
|
32
33
|
|
|
34
|
+
static time_t parseFileOpenTime(const std::string& timeStr);
|
|
35
|
+
|
|
33
36
|
friend std::ostream& operator << (std::ostream& os, const Header& hdr);
|
|
34
37
|
}; // Header
|
|
35
38
|
|