xarray-dbd 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/.pre-commit-config.yaml +11 -0
  2. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/CHANGELOG.md +30 -0
  3. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/CMakeLists.txt +0 -1
  4. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/PKG-INFO +77 -6
  5. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/README.md +76 -5
  6. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/benchmark_performance.py +2 -2
  7. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/conda/recipe.yaml +5 -3
  8. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/ColumnData.C +6 -2
  9. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Decompress.C +21 -6
  10. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Decompress.H +7 -5
  11. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Header.C +38 -13
  12. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Header.H +3 -0
  13. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/KnownBytes.C +30 -23
  14. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/KnownBytes.H +1 -1
  15. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/MyException.H +2 -2
  16. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensor.C +15 -8
  17. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensor.H +2 -1
  18. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensors.C +7 -3
  19. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Sensors.H +0 -4
  20. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/SensorsMap.C +13 -1
  21. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/SensorsMap.H +1 -1
  22. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/config.h +0 -2
  23. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/dbd_python.cpp +45 -38
  24. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/pyproject.toml +20 -5
  25. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_backend.py +140 -1
  26. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_cli.py +395 -1
  27. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_cpp_backend.py +8 -0
  28. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/test_dbdreader2.py +149 -29
  29. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/backend.py +58 -16
  30. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/csv.py +44 -25
  31. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/dbd2nc.py +54 -9
  32. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/mkone.py +28 -8
  33. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_core.py +68 -21
  34. xarray_dbd-0.2.5/csrc/Data.C +0 -173
  35. xarray_dbd-0.2.5/csrc/Data.H +0 -67
  36. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/.clang-tidy +0 -0
  37. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/.gitignore +0 -0
  38. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/CONTRIBUTING.md +0 -0
  39. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/License.txt +0 -0
  40. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/ColumnData.H +0 -0
  41. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/FileInfo.H +0 -0
  42. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/Logger.H +0 -0
  43. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/lz4.c +0 -0
  44. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/csrc/lz4.h +0 -0
  45. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/examples/README.md +0 -0
  46. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/scripts/README.md +0 -0
  47. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/tests/conftest.py +0 -0
  48. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/__init__.py +0 -0
  49. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/_dbd_cpp.pyi +0 -0
  50. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/__init__.py +0 -0
  51. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/cache.py +0 -0
  52. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/logger.py +0 -0
  53. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/main.py +0 -0
  54. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/missions.py +0 -0
  55. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/cli/sensors.py +0 -0
  56. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/__init__.py +0 -0
  57. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_cache.py +0 -0
  58. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_errors.py +0 -0
  59. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_list.py +0 -0
  60. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/dbdreader2/_util.py +0 -0
  61. {xarray_dbd-0.2.5 → xarray_dbd-0.2.7}/xarray_dbd/py.typed +0 -0
@@ -13,3 +13,14 @@ repos:
13
13
  - id: ruff
14
14
  args: [--fix, --exit-non-zero-on-fix]
15
15
  - id: ruff-format
16
+
17
+ - repo: https://github.com/pre-commit/mirrors-mypy
18
+ rev: v1.13.0
19
+ hooks:
20
+ - id: mypy
21
+ # Match the CI invocation (xarray_dbd/ only; tests are linted, not typed).
22
+ files: ^xarray_dbd/
23
+ additional_dependencies:
24
+ - numpy>=1.23,<3.0
25
+ - xarray>=2023.6.0
26
+ - netCDF4>=1.6
@@ -5,6 +5,36 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.6] - 2026-03-30
9
+
10
+ ### Added
11
+
12
+ - `--list-sensors` flag for `dbd2nc` CLI to print available sensors without conversion
13
+ - `batch_size` parameter for `write_multi_dbd_netcdf()` (was hardcoded at 100)
14
+ - Signal handling in `mkone` — Ctrl+C now terminates child processes cleanly
15
+ - "Working with Glider Data" section in README (sensor discovery, time conversion, fill values)
16
+ - Tests for `get_CTD_sync`, `determine_ctd_type`, `get_global_time_range`, file ordering, batch boundaries
17
+
18
+ ### Changed
19
+
20
+ - `get_sync()` logs interpolation failures at WARNING level instead of INFO
21
+ - Streaming writer logs summary when batches are skipped due to errors
22
+ - `set_time_limits()` accepts numeric epoch seconds in addition to date strings
23
+ - C++ `SensorsMap::setUpForData()` validates sensor byte sizes across files
24
+
25
+ ### Fixed
26
+
27
+ - **Data loss in streaming writer**: removed Python-side double-skip at batch boundaries (C++ already handles `skip_first_record`)
28
+ - **dbdreader2 file ordering**: pass `presorted=True` to `read_dbd_files` so C++ respects chronological order from `DBDList.sort()`
29
+ - **mkone worker error propagation**: workers now exit non-zero on failure so parent detects errors
30
+ - **`_get_with_source` time ordering**: results now sorted by time for consistency with normal `get()` path
31
+ - **`sci_extensions` missing `.sbd`**: file pairing now recognizes `.sbd` as a science file type
32
+ - **`set_time_limits` falsy check**: epoch time 0 no longer causes spurious ValueError
33
+ - **inf-to-NaN for repeated values**: code=1 (repeat) now converts infinity consistently with code=2 (new value)
34
+ - Removed unused `"j"` dimension from `DBDDataStore.get_dimensions()`
35
+ - Fixed `--skip-first` help text (was stale after skip semantics change)
36
+ - Fixed README: CLI command names, removed false wildcard `to_keep` claim
37
+
8
38
  ## [0.2.5] - 2026-03-30
9
39
 
10
40
  ### Added
@@ -35,7 +35,6 @@ pybind11_add_module(_dbd_cpp
35
35
  csrc/SensorsMap.C
36
36
  csrc/KnownBytes.C
37
37
  csrc/Decompress.C
38
- csrc/Data.C
39
38
  csrc/lz4.c
40
39
  )
41
40
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray-dbd
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Efficient xarray backend for reading glider DBD files
5
5
  Keywords: glider,oceanography,dbd,slocum,xarray,netcdf
6
6
  Author-Email: Pat Welch <pat@mousebrains.com>
@@ -74,7 +74,7 @@ pip install xarray-dbd
74
74
  For the CLI tools only:
75
75
 
76
76
  ```bash
77
- pipx install xarray-dbd # installs dbd2nc and mkone commands
77
+ pipx install xarray-dbd # installs xdbd command (xdbd 2nc, xdbd mkone, etc.)
78
78
  ```
79
79
 
80
80
  Or install from source (requires a C++ compiler and CMake):
@@ -167,8 +167,8 @@ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
167
167
  The `--sort` flag is also available on all CLI commands:
168
168
 
169
169
  ```bash
170
- dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
171
- mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
170
+ xdbd 2nc --sort lexicographic -C cache -o output.nc *.dbd
171
+ xdbd mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
172
172
  ```
173
173
 
174
174
  ### Advanced options
@@ -178,11 +178,15 @@ ds = xdbd.open_dbd_dataset(
178
178
  'test.sbd',
179
179
  skip_first_record=True, # Skip first record (default)
180
180
  repair=True, # Attempt to repair corrupted data
181
- to_keep=['m_*'], # Keep sensors matching pattern (future feature)
181
+ to_keep=['m_depth', 'm_lat'], # Keep only these sensors
182
182
  criteria=['m_present_time'], # Sensors for record selection
183
183
  )
184
184
  ```
185
185
 
186
+ The corresponding CLI flag on `xdbd 2nc` and `xdbd 2csv` is `--keep-first`
187
+ (default is to skip the first record of every file, matching `mkone` and
188
+ `dbdreader`). Use `--skip-first` to be explicit or `--keep-first` to invert.
189
+
186
190
  ## DBD File Format
187
191
 
188
192
  DBD (Dinkum Binary Data) files are the native format used by Slocum ocean gliders. The format consists of:
@@ -234,6 +238,27 @@ Open multiple DBD files as a single concatenated xarray Dataset.
234
238
 
235
239
  **Returns:** `xarray.Dataset`
236
240
 
241
+ ### `write_multi_dbd_netcdf(filenames, output, **kwargs)`
242
+
243
+ Stream multiple DBD files directly to a NetCDF file without loading all data
244
+ into memory. Preferred for large datasets (100+ files).
245
+
246
+ **Parameters:**
247
+ - `filenames` (iterable): Paths to DBD files (duplicates removed automatically)
248
+ - `output` (str or Path): Output NetCDF file path (parent directory created if needed)
249
+ - `skip_first_record` (bool): Skip first record in each file (default: True)
250
+ - `repair` (bool): Attempt to repair corrupted records (default: False)
251
+ - `to_keep` (list of str): Sensor names to keep (default: all)
252
+ - `criteria` (list of str): Sensor names for selection criteria
253
+ - `skip_missions` (list of str): Mission names to skip
254
+ - `keep_missions` (list of str): Mission names to keep
255
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
256
+ - `compression` (int): Zlib compression level 0-9 (default: 5, 0 disables)
257
+ - `sort` (str): File sort order (default: `"header_time"`)
258
+ - `batch_size` (int): Files per batch (default: 100; smaller reduces peak memory)
259
+
260
+ **Returns:** `tuple[int, int]` — (n_records, n_files)
261
+
237
262
  ## Migration from dbdreader
238
263
 
239
264
  The dbdreader2 API is derived from Lucas Merckelbach's
@@ -498,6 +523,7 @@ print(f"Depth units: {ds['m_depth'].attrs['units']}")
498
523
  ### Working with trajectories
499
524
 
500
525
  ```python
526
+ from pathlib import Path
501
527
  import xarray_dbd as xdbd
502
528
  import matplotlib.pyplot as plt
503
529
 
@@ -518,6 +544,7 @@ plt.show()
518
544
  ### Extracting science data
519
545
 
520
546
  ```python
547
+ from pathlib import Path
521
548
  # Read full resolution science data
522
549
  files = sorted(Path('.').glob('*.ebd'))
523
550
  ds = xdbd.open_multi_dbd_dataset(
@@ -538,7 +565,7 @@ print(df.describe())
538
565
  | Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
539
566
  | Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
540
567
  | Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
541
- | Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
568
+ | Batch processing 1000+ files | `xdbd mkone` CLI (multiprocessing) |
542
569
  | Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
543
570
 
544
571
  ## Slocum File Types
@@ -554,6 +581,50 @@ print(df.describe())
554
581
 
555
582
  Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
556
583
 
584
+ ## Working with Glider Data
585
+
586
+ ### Discovering available sensors
587
+
588
+ ```python
589
+ import xarray_dbd as xdbd
590
+
591
+ # xarray API
592
+ ds = xdbd.open_dbd_dataset("file.dbd", cache_dir="cache")
593
+ for var in sorted(ds.data_vars):
594
+ print(f" {var:30s} {ds[var].attrs.get('units', '')}")
595
+
596
+ # dbdreader2 API
597
+ dbd = xdbd.MultiDBD(pattern="*.dbd", cacheDir="cache")
598
+ for name in sorted(dbd.parameterNames["eng"]):
599
+ print(f" {name:30s} {dbd.parameterUnits.get(name, '')}")
600
+ ```
601
+
602
+ Sensor naming conventions are documented in
603
+ [TWR's masterdata files](https://gliderfs2.ceoas.oregonstate.edu/gliderweb/masterdata/).
604
+
605
+ ### Time conversion
606
+
607
+ `m_present_time` contains UTC seconds since 1970-01-01 (Unix epoch, float64):
608
+
609
+ ```python
610
+ import pandas as pd
611
+
612
+ time = pd.to_datetime(ds["m_present_time"].values, unit="s", utc=True)
613
+ ```
614
+
615
+ ### Handling fill values
616
+
617
+ Float sensors use NaN for missing data. Integer sensors use sentinel fill
618
+ values (-127 for int8, -32768 for int16). Filter them out:
619
+
620
+ ```python
621
+ # xarray — replace sentinels with NaN
622
+ ds = ds.where(ds != -32768)
623
+
624
+ # dbdreader2 — automatic filtering (default)
625
+ t, v = dbd.get("m_depth") # return_nans=False by default
626
+ ```
627
+
557
628
  ## Known Limitations
558
629
 
559
630
  - **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
@@ -38,7 +38,7 @@ pip install xarray-dbd
38
38
  For the CLI tools only:
39
39
 
40
40
  ```bash
41
- pipx install xarray-dbd # installs dbd2nc and mkone commands
41
+ pipx install xarray-dbd # installs xdbd command (xdbd 2nc, xdbd mkone, etc.)
42
42
  ```
43
43
 
44
44
  Or install from source (requires a C++ compiler and CMake):
@@ -131,8 +131,8 @@ ds = xdbd.open_multi_dbd_dataset(files, sort="none")
131
131
  The `--sort` flag is also available on all CLI commands:
132
132
 
133
133
  ```bash
134
- dbd2nc --sort lexicographic -C cache -o output.nc *.dbd
135
- mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
134
+ xdbd 2nc --sort lexicographic -C cache -o output.nc *.dbd
135
+ xdbd mkone --sort none --output-prefix /path/to/output/ /path/to/raw/
136
136
  ```
137
137
 
138
138
  ### Advanced options
@@ -142,11 +142,15 @@ ds = xdbd.open_dbd_dataset(
142
142
  'test.sbd',
143
143
  skip_first_record=True, # Skip first record (default)
144
144
  repair=True, # Attempt to repair corrupted data
145
- to_keep=['m_*'], # Keep sensors matching pattern (future feature)
145
+ to_keep=['m_depth', 'm_lat'], # Keep only these sensors
146
146
  criteria=['m_present_time'], # Sensors for record selection
147
147
  )
148
148
  ```
149
149
 
150
+ The corresponding CLI flag on `xdbd 2nc` and `xdbd 2csv` is `--keep-first`
151
+ (default is to skip the first record of every file, matching `mkone` and
152
+ `dbdreader`). Use `--skip-first` to be explicit or `--keep-first` to invert.
153
+
150
154
  ## DBD File Format
151
155
 
152
156
  DBD (Dinkum Binary Data) files are the native format used by Slocum ocean gliders. The format consists of:
@@ -198,6 +202,27 @@ Open multiple DBD files as a single concatenated xarray Dataset.
198
202
 
199
203
  **Returns:** `xarray.Dataset`
200
204
 
205
+ ### `write_multi_dbd_netcdf(filenames, output, **kwargs)`
206
+
207
+ Stream multiple DBD files directly to a NetCDF file without loading all data
208
+ into memory. Preferred for large datasets (100+ files).
209
+
210
+ **Parameters:**
211
+ - `filenames` (iterable): Paths to DBD files (duplicates removed automatically)
212
+ - `output` (str or Path): Output NetCDF file path (parent directory created if needed)
213
+ - `skip_first_record` (bool): Skip first record in each file (default: True)
214
+ - `repair` (bool): Attempt to repair corrupted records (default: False)
215
+ - `to_keep` (list of str): Sensor names to keep (default: all)
216
+ - `criteria` (list of str): Sensor names for selection criteria
217
+ - `skip_missions` (list of str): Mission names to skip
218
+ - `keep_missions` (list of str): Mission names to keep
219
+ - `cache_dir` (str, Path, or None): Directory for sensor cache files
220
+ - `compression` (int): Zlib compression level 0-9 (default: 5, 0 disables)
221
+ - `sort` (str): File sort order (default: `"header_time"`)
222
+ - `batch_size` (int): Files per batch (default: 100; smaller reduces peak memory)
223
+
224
+ **Returns:** `tuple[int, int]` — (n_records, n_files)
225
+
201
226
  ## Migration from dbdreader
202
227
 
203
228
  The dbdreader2 API is derived from Lucas Merckelbach's
@@ -462,6 +487,7 @@ print(f"Depth units: {ds['m_depth'].attrs['units']}")
462
487
  ### Working with trajectories
463
488
 
464
489
  ```python
490
+ from pathlib import Path
465
491
  import xarray_dbd as xdbd
466
492
  import matplotlib.pyplot as plt
467
493
 
@@ -482,6 +508,7 @@ plt.show()
482
508
  ### Extracting science data
483
509
 
484
510
  ```python
511
+ from pathlib import Path
485
512
  # Read full resolution science data
486
513
  files = sorted(Path('.').glob('*.ebd'))
487
514
  ds = xdbd.open_multi_dbd_dataset(
@@ -502,7 +529,7 @@ print(df.describe())
502
529
  | Multiple files, < 1 GB | `xdbd.open_multi_dbd_dataset(files, to_keep=[...])` |
503
530
  | Multiple files, large dataset | `xdbd.write_multi_dbd_netcdf(files, "out.nc")` |
504
531
  | Interactive / Jupyter | `xdbd.MultiDBD(filenames=files)` with `.get()` (lazy) |
505
- | Batch processing 1000+ files | `mkone` CLI (multiprocessing) |
532
+ | Batch processing 1000+ files | `xdbd mkone` CLI (multiprocessing) |
506
533
  | Drop-in dbdreader replacement | `import xarray_dbd.dbdreader2 as dbdreader` |
507
534
 
508
535
  ## Slocum File Types
@@ -518,6 +545,50 @@ print(df.describe())
518
545
 
519
546
  Compressed variants (`.?cd`) use LZ4 framing and are handled transparently.
520
547
 
548
+ ## Working with Glider Data
549
+
550
+ ### Discovering available sensors
551
+
552
+ ```python
553
+ import xarray_dbd as xdbd
554
+
555
+ # xarray API
556
+ ds = xdbd.open_dbd_dataset("file.dbd", cache_dir="cache")
557
+ for var in sorted(ds.data_vars):
558
+ print(f" {var:30s} {ds[var].attrs.get('units', '')}")
559
+
560
+ # dbdreader2 API
561
+ dbd = xdbd.MultiDBD(pattern="*.dbd", cacheDir="cache")
562
+ for name in sorted(dbd.parameterNames["eng"]):
563
+ print(f" {name:30s} {dbd.parameterUnits.get(name, '')}")
564
+ ```
565
+
566
+ Sensor naming conventions are documented in
567
+ [TWR's masterdata files](https://gliderfs2.ceoas.oregonstate.edu/gliderweb/masterdata/).
568
+
569
+ ### Time conversion
570
+
571
+ `m_present_time` contains UTC seconds since 1970-01-01 (Unix epoch, float64):
572
+
573
+ ```python
574
+ import pandas as pd
575
+
576
+ time = pd.to_datetime(ds["m_present_time"].values, unit="s", utc=True)
577
+ ```
578
+
579
+ ### Handling fill values
580
+
581
+ Float sensors use NaN for missing data. Integer sensors use sentinel fill
582
+ values (-127 for int8, -32768 for int16). Filter them out:
583
+
584
+ ```python
585
+ # xarray — replace sentinels with NaN
586
+ ds = ds.where(ds != -32768)
587
+
588
+ # dbdreader2 — automatic filtering (default)
589
+ t, v = dbd.get("m_depth") # return_nans=False by default
590
+ ```
591
+
521
592
  ## Known Limitations
522
593
 
523
594
  - **Python 3.10+ required** — uses `from __future__ import annotations` for modern type-hint syntax.
@@ -36,11 +36,11 @@ def measure_command(cmd, desc):
36
36
  break
37
37
  time.sleep(0.01)
38
38
 
39
- # Final check
39
+ # Final check — process may already have exited between the loop and here
40
40
  try:
41
41
  mem_info = process.memory_info()
42
42
  peak_memory = max(peak_memory, mem_info.rss)
43
- except:
43
+ except (psutil.NoSuchProcess, psutil.AccessDenied, OSError):
44
44
  pass
45
45
 
46
46
  except KeyboardInterrupt:
@@ -6,7 +6,7 @@ schema_version: 1
6
6
 
7
7
  context:
8
8
  name: xarray-dbd
9
- version: "0.1.0"
9
+ version: "0.2.7"
10
10
 
11
11
  package:
12
12
  name: ${{ name }}
@@ -37,6 +37,7 @@ requirements:
37
37
  - python >=3.10
38
38
  - numpy >=1.23,<3.0
39
39
  - xarray >=2023.6.0
40
+ - netcdf4 >=1.6
40
41
 
41
42
  tests:
42
43
  - python:
@@ -45,8 +46,9 @@ tests:
45
46
  - xarray_dbd._dbd_cpp
46
47
  pip_check: true
47
48
  - script:
48
- - dbd2nc --help
49
- - mkone --help
49
+ - xdbd --help
50
+ - xdbd 2nc --help
51
+ - xdbd mkone --help
50
52
 
51
53
  about:
52
54
  summary: Efficient xarray backend for reading Slocum ocean glider DBD files
@@ -123,7 +123,7 @@ ColumnDataResult read_columns(std::istream& is,
123
123
  qKeep |= sensor.qCriteria();
124
124
  const int oi = outIndex[i];
125
125
  if (oi >= 0) {
126
- // Copy previous value into current row
126
+ // Copy previous value into current row, converting inf to NaN
127
127
  std::visit([nRows, oi](auto& col_vec, const auto& prev_vec) {
128
128
  using T = typename std::decay_t<decltype(col_vec)>::value_type;
129
129
  using PT = typename std::decay_t<decltype(prev_vec)>::value_type;
@@ -136,7 +136,11 @@ ColumnDataResult read_columns(std::istream& is,
136
136
  else
137
137
  col_vec.resize(col_vec.size() * 2, NAN);
138
138
  }
139
- col_vec[nRows] = prev_vec[0];
139
+ T val = prev_vec[0];
140
+ if constexpr (std::is_floating_point_v<T>) {
141
+ if (std::isinf(val)) val = NAN;
142
+ }
143
+ col_vec[nRows] = val;
140
144
  }
141
145
  }, columns[oi], prevValues[oi]);
142
146
  }
@@ -38,17 +38,20 @@ int DecompressTWRBuf::underflow() {
38
38
  if (!this->mIS.read(frame.data(), n)) { // EOF
39
39
  return std::char_traits<char>::eof();
40
40
  }
41
- const int j = LZ4_decompress_safe(frame.data(), this->mBuffer, static_cast<int>(n), sizeof(this->mBuffer));
41
+ const int j(LZ4_decompress_safe(frame.data(), this->mBuffer, static_cast<int>(n), sizeof(this->mBuffer)));
42
42
  if (j < 0) { // LZ4 decompression error
43
+ LOG_ERROR("LZ4 decompression failed (error {}) in {} (block size {})",
44
+ j, this->mFilename, n);
43
45
  return std::char_traits<char>::eof();
44
46
  }
45
- if (static_cast<size_t>(j) > sizeof(this->mBuffer)) { // Probably a corrupted file
46
- return std::char_traits<char>::eof();
47
- }
48
- this->setg(this->mBuffer, this->mBuffer, this->mBuffer + j);
47
+ const size_t decompressedSize(static_cast<size_t>(j));
48
+ this->setg(this->mBuffer, this->mBuffer, this->mBuffer + decompressedSize);
49
+ this->mPos += decompressedSize;
49
50
  } else { // Not compressed
50
51
  if (this->mIS.read(this->mBuffer, sizeof(this->mBuffer)) || this->mIS.gcount()) {
51
- this->setg(this->mBuffer, this->mBuffer, this->mBuffer + this->mIS.gcount());
52
+ const auto n = this->mIS.gcount();
53
+ this->setg(this->mBuffer, this->mBuffer, this->mBuffer + n);
54
+ this->mPos += static_cast<size_t>(n);
52
55
  } else {
53
56
  return std::char_traits<char>::eof();
54
57
  }
@@ -57,6 +60,18 @@ int DecompressTWRBuf::underflow() {
57
60
  return std::char_traits<char>::to_int_type(*this->gptr());
58
61
  }
59
62
 
63
+ DecompressTWRBuf::pos_type
64
+ DecompressTWRBuf::seekoff(off_type off, std::ios_base::seekdir dir,
65
+ std::ios_base::openmode /*which*/) {
66
+ // Only support tellg(): seekoff(0, cur)
67
+ if (dir == std::ios_base::cur && off == 0) {
68
+ // mPos is total bytes loaded; subtract unread bytes remaining in buffer
69
+ const auto remaining = this->egptr() - this->gptr();
70
+ return static_cast<pos_type>(this->mPos - static_cast<size_t>(remaining));
71
+ }
72
+ return pos_type(off_type(-1)); // Seeking not supported
73
+ }
74
+
60
75
  bool qCompressed(const std::string& fn) {
61
76
  const std::string suffix(fs::path(fn).extension().string());
62
77
  const bool q((suffix.size() == 4) && (std::tolower(static_cast<unsigned char>(suffix[2])) == 'c'));
@@ -9,21 +9,23 @@
9
9
  class DecompressTWRBuf: public std::streambuf {
10
10
  std::ifstream mIS;
11
11
  const bool mqCompressed;
12
- char mBuffer[65536];
12
+ char mBuffer[65536]{}; // zero-init so cppcheck across versions stays happy
13
13
  const std::string mFilename;
14
+ size_t mPos = 0; // Total decompressed bytes loaded into buffer
14
15
  public:
15
16
  DecompressTWRBuf(const std::string& fn, const bool qCompressed)
16
17
  : mIS(fn.c_str(), std::ios::binary)
17
18
  , mqCompressed(qCompressed)
18
19
  , mFilename(fn)
19
-
20
20
  {}
21
21
 
22
- ~DecompressTWRBuf() {mIS.close();}
23
-
24
22
  void close() {mIS.close();}
25
23
 
26
- int underflow();
24
+ int underflow() override;
25
+
26
+ protected:
27
+ pos_type seekoff(off_type off, std::ios_base::seekdir dir,
28
+ std::ios_base::openmode which = std::ios_base::in) override;
27
29
  };
28
30
 
29
31
  class DecompressTWR: public std::istream {
@@ -21,6 +21,10 @@
21
21
  #include "MyException.H"
22
22
  #include "Logger.H"
23
23
  #include <iostream>
24
+ #include <sstream>
25
+ #include <iomanip>
26
+ #include <algorithm>
27
+ #include <limits>
24
28
  #include <cstdlib>
25
29
 
26
30
  namespace {
@@ -35,7 +39,9 @@ namespace {
35
39
  Header::Header(std::istream& is, const char *fn)
36
40
  {
37
41
  size_t cnt = 0;
38
- for (tRecords::size_type nLines(10); mRecords.size() < nLines;) {
42
+ // 14 is the typical ASCII header length in DBD files; num_ascii_tags (if
43
+ // present in the first 14 lines) overrides this with the exact count.
44
+ for (tRecords::size_type nLines(14); mRecords.size() < nLines;) {
39
45
  std::string line;
40
46
  if (!getline(is, line)) {
41
47
  break;
@@ -52,7 +58,10 @@ Header::Header(std::istream& is, const char *fn)
52
58
  mRecords.insert(std::make_pair(key, value));
53
59
  if (key == "num_ascii_tags") {
54
60
  try {
55
- nLines = std::stoi(value);
61
+ const int parsed = std::stoi(value);
62
+ nLines = (parsed > 0 && parsed <= 10000)
63
+ ? static_cast<tRecords::size_type>(parsed)
64
+ : 0;
56
65
  } catch (const std::exception&) {
57
66
  nLines = 0; // Default to 0 on parse error
58
67
  }
@@ -87,19 +96,13 @@ Header::trim(std::string str)
87
96
  {
88
97
  const std::string whitespace(" \t\n");
89
98
 
90
- std::string::size_type index(str.find_first_not_of(whitespace));
91
-
92
- if (index != str.npos) {
93
- str = str.substr(index);
94
- }
95
-
96
- index = str.find_last_not_of(whitespace);
97
-
98
- if (index != str.npos) {
99
- str = str.substr(0, index + 1);
99
+ const std::string::size_type first(str.find_first_not_of(whitespace));
100
+ if (first == str.npos) {
101
+ return std::string();
100
102
  }
101
103
 
102
- return str;
104
+ const std::string::size_type last(str.find_last_not_of(whitespace));
105
+ return str.substr(first, last - first + 1);
103
106
  }
104
107
 
105
108
  void
@@ -126,6 +129,28 @@ Header::qProcessMission(const tMissions& toSkip,
126
129
  return toKeep.empty() || (toKeep.find(mission) != toKeep.end());
127
130
  }
128
131
 
132
+ // Parse a DBD fileopen_time header value (format: "Day_Mon_DD_HH:MM:SS_YYYY")
133
+ // to a UTC epoch seconds. Slocum gliders record fileopen_time in UTC, so we
134
+ // use timegm / _mkgmtime rather than mktime (which would apply local TZ).
135
+ // Returns time_t::max() on empty input or parse failure so the file sorts to
136
+ // the end of a by-time file list.
137
+ time_t
138
+ Header::parseFileOpenTime(const std::string& timeStr)
139
+ {
140
+ if (timeStr.empty()) return std::numeric_limits<time_t>::max();
141
+ std::string s(timeStr);
142
+ std::replace(s.begin(), s.end(), '_', ' ');
143
+ struct tm tm = {};
144
+ std::istringstream iss(s);
145
+ iss >> std::get_time(&tm, "%a %b %d %H:%M:%S %Y");
146
+ if (iss.fail()) return std::numeric_limits<time_t>::max();
147
+ #ifdef _WIN32
148
+ return _mkgmtime(&tm);
149
+ #else
150
+ return timegm(&tm);
151
+ #endif
152
+ }
153
+
129
154
  std::ostream&
130
155
  operator << (std::ostream& os,
131
156
  const Header& hdr)
@@ -7,6 +7,7 @@
7
7
  #include <string>
8
8
  #include <map>
9
9
  #include <set>
10
+ #include <ctime>
10
11
 
11
12
  class Header {
12
13
  private:
@@ -30,6 +31,8 @@ public:
30
31
  static void addMission(std::string name, tMissions& missionList);
31
32
  bool qProcessMission(const tMissions& toSkip, const tMissions& toProcess) const;
32
33
 
34
+ static time_t parseFileOpenTime(const std::string& timeStr);
35
+
33
36
  friend std::ostream& operator << (std::ostream& os, const Header& hdr);
34
37
  }; // Header
35
38