pypff 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. pypff-1.0.0/.coverage +0 -0
  2. pypff-1.0.0/.git +1 -0
  3. pypff-1.0.0/.github/workflows/egg.yml +23 -0
  4. pypff-1.0.0/.gitignore +5 -0
  5. pypff-1.0.0/CLAUDE.md +136 -0
  6. pypff-1.0.0/LICENSE +21 -0
  7. pypff-1.0.0/PKG-INFO +253 -0
  8. pypff-1.0.0/README.md +206 -0
  9. pypff-1.0.0/docs/zarr_v3_spec.md +259 -0
  10. pypff-1.0.0/example/example-data/collect_complete +1 -0
  11. pypff-1.0.0/example/example-data/daq_config.json +13 -0
  12. pypff-1.0.0/example/example-data/data_config.json +10 -0
  13. pypff-1.0.0/example/example-data/hk.pff +73 -0
  14. pypff-1.0.0/example/example-data/hp_stdout_192.168.1.100 +57 -0
  15. pypff-1.0.0/example/example-data/log.txt +125 -0
  16. pypff-1.0.0/example/example-data/obs_config.json +29 -0
  17. pypff-1.0.0/example/example-data/quabo_config_192.168.3.248.json +311 -0
  18. pypff-1.0.0/example/example-data/quabo_config_192.168.3.249.json +311 -0
  19. pypff-1.0.0/example/example-data/quabo_config_192.168.3.250.json +12 -0
  20. pypff-1.0.0/example/example-data/quabo_config_192.168.3.251.json +12 -0
  21. pypff-1.0.0/example/example-data/quabo_config_debug.json +326 -0
  22. pypff-1.0.0/example/example-data/quabo_ph_baseline.json +266 -0
  23. pypff-1.0.0/example/example-data/quabo_uids.json +34 -0
  24. pypff-1.0.0/example/example-data/recording_ended +1 -0
  25. pypff-1.0.0/example/example-data/run_complete +1 -0
  26. pypff-1.0.0/example/example-data/start_2023-06-08T04:30:29Z.dp_img16.bpp_2.module_1.seqno_0.pff +0 -0
  27. pypff-1.0.0/example/example-data/start_2023-08-02T00:39:53Z.dp_ph256.bpp_2.module_254.seqno_0.pff +0 -0
  28. pypff-1.0.0/example/example-data/sw_info.json +6 -0
  29. pypff-1.0.0/example/notebook/data-check.ipynb +202 -0
  30. pypff-1.0.0/example/notebook/start_2025-08-02T05:31:36Z.dp_ph256.bpp_2.module_254.seqno_0.pff +0 -0
  31. pypff-1.0.0/example/pypff_example.py +38 -0
  32. pypff-1.0.0/example/pypff_io2_demo.ipynb +1651 -0
  33. pypff-1.0.0/pyproject.toml +112 -0
  34. pypff-1.0.0/src/ci/Dockerfile.ci +59 -0
  35. pypff-1.0.0/src/ci/legacy_tests/config-data/daq_config.json +13 -0
  36. pypff-1.0.0/src/ci/legacy_tests/config-data/data_config.json +20 -0
  37. pypff-1.0.0/src/ci/legacy_tests/config-data/network_config.json +33 -0
  38. pypff-1.0.0/src/ci/legacy_tests/config-data/obs_config.json +30 -0
  39. pypff-1.0.0/src/ci/legacy_tests/hk-data/hk.pff +215 -0
  40. pypff-1.0.0/src/ci/legacy_tests/sci-data/start_2025-08-02T04-31-52Z.dp_ph256.bpp_2.module_254.seqno_0.pff +0 -0
  41. pypff-1.0.0/src/ci/legacy_tests/test_pypff.py +90 -0
  42. pypff-1.0.0/src/ci/test_io2_new_features.py +87 -0
  43. pypff-1.0.0/src/ci/test_io2_vectorization.py +50 -0
  44. pypff-1.0.0/src/ci/tier1_unit/test_models.py +60 -0
  45. pypff-1.0.0/src/ci/tier1_unit/test_utils.py +49 -0
  46. pypff-1.0.0/src/ci/tier2_logic/test_io.py +67 -0
  47. pypff-1.0.0/src/ci/tier2_logic/test_io2_comprehensive.py +183 -0
  48. pypff-1.0.0/src/ci/tier2_logic/test_io2_streaming.py +373 -0
  49. pypff-1.0.0/src/ci/tier2_logic/test_io_comparison.py +85 -0
  50. pypff-1.0.0/src/ci/tier2_logic/test_io_parsing_methods.py +91 -0
  51. pypff-1.0.0/src/ci/tier2_logic/test_zarr_roundtrip.py +453 -0
  52. pypff-1.0.0/src/pypff/__init__.py +8 -0
  53. pypff-1.0.0/src/pypff/_cli/__init__.py +0 -0
  54. pypff-1.0.0/src/pypff/_cli/profile.py +54 -0
  55. pypff-1.0.0/src/pypff/_cli/root.py +19 -0
  56. pypff-1.0.0/src/pypff/_cli/test.py +57 -0
  57. pypff-1.0.0/src/pypff/_cli/zarr.py +46 -0
  58. pypff-1.0.0/src/pypff/cli.py +55 -0
  59. pypff-1.0.0/src/pypff/io.py +336 -0
  60. pypff-1.0.0/src/pypff/io2.py +1234 -0
  61. pypff-1.0.0/src/pypff/models.py +438 -0
  62. pypff-1.0.0/src/pypff/pixelmap.py +74 -0
  63. pypff-1.0.0/src/pypff/pixelmap_maroc2phys_bga.py +1156 -0
  64. pypff-1.0.0/src/pypff/pixelmap_maroc2phys_qfp.py +1156 -0
  65. pypff-1.0.0/src/pypff/pixelmap_phys2maroc_bga.py +1060 -0
  66. pypff-1.0.0/src/pypff/pixelmap_phys2maroc_qfp.py +1060 -0
  67. pypff-1.0.0/src/pypff/profiling.py +192 -0
  68. pypff-1.0.0/src/pypff/util/__init__.py +0 -0
  69. pypff-1.0.0/src/pypff/util/cli.py +160 -0
  70. pypff-1.0.0/src/pypff/utils.py +61 -0
  71. pypff-1.0.0/src/pypff/zarr/__init__.py +518 -0
  72. pypff-1.0.0/src/pypff/zarr/_reader.py +231 -0
  73. pypff-1.0.0/uv.lock +2051 -0
pypff-1.0.0/.coverage ADDED
Binary file
pypff-1.0.0/.git ADDED
@@ -0,0 +1 @@
1
+ gitdir: ../.git/modules/pypff
@@ -0,0 +1,23 @@
1
+ name: pypff-CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["dev", "opt", "master"]
6
+
7
+ jobs:
8
+ pypff-test:
9
+ name: pypff-test
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: clone pypff
14
+ uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v5
18
+
19
+ - name: Install dependencies
20
+ run: uv sync --all-extras
21
+
22
+ - name: run test
23
+ run: uv run pypff test all --cov
pypff-1.0.0/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ __pycache__
2
+ .DS_Store
3
+ *.egg-info/
4
+ **/.ipynb_checkpoints**
5
+ **__pycache__**
pypff-1.0.0/CLAUDE.md ADDED
@@ -0,0 +1,136 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ `pypff` is a Python I/O library for PanoSETI File Format (PFF) data files — binary science data produced by the PanoSETI telescope array. It provides zero-copy mmap-based access, Pydantic-validated headers, and a high-level run discovery API.
8
+
9
+ ## Commands
10
+
11
+ All commands assume `uv sync` has been run first.
12
+
13
+ ```bash
14
+ uv sync # Install all dependencies
15
+ uv run pypff test all # Run full test suite (unit + logic + legacy)
16
+ uv run pypff test unit # Tier 1 unit tests only
17
+ uv run pypff test logic # Tier 2 logic/IO tests only
18
+ uv run pypff test legacy # Legacy integration tests only
19
+ uv run pypff test all --lint # Also run Ruff + MyPy
20
+ uv run pypff test all --cov # With coverage reporting
21
+ ```
22
+
23
+ Run a single test file directly:
24
+ ```bash
25
+ uv run pytest src/ci/tier1_unit/test_utils.py
26
+ uv run pytest src/ci/tier2_logic/test_io.py::test_name
27
+ ```
28
+
29
+ Lint manually:
30
+ ```bash
31
+ uv run ruff check .
32
+ uv run mypy src
33
+ ```
34
+
35
+ ## Architecture
36
+
37
+ ### Two-Layer I/O Design
38
+
39
+ **Legacy layer (`src/pypff/io.py`):** Original byte-offset based reader. Uses hardcoded byte positions (`loc_arr`) to extract metadata fields from fixed-format PFF JSON headers. Exposes `datapff`, `hkpff`, `qconfig`. Kept for backward compatibility.
40
+
41
+ **Modern layer (`src/pypff/io2.py`):** `PFFSequence` and `PanosetiRun` — the primary API. `PFFSequence` wraps one or more sequential `.pff` files for a single data product, providing:
42
+ - Streaming-by-default API: `for img in seq` and `for batch in seq.iter_batches(size=256)` are zero-copy within a single file.
43
+ - `iter_byte_range(file_idx, byte_start, byte_end, batch_size)` for distributed chunked reads (Nextflow/Dask workers).
44
+ - `read_images(indices)` for sorted-with-inverse-permutation random access; `read_images_range(start, count)` for sequential bulk.
45
+ - `seq[i]` returns a zero-copy view; `seq[start:stop:step]` uses `read_images`.
46
+ - Single-pass metadata extraction via NumPy composite structured dtype (`get_metadata_arrays`). Supports virtual `unix_t_ns` key.
47
+ - `timestamps(indices=None)` returns cached `int64` ns array; `timestamps(as_datetime=True)` returns a zero-copy `datetime64[ns]` view for matplotlib/pandas. `timestamp_at(i)` and `seek_time(ns)` use a two-level binary search (file bounds, then within-file).
48
+ - LRU-bounded mmap handles (`_MmapLRU`, default capacity 16). `PFFSequence` is a context manager.
49
+ - Pickle-safe for multiprocessing (handles dropped on `__getstate__`, lazily reopened).
50
+ - `get_frame(i)` returns `(dict, ndarray_view)` by default; `get_frame_validated(i)` returns `(QuaboHeader|ModuleHeader, ndarray_view)`.
51
+
52
+ `PanosetiRun` is a lazy-loaded directory scanner over a `.pffd` run directory. It discovers and groups `.pff` files by data product and exposes typed config loading via Pydantic models.
53
+
54
+ ### Models (`src/pypff/models.py`)
55
+
56
+ Pydantic v2 models validate all PFF headers and PANOSETI config JSON files. `BaseStrictModel` (extra=`'forbid'`) is the base for all models to catch config key typos. Key models: `PFFHeader`, `QuaboHeader`, `ModuleHeader`, `FrameConfig`.
57
+
58
+ ### Timing
59
+
60
+ All timestamps are **`int64` nanoseconds since the Unix epoch** — never Python floats. Float64 has only ~15–16 significant digits; a Unix timestamp in nanoseconds is ~19 digits, so `float(ns) / 1e9` loses nanosecond precision at the point of division.
61
+
62
+ - `timestamps()` → `np.ndarray[int64]` — for arithmetic, diffs, and storage.
63
+ - `timestamps(as_datetime=True)` → `np.ndarray[datetime64[ns]]` — zero-copy view of the same `int64` data; natively understood by matplotlib date axes, pandas, and xarray. Use for display only.
64
+ - `timestamp_at(i)` → `int` — single frame, nanoseconds.
65
+
66
+ **Integer-space epoch rule:** when converting to float seconds for plotting, subtract the reference epoch first in integer space, then divide — the resulting relative values are small so float64 retains sub-nanosecond resolution:
67
+ ```python
68
+ # CORRECT — subtract first (int64), then divide (small values, no precision loss)
69
+ rel_s = (times_ns - t0_ns) / 1e9
70
+
71
+ # WRONG — 1.7e18 ÷ 1e9 ≈ 1.7e9 s, only ~6 decimal digits remain after the decimal
72
+ times_ns / 1e9 - t0_s
73
+ ```
74
+
75
+ `utils.get_precise_time_ns()` reconciles `tv_sec`/`tv_usec` (system clock) against `pkt_nsec`/`pkt_tai` (quabo hardware clock) using 10-bit TAI counter wraparound logic.
76
+
77
+ ### Pixel Maps
78
+
79
+ `src/pypff/pixelmap.py` and the four `pixelmap_*.py` files provide MAROC↔physical pixel coordinate conversions for BGA and QFP package variants. These are static lookup tables.
80
+
81
+ ### Zarr v3 Conversion (`src/pypff/zarr/`, optional extra `pypff[zarr]`)
82
+
83
+ Install with `uv sync --extra zarr`. The zarr extra requires zarr-python ≥ 3, xarray, and dask.
84
+
85
+ **Conversion:**
86
+ ```bash
87
+ uv run pypff zarr <obs.pffd> <out_dir> # CLI
88
+ ```
89
+ ```python
90
+ from pypff import PanosetiRun
91
+ from pypff.zarr import convert_run
92
+ stores = convert_run(PanosetiRun("obs.pffd"), "out/")
93
+ ```
94
+
95
+ **Reading:**
96
+ ```python
97
+ from pypff.zarr import PanosetiZarrRun
98
+ zrun = PanosetiZarrRun("out/")
99
+ store = zrun.get_product("dp_ph256.bpp_2.module_254")
100
+ store.timestamps() # int64 ns
101
+ store.to_dataset() # xarray.Dataset — images + unix_t_ns + all header fields
102
+ ```
103
+
104
+ **Key design decisions (see [`docs/zarr_v3_spec.md`](docs/zarr_v3_spec.md) for the full spec):**
105
+ - **Flat root layout**: all arrays (`images`, `unix_t_ns`, header fields) live at the zarr store root — no sub-groups. `xr.open_zarr(store)` surfaces every variable automatically. Sub-groups are invisible to `xr.open_zarr` without `group=`, so the hierarchical layout was rejected.
106
+ - **Module-level header naming**: `quabo_0.pkt_num` → `quabo_0_pkt_num` (dots replaced by underscores) for xarray compatibility.
107
+ - **Discoverability attrs**: `header_fields` and `quabo_fields` lists in root attrs let consumers separate image columns from metadata columns without re-deriving naming conventions.
108
+ - **No consolidated metadata**: `zarr.consolidate_metadata()` is not called by default — it is non-spec for Zarr v3 and emits `ZarrUserWarning`. Use `xr.open_zarr(store, consolidated=False)`.
109
+ - **Sidecar bundle**: `convert_run` writes a sibling `<run>.panoseti-meta/` directory with all non-PFF ancillary files (configs/, logs/, hk.pff, sentinels/). Run configs are also embedded in each store's root attrs under `run_configs`.
110
+
111
+ **Key files:**
112
+ - `src/pypff/zarr/__init__.py` — `ZarrWriter` protocol, `ZarrPythonWriter`, `PFFToZarrConverter`, `convert_run`
113
+ - `src/pypff/zarr/_reader.py` — `PanosetiZarrStore`, `PanosetiZarrRun`, `open_zarr_run`
114
+ - `docs/zarr_v3_spec.md` — full store layout specification
115
+
116
+ ### CLI (`src/pypff/cli.py`, `src/pypff/_cli/`)
117
+
118
+ Built with `typer`. Entry point is `pypff` (defined in `pyproject.toml`). Sub-commands live in `src/pypff/_cli/`. The `test` sub-command (`_cli/test.py`) delegates to `pytest` via `subprocess`. The `zarr` sub-command (`_cli/zarr.py`) wraps `convert_run`.
119
+
120
+ ### Test Structure
121
+
122
+ ```
123
+ src/ci/
124
+ tier1_unit/ # Fast unit tests (models, utils)
125
+ tier2_logic/ # IO correctness, slicing, concurrency
126
+ legacy_tests/ # Original test suite against sample .pff files
127
+ ```
128
+
129
+ Test data for legacy tests lives alongside the tests in `src/ci/legacy_tests/{hk-data,sci-data,config-data}/`.
130
+
131
+ ## Key Conventions
132
+
133
+ - `ruff` is the linter/formatter; `mypy` runs in strict mode with pydantic plugin.
134
+ - Line length limit is 100 (ruff), but E501 is ignored so mypy drives strictness.
135
+ - `orjson` (not `json`) is used for all JSON parsing in the hot path — it's faster and handles bytes directly.
136
+ - Data product names follow the pattern `dp_<type>.bpp_<bits>.module_<id>`, e.g. `dp_img16.bpp_2.module_1`.
pypff-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 PANOSETI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pypff-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,253 @@
1
+ Metadata-Version: 2.4
2
+ Name: pypff
3
+ Version: 1.0.0
4
+ Summary: High-performance PanoSETI File Format (PFF) I/O library
5
+ Project-URL: Homepage, https://github.com/panoseti/pypff
6
+ Project-URL: Repository, https://github.com/panoseti/pypff
7
+ Project-URL: Issues, https://github.com/panoseti/pypff/issues
8
+ Author-email: Wei Liu <liuwei_berkeley@berkeley.edu>, Nicolas Rault-Wang <nraultwang@berkeley.edu>
9
+ License-File: LICENSE
10
+ Keywords: astronomy,dask,high-performance,io,panoseti,pff,xarray,zarr
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Topic :: Scientific/Engineering :: Astronomy
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: astropy
21
+ Requires-Dist: dask[distributed]>=2024.1
22
+ Requires-Dist: matplotlib
23
+ Requires-Dist: numcodecs>=0.12
24
+ Requires-Dist: numpy>=1.24.0
25
+ Requires-Dist: orjson>=3.9.0
26
+ Requires-Dist: plotly
27
+ Requires-Dist: pydantic>=2.5.0
28
+ Requires-Dist: rich>=13.0.0
29
+ Requires-Dist: scipy
30
+ Requires-Dist: tqdm
31
+ Requires-Dist: typer>=0.9.0
32
+ Requires-Dist: xarray>=2024.1
33
+ Requires-Dist: zarr>=3.0
34
+ Provides-Extra: dev
35
+ Requires-Dist: dask; extra == 'dev'
36
+ Requires-Dist: hatchling; extra == 'dev'
37
+ Requires-Dist: mypy; extra == 'dev'
38
+ Requires-Dist: pytest-asyncio; extra == 'dev'
39
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
40
+ Requires-Dist: ruff; extra == 'dev'
41
+ Provides-Extra: zarr
42
+ Requires-Dist: dask[distributed]>=2024.1; extra == 'zarr'
43
+ Requires-Dist: numcodecs>=0.12; extra == 'zarr'
44
+ Requires-Dist: xarray>=2024.1; extra == 'zarr'
45
+ Requires-Dist: zarr>=3.0; extra == 'zarr'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # pypff - High-performance PanoSETI I/O Library
49
+
50
+ [![pypff-CI](https://github.com/panoseti/pypff/actions/workflows/egg.yml/badge.svg)](https://github.com/panoseti/pypff/actions)
51
+ [![Version](https://img.shields.io/badge/version-1.0.0-blue)](https://github.com/panoseti/pypff)
52
+ [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
53
+ [![Coverage](https://img.shields.io/badge/coverage-65%25-green)](https://github.com/panoseti/pypff)
54
+ [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
55
+
56
+ A high-performance Python package for reading and analyzing data files generated by PanoSETI (PanoSETI File Format - PFF).
57
+
58
+ ## Features
59
+ - **Streaming by default:** `iter_batches(size=256)` and `__iter__` yield zero-copy views without materializing the full sequence into RAM — safe for Jupyter and HPC pipelines alike.
60
+ - **Distributed chunked reads:** `iter_byte_range(file_idx, byte_start, byte_end)` lets Dask/Nextflow workers parse frame-aligned byte ranges in parallel without coordination.
61
+ - **Zero-copy random access:** `seq[i]` returns a strided mmap view; slicing and `read_images(indices)` use a sort + inverse-permutation for disk locality.
62
+ - **Single-pass metadata:** `get_metadata_arrays(keys)` extracts any number of header fields in one `np.frombuffer` pass per file via a composite NumPy structured dtype. Supports virtual `unix_t_ns` key.
63
+ - **Nanosecond-precise timestamps:** `timestamps()` returns `int64` ns (no float precision loss); `timestamps(as_datetime=True)` returns a zero-copy `datetime64[ns]` view for matplotlib and pandas.
64
+ - **PFF → Zarr v3 conversion:** `pypff[zarr]` optional extra converts any `.pffd` run to Zarr v3 stores readable by xarray, dask, numpy, TensorStore, and Julia — lossless, compressed, HPC/ML-ready. See [Zarr v3 spec](docs/zarr_v3_spec.md).
65
+ - **Bounded resources:** LRU mmap handle cache (default 16 files); `PFFSequence` is a context manager.
66
+ - **Multiprocessing-safe:** pickle-compatible — file handles are dropped on serialisation and lazily reopened in workers.
67
+ - **Pydantic validation:** strict schema validation for all PFF headers and PANOSETI config files.
68
+ - **Run discovery:** `PanosetiRun` lazily scans a `.pffd` directory and exposes typed configs, housekeeping, and data products.
69
+
70
+ ## Installation
71
+ The package uses `uv` for dependency management.
72
+
73
+ ```bash
74
+ cd pypff
75
+ uv sync # core library
76
+ uv sync --extra zarr # + Zarr v3 conversion (zarr-python, xarray, dask)
77
+ ```
78
+
79
+ ## Quick Start
80
+
81
+ ### Run discovery
82
+ ```python
83
+ from pypff import PanosetiRun
84
+
85
+ run = PanosetiRun("path/to/run_directory.pffd")
86
+ run.show() # pretty-print structure
87
+
88
+ print(run.list_products()) # ['dp_img16.bpp_2.module_1', ...]
89
+ seq = run.get_product("dp_img16.bpp_2.module_1")
90
+
91
+ obs_cfg = run.get_config("obs_config") # returns Pydantic model
92
+ data_cfg = run.get_config("data_config")
93
+ hk = run.get_hk() # dict[device, dict[field, np.ndarray]]
94
+ ```
95
+
96
+ ### Streaming (memory-bounded)
97
+ ```python
98
+ # Iterate frame-by-frame (zero-copy views into mmap)
99
+ for img in seq:
100
+ process(img)
101
+
102
+ # Batch iteration — never holds more than one batch in RAM
103
+ for batch in seq.iter_batches(size=256):
104
+ batch # shape (256, H, W), dtype matches file
105
+
106
+ # With timestamps and headers in one pass
107
+ for batch, ts in seq.iter_batches(size=256, with_timestamps=True):
108
+ ts # int64 nanoseconds, shape (256,)
109
+
110
+ # Distributed byte-range reads (Dask / Nextflow workers)
111
+ for batch in seq.iter_byte_range(file_idx=0, byte_start=0, byte_end=size):
112
+ ...
113
+ ```
114
+
115
+ ### Random access and slicing
116
+ ```python
117
+ img = seq[42] # zero-copy view, shape (H, W)
118
+ imgs = seq[0:100:2] # every other frame, shape (50, H, W)
119
+ imgs = seq.read_images(np.array([5, 0, 3])) # unsorted — sorted internally for locality
120
+ imgs = seq.read_images_range(start=0, count=500)
121
+ ```
122
+
123
+ ### Timestamps — always `int64` nanoseconds
124
+ ```python
125
+ # Full cached array — int64 ns, no float precision loss
126
+ ts = seq.timestamps() # np.ndarray[int64]
127
+
128
+ # Zero-copy datetime64[ns] view for matplotlib / pandas
129
+ ts_dt = seq.timestamps(as_datetime=True) # np.ndarray[datetime64[ns]]
130
+
131
+ # Indexed subset
132
+ ts_sub = seq.timestamps(indices=np.arange(0, len(seq), 100))
133
+
134
+ # Single frame
135
+ t_ns = seq.timestamp_at(42) # int, nanoseconds
136
+
137
+ # Time-based navigation
138
+ idx = seq.seek_time(t_ns + 1_000_000_000) # 1 s later
139
+
140
+ # Arithmetic rule: subtract epoch in integer space before dividing
141
+ rel_s = (ts - ts[0]) / 1e9 # CORRECT — small values, no precision loss
142
+ # ts / 1e9 - t0_s # WRONG — loses ns precision at ~1.7e9 s
143
+ ```
144
+
145
+ ### Metadata extraction (single pass)
146
+ ```python
147
+ # One np.frombuffer call per file regardless of number of keys
148
+ meta = seq.get_metadata_arrays(["pkt_num", "tv_sec", "unix_t_ns"])
149
+ meta["unix_t_ns"] # int64 ns, same as timestamps()
150
+ meta["pkt_num"] # int64
151
+
152
+ # All fields at once
153
+ all_meta = seq.get_all_metadata()
154
+ ```
155
+
156
+ ### Headers and Pydantic validation
157
+ ```python
158
+ # Fast path — raw dict, no Pydantic overhead
159
+ header, img = seq.get_frame(0)
160
+ header["pkt_num"] # quabo file
161
+ header["quabo_0"]["pkt_tai"] # module file
162
+
163
+ # Validated path — full Pydantic model
164
+ from pypff.models import ModuleHeader
165
+ hdr, img = seq.get_frame_validated(0)
166
+ assert isinstance(hdr, ModuleHeader)
167
+ ```
168
+
169
+ ### Context manager and multiprocessing
170
+ ```python
171
+ # Deterministic handle cleanup
172
+ with run.get_product("dp_ph256.bpp_2.module_254") as seq:
173
+ data = seq.read_images_range(0, 1000)
174
+
175
+ # PFFSequence is pickle-safe — handles are dropped on serialisation
176
+ import concurrent.futures
177
+ with concurrent.futures.ProcessPoolExecutor() as ex:
178
+ futures = [ex.submit(lambda s, i: s[i].sum(), seq, i) for i in range(len(seq))]
179
+ ```
180
+
181
+ ## PFF → Zarr v3 conversion (`pypff[zarr]`)
182
+
183
+ Convert a `.pffd` observation run to Zarr v3 stores readable by xarray, dask,
184
+ numpy, TensorStore, Julia, and Rust. See [docs/zarr_v3_spec.md](docs/zarr_v3_spec.md)
185
+ for the full layout specification.
186
+
187
+ ### Write
188
+
189
+ ```python
190
+ from pypff.io2 import PanosetiRun
191
+ from pypff.zarr import convert_run
192
+
193
+ run = PanosetiRun("path/to/obs.pffd")
194
+ stores = convert_run(run, "output/L0_zarr")
195
+ # → one .zarr per (data_product, module)
196
+ # → one .panoseti-meta/ sidecar bundle (configs, logs, hk.pff)
197
+ ```
198
+
199
+ Or via the CLI:
200
+
201
+ ```bash
202
+ uv run pypff zarr path/to/obs.pffd output/L0_zarr
203
+ ```
204
+
205
+ ### Read
206
+
207
+ ```python
208
+ from pypff.zarr import PanosetiZarrRun
209
+ import xarray as xr
210
+
211
+ # High-level wrapper (mirrors PanosetiRun)
212
+ zrun = PanosetiZarrRun("output/L0_zarr")
213
+ store = zrun.get_product("dp_ph256.bpp_2.module_254")
214
+ ts = store.timestamps() # int64 ns array
215
+ ds = store.to_dataset() # xarray.Dataset
216
+ cfgs = zrun.configs # parsed run configs
217
+
218
+ # Or open directly with xarray — all arrays visible as variables
219
+ ds = xr.open_zarr(str(stores[0]), consolidated=False)
220
+ # ds.images (T, H, W) int16 / uint16
221
+ # ds.unix_t_ns (T,) int64 — nanosecond timestamps
222
+ # ds.pkt_num (T,) uint32 ─┐ per-frame header fields
223
+ # ds.pkt_nsec (T,) uint32 │ (single-level: ph256)
224
+ # ds.quabo_num (T,) uint8 ─┘
225
+ # ds.quabo_0_pkt_num (T,) uint32 ─┐ module-level headers
226
+ # … ─┘ (img16, ph1024)
227
+ ```
228
+
229
+ ### Zarr store layout
230
+
231
+ All arrays live at the **root** of each store (no sub-groups), so
232
+ `xr.open_zarr(store)` surfaces every variable — images, timestamps, and all
233
+ header fields — as a single Dataset aligned on the shared `time` dimension.
234
+ Logical grouping of headers is expressed via `header_fields` and `quabo_fields`
235
+ root attributes. Full specification: [docs/zarr_v3_spec.md](docs/zarr_v3_spec.md).
236
+
237
+ ## Testing
238
+ Run the test suite via the built-in CLI:
239
+
240
+ ```bash
241
+ uv run pypff test all
242
+ ```
243
+
244
+ The test suite includes:
245
+ - **Tier 1 (Unit):** Basic logic and timing tests.
246
+ - **Tier 2 (Logic):** Higher-level I/O, slicing, and concurrency tests.
247
+ - **Legacy Integration:** The original `pypff` test suite using provided sample data.
248
+
249
+ ## Dockerized CI
250
+ Build the CI environment:
251
+ ```bash
252
+ docker build -t pypff-ci -f src/ci/Dockerfile.ci .
253
+ ```
pypff-1.0.0/README.md ADDED
@@ -0,0 +1,206 @@
1
+ # pypff - High-performance PanoSETI I/O Library
2
+
3
+ [![pypff-CI](https://github.com/panoseti/pypff/actions/workflows/egg.yml/badge.svg)](https://github.com/panoseti/pypff/actions)
4
+ [![Version](https://img.shields.io/badge/version-1.0.0-blue)](https://github.com/panoseti/pypff)
5
+ [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
6
+ [![Coverage](https://img.shields.io/badge/coverage-65%25-green)](https://github.com/panoseti/pypff)
7
+ [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
8
+
9
+ A high-performance Python package for reading and analyzing data files generated by PanoSETI (PanoSETI File Format - PFF).
10
+
11
+ ## Features
12
+ - **Streaming by default:** `iter_batches(size=256)` and `__iter__` yield zero-copy views without materializing the full sequence into RAM — safe for Jupyter and HPC pipelines alike.
13
+ - **Distributed chunked reads:** `iter_byte_range(file_idx, byte_start, byte_end)` lets Dask/Nextflow workers parse frame-aligned byte ranges in parallel without coordination.
14
+ - **Zero-copy random access:** `seq[i]` returns a strided mmap view; slicing and `read_images(indices)` use a sort + inverse-permutation for disk locality.
15
+ - **Single-pass metadata:** `get_metadata_arrays(keys)` extracts any number of header fields in one `np.frombuffer` pass per file via a composite NumPy structured dtype. Supports virtual `unix_t_ns` key.
16
+ - **Nanosecond-precise timestamps:** `timestamps()` returns `int64` ns (no float precision loss); `timestamps(as_datetime=True)` returns a zero-copy `datetime64[ns]` view for matplotlib and pandas.
17
+ - **PFF → Zarr v3 conversion:** `pypff[zarr]` optional extra converts any `.pffd` run to Zarr v3 stores readable by xarray, dask, numpy, TensorStore, and Julia — lossless, compressed, HPC/ML-ready. See [Zarr v3 spec](docs/zarr_v3_spec.md).
18
+ - **Bounded resources:** LRU mmap handle cache (default 16 files); `PFFSequence` is a context manager.
19
+ - **Multiprocessing-safe:** pickle-compatible — file handles are dropped on serialisation and lazily reopened in workers.
20
+ - **Pydantic validation:** strict schema validation for all PFF headers and PANOSETI config files.
21
+ - **Run discovery:** `PanosetiRun` lazily scans a `.pffd` directory and exposes typed configs, housekeeping, and data products.
22
+
23
+ ## Installation
24
+ The package uses `uv` for dependency management.
25
+
26
+ ```bash
27
+ cd pypff
28
+ uv sync # core library
29
+ uv sync --extra zarr # + Zarr v3 conversion (zarr-python, xarray, dask)
30
+ ```
31
+
32
+ ## Quick Start
33
+
34
+ ### Run discovery
35
+ ```python
36
+ from pypff import PanosetiRun
37
+
38
+ run = PanosetiRun("path/to/run_directory.pffd")
39
+ run.show() # pretty-print structure
40
+
41
+ print(run.list_products()) # ['dp_img16.bpp_2.module_1', ...]
42
+ seq = run.get_product("dp_img16.bpp_2.module_1")
43
+
44
+ obs_cfg = run.get_config("obs_config") # returns Pydantic model
45
+ data_cfg = run.get_config("data_config")
46
+ hk = run.get_hk() # dict[device, dict[field, np.ndarray]]
47
+ ```
48
+
49
+ ### Streaming (memory-bounded)
50
+ ```python
51
+ # Iterate frame-by-frame (zero-copy views into mmap)
52
+ for img in seq:
53
+ process(img)
54
+
55
+ # Batch iteration — never holds more than one batch in RAM
56
+ for batch in seq.iter_batches(size=256):
57
+ batch # shape (256, H, W), dtype matches file
58
+
59
+ # With timestamps and headers in one pass
60
+ for batch, ts in seq.iter_batches(size=256, with_timestamps=True):
61
+ ts # int64 nanoseconds, shape (256,)
62
+
63
+ # Distributed byte-range reads (Dask / Nextflow workers)
64
+ for batch in seq.iter_byte_range(file_idx=0, byte_start=0, byte_end=size):
65
+ ...
66
+ ```
67
+
68
+ ### Random access and slicing
69
+ ```python
70
+ img = seq[42] # zero-copy view, shape (H, W)
71
+ imgs = seq[0:100:2] # every other frame, shape (50, H, W)
72
+ imgs = seq.read_images(np.array([5, 0, 3])) # unsorted — sorted internally for locality
73
+ imgs = seq.read_images_range(start=0, count=500)
74
+ ```
75
+
76
+ ### Timestamps — always `int64` nanoseconds
77
+ ```python
78
+ # Full cached array — int64 ns, no float precision loss
79
+ ts = seq.timestamps() # np.ndarray[int64]
80
+
81
+ # Zero-copy datetime64[ns] view for matplotlib / pandas
82
+ ts_dt = seq.timestamps(as_datetime=True) # np.ndarray[datetime64[ns]]
83
+
84
+ # Indexed subset
85
+ ts_sub = seq.timestamps(indices=np.arange(0, len(seq), 100))
86
+
87
+ # Single frame
88
+ t_ns = seq.timestamp_at(42) # int, nanoseconds
89
+
90
+ # Time-based navigation
91
+ idx = seq.seek_time(t_ns + 1_000_000_000) # 1 s later
92
+
93
+ # Arithmetic rule: subtract epoch in integer space before dividing
94
+ rel_s = (ts - ts[0]) / 1e9 # CORRECT — small values, no precision loss
95
+ # ts / 1e9 - t0_s # WRONG — loses ns precision at ~1.7e9 s
96
+ ```
97
+
98
+ ### Metadata extraction (single pass)
99
+ ```python
100
+ # One np.frombuffer call per file regardless of number of keys
101
+ meta = seq.get_metadata_arrays(["pkt_num", "tv_sec", "unix_t_ns"])
102
+ meta["unix_t_ns"] # int64 ns, same as timestamps()
103
+ meta["pkt_num"] # int64
104
+
105
+ # All fields at once
106
+ all_meta = seq.get_all_metadata()
107
+ ```
108
+
109
+ ### Headers and Pydantic validation
110
+ ```python
111
+ # Fast path — raw dict, no Pydantic overhead
112
+ header, img = seq.get_frame(0)
113
+ header["pkt_num"] # quabo file
114
+ header["quabo_0"]["pkt_tai"] # module file
115
+
116
+ # Validated path — full Pydantic model
117
+ from pypff.models import ModuleHeader
118
+ hdr, img = seq.get_frame_validated(0)
119
+ assert isinstance(hdr, ModuleHeader)
120
+ ```
121
+
122
+ ### Context manager and multiprocessing
123
+ ```python
124
+ # Deterministic handle cleanup
125
+ with run.get_product("dp_ph256.bpp_2.module_254") as seq:
126
+ data = seq.read_images_range(0, 1000)
127
+
128
+ # PFFSequence is pickle-safe — handles are dropped on serialisation
129
+ import concurrent.futures
130
+ with concurrent.futures.ProcessPoolExecutor() as ex:
131
+ futures = [ex.submit(lambda s, i: s[i].sum(), seq, i) for i in range(len(seq))]
132
+ ```
133
+
134
+ ## PFF → Zarr v3 conversion (`pypff[zarr]`)
135
+
136
+ Convert a `.pffd` observation run to Zarr v3 stores readable by xarray, dask,
137
+ numpy, TensorStore, Julia, and Rust. See [docs/zarr_v3_spec.md](docs/zarr_v3_spec.md)
138
+ for the full layout specification.
139
+
140
+ ### Write
141
+
142
+ ```python
143
+ from pypff.io2 import PanosetiRun
144
+ from pypff.zarr import convert_run
145
+
146
+ run = PanosetiRun("path/to/obs.pffd")
147
+ stores = convert_run(run, "output/L0_zarr")
148
+ # → one .zarr per (data_product, module)
149
+ # → one .panoseti-meta/ sidecar bundle (configs, logs, hk.pff)
150
+ ```
151
+
152
+ Or via the CLI:
153
+
154
+ ```bash
155
+ uv run pypff zarr path/to/obs.pffd output/L0_zarr
156
+ ```
157
+
158
+ ### Read
159
+
160
+ ```python
161
+ from pypff.zarr import PanosetiZarrRun
162
+ import xarray as xr
163
+
164
+ # High-level wrapper (mirrors PanosetiRun)
165
+ zrun = PanosetiZarrRun("output/L0_zarr")
166
+ store = zrun.get_product("dp_ph256.bpp_2.module_254")
167
+ ts = store.timestamps() # int64 ns array
168
+ ds = store.to_dataset() # xarray.Dataset
169
+ cfgs = zrun.configs # parsed run configs
170
+
171
+ # Or open directly with xarray — all arrays visible as variables
172
+ ds = xr.open_zarr(str(stores[0]), consolidated=False)
173
+ # ds.images (T, H, W) int16 / uint16
174
+ # ds.unix_t_ns (T,) int64 — nanosecond timestamps
175
+ # ds.pkt_num (T,) uint32 ─┐ per-frame header fields
176
+ # ds.pkt_nsec (T,) uint32 │ (single-level: ph256)
177
+ # ds.quabo_num (T,) uint8 ─┘
178
+ # ds.quabo_0_pkt_num (T,) uint32 ─┐ module-level headers
179
+ # … ─┘ (img16, ph1024)
180
+ ```
181
+
182
+ ### Zarr store layout
183
+
184
+ All arrays live at the **root** of each store (no sub-groups), so
185
+ `xr.open_zarr(store)` surfaces every variable — images, timestamps, and all
186
+ header fields — as a single Dataset aligned on the shared `time` dimension.
187
+ Logical grouping of headers is expressed via `header_fields` and `quabo_fields`
188
+ root attributes. Full specification: [docs/zarr_v3_spec.md](docs/zarr_v3_spec.md).
189
+
190
+ ## Testing
191
+ Run the test suite via the built-in CLI:
192
+
193
+ ```bash
194
+ uv run pypff test all
195
+ ```
196
+
197
+ The test suite includes:
198
+ - **Tier 1 (Unit):** Basic logic and timing tests.
199
+ - **Tier 2 (Logic):** Higher-level I/O, slicing, and concurrency tests.
200
+ - **Legacy Integration:** The original `pypff` test suite using provided sample data.
201
+
202
+ ## Dockerized CI
203
+ Build the CI environment:
204
+ ```bash
205
+ docker build -t pypff-ci -f src/ci/Dockerfile.ci .
206
+ ```