zdata-py 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zdata_py-0.1.0/LICENSE +21 -0
- zdata_py-0.1.0/MANIFEST.in +30 -0
- zdata_py-0.1.0/PKG-INFO +332 -0
- zdata_py-0.1.0/README.md +286 -0
- zdata_py-0.1.0/__init__.py +29 -0
- zdata_py-0.1.0/_settings.py +501 -0
- zdata_py-0.1.0/build_zdata/__init__.py +4 -0
- zdata_py-0.1.0/build_zdata/align_mtx.py +715 -0
- zdata_py-0.1.0/build_zdata/build_x.py +520 -0
- zdata_py-0.1.0/build_zdata/build_zdata.py +465 -0
- zdata_py-0.1.0/build_zdata/check_directory.py +155 -0
- zdata_py-0.1.0/build_zdata/concat_obs.py +539 -0
- zdata_py-0.1.0/core/__init__.py +23 -0
- zdata_py-0.1.0/core/index.py +458 -0
- zdata_py-0.1.0/core/utils.py +131 -0
- zdata_py-0.1.0/core/zdata.py +1098 -0
- zdata_py-0.1.0/ctools/mtx_to_zdata +0 -0
- zdata_py-0.1.0/ctools/mtx_to_zdata.c +793 -0
- zdata_py-0.1.0/ctools/zdata_read +0 -0
- zdata_py-0.1.0/ctools/zdata_read.c +296 -0
- zdata_py-0.1.0/files/2ks10c_genes.txt +35804 -0
- zdata_py-0.1.0/files/__init__.py +2 -0
- zdata_py-0.1.0/pyproject.toml +15 -0
- zdata_py-0.1.0/setup.cfg +4 -0
- zdata_py-0.1.0/setup.py +388 -0
- zdata_py-0.1.0/tests/test_full_pipeline_at_scale.py +487 -0
- zdata_py-0.1.0/zdata_py.egg-info/PKG-INFO +332 -0
- zdata_py-0.1.0/zdata_py.egg-info/SOURCES.txt +48 -0
- zdata_py-0.1.0/zdata_py.egg-info/dependency_links.txt +1 -0
- zdata_py-0.1.0/zdata_py.egg-info/requires.txt +16 -0
- zdata_py-0.1.0/zdata_py.egg-info/top_level.txt +1 -0
zdata_py-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sam Cooper
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Include C source files
|
|
2
|
+
include ctools/*.c
|
|
3
|
+
|
|
4
|
+
# Include README and LICENSE
|
|
5
|
+
include README.md
|
|
6
|
+
include LICENSE
|
|
7
|
+
|
|
8
|
+
# Include default gene list (required for alignment)
|
|
9
|
+
include files/2ks10c_genes.txt
|
|
10
|
+
|
|
11
|
+
# Include package data
|
|
12
|
+
recursive-include core *.py
|
|
13
|
+
recursive-include build *.py
|
|
14
|
+
recursive-include ctools *.c
|
|
15
|
+
|
|
16
|
+
# Include pre-compiled binaries if they exist (for fallback)
|
|
17
|
+
# Platform-specific binaries
|
|
18
|
+
include ctools/mtx_to_zdata
|
|
19
|
+
include ctools/mtx_to_zdata.exe
|
|
20
|
+
include ctools/zdata_read
|
|
21
|
+
include ctools/zdata_read.exe
|
|
22
|
+
|
|
23
|
+
# Exclude compiled binaries from git (but include in distribution)
|
|
24
|
+
global-exclude *.pyc
|
|
25
|
+
global-exclude *.pyo
|
|
26
|
+
global-exclude __pycache__
|
|
27
|
+
global-exclude *.so
|
|
28
|
+
global-exclude *.dylib
|
|
29
|
+
global-exclude *.dll
|
|
30
|
+
|
zdata_py-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zdata-py
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Efficient storage and access for large single-cell RNA datasets (supports Zarr and H5AD formats)
|
|
5
|
+
Home-page:
|
|
6
|
+
Author: Sam Cooper
|
|
7
|
+
Author-email:
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy>=1.20.0
|
|
23
|
+
Requires-Dist: scipy>=1.7.0
|
|
24
|
+
Requires-Dist: polars>=0.18.0
|
|
25
|
+
Requires-Dist: pandas>=1.3.0
|
|
26
|
+
Requires-Dist: anndata>=0.8.0
|
|
27
|
+
Requires-Dist: zarr>=2.10.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
|
|
32
|
+
Provides-Extra: test
|
|
33
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
34
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
35
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
|
36
|
+
Dynamic: author
|
|
37
|
+
Dynamic: classifier
|
|
38
|
+
Dynamic: description
|
|
39
|
+
Dynamic: description-content-type
|
|
40
|
+
Dynamic: license
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
Dynamic: provides-extra
|
|
43
|
+
Dynamic: requires-dist
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
Dynamic: summary
|
|
46
|
+
|
|
47
|
+
# zdata
|
|
48
|
+
|
|
49
|
+
Efficient sparse matrix storage and retrieval for large-scale transcriptomics datasets using seekable zstd compression.
|
|
50
|
+
|
|
51
|
+
## Overview
|
|
52
|
+
|
|
53
|
+
`zdata` is a high-performance library for storing and querying large sparse matrices (e.g., single-cell RNA-seq data) with efficient random row access. It uses a custom format based on:
|
|
54
|
+
|
|
55
|
+
- **Block-compressed sparse row (CSR) format** - Organized in 16-row blocks for efficient access
|
|
56
|
+
- **Zstd seekable compression** - Enables random access to compressed data without full decompression
|
|
57
|
+
- **Chunked storage** - Large matrices are split into 4096-row chunks stored as separate `.bin` files
|
|
58
|
+
|
|
59
|
+
This approach provides excellent compression ratios while maintaining fast random row retrieval performance, making it ideal for querying subsets of large datasets.
|
|
60
|
+
|
|
61
|
+
## Features
|
|
62
|
+
|
|
63
|
+
- **Fast random row access** - Retrieve arbitrary rows without loading the entire dataset
|
|
64
|
+
- **Efficient compression** - Zstd compression with seekable format for space savings
|
|
65
|
+
- **Scalable** - Handles datasets with millions of rows and columns
|
|
66
|
+
- **Python API** - Simple, intuitive interface for data access
|
|
67
|
+
- **C-based backend** - High-performance C implementation for core operations
|
|
68
|
+
- **Multiple input formats** - Supports both Zarr and H5AD (AnnData) file formats
|
|
69
|
+
- **Auto-detection** - Automatically detects and processes mixed file types in a directory
|
|
70
|
+
|
|
71
|
+
## Quick Start
|
|
72
|
+
|
|
73
|
+
### Building zdata from Zarr or H5AD Files
|
|
74
|
+
|
|
75
|
+
The easiest way to create a zdata object is from a directory of zarr files or h5ad files:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from zdata import build_zdata_from_zarr
|
|
79
|
+
|
|
80
|
+
# Build zdata from a directory containing .zarr files or .h5/.hdf5/.h5ad files
|
|
81
|
+
# The function auto-detects file types based on extensions
|
|
82
|
+
zdata_dir = build_zdata_from_zarr(
|
|
83
|
+
zarr_dir='/path/to/data/directory', # Directory containing .zarr or .h5/.hdf5/.h5ad files
|
|
84
|
+
output_name='my_dataset.zdata', # Output zdata directory name
|
|
85
|
+
block_rows=16, # Rows per block (default: 16)
|
|
86
|
+
max_rows=8192, # Max rows per chunk (default: 8192)
|
|
87
|
+
obs_join_strategy="outer" # How to join obs metadata: "inner", "outer", or "columns"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# The function returns the path to the created zdata directory
|
|
91
|
+
print(f"Created zdata directory at: {zdata_dir}")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
This single function:
|
|
95
|
+
1. Auto-detects file types (`.zarr` directories or `.h5`/`.hdf5`/`.h5ad` files)
|
|
96
|
+
2. Aligns all files to a standard gene list
|
|
97
|
+
3. Converts them to zdata format with efficient compression
|
|
98
|
+
4. Concatenates observation metadata from all files
|
|
99
|
+
5. Creates a complete `.zdata/` directory ready for querying
|
|
100
|
+
|
|
101
|
+
**Supported Input Formats:**
|
|
102
|
+
- **Zarr**: Directories ending in `.zarr` (e.g., `data.zarr/`)
|
|
103
|
+
- **H5AD**: Files with extensions `.h5`, `.hdf5`, or `.h5ad` (e.g., `data.h5ad`)
|
|
104
|
+
|
|
105
|
+
### Reading from zdata
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from zdata import ZData
|
|
109
|
+
|
|
110
|
+
# Open the zdata directory
|
|
111
|
+
reader = ZData("my_dataset.zdata")
|
|
112
|
+
|
|
113
|
+
# Query specific rows
|
|
114
|
+
rows_data = reader.read_rows([100, 200, 300])
|
|
115
|
+
for row_id, cols, vals in rows_data:
|
|
116
|
+
print(f"Row {row_id}: {len(cols)} non-zero values")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Installation
|
|
120
|
+
|
|
121
|
+
### Prerequisites
|
|
122
|
+
|
|
123
|
+
**Required:**
|
|
124
|
+
- **Python 3.8+**
|
|
125
|
+
- **GCC compiler** (for compiling C tools)
|
|
126
|
+
- **ZSTD source code** (not just the library) - The ZSTD source directory must contain:
|
|
127
|
+
- `lib/libzstd.a` (static library)
|
|
128
|
+
- `lib/common/xxhash.c` (source file)
|
|
129
|
+
- `contrib/seekable_format/zstdseek_compress.c` (source file)
|
|
130
|
+
- `contrib/seekable_format/zstdseek_decompress.c` (source file)
|
|
131
|
+
|
|
132
|
+
**Note:** C tools compilation is **required** for the package to work. The installation will fail if ZSTD is not found or if compilation fails.
|
|
133
|
+
|
|
134
|
+
### Setting up ZSTD
|
|
135
|
+
|
|
136
|
+
If you don't have ZSTD source code, clone and build it:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
git clone https://github.com/facebook/zstd.git
|
|
140
|
+
cd zstd
|
|
141
|
+
make
|
|
142
|
+
export ZSTD_BASE=$(pwd)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### From PyPI
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Set ZSTD_BASE before installation
|
|
149
|
+
export ZSTD_BASE=/path/to/zstd-source
|
|
150
|
+
pip install zdata
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
The C tools will be automatically compiled during installation.
|
|
154
|
+
|
|
155
|
+
### From Source
|
|
156
|
+
|
|
157
|
+
1. Clone the repository:
|
|
158
|
+
```bash
|
|
159
|
+
git clone <repository-url>
|
|
160
|
+
cd zdata
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
2. Set ZSTD_BASE:
|
|
164
|
+
```bash
|
|
165
|
+
export ZSTD_BASE=/path/to/zstd-source
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
3. Install in development mode:
|
|
169
|
+
```bash
|
|
170
|
+
pip install -e .
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Or install normally:
|
|
174
|
+
```bash
|
|
175
|
+
pip install .
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
The C tools will be automatically compiled during installation.
|
|
179
|
+
|
|
180
|
+
## Usage
|
|
181
|
+
|
|
182
|
+
### Converting MTX Files to zdata Format
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
./ctools/mtx_to_zdata matrix.mtx output_name
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
This creates a directory `output_name.zdata/` containing numbered `.bin` files (0.bin, 1.bin, etc.), each containing up to 4096 rows.
|
|
189
|
+
|
|
190
|
+
### Python API
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from zdata.core import ZData
|
|
194
|
+
|
|
195
|
+
# Initialize reader
|
|
196
|
+
reader = ZData("andrews") # Looks for andrews.zdata/
|
|
197
|
+
|
|
198
|
+
# Get dataset info
|
|
199
|
+
print(f"Rows: {reader.num_rows}, Columns: {reader.num_columns}")
|
|
200
|
+
|
|
201
|
+
# Read specific rows
|
|
202
|
+
rows_data = reader.read_rows([100, 200, 300])
|
|
203
|
+
for row_id, cols, vals in rows_data:
|
|
204
|
+
print(f"Row {row_id}: {len(cols)} non-zeros")
|
|
205
|
+
|
|
206
|
+
# Read rows as CSR matrix
|
|
207
|
+
csr = reader.read_rows_csr([100, 200, 300])
|
|
208
|
+
|
|
209
|
+
# Get random rows
|
|
210
|
+
random_rows = reader.get_random_rows(10, seed=42)
|
|
211
|
+
data = reader.read_rows(random_rows)
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Command-Line Tools
|
|
215
|
+
|
|
216
|
+
**Convert MTX to zdata:**
|
|
217
|
+
```bash
|
|
218
|
+
./ctools/mtx_to_zdata input.mtx output_name
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**Read rows from zdata (binary output):**
|
|
222
|
+
```bash
|
|
223
|
+
./ctools/zdata_read --binary output_name.zdata/0.bin "100,200,300"
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Project Structure
|
|
227
|
+
|
|
228
|
+
```
|
|
229
|
+
zdata/
|
|
230
|
+
├── core/ # Python core module
|
|
231
|
+
│ ├── zdata.py # ZData class implementation
|
|
232
|
+
│ └── __init__.py
|
|
233
|
+
├── build_zdata/ # Build and preprocessing utilities
|
|
234
|
+
│ ├── build_x.py # Build zdata from MTX files
|
|
235
|
+
│ ├── build_zdata.py # Main build function for zarr/h5ad directories
|
|
236
|
+
│ ├── align_mtx.py # Align zarr/h5ad files to standard gene list
|
|
237
|
+
│ ├── check_directory.py # Check zarr directory structure
|
|
238
|
+
│ └── concat_obs.py # Concatenate obs/metadata from zarr/h5ad files
|
|
239
|
+
├── ctools/ # C command-line tools
|
|
240
|
+
│ ├── mtx_to_zdata.c # MTX to zdata converter
|
|
241
|
+
│ ├── zdata_read.c # Row reader
|
|
242
|
+
│ ├── mtx_to_zdata # Compiled binary (generated during install)
|
|
243
|
+
│ └── zdata_read # Compiled binary (generated during install)
|
|
244
|
+
├── files/ # Package data files
|
|
245
|
+
│ └── 2ks10c_genes.txt # Default gene list for alignment (required)
|
|
246
|
+
└── tests/ # Test suite
|
|
247
|
+
├── test_random_rows.py # Random row extraction test
|
|
248
|
+
├── test_fast_queries.py # Performance benchmark
|
|
249
|
+
└── test_full_pipeline.py # Full pipeline test
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Default Gene List
|
|
253
|
+
|
|
254
|
+
The package includes a default gene list (`files/2ks10c_genes.txt`) that is used as the standard gene set for aligning zarr and h5ad files. This file is:
|
|
255
|
+
- **Required**: Must be included in the package distribution
|
|
256
|
+
- **Default**: Used automatically when building zdata from zarr or h5ad files
|
|
257
|
+
- **Overridable**: Can be replaced with a custom gene list path if needed
|
|
258
|
+
|
|
259
|
+
## Testing
|
|
260
|
+
|
|
261
|
+
Run all tests with pytest:
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
pytest tests/
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
Run the full pipeline test (compiles, builds, and tests):
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
# With zarr files (default)
|
|
271
|
+
python tests/test_full_pipeline_at_scale.py [zarr_directory] [output_name]
|
|
272
|
+
|
|
273
|
+
# With h5ad files
|
|
274
|
+
python tests/test_full_pipeline_at_scale.py --h5ad [h5ad_directory] [output_name]
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Run specific test modules:
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
# Test core functionality
|
|
281
|
+
pytest tests/test_core/
|
|
282
|
+
|
|
283
|
+
# Test h5ad support
|
|
284
|
+
pytest tests/test_core/test_h5ad.py
|
|
285
|
+
|
|
286
|
+
# Test with coverage
|
|
287
|
+
pytest tests/ --cov=zdata --cov-report=html
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Performance
|
|
291
|
+
|
|
292
|
+
The zdata format is optimized for:
|
|
293
|
+
- **Random row queries** - Fast retrieval of arbitrary row subsets
|
|
294
|
+
- **Compression** - Significant space savings compared to uncompressed formats
|
|
295
|
+
- **Scalability** - Efficient handling of datasets with millions of cells/genes
|
|
296
|
+
|
|
297
|
+
Benchmark results can be obtained by running `test_fast_queries.py`.
|
|
298
|
+
|
|
299
|
+
## Development
|
|
300
|
+
|
|
301
|
+
### Building for PyPI
|
|
302
|
+
|
|
303
|
+
```bash
|
|
304
|
+
# Install build tools
|
|
305
|
+
pip install build twine
|
|
306
|
+
|
|
307
|
+
# Build distribution packages
|
|
308
|
+
python -m build
|
|
309
|
+
|
|
310
|
+
# Test locally
|
|
311
|
+
pip install dist/zdata-*.whl
|
|
312
|
+
|
|
313
|
+
# Upload to PyPI
|
|
314
|
+
twine upload dist/*
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
### Cross-Platform Wheel Building
|
|
318
|
+
|
|
319
|
+
The project uses `cibuildwheel` to build platform-specific wheels. See `.github/workflows/build_wheels.yml` for the CI configuration.
|
|
320
|
+
|
|
321
|
+
**Building wheels locally:**
|
|
322
|
+
```bash
|
|
323
|
+
pip install cibuildwheel
|
|
324
|
+
export ZSTD_BASE=/path/to/zstd-source
|
|
325
|
+
cibuildwheel --output-dir wheelhouse
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
**Note:** C tools compilation is required. The setup script will compile C tools during installation if ZSTD is available, or use pre-compiled binaries from the wheel if available.
|
|
329
|
+
|
|
330
|
+
## License
|
|
331
|
+
|
|
332
|
+
See LICENSE file for details.
|
zdata_py-0.1.0/README.md
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
# zdata
|
|
2
|
+
|
|
3
|
+
Efficient sparse matrix storage and retrieval for large-scale transcriptomics datasets using seekable zstd compression.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
`zdata` is a high-performance library for storing and querying large sparse matrices (e.g., single-cell RNA-seq data) with efficient random row access. It uses a custom format based on:
|
|
8
|
+
|
|
9
|
+
- **Block-compressed sparse row (CSR) format** - Organized in 16-row blocks for efficient access
|
|
10
|
+
- **Zstd seekable compression** - Enables random access to compressed data without full decompression
|
|
11
|
+
- **Chunked storage** - Large matrices are split into 4096-row chunks stored as separate `.bin` files
|
|
12
|
+
|
|
13
|
+
This approach provides excellent compression ratios while maintaining fast random row retrieval performance, making it ideal for querying subsets of large datasets.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **Fast random row access** - Retrieve arbitrary rows without loading the entire dataset
|
|
18
|
+
- **Efficient compression** - Zstd compression with seekable format for space savings
|
|
19
|
+
- **Scalable** - Handles datasets with millions of rows and columns
|
|
20
|
+
- **Python API** - Simple, intuitive interface for data access
|
|
21
|
+
- **C-based backend** - High-performance C implementation for core operations
|
|
22
|
+
- **Multiple input formats** - Supports both Zarr and H5AD (AnnData) file formats
|
|
23
|
+
- **Auto-detection** - Automatically detects and processes mixed file types in a directory
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
### Building zdata from Zarr or H5AD Files
|
|
28
|
+
|
|
29
|
+
The easiest way to create a zdata object is from a directory of zarr files or h5ad files:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from zdata import build_zdata_from_zarr
|
|
33
|
+
|
|
34
|
+
# Build zdata from a directory containing .zarr files or .h5/.hdf5/.h5ad files
|
|
35
|
+
# The function auto-detects file types based on extensions
|
|
36
|
+
zdata_dir = build_zdata_from_zarr(
|
|
37
|
+
zarr_dir='/path/to/data/directory', # Directory containing .zarr or .h5/.hdf5/.h5ad files
|
|
38
|
+
output_name='my_dataset.zdata', # Output zdata directory name
|
|
39
|
+
block_rows=16, # Rows per block (default: 16)
|
|
40
|
+
max_rows=8192, # Max rows per chunk (default: 8192)
|
|
41
|
+
obs_join_strategy="outer" # How to join obs metadata: "inner", "outer", or "columns"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# The function returns the path to the created zdata directory
|
|
45
|
+
print(f"Created zdata directory at: {zdata_dir}")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This single function:
|
|
49
|
+
1. Auto-detects file types (`.zarr` directories or `.h5`/`.hdf5`/`.h5ad` files)
|
|
50
|
+
2. Aligns all files to a standard gene list
|
|
51
|
+
3. Converts them to zdata format with efficient compression
|
|
52
|
+
4. Concatenates observation metadata from all files
|
|
53
|
+
5. Creates a complete `.zdata/` directory ready for querying
|
|
54
|
+
|
|
55
|
+
**Supported Input Formats:**
|
|
56
|
+
- **Zarr**: Directories ending in `.zarr` (e.g., `data.zarr/`)
|
|
57
|
+
- **H5AD**: Files with extensions `.h5`, `.hdf5`, or `.h5ad` (e.g., `data.h5ad`)
|
|
58
|
+
|
|
59
|
+
### Reading from zdata
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from zdata import ZData
|
|
63
|
+
|
|
64
|
+
# Open the zdata directory
|
|
65
|
+
reader = ZData("my_dataset.zdata")
|
|
66
|
+
|
|
67
|
+
# Query specific rows
|
|
68
|
+
rows_data = reader.read_rows([100, 200, 300])
|
|
69
|
+
for row_id, cols, vals in rows_data:
|
|
70
|
+
print(f"Row {row_id}: {len(cols)} non-zero values")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
### Prerequisites
|
|
76
|
+
|
|
77
|
+
**Required:**
|
|
78
|
+
- **Python 3.8+**
|
|
79
|
+
- **GCC compiler** (for compiling C tools)
|
|
80
|
+
- **ZSTD source code** (not just the library) - The ZSTD source directory must contain:
|
|
81
|
+
- `lib/libzstd.a` (static library)
|
|
82
|
+
- `lib/common/xxhash.c` (source file)
|
|
83
|
+
- `contrib/seekable_format/zstdseek_compress.c` (source file)
|
|
84
|
+
- `contrib/seekable_format/zstdseek_decompress.c` (source file)
|
|
85
|
+
|
|
86
|
+
**Note:** C tools compilation is **required** for the package to work. The installation will fail if ZSTD is not found or if compilation fails.
|
|
87
|
+
|
|
88
|
+
### Setting up ZSTD
|
|
89
|
+
|
|
90
|
+
If you don't have ZSTD source code, clone and build it:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
git clone https://github.com/facebook/zstd.git
|
|
94
|
+
cd zstd
|
|
95
|
+
make
|
|
96
|
+
export ZSTD_BASE=$(pwd)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### From PyPI
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Set ZSTD_BASE before installation
|
|
103
|
+
export ZSTD_BASE=/path/to/zstd-source
|
|
104
|
+
pip install zdata
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The C tools will be automatically compiled during installation.
|
|
108
|
+
|
|
109
|
+
### From Source
|
|
110
|
+
|
|
111
|
+
1. Clone the repository:
|
|
112
|
+
```bash
|
|
113
|
+
git clone <repository-url>
|
|
114
|
+
cd zdata
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
2. Set ZSTD_BASE:
|
|
118
|
+
```bash
|
|
119
|
+
export ZSTD_BASE=/path/to/zstd-source
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
3. Install in development mode:
|
|
123
|
+
```bash
|
|
124
|
+
pip install -e .
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Or install normally:
|
|
128
|
+
```bash
|
|
129
|
+
pip install .
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
The C tools will be automatically compiled during installation.
|
|
133
|
+
|
|
134
|
+
## Usage
|
|
135
|
+
|
|
136
|
+
### Converting MTX Files to zdata Format
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
./ctools/mtx_to_zdata matrix.mtx output_name
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
This creates a directory `output_name.zdata/` containing numbered `.bin` files (0.bin, 1.bin, etc.), each containing up to 4096 rows.
|
|
143
|
+
|
|
144
|
+
### Python API
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from zdata.core import ZData
|
|
148
|
+
|
|
149
|
+
# Initialize reader
|
|
150
|
+
reader = ZData("andrews") # Looks for andrews.zdata/
|
|
151
|
+
|
|
152
|
+
# Get dataset info
|
|
153
|
+
print(f"Rows: {reader.num_rows}, Columns: {reader.num_columns}")
|
|
154
|
+
|
|
155
|
+
# Read specific rows
|
|
156
|
+
rows_data = reader.read_rows([100, 200, 300])
|
|
157
|
+
for row_id, cols, vals in rows_data:
|
|
158
|
+
print(f"Row {row_id}: {len(cols)} non-zeros")
|
|
159
|
+
|
|
160
|
+
# Read rows as CSR matrix
|
|
161
|
+
csr = reader.read_rows_csr([100, 200, 300])
|
|
162
|
+
|
|
163
|
+
# Get random rows
|
|
164
|
+
random_rows = reader.get_random_rows(10, seed=42)
|
|
165
|
+
data = reader.read_rows(random_rows)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Command-Line Tools
|
|
169
|
+
|
|
170
|
+
**Convert MTX to zdata:**
|
|
171
|
+
```bash
|
|
172
|
+
./ctools/mtx_to_zdata input.mtx output_name
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Read rows from zdata (binary output):**
|
|
176
|
+
```bash
|
|
177
|
+
./ctools/zdata_read --binary output_name.zdata/0.bin "100,200,300"
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Project Structure
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
zdata/
|
|
184
|
+
├── core/ # Python core module
|
|
185
|
+
│ ├── zdata.py # ZData class implementation
|
|
186
|
+
│ └── __init__.py
|
|
187
|
+
├── build_zdata/ # Build and preprocessing utilities
|
|
188
|
+
│ ├── build_x.py # Build zdata from MTX files
|
|
189
|
+
│ ├── build_zdata.py # Main build function for zarr/h5ad directories
|
|
190
|
+
│ ├── align_mtx.py # Align zarr/h5ad files to standard gene list
|
|
191
|
+
│ ├── check_directory.py # Check zarr directory structure
|
|
192
|
+
│ └── concat_obs.py # Concatenate obs/metadata from zarr/h5ad files
|
|
193
|
+
├── ctools/ # C command-line tools
|
|
194
|
+
│ ├── mtx_to_zdata.c # MTX to zdata converter
|
|
195
|
+
│ ├── zdata_read.c # Row reader
|
|
196
|
+
│ ├── mtx_to_zdata # Compiled binary (generated during install)
|
|
197
|
+
│ └── zdata_read # Compiled binary (generated during install)
|
|
198
|
+
├── files/ # Package data files
|
|
199
|
+
│ └── 2ks10c_genes.txt # Default gene list for alignment (required)
|
|
200
|
+
└── tests/ # Test suite
|
|
201
|
+
├── test_random_rows.py # Random row extraction test
|
|
202
|
+
├── test_fast_queries.py # Performance benchmark
|
|
203
|
+
└── test_full_pipeline.py # Full pipeline test
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Default Gene List
|
|
207
|
+
|
|
208
|
+
The package includes a default gene list (`files/2ks10c_genes.txt`) that is used as the standard gene set for aligning zarr and h5ad files. This file is:
|
|
209
|
+
- **Required**: Must be included in the package distribution
|
|
210
|
+
- **Default**: Used automatically when building zdata from zarr or h5ad files
|
|
211
|
+
- **Overridable**: Can be replaced with a custom gene list path if needed
|
|
212
|
+
|
|
213
|
+
## Testing
|
|
214
|
+
|
|
215
|
+
Run all tests with pytest:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
pytest tests/
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Run the full pipeline test (compiles, builds, and tests):
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# With zarr files (default)
|
|
225
|
+
python tests/test_full_pipeline_at_scale.py [zarr_directory] [output_name]
|
|
226
|
+
|
|
227
|
+
# With h5ad files
|
|
228
|
+
python tests/test_full_pipeline_at_scale.py --h5ad [h5ad_directory] [output_name]
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Run specific test modules:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
# Test core functionality
|
|
235
|
+
pytest tests/test_core/
|
|
236
|
+
|
|
237
|
+
# Test h5ad support
|
|
238
|
+
pytest tests/test_core/test_h5ad.py
|
|
239
|
+
|
|
240
|
+
# Test with coverage
|
|
241
|
+
pytest tests/ --cov=zdata --cov-report=html
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Performance
|
|
245
|
+
|
|
246
|
+
The zdata format is optimized for:
|
|
247
|
+
- **Random row queries** - Fast retrieval of arbitrary row subsets
|
|
248
|
+
- **Compression** - Significant space savings compared to uncompressed formats
|
|
249
|
+
- **Scalability** - Efficient handling of datasets with millions of cells/genes
|
|
250
|
+
|
|
251
|
+
Benchmark results can be obtained by running `test_fast_queries.py`.
|
|
252
|
+
|
|
253
|
+
## Development
|
|
254
|
+
|
|
255
|
+
### Building for PyPI
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Install build tools
|
|
259
|
+
pip install build twine
|
|
260
|
+
|
|
261
|
+
# Build distribution packages
|
|
262
|
+
python -m build
|
|
263
|
+
|
|
264
|
+
# Test locally
|
|
265
|
+
pip install dist/zdata-*.whl
|
|
266
|
+
|
|
267
|
+
# Upload to PyPI
|
|
268
|
+
twine upload dist/*
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Cross-Platform Wheel Building
|
|
272
|
+
|
|
273
|
+
The project uses `cibuildwheel` to build platform-specific wheels. See `.github/workflows/build_wheels.yml` for the CI configuration.
|
|
274
|
+
|
|
275
|
+
**Building wheels locally:**
|
|
276
|
+
```bash
|
|
277
|
+
pip install cibuildwheel
|
|
278
|
+
export ZSTD_BASE=/path/to/zstd-source
|
|
279
|
+
cibuildwheel --output-dir wheelhouse
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
**Note:** C tools compilation is required. The setup script will compile C tools during installation if ZSTD is available, or use pre-compiled binaries from the wheel if available.
|
|
283
|
+
|
|
284
|
+
## License
|
|
285
|
+
|
|
286
|
+
See LICENSE file for details.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
zdata - Efficient sparse matrix storage and retrieval using seekable zstd compression.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
from zdata._settings import settings
|
|
10
|
+
from zdata.core import ObsWrapper, ZData
|
|
11
|
+
from zdata.build_zdata.build_zdata import build_zdata_from_zarr
|
|
12
|
+
from zdata.build_zdata.build_x import build_zdata
|
|
13
|
+
from zdata.build_zdata.align_mtx import align_zarr_directory_to_mtx, get_default_gene_list_path
|
|
14
|
+
from zdata.build_zdata.concat_obs import concat_obs_from_zarr_directory
|
|
15
|
+
from zdata.build_zdata.check_directory import check_zarr_directory
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ObsWrapper",
|
|
19
|
+
"ZData",
|
|
20
|
+
"__version__",
|
|
21
|
+
"settings",
|
|
22
|
+
"build_zdata_from_zarr",
|
|
23
|
+
"build_zdata",
|
|
24
|
+
"align_zarr_directory_to_mtx",
|
|
25
|
+
"get_default_gene_list_path",
|
|
26
|
+
"concat_obs_from_zarr_directory",
|
|
27
|
+
"check_zarr_directory",
|
|
28
|
+
]
|
|
29
|
+
|