zdata-py 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zdata_py-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sam Cooper
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,30 @@
1
+ # Include C source files
2
+ include ctools/*.c
3
+
4
+ # Include README and LICENSE
5
+ include README.md
6
+ include LICENSE
7
+
8
+ # Include default gene list (required for alignment)
9
+ include files/2ks10c_genes.txt
10
+
11
+ # Include package data
12
+ recursive-include core *.py
13
+ recursive-include build *.py
14
+ recursive-include ctools *.c
15
+
16
+ # Include pre-compiled binaries if they exist (for fallback)
17
+ # Platform-specific binaries
18
+ include ctools/mtx_to_zdata
19
+ include ctools/mtx_to_zdata.exe
20
+ include ctools/zdata_read
21
+ include ctools/zdata_read.exe
22
+
23
+ # Exclude compiled binaries from git (but include in distribution)
24
+ global-exclude *.pyc
25
+ global-exclude *.pyo
26
+ global-exclude __pycache__
27
+ global-exclude *.so
28
+ global-exclude *.dylib
29
+ global-exclude *.dll
30
+
@@ -0,0 +1,332 @@
1
+ Metadata-Version: 2.4
2
+ Name: zdata-py
3
+ Version: 0.1.0
4
+ Summary: Efficient storage and access for large single-cell RNA datasets (supports Zarr and H5AD formats)
5
+ Home-page:
6
+ Author: Sam Cooper
7
+ Author-email:
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: numpy>=1.20.0
23
+ Requires-Dist: scipy>=1.7.0
24
+ Requires-Dist: polars>=0.18.0
25
+ Requires-Dist: pandas>=1.3.0
26
+ Requires-Dist: anndata>=0.8.0
27
+ Requires-Dist: zarr>=2.10.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
31
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
32
+ Provides-Extra: test
33
+ Requires-Dist: pytest>=7.0.0; extra == "test"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
35
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
36
+ Dynamic: author
37
+ Dynamic: classifier
38
+ Dynamic: description
39
+ Dynamic: description-content-type
40
+ Dynamic: license
41
+ Dynamic: license-file
42
+ Dynamic: provides-extra
43
+ Dynamic: requires-dist
44
+ Dynamic: requires-python
45
+ Dynamic: summary
46
+
47
+ # zdata
48
+
49
+ Efficient sparse matrix storage and retrieval for large-scale transcriptomics datasets using seekable zstd compression.
50
+
51
+ ## Overview
52
+
53
+ `zdata` is a high-performance library for storing and querying large sparse matrices (e.g., single-cell RNA-seq data) with efficient random row access. It uses a custom format based on:
54
+
55
+ - **Block-compressed sparse row (CSR) format** - Organized in 16-row blocks for efficient access
56
+ - **Zstd seekable compression** - Enables random access to compressed data without full decompression
57
+ - **Chunked storage** - Large matrices are split into 4096-row chunks stored as separate `.bin` files
58
+
59
+ This approach provides excellent compression ratios while maintaining fast random row retrieval performance, making it ideal for querying subsets of large datasets.
60
+
61
+ ## Features
62
+
63
+ - **Fast random row access** - Retrieve arbitrary rows without loading the entire dataset
64
+ - **Efficient compression** - Zstd compression with seekable format for space savings
65
+ - **Scalable** - Handles datasets with millions of rows and columns
66
+ - **Python API** - Simple, intuitive interface for data access
67
+ - **C-based backend** - High-performance C implementation for core operations
68
+ - **Multiple input formats** - Supports both Zarr and H5AD (AnnData) file formats
69
+ - **Auto-detection** - Automatically detects and processes mixed file types in a directory
70
+
71
+ ## Quick Start
72
+
73
+ ### Building zdata from Zarr or H5AD Files
74
+
75
+ The easiest way to create a zdata object is from a directory of zarr files or h5ad files:
76
+
77
+ ```python
78
+ from zdata import build_zdata_from_zarr
79
+
80
+ # Build zdata from a directory containing .zarr files or .h5/.hdf5/.h5ad files
81
+ # The function auto-detects file types based on extensions
82
+ zdata_dir = build_zdata_from_zarr(
83
+ zarr_dir='/path/to/data/directory', # Directory containing .zarr or .h5/.hdf5/.h5ad files
84
+ output_name='my_dataset.zdata', # Output zdata directory name
85
+ block_rows=16, # Rows per block (default: 16)
86
+ max_rows=8192, # Max rows per chunk (default: 8192)
87
+ obs_join_strategy="outer" # How to join obs metadata: "inner", "outer", or "columns"
88
+ )
89
+
90
+ # The function returns the path to the created zdata directory
91
+ print(f"Created zdata directory at: {zdata_dir}")
92
+ ```
93
+
94
+ This single function:
95
+ 1. Auto-detects file types (`.zarr` directories or `.h5`/`.hdf5`/`.h5ad` files)
96
+ 2. Aligns all files to a standard gene list
97
+ 3. Converts them to zdata format with efficient compression
98
+ 4. Concatenates observation metadata from all files
99
+ 5. Creates a complete `.zdata/` directory ready for querying
100
+
101
+ **Supported Input Formats:**
102
+ - **Zarr**: Directories ending in `.zarr` (e.g., `data.zarr/`)
103
+ - **H5AD**: Files with extensions `.h5`, `.hdf5`, or `.h5ad` (e.g., `data.h5ad`)
104
+
105
+ ### Reading from zdata
106
+
107
+ ```python
108
+ from zdata import ZData
109
+
110
+ # Open the zdata directory
111
+ reader = ZData("my_dataset.zdata")
112
+
113
+ # Query specific rows
114
+ rows_data = reader.read_rows([100, 200, 300])
115
+ for row_id, cols, vals in rows_data:
116
+ print(f"Row {row_id}: {len(cols)} non-zero values")
117
+ ```
118
+
119
+ ## Installation
120
+
121
+ ### Prerequisites
122
+
123
+ **Required:**
124
+ - **Python 3.8+**
125
+ - **GCC compiler** (for compiling C tools)
126
+ - **ZSTD source code** (not just the library) - The ZSTD source directory must contain:
127
+ - `lib/libzstd.a` (static library)
128
+ - `lib/common/xxhash.c` (source file)
129
+ - `contrib/seekable_format/zstdseek_compress.c` (source file)
130
+ - `contrib/seekable_format/zstdseek_decompress.c` (source file)
131
+
132
+ **Note:** C tools compilation is **required** for the package to work. The installation will fail if ZSTD is not found or if compilation fails.
133
+
134
+ ### Setting up ZSTD
135
+
136
+ If you don't have ZSTD source code, clone and build it:
137
+
138
+ ```bash
139
+ git clone https://github.com/facebook/zstd.git
140
+ cd zstd
141
+ make
142
+ export ZSTD_BASE=$(pwd)
143
+ ```
144
+
145
+ ### From PyPI
146
+
147
+ ```bash
148
+ # Set ZSTD_BASE before installation
149
+ export ZSTD_BASE=/path/to/zstd-source
150
+ pip install zdata
151
+ ```
152
+
153
+ The C tools will be automatically compiled during installation.
154
+
155
+ ### From Source
156
+
157
+ 1. Clone the repository:
158
+ ```bash
159
+ git clone <repository-url>
160
+ cd zdata
161
+ ```
162
+
163
+ 2. Set ZSTD_BASE:
164
+ ```bash
165
+ export ZSTD_BASE=/path/to/zstd-source
166
+ ```
167
+
168
+ 3. Install in development mode:
169
+ ```bash
170
+ pip install -e .
171
+ ```
172
+
173
+ Or install normally:
174
+ ```bash
175
+ pip install .
176
+ ```
177
+
178
+ The C tools will be automatically compiled during installation.
179
+
180
+ ## Usage
181
+
182
+ ### Converting MTX Files to zdata Format
183
+
184
+ ```bash
185
+ ./ctools/mtx_to_zdata matrix.mtx output_name
186
+ ```
187
+
188
+ This creates a directory `output_name.zdata/` containing numbered `.bin` files (0.bin, 1.bin, etc.), each containing up to 4096 rows.
189
+
190
+ ### Python API
191
+
192
+ ```python
193
+ from zdata.core import ZData
194
+
195
+ # Initialize reader
196
+ reader = ZData("andrews") # Looks for andrews.zdata/
197
+
198
+ # Get dataset info
199
+ print(f"Rows: {reader.num_rows}, Columns: {reader.num_columns}")
200
+
201
+ # Read specific rows
202
+ rows_data = reader.read_rows([100, 200, 300])
203
+ for row_id, cols, vals in rows_data:
204
+ print(f"Row {row_id}: {len(cols)} non-zeros")
205
+
206
+ # Read rows as CSR matrix
207
+ csr = reader.read_rows_csr([100, 200, 300])
208
+
209
+ # Get random rows
210
+ random_rows = reader.get_random_rows(10, seed=42)
211
+ data = reader.read_rows(random_rows)
212
+ ```
213
+
214
+ ### Command-Line Tools
215
+
216
+ **Convert MTX to zdata:**
217
+ ```bash
218
+ ./ctools/mtx_to_zdata input.mtx output_name
219
+ ```
220
+
221
+ **Read rows from zdata (binary output):**
222
+ ```bash
223
+ ./ctools/zdata_read --binary output_name.zdata/0.bin "100,200,300"
224
+ ```
225
+
226
+ ## Project Structure
227
+
228
+ ```
229
+ zdata/
230
+ ├── core/ # Python core module
231
+ │ ├── zdata.py # ZData class implementation
232
+ │ └── __init__.py
233
+ ├── build_zdata/ # Build and preprocessing utilities
234
+ │ ├── build_x.py # Build zdata from MTX files
235
+ │ ├── build_zdata.py # Main build function for zarr/h5ad directories
236
+ │ ├── align_mtx.py # Align zarr/h5ad files to standard gene list
237
+ │ ├── check_directory.py # Check zarr directory structure
238
+ │ └── concat_obs.py # Concatenate obs/metadata from zarr/h5ad files
239
+ ├── ctools/ # C command-line tools
240
+ │ ├── mtx_to_zdata.c # MTX to zdata converter
241
+ │ ├── zdata_read.c # Row reader
242
+ │ ├── mtx_to_zdata # Compiled binary (generated during install)
243
+ │ └── zdata_read # Compiled binary (generated during install)
244
+ ├── files/ # Package data files
245
+ │ └── 2ks10c_genes.txt # Default gene list for alignment (required)
246
+ └── tests/ # Test suite
247
+ ├── test_random_rows.py # Random row extraction test
248
+ ├── test_fast_queries.py # Performance benchmark
249
+ └── test_full_pipeline.py # Full pipeline test
250
+ ```
251
+
252
+ ### Default Gene List
253
+
254
+ The package includes a default gene list (`files/2ks10c_genes.txt`) that is used as the standard gene set for aligning zarr and h5ad files. This file is:
255
+ - **Required**: Must be included in the package distribution
256
+ - **Default**: Used automatically when building zdata from zarr or h5ad files
257
+ - **Overridable**: Can be replaced with a custom gene list path if needed
258
+
259
+ ## Testing
260
+
261
+ Run all tests with pytest:
262
+
263
+ ```bash
264
+ pytest tests/
265
+ ```
266
+
267
+ Run the full pipeline test (compiles, builds, and tests):
268
+
269
+ ```bash
270
+ # With zarr files (default)
271
+ python tests/test_full_pipeline_at_scale.py [zarr_directory] [output_name]
272
+
273
+ # With h5ad files
274
+ python tests/test_full_pipeline_at_scale.py --h5ad [h5ad_directory] [output_name]
275
+ ```
276
+
277
+ Run specific test modules:
278
+
279
+ ```bash
280
+ # Test core functionality
281
+ pytest tests/test_core/
282
+
283
+ # Test h5ad support
284
+ pytest tests/test_core/test_h5ad.py
285
+
286
+ # Test with coverage
287
+ pytest tests/ --cov=zdata --cov-report=html
288
+ ```
289
+
290
+ ## Performance
291
+
292
+ The zdata format is optimized for:
293
+ - **Random row queries** - Fast retrieval of arbitrary row subsets
294
+ - **Compression** - Significant space savings compared to uncompressed formats
295
+ - **Scalability** - Efficient handling of datasets with millions of cells/genes
296
+
297
+ Benchmark results can be obtained by running `test_fast_queries.py`.
298
+
299
+ ## Development
300
+
301
+ ### Building for PyPI
302
+
303
+ ```bash
304
+ # Install build tools
305
+ pip install build twine
306
+
307
+ # Build distribution packages
308
+ python -m build
309
+
310
+ # Test locally
311
+ pip install dist/zdata-*.whl
312
+
313
+ # Upload to PyPI
314
+ twine upload dist/*
315
+ ```
316
+
317
+ ### Cross-Platform Wheel Building
318
+
319
+ The project uses `cibuildwheel` to build platform-specific wheels. See `.github/workflows/build_wheels.yml` for the CI configuration.
320
+
321
+ **Building wheels locally:**
322
+ ```bash
323
+ pip install cibuildwheel
324
+ export ZSTD_BASE=/path/to/zstd-source
325
+ cibuildwheel --output-dir wheelhouse
326
+ ```
327
+
328
+ **Note:** C tools compilation is required. The setup script will compile C tools during installation if ZSTD is available, or use pre-compiled binaries from the wheel if available.
329
+
330
+ ## License
331
+
332
+ See LICENSE file for details.
@@ -0,0 +1,286 @@
1
+ # zdata
2
+
3
+ Efficient sparse matrix storage and retrieval for large-scale transcriptomics datasets using seekable zstd compression.
4
+
5
+ ## Overview
6
+
7
+ `zdata` is a high-performance library for storing and querying large sparse matrices (e.g., single-cell RNA-seq data) with efficient random row access. It uses a custom format based on:
8
+
9
+ - **Block-compressed sparse row (CSR) format** - Organized in 16-row blocks for efficient access
10
+ - **Zstd seekable compression** - Enables random access to compressed data without full decompression
11
+ - **Chunked storage** - Large matrices are split into 4096-row chunks stored as separate `.bin` files
12
+
13
+ This approach provides excellent compression ratios while maintaining fast random row retrieval performance, making it ideal for querying subsets of large datasets.
14
+
15
+ ## Features
16
+
17
+ - **Fast random row access** - Retrieve arbitrary rows without loading the entire dataset
18
+ - **Efficient compression** - Zstd compression with seekable format for space savings
19
+ - **Scalable** - Handles datasets with millions of rows and columns
20
+ - **Python API** - Simple, intuitive interface for data access
21
+ - **C-based backend** - High-performance C implementation for core operations
22
+ - **Multiple input formats** - Supports both Zarr and H5AD (AnnData) file formats
23
+ - **Auto-detection** - Automatically detects and processes mixed file types in a directory
24
+
25
+ ## Quick Start
26
+
27
+ ### Building zdata from Zarr or H5AD Files
28
+
29
+ The easiest way to create a zdata object is from a directory of zarr files or h5ad files:
30
+
31
+ ```python
32
+ from zdata import build_zdata_from_zarr
33
+
34
+ # Build zdata from a directory containing .zarr files or .h5/.hdf5/.h5ad files
35
+ # The function auto-detects file types based on extensions
36
+ zdata_dir = build_zdata_from_zarr(
37
+ zarr_dir='/path/to/data/directory', # Directory containing .zarr or .h5/.hdf5/.h5ad files
38
+ output_name='my_dataset.zdata', # Output zdata directory name
39
+ block_rows=16, # Rows per block (default: 16)
40
+ max_rows=8192, # Max rows per chunk (default: 8192)
41
+ obs_join_strategy="outer" # How to join obs metadata: "inner", "outer", or "columns"
42
+ )
43
+
44
+ # The function returns the path to the created zdata directory
45
+ print(f"Created zdata directory at: {zdata_dir}")
46
+ ```
47
+
48
+ This single function:
49
+ 1. Auto-detects file types (`.zarr` directories or `.h5`/`.hdf5`/`.h5ad` files)
50
+ 2. Aligns all files to a standard gene list
51
+ 3. Converts them to zdata format with efficient compression
52
+ 4. Concatenates observation metadata from all files
53
+ 5. Creates a complete `.zdata/` directory ready for querying
54
+
55
+ **Supported Input Formats:**
56
+ - **Zarr**: Directories ending in `.zarr` (e.g., `data.zarr/`)
57
+ - **H5AD**: Files with extensions `.h5`, `.hdf5`, or `.h5ad` (e.g., `data.h5ad`)
58
+
59
+ ### Reading from zdata
60
+
61
+ ```python
62
+ from zdata import ZData
63
+
64
+ # Open the zdata directory
65
+ reader = ZData("my_dataset.zdata")
66
+
67
+ # Query specific rows
68
+ rows_data = reader.read_rows([100, 200, 300])
69
+ for row_id, cols, vals in rows_data:
70
+ print(f"Row {row_id}: {len(cols)} non-zero values")
71
+ ```
72
+
73
+ ## Installation
74
+
75
+ ### Prerequisites
76
+
77
+ **Required:**
78
+ - **Python 3.8+**
79
+ - **GCC compiler** (for compiling C tools)
80
+ - **ZSTD source code** (not just the library) - The ZSTD source directory must contain:
81
+ - `lib/libzstd.a` (static library)
82
+ - `lib/common/xxhash.c` (source file)
83
+ - `contrib/seekable_format/zstdseek_compress.c` (source file)
84
+ - `contrib/seekable_format/zstdseek_decompress.c` (source file)
85
+
86
+ **Note:** C tools compilation is **required** for the package to work. The installation will fail if ZSTD is not found or if compilation fails.
87
+
88
+ ### Setting up ZSTD
89
+
90
+ If you don't have ZSTD source code, clone and build it:
91
+
92
+ ```bash
93
+ git clone https://github.com/facebook/zstd.git
94
+ cd zstd
95
+ make
96
+ export ZSTD_BASE=$(pwd)
97
+ ```
98
+
99
+ ### From PyPI
100
+
101
+ ```bash
102
+ # Set ZSTD_BASE before installation
103
+ export ZSTD_BASE=/path/to/zstd-source
104
+ pip install zdata
105
+ ```
106
+
107
+ The C tools will be automatically compiled during installation.
108
+
109
+ ### From Source
110
+
111
+ 1. Clone the repository:
112
+ ```bash
113
+ git clone <repository-url>
114
+ cd zdata
115
+ ```
116
+
117
+ 2. Set ZSTD_BASE:
118
+ ```bash
119
+ export ZSTD_BASE=/path/to/zstd-source
120
+ ```
121
+
122
+ 3. Install in development mode:
123
+ ```bash
124
+ pip install -e .
125
+ ```
126
+
127
+ Or install normally:
128
+ ```bash
129
+ pip install .
130
+ ```
131
+
132
+ The C tools will be automatically compiled during installation.
133
+
134
+ ## Usage
135
+
136
+ ### Converting MTX Files to zdata Format
137
+
138
+ ```bash
139
+ ./ctools/mtx_to_zdata matrix.mtx output_name
140
+ ```
141
+
142
+ This creates a directory `output_name.zdata/` containing numbered `.bin` files (0.bin, 1.bin, etc.), each containing up to 4096 rows.
143
+
144
+ ### Python API
145
+
146
+ ```python
147
+ from zdata.core import ZData
148
+
149
+ # Initialize reader
150
+ reader = ZData("andrews") # Looks for andrews.zdata/
151
+
152
+ # Get dataset info
153
+ print(f"Rows: {reader.num_rows}, Columns: {reader.num_columns}")
154
+
155
+ # Read specific rows
156
+ rows_data = reader.read_rows([100, 200, 300])
157
+ for row_id, cols, vals in rows_data:
158
+ print(f"Row {row_id}: {len(cols)} non-zeros")
159
+
160
+ # Read rows as CSR matrix
161
+ csr = reader.read_rows_csr([100, 200, 300])
162
+
163
+ # Get random rows
164
+ random_rows = reader.get_random_rows(10, seed=42)
165
+ data = reader.read_rows(random_rows)
166
+ ```
167
+
168
+ ### Command-Line Tools
169
+
170
+ **Convert MTX to zdata:**
171
+ ```bash
172
+ ./ctools/mtx_to_zdata input.mtx output_name
173
+ ```
174
+
175
+ **Read rows from zdata (binary output):**
176
+ ```bash
177
+ ./ctools/zdata_read --binary output_name.zdata/0.bin "100,200,300"
178
+ ```
179
+
180
+ ## Project Structure
181
+
182
+ ```
183
+ zdata/
184
+ ├── core/ # Python core module
185
+ │ ├── zdata.py # ZData class implementation
186
+ │ └── __init__.py
187
+ ├── build_zdata/ # Build and preprocessing utilities
188
+ │ ├── build_x.py # Build zdata from MTX files
189
+ │ ├── build_zdata.py # Main build function for zarr/h5ad directories
190
+ │ ├── align_mtx.py # Align zarr/h5ad files to standard gene list
191
+ │ ├── check_directory.py # Check zarr directory structure
192
+ │ └── concat_obs.py # Concatenate obs/metadata from zarr/h5ad files
193
+ ├── ctools/ # C command-line tools
194
+ │ ├── mtx_to_zdata.c # MTX to zdata converter
195
+ │ ├── zdata_read.c # Row reader
196
+ │ ├── mtx_to_zdata # Compiled binary (generated during install)
197
+ │ └── zdata_read # Compiled binary (generated during install)
198
+ ├── files/ # Package data files
199
+ │ └── 2ks10c_genes.txt # Default gene list for alignment (required)
200
+ └── tests/ # Test suite
201
+ ├── test_random_rows.py # Random row extraction test
202
+ ├── test_fast_queries.py # Performance benchmark
203
+ └── test_full_pipeline.py # Full pipeline test
204
+ ```
205
+
206
+ ### Default Gene List
207
+
208
+ The package includes a default gene list (`files/2ks10c_genes.txt`) that is used as the standard gene set for aligning zarr and h5ad files. This file is:
209
+ - **Required**: Must be included in the package distribution
210
+ - **Default**: Used automatically when building zdata from zarr or h5ad files
211
+ - **Overridable**: Can be replaced with a custom gene list path if needed
212
+
213
+ ## Testing
214
+
215
+ Run all tests with pytest:
216
+
217
+ ```bash
218
+ pytest tests/
219
+ ```
220
+
221
+ Run the full pipeline test (compiles, builds, and tests):
222
+
223
+ ```bash
224
+ # With zarr files (default)
225
+ python tests/test_full_pipeline_at_scale.py [zarr_directory] [output_name]
226
+
227
+ # With h5ad files
228
+ python tests/test_full_pipeline_at_scale.py --h5ad [h5ad_directory] [output_name]
229
+ ```
230
+
231
+ Run specific test modules:
232
+
233
+ ```bash
234
+ # Test core functionality
235
+ pytest tests/test_core/
236
+
237
+ # Test h5ad support
238
+ pytest tests/test_core/test_h5ad.py
239
+
240
+ # Test with coverage
241
+ pytest tests/ --cov=zdata --cov-report=html
242
+ ```
243
+
244
+ ## Performance
245
+
246
+ The zdata format is optimized for:
247
+ - **Random row queries** - Fast retrieval of arbitrary row subsets
248
+ - **Compression** - Significant space savings compared to uncompressed formats
249
+ - **Scalability** - Efficient handling of datasets with millions of cells/genes
250
+
251
+ Benchmark results can be obtained by running `test_fast_queries.py`.
252
+
253
+ ## Development
254
+
255
+ ### Building for PyPI
256
+
257
+ ```bash
258
+ # Install build tools
259
+ pip install build twine
260
+
261
+ # Build distribution packages
262
+ python -m build
263
+
264
+ # Test locally
265
+ pip install dist/zdata-*.whl
266
+
267
+ # Upload to PyPI
268
+ twine upload dist/*
269
+ ```
270
+
271
+ ### Cross-Platform Wheel Building
272
+
273
+ The project uses `cibuildwheel` to build platform-specific wheels. See `.github/workflows/build_wheels.yml` for the CI configuration.
274
+
275
+ **Building wheels locally:**
276
+ ```bash
277
+ pip install cibuildwheel
278
+ export ZSTD_BASE=/path/to/zstd-source
279
+ cibuildwheel --output-dir wheelhouse
280
+ ```
281
+
282
+ **Note:** C tools compilation is required. The setup script will compile C tools during installation if ZSTD is available, or use pre-compiled binaries from the wheel if available.
283
+
284
+ ## License
285
+
286
+ See LICENSE file for details.
@@ -0,0 +1,29 @@
1
+ """
2
+ zdata - Efficient sparse matrix storage and retrieval using seekable zstd compression.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ from zdata._settings import settings
10
+ from zdata.core import ObsWrapper, ZData
11
+ from zdata.build_zdata.build_zdata import build_zdata_from_zarr
12
+ from zdata.build_zdata.build_x import build_zdata
13
+ from zdata.build_zdata.align_mtx import align_zarr_directory_to_mtx, get_default_gene_list_path
14
+ from zdata.build_zdata.concat_obs import concat_obs_from_zarr_directory
15
+ from zdata.build_zdata.check_directory import check_zarr_directory
16
+
17
+ __all__ = [
18
+ "ObsWrapper",
19
+ "ZData",
20
+ "__version__",
21
+ "settings",
22
+ "build_zdata_from_zarr",
23
+ "build_zdata",
24
+ "align_zarr_directory_to_mtx",
25
+ "get_default_gene_list_path",
26
+ "concat_obs_from_zarr_directory",
27
+ "check_zarr_directory",
28
+ ]
29
+