h5adify 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h5adify-0.1.1/LICENSE +21 -0
- h5adify-0.1.1/PKG-INFO +305 -0
- h5adify-0.1.1/README.md +267 -0
- h5adify-0.1.1/pyproject.toml +56 -0
- h5adify-0.1.1/setup.cfg +4 -0
- h5adify-0.1.1/src/h5adify/__init__.py +10 -0
- h5adify-0.1.1/src/h5adify/cli.py +301 -0
- h5adify-0.1.1/src/h5adify/config.py +31 -0
- h5adify-0.1.1/src/h5adify/highlevel.py +77 -0
- h5adify-0.1.1/src/h5adify/inspect.py +105 -0
- h5adify-0.1.1/src/h5adify/local_query.py +34 -0
- h5adify-0.1.1/src/h5adify/manifest.py +189 -0
- h5adify-0.1.1/src/h5adify/merge.py +18 -0
- h5adify-0.1.1/src/h5adify/metadata.py +46 -0
- h5adify-0.1.1/src/h5adify/registry.py +49 -0
- h5adify-0.1.1/src/h5adify/sources/base.py +30 -0
- h5adify-0.1.1/src/h5adify/sources/cellxgene.py +101 -0
- h5adify-0.1.1/src/h5adify/sources/ema.py +196 -0
- h5adify-0.1.1/src/h5adify/sources/geo.py +198 -0
- h5adify-0.1.1/src/h5adify/sources/scp.py +113 -0
- h5adify-0.1.1/src/h5adify/sources/sodb.py +142 -0
- h5adify-0.1.1/src/h5adify/sources/ucsc.py +198 -0
- h5adify-0.1.1/src/h5adify/utils.py +167 -0
- h5adify-0.1.1/src/h5adify.egg-info/PKG-INFO +305 -0
- h5adify-0.1.1/src/h5adify.egg-info/SOURCES.txt +27 -0
- h5adify-0.1.1/src/h5adify.egg-info/dependency_links.txt +1 -0
- h5adify-0.1.1/src/h5adify.egg-info/entry_points.txt +2 -0
- h5adify-0.1.1/src/h5adify.egg-info/requires.txt +25 -0
- h5adify-0.1.1/src/h5adify.egg-info/top_level.txt +2 -0
h5adify-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 h5adify contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
h5adify-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: h5adify
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Download, normalize metadata, and convert public sc/snRNA-seq + spatial datasets to standardized .h5ad (AnnData).
|
|
5
|
+
Author: h5adify contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: single-cell,spatial,anndata,h5ad,GEO,cellxgene
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: anndata>=0.10.7
|
|
15
|
+
Requires-Dist: numpy>=1.23
|
|
16
|
+
Requires-Dist: pandas>=1.5
|
|
17
|
+
Requires-Dist: scipy>=1.10
|
|
18
|
+
Requires-Dist: requests>=2.31
|
|
19
|
+
Requires-Dist: tqdm>=4.66
|
|
20
|
+
Requires-Dist: GEOparse>=2.0.4
|
|
21
|
+
Requires-Dist: scanpy>=1.9.8
|
|
22
|
+
Requires-Dist: h5py>=3.9
|
|
23
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
24
|
+
Requires-Dist: lxml>=5.0
|
|
25
|
+
Requires-Dist: packaging>=23.2
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.4; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
29
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
30
|
+
Requires-Dist: packaging>=23.2; extra == "dev"
|
|
31
|
+
Requires-Dist: build; extra == "dev"
|
|
32
|
+
Requires-Dist: twine; extra == "dev"
|
|
33
|
+
Provides-Extra: docs
|
|
34
|
+
Requires-Dist: sphinx>=7.2; extra == "docs"
|
|
35
|
+
Requires-Dist: myst-parser>=2.0; extra == "docs"
|
|
36
|
+
Requires-Dist: sphinx-rtd-theme>=2.0; extra == "docs"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# h5adify
|
|
40
|
+
|
|
41
|
+
<p align="center">
|
|
42
|
+
<img src="https://img.shields.io/badge/Python-3.10%E2%80%933.11-informational.svg" />
|
|
43
|
+
<img src="https://img.shields.io/badge/AnnData-.h5ad%20native-blueviolet.svg" />
|
|
44
|
+
<img src="https://img.shields.io/badge/Scanpy-compatible-brightgreen.svg" />
|
|
45
|
+
<img src="https://img.shields.io/badge/Modality-single--cell%20%2B%20spatial-success.svg" />
|
|
46
|
+
<img src="https://img.shields.io/badge/Sources-GEO%20%7C%20CELLxGENE%20%7C%20%7C%20Zenodo%20%7C%20UCSC%20%7C%20EMA-orange.svg" />
|
|
47
|
+
</p>
|
|
48
|
+
|
|
49
|
+
`h5adify` is a small Python library + CLI to **search**, **download**, and **convert** public single-cell / spatial datasets into **standardized `.h5ad` (AnnData)** with consistent metadata fields (`.obs`).
|
|
50
|
+
It can also **merge** multiple datasets (even across sources) into a single `.h5ad`.
|
|
51
|
+
|
|
52
|
+
> **Best-effort by design**: public portals vary wildly. Some provide direct `.h5ad`, others provide 10x MTX/H5 and many clinical datasets are controlled-access. `h5adify` focuses on workflows that can be automated reliably without proprietary tooling being able to homogenously, automatically and download and annotate a very large number of datasets.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Supported sources
|
|
57
|
+
|
|
58
|
+
- **GEO (GSE/GSM)**
|
|
59
|
+
Downloads *processed supplementary matrices* (10x MTX/H5, etc.) and converts to `.h5ad` (**does not require SRA**).
|
|
60
|
+
|
|
61
|
+
- **CZ CELLxGENE Discover**
|
|
62
|
+
Accepts **dataset UUIDs** or direct **`.h5ad` URLs**.
|
|
63
|
+
Search is best-effort (API schema can vary and may return different JSON shapes depending on endpoint/proxy).
|
|
64
|
+
|
|
65
|
+
- **Zenodo**
|
|
66
|
+
Best-effort download via public endpoints / direct file links (when available).
|
|
67
|
+
|
|
68
|
+
- **UCSC Cell Browser (single-cell + some spatial datasets)**
|
|
69
|
+
Search via UCSC dataset registry, and download when a dataset exposes a direct `.h5ad` in the dataset directory.
|
|
70
|
+
|
|
71
|
+
- **EMA (EBI) — BioStudies / ArrayExpress**
|
|
72
|
+
Search via EBI BioStudies API (ArrayExpress collection).
|
|
73
|
+
Download works **only** when a study provides an attached **`.h5ad`** file.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Install (local)
|
|
78
|
+
|
|
79
|
+
### 1) Clone + venv
|
|
80
|
+
```bash
|
|
81
|
+
git clone <your-fork-or-local-repo>
|
|
82
|
+
cd h5adify
|
|
83
|
+
python -m venv .venv
|
|
84
|
+
source .venv/bin/activate
|
|
85
|
+
pip install -U pip
|
|
86
|
+
|
|
87
|
+
### 2) Install h5adify
|
|
88
|
+
```bash
|
|
89
|
+
pip install -e . # core
|
|
90
|
+
pip install -e ".[docs]" # docs build dependencies (optional)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Install (from pip)
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
pip install h5adify
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Quickstart (CLI)
|
|
100
|
+
### 1) Search datasets
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# GEO
|
|
104
|
+
h5adify search geo --query "human brain spatial transcriptomics" --max-results 20
|
|
105
|
+
|
|
106
|
+
# CELLxGENE
|
|
107
|
+
h5adify search cellxgene --query "human brain spatial transcriptomics" --max-results 20
|
|
108
|
+
|
|
109
|
+
# UCSC Cell Browser
|
|
110
|
+
h5adify search ucsc --query "human hippocampus" --max-results 20
|
|
111
|
+
|
|
112
|
+
# EMA / EBI (BioStudies / ArrayExpress)
|
|
113
|
+
h5adify search ema --query "single cell brain" --max-results 20
|
|
114
|
+
```
|
|
115
|
+
### 2) Download + convert (per dataset -> one .h5ad)
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# GEO: converts all samples with parseable supplementary matrices
|
|
119
|
+
h5adify download geo --gse GSE229409 --outdir data/out
|
|
120
|
+
|
|
121
|
+
# CELLxGENE: dataset UUID or direct .h5ad URL
|
|
122
|
+
h5adify download cellxgene --id e52ed1cc-d59f-4bf5-9716-8d81f14a89fd --outdir data/out
|
|
123
|
+
h5adify download cellxgene --id https://datasets.cellxgene.cziscience.com/e52ed1cc-d59f-4bf5-9716-8d81f14a89fd.h5ad --outdir data/out
|
|
124
|
+
|
|
125
|
+
# SODB: dataset-level (downloads all experiments -> one merged file)
|
|
126
|
+
h5adify download sodb --id "Mouse brain atlas" --outdir data/out
|
|
127
|
+
|
|
128
|
+
# SODB: single experiment
|
|
129
|
+
h5adify download sodb --id "Mouse brain atlas::exp_001" --outdir data/out
|
|
130
|
+
|
|
131
|
+
# UCSC: dataset id from search results (download works when a .h5ad is exposed)
|
|
132
|
+
h5adify download ucsc --id human-hippo-axis --outdir data/out
|
|
133
|
+
|
|
134
|
+
# EMA: E-MTAB / E-XXXX study accession (download works when an attached .h5ad is present)
|
|
135
|
+
h5adify download ema --id E-MTAB-XXXX --outdir data/out
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### 3) Multi-source batch + merge
|
|
139
|
+
```bash
|
|
140
|
+
h5adify batch \
|
|
141
|
+
--ids geo:GSE229409 \
|
|
142
|
+
cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
|
|
143
|
+
sodb:"Mouse brain atlas::exp_001" \
|
|
144
|
+
--outdir data/out \
|
|
145
|
+
--merge-out data/out/merged_all.h5ad
|
|
146
|
+
```
|
|
147
|
+
### 4) Batch multiple files from different databases
|
|
148
|
+
```bash
|
|
149
|
+
h5adify batch --ids geo:GSE229409 \
|
|
150
|
+
cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
|
|
151
|
+
--outdir data/out \
|
|
152
|
+
--merge-out data/out/merged.h5ad
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 5) Provide a manifest of a list of h5ad files
|
|
156
|
+
```bash
|
|
157
|
+
h5adify manifest --root data/stereo_seq_mouse_embryo/ \
|
|
158
|
+
--out data/stereo_seq_mouse_embryo/out
|
|
159
|
+
```
|
|
160
|
+
It gives a `.csv` and `.jsonl` files, allowing to analyze the metadata of a large list of samples.
|
|
161
|
+
|
|
162
|
+
### 6) Query the metadata of a list of h5ad files
|
|
163
|
+
|
|
164
|
+
There are 2 .h5ad in this folder:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
h5adify query --root data/stereo_seq_mouse_embryo/
|
|
168
|
+
UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
|
|
169
|
+
utils.warn_names_duplicates("obs")
|
|
170
|
+
[
|
|
171
|
+
{
|
|
172
|
+
"path": "data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
|
|
173
|
+
"filename": "mouse_embryo_all_slices.h5ad",
|
|
174
|
+
"n_obs": 176711,
|
|
175
|
+
"n_vars": 1923,
|
|
176
|
+
"x_dtype": "float32",
|
|
177
|
+
"is_sparse": false,
|
|
178
|
+
"has_raw_counts": false,
|
|
179
|
+
"has_spatial": true,
|
|
180
|
+
"layers": "count,norm",
|
|
181
|
+
"obsm": "spatial,spatial_aligned,spatial_pair",
|
|
182
|
+
"source": "",
|
|
183
|
+
"dataset_id": "",
|
|
184
|
+
"species": "",
|
|
185
|
+
"technology": "",
|
|
186
|
+
"condition": "",
|
|
187
|
+
"disease": "",
|
|
188
|
+
"batch": "real",
|
|
189
|
+
"checksum_sha256": ""
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"path": "data/stereo_seq_mouse_embryo/E16.5_E1S3_cell_bin.h5ad",
|
|
193
|
+
"filename": "E16.5_E1S3_cell_bin.h5ad",
|
|
194
|
+
"n_obs": 281377,
|
|
195
|
+
"n_vars": 28103,
|
|
196
|
+
"x_dtype": "float32",
|
|
197
|
+
"is_sparse": false,
|
|
198
|
+
"has_raw_counts": false,
|
|
199
|
+
"has_spatial": true,
|
|
200
|
+
"layers": "counts",
|
|
201
|
+
"obsm": "spatial",
|
|
202
|
+
"source": "",
|
|
203
|
+
"dataset_id": "",
|
|
204
|
+
"species": "",
|
|
205
|
+
"technology": "",
|
|
206
|
+
"condition": "",
|
|
207
|
+
"disease": "",
|
|
208
|
+
"batch": "",
|
|
209
|
+
"checksum_sha256": ""
|
|
210
|
+
}
|
|
211
|
+
]
|
|
212
|
+
```
|
|
213
|
+
### 7) Inspect the metadata of h5ad
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
h5adify inspect --path data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad
|
|
217
|
+
UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
|
|
218
|
+
utils.warn_names_duplicates("obs")
|
|
219
|
+
|
|
220
|
+
{
|
|
221
|
+
"path": "/home/aalentorn/Téléchargements/data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
|
|
222
|
+
"n_obs": 176711,
|
|
223
|
+
"n_vars": 1923,
|
|
224
|
+
"obs_cols": [
|
|
225
|
+
"n_genes_by_counts",
|
|
226
|
+
"log1p_n_genes_by_counts",
|
|
227
|
+
"total_counts",
|
|
228
|
+
"log1p_total_counts",
|
|
229
|
+
"annotation"
|
|
230
|
+
],
|
|
231
|
+
"var_cols": [],
|
|
232
|
+
"layers": [
|
|
233
|
+
"count",
|
|
234
|
+
"norm"
|
|
235
|
+
],
|
|
236
|
+
"obsm": [
|
|
237
|
+
"spatial",
|
|
238
|
+
"spatial_aligned",
|
|
239
|
+
"spatial_pair"
|
|
240
|
+
],
|
|
241
|
+
"uns": [],
|
|
242
|
+
"has_spatial": true,
|
|
243
|
+
"has_raw_counts": false,
|
|
244
|
+
"x_dtype": "float32",
|
|
245
|
+
"x_is_sparse": false,
|
|
246
|
+
"missing_std_fields": {
|
|
247
|
+
"source": 1.0,
|
|
248
|
+
"dataset_id": 1.0,
|
|
249
|
+
"species": 1.0,
|
|
250
|
+
"technology": 1.0,
|
|
251
|
+
"sex": 1.0,
|
|
252
|
+
"age": 1.0,
|
|
253
|
+
"condition": 1.0,
|
|
254
|
+
"disease": 1.0,
|
|
255
|
+
"batch": 0.0
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Standardized metadata (`.obs`)
|
|
261
|
+
|
|
262
|
+
By default, h5adify tries to fill a standard set of .obs fields where possible, e.g.:
|
|
263
|
+
|
|
264
|
+
`species`
|
|
265
|
+
`technology`
|
|
266
|
+
`sex`
|
|
267
|
+
`age`
|
|
268
|
+
`condition`
|
|
269
|
+
`disease`
|
|
270
|
+
`batch`
|
|
271
|
+
`source`
|
|
272
|
+
`dataset_id`
|
|
273
|
+
|
|
274
|
+
You can override any fields via repeatable `--set`:
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
h5adify download geo --gse GSE229409 --outdir data/out \
|
|
278
|
+
--set species=human --set condition=control --set technology=10x_visium
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Python usage (notebook)
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
from h5adify import download, merge_h5ads
|
|
285
|
+
|
|
286
|
+
# Download one dataset into standardized .h5ad
|
|
287
|
+
paths = download("geo", gse="GSE229409", outdir="data/out")
|
|
288
|
+
|
|
289
|
+
# Merge multiple .h5ad files
|
|
290
|
+
merged = merge_h5ads(["data/out/A.h5ad", "data/out/B.h5ad"], join="outer")
|
|
291
|
+
merged.write_h5ad("data/out/merged.h5ad")
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Notes on GEO (GSE) conversion
|
|
295
|
+
|
|
296
|
+
h5adify download geo focuses on processed supplementary matrices (e.g., 10x MTX/H5).
|
|
297
|
+
|
|
298
|
+
If a GEO series only provides raw SRA, you’ll need a dedicated pipeline (SRA → FASTQ → CellRanger/STARsolo → matrix).
|
|
299
|
+
h5adify will detect “raw-only” cases and explain what’s missing.
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## License
|
|
304
|
+
|
|
305
|
+
MIT
|
h5adify-0.1.1/README.md
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# h5adify
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://img.shields.io/badge/Python-3.10%E2%80%933.11-informational.svg" />
|
|
5
|
+
<img src="https://img.shields.io/badge/AnnData-.h5ad%20native-blueviolet.svg" />
|
|
6
|
+
<img src="https://img.shields.io/badge/Scanpy-compatible-brightgreen.svg" />
|
|
7
|
+
<img src="https://img.shields.io/badge/Modality-single--cell%20%2B%20spatial-success.svg" />
|
|
8
|
+
<img src="https://img.shields.io/badge/Sources-GEO%20%7C%20CELLxGENE%20%7C%20%7C%20Zenodo%20%7C%20UCSC%20%7C%20EMA-orange.svg" />
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
`h5adify` is a small Python library + CLI to **search**, **download**, and **convert** public single-cell / spatial datasets into **standardized `.h5ad` (AnnData)** with consistent metadata fields (`.obs`).
|
|
12
|
+
It can also **merge** multiple datasets (even across sources) into a single `.h5ad`.
|
|
13
|
+
|
|
14
|
+
> **Best-effort by design**: public portals vary wildly. Some provide direct `.h5ad`, others provide 10x MTX/H5 and many clinical datasets are controlled-access. `h5adify` focuses on workflows that can be automated reliably without proprietary tooling being able to homogenously, automatically and download and annotate a very large number of datasets.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Supported sources
|
|
19
|
+
|
|
20
|
+
- **GEO (GSE/GSM)**
|
|
21
|
+
Downloads *processed supplementary matrices* (10x MTX/H5, etc.) and converts to `.h5ad` (**does not require SRA**).
|
|
22
|
+
|
|
23
|
+
- **CZ CELLxGENE Discover**
|
|
24
|
+
Accepts **dataset UUIDs** or direct **`.h5ad` URLs**.
|
|
25
|
+
Search is best-effort (API schema can vary and may return different JSON shapes depending on endpoint/proxy).
|
|
26
|
+
|
|
27
|
+
- **Zenodo**
|
|
28
|
+
Best-effort download via public endpoints / direct file links (when available).
|
|
29
|
+
|
|
30
|
+
- **UCSC Cell Browser (single-cell + some spatial datasets)**
|
|
31
|
+
Search via UCSC dataset registry, and download when a dataset exposes a direct `.h5ad` in the dataset directory.
|
|
32
|
+
|
|
33
|
+
- **EMA (EBI) — BioStudies / ArrayExpress**
|
|
34
|
+
Search via EBI BioStudies API (ArrayExpress collection).
|
|
35
|
+
Download works **only** when a study provides an attached **`.h5ad`** file.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Install (local)
|
|
40
|
+
|
|
41
|
+
### 1) Clone + venv
|
|
42
|
+
```bash
|
|
43
|
+
git clone <your-fork-or-local-repo>
|
|
44
|
+
cd h5adify
|
|
45
|
+
python -m venv .venv
|
|
46
|
+
source .venv/bin/activate
|
|
47
|
+
pip install -U pip
|
|
48
|
+
|
|
49
|
+
### 2) Install h5adify
|
|
50
|
+
```bash
|
|
51
|
+
pip install -e . # core
|
|
52
|
+
pip install -e ".[docs]" # docs build dependencies (optional)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Install (from pip)
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install h5adify
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quickstart (CLI)
|
|
62
|
+
### 1) Search datasets
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
# GEO
|
|
66
|
+
h5adify search geo --query "human brain spatial transcriptomics" --max-results 20
|
|
67
|
+
|
|
68
|
+
# CELLxGENE
|
|
69
|
+
h5adify search cellxgene --query "human brain spatial transcriptomics" --max-results 20
|
|
70
|
+
|
|
71
|
+
# UCSC Cell Browser
|
|
72
|
+
h5adify search ucsc --query "human hippocampus" --max-results 20
|
|
73
|
+
|
|
74
|
+
# EMA / EBI (BioStudies / ArrayExpress)
|
|
75
|
+
h5adify search ema --query "single cell brain" --max-results 20
|
|
76
|
+
```
|
|
77
|
+
### 2) Download + convert (per dataset -> one .h5ad)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# GEO: converts all samples with parseable supplementary matrices
|
|
81
|
+
h5adify download geo --gse GSE229409 --outdir data/out
|
|
82
|
+
|
|
83
|
+
# CELLxGENE: dataset UUID or direct .h5ad URL
|
|
84
|
+
h5adify download cellxgene --id e52ed1cc-d59f-4bf5-9716-8d81f14a89fd --outdir data/out
|
|
85
|
+
h5adify download cellxgene --id https://datasets.cellxgene.cziscience.com/e52ed1cc-d59f-4bf5-9716-8d81f14a89fd.h5ad --outdir data/out
|
|
86
|
+
|
|
87
|
+
# SODB: dataset-level (downloads all experiments -> one merged file)
|
|
88
|
+
h5adify download sodb --id "Mouse brain atlas" --outdir data/out
|
|
89
|
+
|
|
90
|
+
# SODB: single experiment
|
|
91
|
+
h5adify download sodb --id "Mouse brain atlas::exp_001" --outdir data/out
|
|
92
|
+
|
|
93
|
+
# UCSC: dataset id from search results (download works when a .h5ad is exposed)
|
|
94
|
+
h5adify download ucsc --id human-hippo-axis --outdir data/out
|
|
95
|
+
|
|
96
|
+
# EMA: E-MTAB / E-XXXX study accession (download works when an attached .h5ad is present)
|
|
97
|
+
h5adify download ema --id E-MTAB-XXXX --outdir data/out
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 3) Multi-source batch + merge
|
|
101
|
+
```bash
|
|
102
|
+
h5adify batch \
|
|
103
|
+
--ids geo:GSE229409 \
|
|
104
|
+
cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
|
|
105
|
+
sodb:"Mouse brain atlas::exp_001" \
|
|
106
|
+
--outdir data/out \
|
|
107
|
+
--merge-out data/out/merged_all.h5ad
|
|
108
|
+
```
|
|
109
|
+
### 4) Batch multiple files from different databases
|
|
110
|
+
```bash
|
|
111
|
+
h5adify batch --ids geo:GSE229409 \
|
|
112
|
+
cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
|
|
113
|
+
--outdir data/out \
|
|
114
|
+
--merge-out data/out/merged.h5ad
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### 5) Provide a manifest of a list of h5ad files
|
|
118
|
+
```bash
|
|
119
|
+
h5adify manifest --root data/stereo_seq_mouse_embryo/ \
|
|
120
|
+
--out data/stereo_seq_mouse_embryo/out
|
|
121
|
+
```
|
|
122
|
+
It gives a `.csv` and `.jsonl` files, allowing to analyze the metadata of a large list of samples.
|
|
123
|
+
|
|
124
|
+
### 6) Query the metadata of a list of h5ad files
|
|
125
|
+
|
|
126
|
+
There are 2 .h5ad in this folder:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
h5adify query --root data/stereo_seq_mouse_embryo/
|
|
130
|
+
UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
|
|
131
|
+
utils.warn_names_duplicates("obs")
|
|
132
|
+
[
|
|
133
|
+
{
|
|
134
|
+
"path": "data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
|
|
135
|
+
"filename": "mouse_embryo_all_slices.h5ad",
|
|
136
|
+
"n_obs": 176711,
|
|
137
|
+
"n_vars": 1923,
|
|
138
|
+
"x_dtype": "float32",
|
|
139
|
+
"is_sparse": false,
|
|
140
|
+
"has_raw_counts": false,
|
|
141
|
+
"has_spatial": true,
|
|
142
|
+
"layers": "count,norm",
|
|
143
|
+
"obsm": "spatial,spatial_aligned,spatial_pair",
|
|
144
|
+
"source": "",
|
|
145
|
+
"dataset_id": "",
|
|
146
|
+
"species": "",
|
|
147
|
+
"technology": "",
|
|
148
|
+
"condition": "",
|
|
149
|
+
"disease": "",
|
|
150
|
+
"batch": "real",
|
|
151
|
+
"checksum_sha256": ""
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
"path": "data/stereo_seq_mouse_embryo/E16.5_E1S3_cell_bin.h5ad",
|
|
155
|
+
"filename": "E16.5_E1S3_cell_bin.h5ad",
|
|
156
|
+
"n_obs": 281377,
|
|
157
|
+
"n_vars": 28103,
|
|
158
|
+
"x_dtype": "float32",
|
|
159
|
+
"is_sparse": false,
|
|
160
|
+
"has_raw_counts": false,
|
|
161
|
+
"has_spatial": true,
|
|
162
|
+
"layers": "counts",
|
|
163
|
+
"obsm": "spatial",
|
|
164
|
+
"source": "",
|
|
165
|
+
"dataset_id": "",
|
|
166
|
+
"species": "",
|
|
167
|
+
"technology": "",
|
|
168
|
+
"condition": "",
|
|
169
|
+
"disease": "",
|
|
170
|
+
"batch": "",
|
|
171
|
+
"checksum_sha256": ""
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
```
|
|
175
|
+
### 7) Inspect the metadata of h5ad
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
h5adify inspect --path data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad
|
|
179
|
+
UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
|
|
180
|
+
utils.warn_names_duplicates("obs")
|
|
181
|
+
|
|
182
|
+
{
|
|
183
|
+
"path": "/home/aalentorn/Téléchargements/data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
|
|
184
|
+
"n_obs": 176711,
|
|
185
|
+
"n_vars": 1923,
|
|
186
|
+
"obs_cols": [
|
|
187
|
+
"n_genes_by_counts",
|
|
188
|
+
"log1p_n_genes_by_counts",
|
|
189
|
+
"total_counts",
|
|
190
|
+
"log1p_total_counts",
|
|
191
|
+
"annotation"
|
|
192
|
+
],
|
|
193
|
+
"var_cols": [],
|
|
194
|
+
"layers": [
|
|
195
|
+
"count",
|
|
196
|
+
"norm"
|
|
197
|
+
],
|
|
198
|
+
"obsm": [
|
|
199
|
+
"spatial",
|
|
200
|
+
"spatial_aligned",
|
|
201
|
+
"spatial_pair"
|
|
202
|
+
],
|
|
203
|
+
"uns": [],
|
|
204
|
+
"has_spatial": true,
|
|
205
|
+
"has_raw_counts": false,
|
|
206
|
+
"x_dtype": "float32",
|
|
207
|
+
"x_is_sparse": false,
|
|
208
|
+
"missing_std_fields": {
|
|
209
|
+
"source": 1.0,
|
|
210
|
+
"dataset_id": 1.0,
|
|
211
|
+
"species": 1.0,
|
|
212
|
+
"technology": 1.0,
|
|
213
|
+
"sex": 1.0,
|
|
214
|
+
"age": 1.0,
|
|
215
|
+
"condition": 1.0,
|
|
216
|
+
"disease": 1.0,
|
|
217
|
+
"batch": 0.0
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Standardized metadata (`.obs`)
|
|
223
|
+
|
|
224
|
+
By default, h5adify tries to fill a standard set of .obs fields where possible, e.g.:
|
|
225
|
+
|
|
226
|
+
`species`
|
|
227
|
+
`technology`
|
|
228
|
+
`sex`
|
|
229
|
+
`age`
|
|
230
|
+
`condition`
|
|
231
|
+
`disease`
|
|
232
|
+
`batch`
|
|
233
|
+
`source`
|
|
234
|
+
`dataset_id`
|
|
235
|
+
|
|
236
|
+
You can override any fields via repeatable `--set`:
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
h5adify download geo --gse GSE229409 --outdir data/out \
|
|
240
|
+
--set species=human --set condition=control --set technology=10x_visium
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Python usage (notebook)
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
from h5adify import download, merge_h5ads
|
|
247
|
+
|
|
248
|
+
# Download one dataset into standardized .h5ad
|
|
249
|
+
paths = download("geo", gse="GSE229409", outdir="data/out")
|
|
250
|
+
|
|
251
|
+
# Merge multiple .h5ad files
|
|
252
|
+
merged = merge_h5ads(["data/out/A.h5ad", "data/out/B.h5ad"], join="outer")
|
|
253
|
+
merged.write_h5ad("data/out/merged.h5ad")
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Notes on GEO (GSE) conversion
|
|
257
|
+
|
|
258
|
+
h5adify download geo focuses on processed supplementary matrices (e.g., 10x MTX/H5).
|
|
259
|
+
|
|
260
|
+
If a GEO series only provides raw SRA, you’ll need a dedicated pipeline (SRA → FASTQ → CellRanger/STARsolo → matrix).
|
|
261
|
+
h5adify will detect “raw-only” cases and explain what’s missing.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
MIT
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "h5adify"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Download, normalize metadata, and convert public sc/snRNA-seq + spatial datasets to standardized .h5ad (AnnData)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [{name="h5adify contributors"}]
|
|
13
|
+
keywords = ["single-cell", "spatial", "anndata", "h5ad", "GEO", "cellxgene"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dependencies = [
|
|
21
|
+
"anndata>=0.10.7",
|
|
22
|
+
"numpy>=1.23",
|
|
23
|
+
"pandas>=1.5",
|
|
24
|
+
"scipy>=1.10",
|
|
25
|
+
"requests>=2.31",
|
|
26
|
+
"tqdm>=4.66",
|
|
27
|
+
"GEOparse>=2.0.4",
|
|
28
|
+
"scanpy>=1.9.8",
|
|
29
|
+
"h5py>=3.9",
|
|
30
|
+
"beautifulsoup4>=4.12",
|
|
31
|
+
"lxml>=5.0",
|
|
32
|
+
"packaging>=23.2",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = ["pytest>=7.4", "ruff>=0.5", "mypy>=1.8", "packaging>=23.2",
|
|
37
|
+
"build",
|
|
38
|
+
"twine"]
|
|
39
|
+
docs = [
|
|
40
|
+
"sphinx>=7.2",
|
|
41
|
+
"myst-parser>=2.0",
|
|
42
|
+
"sphinx-rtd-theme>=2.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
h5adify = "h5adify.cli:main"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools]
|
|
49
|
+
package-dir = {"" = "src"}
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.packages.find]
|
|
52
|
+
where = ["src"]
|
|
53
|
+
|
|
54
|
+
[tool.ruff]
|
|
55
|
+
line-length = 100
|
|
56
|
+
target-version = "py39"
|
h5adify-0.1.1/setup.cfg
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""h5adify public API."""
|
|
2
|
+
|
|
3
|
+
from .highlevel import download, batch_download
|
|
4
|
+
from .merge import merge_h5ads
|
|
5
|
+
|
|
6
|
+
# These are imported from sources.base
|
|
7
|
+
from .sources.base import SearchResult, Source
|
|
8
|
+
|
|
9
|
+
__all__ = ["download", "batch_download", "merge_h5ads", "SearchResult", "Source"]
|
|
10
|
+
__version__ = "0.1.0"
|