h5adify 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
h5adify-0.1.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 h5adify contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
h5adify-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,305 @@
1
+ Metadata-Version: 2.4
2
+ Name: h5adify
3
+ Version: 0.1.1
4
+ Summary: Download, normalize metadata, and convert public sc/snRNA-seq + spatial datasets to standardized .h5ad (AnnData).
5
+ Author: h5adify contributors
6
+ License: MIT
7
+ Keywords: single-cell,spatial,anndata,h5ad,GEO,cellxgene
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: anndata>=0.10.7
15
+ Requires-Dist: numpy>=1.23
16
+ Requires-Dist: pandas>=1.5
17
+ Requires-Dist: scipy>=1.10
18
+ Requires-Dist: requests>=2.31
19
+ Requires-Dist: tqdm>=4.66
20
+ Requires-Dist: GEOparse>=2.0.4
21
+ Requires-Dist: scanpy>=1.9.8
22
+ Requires-Dist: h5py>=3.9
23
+ Requires-Dist: beautifulsoup4>=4.12
24
+ Requires-Dist: lxml>=5.0
25
+ Requires-Dist: packaging>=23.2
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.4; extra == "dev"
28
+ Requires-Dist: ruff>=0.5; extra == "dev"
29
+ Requires-Dist: mypy>=1.8; extra == "dev"
30
+ Requires-Dist: packaging>=23.2; extra == "dev"
31
+ Requires-Dist: build; extra == "dev"
32
+ Requires-Dist: twine; extra == "dev"
33
+ Provides-Extra: docs
34
+ Requires-Dist: sphinx>=7.2; extra == "docs"
35
+ Requires-Dist: myst-parser>=2.0; extra == "docs"
36
+ Requires-Dist: sphinx-rtd-theme>=2.0; extra == "docs"
37
+ Dynamic: license-file
38
+
39
+ # h5adify
40
+
41
+ <p align="center">
42
+ <img src="https://img.shields.io/badge/Python-3.10%E2%80%933.11-informational.svg" />
43
+ <img src="https://img.shields.io/badge/AnnData-.h5ad%20native-blueviolet.svg" />
44
+ <img src="https://img.shields.io/badge/Scanpy-compatible-brightgreen.svg" />
45
+ <img src="https://img.shields.io/badge/Modality-single--cell%20%2B%20spatial-success.svg" />
46
+ <img src="https://img.shields.io/badge/Sources-GEO%20%7C%20CELLxGENE%20%7C%20%7C%20Zenodo%20%7C%20UCSC%20%7C%20EMA-orange.svg" />
47
+ </p>
48
+
49
+ `h5adify` is a small Python library + CLI to **search**, **download**, and **convert** public single-cell / spatial datasets into **standardized `.h5ad` (AnnData)** with consistent metadata fields (`.obs`).
50
+ It can also **merge** multiple datasets (even across sources) into a single `.h5ad`.
51
+
52
+ > **Best-effort by design**: public portals vary wildly. Some provide direct `.h5ad`, others provide 10x MTX/H5 and many clinical datasets are controlled-access. `h5adify` focuses on workflows that can be automated reliably without proprietary tooling being able to homogenously, automatically and download and annotate a very large number of datasets.
53
+
54
+ ---
55
+
56
+ ## Supported sources
57
+
58
+ - **GEO (GSE/GSM)**
59
+ Downloads *processed supplementary matrices* (10x MTX/H5, etc.) and converts to `.h5ad` (**does not require SRA**).
60
+
61
+ - **CZ CELLxGENE Discover**
62
+ Accepts **dataset UUIDs** or direct **`.h5ad` URLs**.
63
+ Search is best-effort (API schema can vary and may return different JSON shapes depending on endpoint/proxy).
64
+
65
+ - **Zenodo**
66
+ Best-effort download via public endpoints / direct file links (when available).
67
+
68
+ - **UCSC Cell Browser (single-cell + some spatial datasets)**
69
+ Search via UCSC dataset registry, and download when a dataset exposes a direct `.h5ad` in the dataset directory.
70
+
71
+ - **EMA (EBI) — BioStudies / ArrayExpress**
72
+ Search via EBI BioStudies API (ArrayExpress collection).
73
+ Download works **only** when a study provides an attached **`.h5ad`** file.
74
+
75
+ ---
76
+
77
+ ## Install (local)
78
+
79
+ ### 1) Clone + venv
80
+ ```bash
81
+ git clone <your-fork-or-local-repo>
82
+ cd h5adify
83
+ python -m venv .venv
84
+ source .venv/bin/activate
85
+ pip install -U pip
86
+
87
+ ### 2) Install h5adify
88
+ ```bash
89
+ pip install -e . # core
90
+ pip install -e ".[docs]" # docs build dependencies (optional)
91
+ ```
92
+
93
+ ## Install (from pip)
94
+
95
+ ```bash
96
+ pip install h5adify
97
+ ```
98
+
99
+ ## Quickstart (CLI)
100
+ ### 1) Search datasets
101
+
102
+ ```bash
103
+ # GEO
104
+ h5adify search geo --query "human brain spatial transcriptomics" --max-results 20
105
+
106
+ # CELLxGENE
107
+ h5adify search cellxgene --query "human brain spatial transcriptomics" --max-results 20
108
+
109
+ # UCSC Cell Browser
110
+ h5adify search ucsc --query "human hippocampus" --max-results 20
111
+
112
+ # EMA / EBI (BioStudies / ArrayExpress)
113
+ h5adify search ema --query "single cell brain" --max-results 20
114
+ ```
115
+ ### 2) Download + convert (per dataset -> one .h5ad)
116
+
117
+ ```bash
118
+ # GEO: converts all samples with parseable supplementary matrices
119
+ h5adify download geo --gse GSE229409 --outdir data/out
120
+
121
+ # CELLxGENE: dataset UUID or direct .h5ad URL
122
+ h5adify download cellxgene --id e52ed1cc-d59f-4bf5-9716-8d81f14a89fd --outdir data/out
123
+ h5adify download cellxgene --id https://datasets.cellxgene.cziscience.com/e52ed1cc-d59f-4bf5-9716-8d81f14a89fd.h5ad --outdir data/out
124
+
125
+ # SODB: dataset-level (downloads all experiments -> one merged file)
126
+ h5adify download sodb --id "Mouse brain atlas" --outdir data/out
127
+
128
+ # SODB: single experiment
129
+ h5adify download sodb --id "Mouse brain atlas::exp_001" --outdir data/out
130
+
131
+ # UCSC: dataset id from search results (download works when a .h5ad is exposed)
132
+ h5adify download ucsc --id human-hippo-axis --outdir data/out
133
+
134
+ # EMA: E-MTAB / E-XXXX study accession (download works when an attached .h5ad is present)
135
+ h5adify download ema --id E-MTAB-XXXX --outdir data/out
136
+ ```
137
+
138
+ ### 3) Multi-source batch + merge
139
+ ```bash
140
+ h5adify batch \
141
+ --ids geo:GSE229409 \
142
+ cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
143
+ sodb:"Mouse brain atlas::exp_001" \
144
+ --outdir data/out \
145
+ --merge-out data/out/merged_all.h5ad
146
+ ```
147
+ ### 4) Batch multiple files from different databases
148
+ ```bash
149
+ h5adify batch --ids geo:GSE229409 \
150
+ cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
151
+ --outdir data/out \
152
+ --merge-out data/out/merged.h5ad
153
+ ```
154
+
155
+ ### 5) Provide a manifest of a list of h5ad files
156
+ ```bash
157
+ h5adify manifest --root data/stereo_seq_mouse_embryo/ \
158
+ --out data/stereo_seq_mouse_embryo/out
159
+ ```
160
+ It gives a `.csv` and `.jsonl` files, allowing to analyze the metadata of a large list of samples.
161
+
162
+ ### 6) Query the metadata of a list of h5ad files
163
+
164
+ There are 2 .h5ad in this folder:
165
+
166
+ ```bash
167
+ h5adify query --root data/stereo_seq_mouse_embryo/
168
+ UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
169
+ utils.warn_names_duplicates("obs")
170
+ [
171
+ {
172
+ "path": "data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
173
+ "filename": "mouse_embryo_all_slices.h5ad",
174
+ "n_obs": 176711,
175
+ "n_vars": 1923,
176
+ "x_dtype": "float32",
177
+ "is_sparse": false,
178
+ "has_raw_counts": false,
179
+ "has_spatial": true,
180
+ "layers": "count,norm",
181
+ "obsm": "spatial,spatial_aligned,spatial_pair",
182
+ "source": "",
183
+ "dataset_id": "",
184
+ "species": "",
185
+ "technology": "",
186
+ "condition": "",
187
+ "disease": "",
188
+ "batch": "real",
189
+ "checksum_sha256": ""
190
+ },
191
+ {
192
+ "path": "data/stereo_seq_mouse_embryo/E16.5_E1S3_cell_bin.h5ad",
193
+ "filename": "E16.5_E1S3_cell_bin.h5ad",
194
+ "n_obs": 281377,
195
+ "n_vars": 28103,
196
+ "x_dtype": "float32",
197
+ "is_sparse": false,
198
+ "has_raw_counts": false,
199
+ "has_spatial": true,
200
+ "layers": "counts",
201
+ "obsm": "spatial",
202
+ "source": "",
203
+ "dataset_id": "",
204
+ "species": "",
205
+ "technology": "",
206
+ "condition": "",
207
+ "disease": "",
208
+ "batch": "",
209
+ "checksum_sha256": ""
210
+ }
211
+ ]
212
+ ```
213
+ ### 7) Inspect the metadata of h5ad
214
+
215
+ ```bash
216
+ h5adify inspect --path data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad
217
+ UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
218
+ utils.warn_names_duplicates("obs")
219
+
220
+ {
221
+ "path": "/home/aalentorn/Téléchargements/data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
222
+ "n_obs": 176711,
223
+ "n_vars": 1923,
224
+ "obs_cols": [
225
+ "n_genes_by_counts",
226
+ "log1p_n_genes_by_counts",
227
+ "total_counts",
228
+ "log1p_total_counts",
229
+ "annotation"
230
+ ],
231
+ "var_cols": [],
232
+ "layers": [
233
+ "count",
234
+ "norm"
235
+ ],
236
+ "obsm": [
237
+ "spatial",
238
+ "spatial_aligned",
239
+ "spatial_pair"
240
+ ],
241
+ "uns": [],
242
+ "has_spatial": true,
243
+ "has_raw_counts": false,
244
+ "x_dtype": "float32",
245
+ "x_is_sparse": false,
246
+ "missing_std_fields": {
247
+ "source": 1.0,
248
+ "dataset_id": 1.0,
249
+ "species": 1.0,
250
+ "technology": 1.0,
251
+ "sex": 1.0,
252
+ "age": 1.0,
253
+ "condition": 1.0,
254
+ "disease": 1.0,
255
+ "batch": 0.0
256
+ }
257
+ }
258
+ ```
259
+
260
+ ### Standardized metadata (`.obs`)
261
+
262
+ By default, h5adify tries to fill a standard set of .obs fields where possible, e.g.:
263
+
264
+ `species`
265
+ `technology`
266
+ `sex`
267
+ `age`
268
+ `condition`
269
+ `disease`
270
+ `batch`
271
+ `source`
272
+ `dataset_id`
273
+
274
+ You can override any fields via repeatable `--set`:
275
+
276
+ ```bash
277
+ h5adify download geo --gse GSE229409 --outdir data/out \
278
+ --set species=human --set condition=control --set technology=10x_visium
279
+ ```
280
+
281
+ ### Python usage (notebook)
282
+
283
+ ```python
284
+ from h5adify import download, merge_h5ads
285
+
286
+ # Download one dataset into standardized .h5ad
287
+ paths = download("geo", gse="GSE229409", outdir="data/out")
288
+
289
+ # Merge multiple .h5ad files
290
+ merged = merge_h5ads(["data/out/A.h5ad", "data/out/B.h5ad"], join="outer")
291
+ merged.write_h5ad("data/out/merged.h5ad")
292
+ ```
293
+
294
+ ### Notes on GEO (GSE) conversion
295
+
296
+ h5adify download geo focuses on processed supplementary matrices (e.g., 10x MTX/H5).
297
+
298
+ If a GEO series only provides raw SRA, you’ll need a dedicated pipeline (SRA → FASTQ → CellRanger/STARsolo → matrix).
299
+ h5adify will detect “raw-only” cases and explain what’s missing.
300
+
301
+ ---
302
+
303
+ ## License
304
+
305
+ MIT
@@ -0,0 +1,267 @@
1
+ # h5adify
2
+
3
+ <p align="center">
4
+ <img src="https://img.shields.io/badge/Python-3.10%E2%80%933.11-informational.svg" />
5
+ <img src="https://img.shields.io/badge/AnnData-.h5ad%20native-blueviolet.svg" />
6
+ <img src="https://img.shields.io/badge/Scanpy-compatible-brightgreen.svg" />
7
+ <img src="https://img.shields.io/badge/Modality-single--cell%20%2B%20spatial-success.svg" />
8
+ <img src="https://img.shields.io/badge/Sources-GEO%20%7C%20CELLxGENE%20%7C%20%7C%20Zenodo%20%7C%20UCSC%20%7C%20EMA-orange.svg" />
9
+ </p>
10
+
11
+ `h5adify` is a small Python library + CLI to **search**, **download**, and **convert** public single-cell / spatial datasets into **standardized `.h5ad` (AnnData)** with consistent metadata fields (`.obs`).
12
+ It can also **merge** multiple datasets (even across sources) into a single `.h5ad`.
13
+
14
+ > **Best-effort by design**: public portals vary wildly. Some provide direct `.h5ad`, others provide 10x MTX/H5 and many clinical datasets are controlled-access. `h5adify` focuses on workflows that can be automated reliably without proprietary tooling being able to homogenously, automatically and download and annotate a very large number of datasets.
15
+
16
+ ---
17
+
18
+ ## Supported sources
19
+
20
+ - **GEO (GSE/GSM)**
21
+ Downloads *processed supplementary matrices* (10x MTX/H5, etc.) and converts to `.h5ad` (**does not require SRA**).
22
+
23
+ - **CZ CELLxGENE Discover**
24
+ Accepts **dataset UUIDs** or direct **`.h5ad` URLs**.
25
+ Search is best-effort (API schema can vary and may return different JSON shapes depending on endpoint/proxy).
26
+
27
+ - **Zenodo**
28
+ Best-effort download via public endpoints / direct file links (when available).
29
+
30
+ - **UCSC Cell Browser (single-cell + some spatial datasets)**
31
+ Search via UCSC dataset registry, and download when a dataset exposes a direct `.h5ad` in the dataset directory.
32
+
33
+ - **EMA (EBI) — BioStudies / ArrayExpress**
34
+ Search via EBI BioStudies API (ArrayExpress collection).
35
+ Download works **only** when a study provides an attached **`.h5ad`** file.
36
+
37
+ ---
38
+
39
+ ## Install (local)
40
+
41
+ ### 1) Clone + venv
42
+ ```bash
43
+ git clone <your-fork-or-local-repo>
44
+ cd h5adify
45
+ python -m venv .venv
46
+ source .venv/bin/activate
47
+ pip install -U pip
48
+
49
+ ### 2) Install h5adify
50
+ ```bash
51
+ pip install -e . # core
52
+ pip install -e ".[docs]" # docs build dependencies (optional)
53
+ ```
54
+
55
+ ## Install (from pip)
56
+
57
+ ```bash
58
+ pip install h5adify
59
+ ```
60
+
61
+ ## Quickstart (CLI)
62
+ ### 1) Search datasets
63
+
64
+ ```bash
65
+ # GEO
66
+ h5adify search geo --query "human brain spatial transcriptomics" --max-results 20
67
+
68
+ # CELLxGENE
69
+ h5adify search cellxgene --query "human brain spatial transcriptomics" --max-results 20
70
+
71
+ # UCSC Cell Browser
72
+ h5adify search ucsc --query "human hippocampus" --max-results 20
73
+
74
+ # EMA / EBI (BioStudies / ArrayExpress)
75
+ h5adify search ema --query "single cell brain" --max-results 20
76
+ ```
77
+ ### 2) Download + convert (per dataset -> one .h5ad)
78
+
79
+ ```bash
80
+ # GEO: converts all samples with parseable supplementary matrices
81
+ h5adify download geo --gse GSE229409 --outdir data/out
82
+
83
+ # CELLxGENE: dataset UUID or direct .h5ad URL
84
+ h5adify download cellxgene --id e52ed1cc-d59f-4bf5-9716-8d81f14a89fd --outdir data/out
85
+ h5adify download cellxgene --id https://datasets.cellxgene.cziscience.com/e52ed1cc-d59f-4bf5-9716-8d81f14a89fd.h5ad --outdir data/out
86
+
87
+ # SODB: dataset-level (downloads all experiments -> one merged file)
88
+ h5adify download sodb --id "Mouse brain atlas" --outdir data/out
89
+
90
+ # SODB: single experiment
91
+ h5adify download sodb --id "Mouse brain atlas::exp_001" --outdir data/out
92
+
93
+ # UCSC: dataset id from search results (download works when a .h5ad is exposed)
94
+ h5adify download ucsc --id human-hippo-axis --outdir data/out
95
+
96
+ # EMA: E-MTAB / E-XXXX study accession (download works when an attached .h5ad is present)
97
+ h5adify download ema --id E-MTAB-XXXX --outdir data/out
98
+ ```
99
+
100
+ ### 3) Multi-source batch + merge
101
+ ```bash
102
+ h5adify batch \
103
+ --ids geo:GSE229409 \
104
+ cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
105
+ sodb:"Mouse brain atlas::exp_001" \
106
+ --outdir data/out \
107
+ --merge-out data/out/merged_all.h5ad
108
+ ```
109
+ ### 4) Batch multiple files from different databases
110
+ ```bash
111
+ h5adify batch --ids geo:GSE229409 \
112
+ cellxgene:e52ed1cc-d59f-4bf5-9716-8d81f14a89fd \
113
+ --outdir data/out \
114
+ --merge-out data/out/merged.h5ad
115
+ ```
116
+
117
+ ### 5) Provide a manifest of a list of h5ad files
118
+ ```bash
119
+ h5adify manifest --root data/stereo_seq_mouse_embryo/ \
120
+ --out data/stereo_seq_mouse_embryo/out
121
+ ```
122
+ It gives a `.csv` and `.jsonl` files, allowing to analyze the metadata of a large list of samples.
123
+
124
+ ### 6) Query the metadata of a list of h5ad files
125
+
126
+ There are 2 .h5ad in this folder:
127
+
128
+ ```bash
129
+ h5adify query --root data/stereo_seq_mouse_embryo/
130
+ UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
131
+ utils.warn_names_duplicates("obs")
132
+ [
133
+ {
134
+ "path": "data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
135
+ "filename": "mouse_embryo_all_slices.h5ad",
136
+ "n_obs": 176711,
137
+ "n_vars": 1923,
138
+ "x_dtype": "float32",
139
+ "is_sparse": false,
140
+ "has_raw_counts": false,
141
+ "has_spatial": true,
142
+ "layers": "count,norm",
143
+ "obsm": "spatial,spatial_aligned,spatial_pair",
144
+ "source": "",
145
+ "dataset_id": "",
146
+ "species": "",
147
+ "technology": "",
148
+ "condition": "",
149
+ "disease": "",
150
+ "batch": "real",
151
+ "checksum_sha256": ""
152
+ },
153
+ {
154
+ "path": "data/stereo_seq_mouse_embryo/E16.5_E1S3_cell_bin.h5ad",
155
+ "filename": "E16.5_E1S3_cell_bin.h5ad",
156
+ "n_obs": 281377,
157
+ "n_vars": 28103,
158
+ "x_dtype": "float32",
159
+ "is_sparse": false,
160
+ "has_raw_counts": false,
161
+ "has_spatial": true,
162
+ "layers": "counts",
163
+ "obsm": "spatial",
164
+ "source": "",
165
+ "dataset_id": "",
166
+ "species": "",
167
+ "technology": "",
168
+ "condition": "",
169
+ "disease": "",
170
+ "batch": "",
171
+ "checksum_sha256": ""
172
+ }
173
+ ]
174
+ ```
175
+ ### 7) Inspect the metadata of h5ad
176
+
177
+ ```bash
178
+ h5adify inspect --path data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad
179
+ UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
180
+ utils.warn_names_duplicates("obs")
181
+
182
+ {
183
+ "path": "/home/aalentorn/Téléchargements/data/stereo_seq_mouse_embryo/mouse_embryo_all_slices.h5ad",
184
+ "n_obs": 176711,
185
+ "n_vars": 1923,
186
+ "obs_cols": [
187
+ "n_genes_by_counts",
188
+ "log1p_n_genes_by_counts",
189
+ "total_counts",
190
+ "log1p_total_counts",
191
+ "annotation"
192
+ ],
193
+ "var_cols": [],
194
+ "layers": [
195
+ "count",
196
+ "norm"
197
+ ],
198
+ "obsm": [
199
+ "spatial",
200
+ "spatial_aligned",
201
+ "spatial_pair"
202
+ ],
203
+ "uns": [],
204
+ "has_spatial": true,
205
+ "has_raw_counts": false,
206
+ "x_dtype": "float32",
207
+ "x_is_sparse": false,
208
+ "missing_std_fields": {
209
+ "source": 1.0,
210
+ "dataset_id": 1.0,
211
+ "species": 1.0,
212
+ "technology": 1.0,
213
+ "sex": 1.0,
214
+ "age": 1.0,
215
+ "condition": 1.0,
216
+ "disease": 1.0,
217
+ "batch": 0.0
218
+ }
219
+ }
220
+ ```
221
+
222
+ ### Standardized metadata (`.obs`)
223
+
224
+ By default, h5adify tries to fill a standard set of .obs fields where possible, e.g.:
225
+
226
+ `species`
227
+ `technology`
228
+ `sex`
229
+ `age`
230
+ `condition`
231
+ `disease`
232
+ `batch`
233
+ `source`
234
+ `dataset_id`
235
+
236
+ You can override any fields via repeatable `--set`:
237
+
238
+ ```bash
239
+ h5adify download geo --gse GSE229409 --outdir data/out \
240
+ --set species=human --set condition=control --set technology=10x_visium
241
+ ```
242
+
243
+ ### Python usage (notebook)
244
+
245
+ ```python
246
+ from h5adify import download, merge_h5ads
247
+
248
+ # Download one dataset into standardized .h5ad
249
+ paths = download("geo", gse="GSE229409", outdir="data/out")
250
+
251
+ # Merge multiple .h5ad files
252
+ merged = merge_h5ads(["data/out/A.h5ad", "data/out/B.h5ad"], join="outer")
253
+ merged.write_h5ad("data/out/merged.h5ad")
254
+ ```
255
+
256
+ ### Notes on GEO (GSE) conversion
257
+
258
+ h5adify download geo focuses on processed supplementary matrices (e.g., 10x MTX/H5).
259
+
260
+ If a GEO series only provides raw SRA, you’ll need a dedicated pipeline (SRA → FASTQ → CellRanger/STARsolo → matrix).
261
+ h5adify will detect “raw-only” cases and explain what’s missing.
262
+
263
+ ---
264
+
265
+ ## License
266
+
267
+ MIT
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "h5adify"
7
+ version = "0.1.1"
8
+ description = "Download, normalize metadata, and convert public sc/snRNA-seq + spatial datasets to standardized .h5ad (AnnData)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ authors = [{name="h5adify contributors"}]
13
+ keywords = ["single-cell", "spatial", "anndata", "h5ad", "GEO", "cellxgene"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+
20
+ dependencies = [
21
+ "anndata>=0.10.7",
22
+ "numpy>=1.23",
23
+ "pandas>=1.5",
24
+ "scipy>=1.10",
25
+ "requests>=2.31",
26
+ "tqdm>=4.66",
27
+ "GEOparse>=2.0.4",
28
+ "scanpy>=1.9.8",
29
+ "h5py>=3.9",
30
+ "beautifulsoup4>=4.12",
31
+ "lxml>=5.0",
32
+ "packaging>=23.2",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ dev = ["pytest>=7.4", "ruff>=0.5", "mypy>=1.8", "packaging>=23.2",
37
+ "build",
38
+ "twine"]
39
+ docs = [
40
+ "sphinx>=7.2",
41
+ "myst-parser>=2.0",
42
+ "sphinx-rtd-theme>=2.0",
43
+ ]
44
+
45
+ [project.scripts]
46
+ h5adify = "h5adify.cli:main"
47
+
48
+ [tool.setuptools]
49
+ package-dir = {"" = "src"}
50
+
51
+ [tool.setuptools.packages.find]
52
+ where = ["src"]
53
+
54
+ [tool.ruff]
55
+ line-length = 100
56
+ target-version = "py39"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,10 @@
1
+ """h5adify public API."""
2
+
3
+ from .highlevel import download, batch_download
4
+ from .merge import merge_h5ads
5
+
6
+ # These are imported from sources.base
7
+ from .sources.base import SearchResult, Source
8
+
9
+ __all__ = ["download", "batch_download", "merge_h5ads", "SearchResult", "Source"]
10
+ __version__ = "0.1.0"