scmora-db 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scmora_db-0.1.0/LICENSE +21 -0
- scmora_db-0.1.0/MANIFEST.in +7 -0
- scmora_db-0.1.0/PKG-INFO +173 -0
- scmora_db-0.1.0/README.md +146 -0
- scmora_db-0.1.0/metadata.csv +278 -0
- scmora_db-0.1.0/metadata_summary.json +61 -0
- scmora_db-0.1.0/pyproject.toml +47 -0
- scmora_db-0.1.0/setup.cfg +4 -0
- scmora_db-0.1.0/src/scmora_db/__init__.py +51 -0
- scmora_db-0.1.0/src/scmora_db/_version.py +3 -0
- scmora_db-0.1.0/src/scmora_db/catalog.py +288 -0
- scmora_db-0.1.0/src/scmora_db/cli.py +204 -0
- scmora_db-0.1.0/src/scmora_db/download.py +86 -0
- scmora_db-0.1.0/src/scmora_db/exceptions.py +34 -0
- scmora_db-0.1.0/src/scmora_db/io.py +72 -0
- scmora_db-0.1.0/src/scmora_db/metadata.csv +278 -0
- scmora_db-0.1.0/src/scmora_db/py.typed +1 -0
- scmora_db-0.1.0/src/scmora_db.egg-info/PKG-INFO +173 -0
- scmora_db-0.1.0/src/scmora_db.egg-info/SOURCES.txt +25 -0
- scmora_db-0.1.0/src/scmora_db.egg-info/dependency_links.txt +1 -0
- scmora_db-0.1.0/src/scmora_db.egg-info/entry_points.txt +2 -0
- scmora_db-0.1.0/src/scmora_db.egg-info/requires.txt +8 -0
- scmora_db-0.1.0/src/scmora_db.egg-info/top_level.txt +1 -0
- scmora_db-0.1.0/tests/conftest.py +8 -0
- scmora_db-0.1.0/tests/test_catalog.py +46 -0
- scmora_db-0.1.0/tests/test_cli.py +31 -0
- scmora_db-0.1.0/tests/test_download.py +11 -0
scmora_db-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 SCMORA DB contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
scmora_db-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scmora-db
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Search, download, and load SCMORA .h5mu datasets from Hugging Face.
|
|
5
|
+
Author: SCMORA DB contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: single-cell,multiome,mudata,h5mu,huggingface
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: huggingface_hub>=0.23
|
|
20
|
+
Requires-Dist: mudata>=0.2.4
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: build; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest; extra == "dev"
|
|
25
|
+
Requires-Dist: twine; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# scmora-db
|
|
29
|
+
|
|
30
|
+
`scmora-db` is a small Python package for searching, downloading, and loading
|
|
31
|
+
SCMORA `.h5mu` datasets from the Hugging Face dataset repository
|
|
32
|
+
`shiny321/genome-db`.
|
|
33
|
+
|
|
34
|
+
The package ships with a lightweight metadata catalog. Large `.h5mu` files stay
|
|
35
|
+
on Hugging Face and are downloaded only when requested.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
Python 3.10 or newer is required.
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install scmora-db
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
This installs everything needed to search, download, and load `.h5mu` files as
|
|
46
|
+
MuData objects.
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
For local development:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
cd path/to/pkg
|
|
53
|
+
pip install -e ".[dev]"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Python API
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from scmora_db import search_datasets, download_datasets, load_datasets
|
|
60
|
+
|
|
61
|
+
catalog = search_datasets(dataset_id="GSM5085810_GM12878_rep1")
|
|
62
|
+
|
|
63
|
+
paths = download_datasets(
|
|
64
|
+
detailed_condition="Control",
|
|
65
|
+
usage_tag="control",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
mdata = load_datasets(
|
|
69
|
+
dataset_id="GSM5085810_GM12878_rep1",
|
|
70
|
+
backed="r",
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Supported filters:
|
|
75
|
+
|
|
76
|
+
- `dataset_id`
|
|
77
|
+
- `dataset_uid`
|
|
78
|
+
- `gse_id`
|
|
79
|
+
- `detailed_condition`
|
|
80
|
+
- `usage_tag`
|
|
81
|
+
- `detail_source`
|
|
82
|
+
- `condition`
|
|
83
|
+
- `sample_type`
|
|
84
|
+
- `species`
|
|
85
|
+
- `reference`
|
|
86
|
+
|
|
87
|
+
`dataset_uid` is the safest unique identifier and is formatted as
|
|
88
|
+
`GSE_id/dataset_id`.
|
|
89
|
+
|
|
90
|
+
## Multi-Match Rule
|
|
91
|
+
|
|
92
|
+
`download_datasets()` and `load_datasets()` use this default rule:
|
|
93
|
+
|
|
94
|
+
- one match: return one path or one MuData object
|
|
95
|
+
- two to five matches: return a list
|
|
96
|
+
- more than five matches: stop and report all matched `dataset_uid` values
|
|
97
|
+
|
|
98
|
+
This prevents accidental large downloads.
|
|
99
|
+
|
|
100
|
+
## Command Line
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
scmora-db search --usage-tag control
|
|
104
|
+
scmora-db search --detailed-condition Control
|
|
105
|
+
scmora-db search --detail-source "GM12878 (Cell Line)"
|
|
106
|
+
scmora-db download --dataset-id GSM5085810_GM12878_rep1
|
|
107
|
+
scmora-db load --dataset-id GSM5085810_GM12878_rep1 --backed r
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
List available metadata values:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
scmora-db list dataset-ids
|
|
114
|
+
scmora-db list dataset-uids
|
|
115
|
+
scmora-db list gse-ids
|
|
116
|
+
scmora-db list usage-tags
|
|
117
|
+
scmora-db list groups
|
|
118
|
+
scmora-db list condition
|
|
119
|
+
scmora-db list detailed-conditions
|
|
120
|
+
scmora-db list detail-sources
|
|
121
|
+
scmora-db list sample-types
|
|
122
|
+
scmora-db list species
|
|
123
|
+
scmora-db list references
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Useful options:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
scmora-db download --cache-dir ./hf-cache --local-dir ./data
|
|
130
|
+
scmora-db search --prefer-remote
|
|
131
|
+
scmora-db download --revision main
|
|
132
|
+
scmora-db download --token hf_xxx
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Metadata
|
|
136
|
+
|
|
137
|
+
The bundled metadata file contains 277 datasets and these core columns:
|
|
138
|
+
|
|
139
|
+
- `dataset_uid`
|
|
140
|
+
- `dataset_id`
|
|
141
|
+
- `gse_id`
|
|
142
|
+
- `file_path`
|
|
143
|
+
- `file_name`
|
|
144
|
+
- `species`
|
|
145
|
+
- `reference`
|
|
146
|
+
- `group`
|
|
147
|
+
- `usage_primary`
|
|
148
|
+
- `usage_tags`
|
|
149
|
+
- `sample_type`
|
|
150
|
+
- `detail_source`
|
|
151
|
+
- `condition`
|
|
152
|
+
- `detailed_condition`
|
|
153
|
+
|
|
154
|
+
The `file_path` values are relative to `shiny321/genome-db`, for example:
|
|
155
|
+
|
|
156
|
+
```text
|
|
157
|
+
GSE166797/GSM5085810_GM12878_rep1.h5mu
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Build and Publish
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
python -m pip install -U build twine
|
|
164
|
+
python -m build
|
|
165
|
+
python -m twine check dist/*
|
|
166
|
+
python -m twine upload --repository testpypi dist/*
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
After testing on TestPyPI:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
python -m twine upload dist/*
|
|
173
|
+
```
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# scmora-db
|
|
2
|
+
|
|
3
|
+
`scmora-db` is a small Python package for searching, downloading, and loading
|
|
4
|
+
SCMORA `.h5mu` datasets from the Hugging Face dataset repository
|
|
5
|
+
`shiny321/genome-db`.
|
|
6
|
+
|
|
7
|
+
The package ships with a lightweight metadata catalog. Large `.h5mu` files stay
|
|
8
|
+
on Hugging Face and are downloaded only when requested.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Python 3.10 or newer is required.
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install scmora-db
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
This installs everything needed to search, download, and load `.h5mu` files as
|
|
19
|
+
MuData objects.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
For local development:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
cd path/to/pkg
|
|
26
|
+
pip install -e ".[dev]"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Python API
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from scmora_db import search_datasets, download_datasets, load_datasets
|
|
33
|
+
|
|
34
|
+
catalog = search_datasets(dataset_id="GSM5085810_GM12878_rep1")
|
|
35
|
+
|
|
36
|
+
paths = download_datasets(
|
|
37
|
+
detailed_condition="Control",
|
|
38
|
+
usage_tag="control",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
mdata = load_datasets(
|
|
42
|
+
dataset_id="GSM5085810_GM12878_rep1",
|
|
43
|
+
backed="r",
|
|
44
|
+
)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Supported filters:
|
|
48
|
+
|
|
49
|
+
- `dataset_id`
|
|
50
|
+
- `dataset_uid`
|
|
51
|
+
- `gse_id`
|
|
52
|
+
- `detailed_condition`
|
|
53
|
+
- `usage_tag`
|
|
54
|
+
- `detail_source`
|
|
55
|
+
- `condition`
|
|
56
|
+
- `sample_type`
|
|
57
|
+
- `species`
|
|
58
|
+
- `reference`
|
|
59
|
+
|
|
60
|
+
`dataset_uid` is the safest unique identifier and is formatted as
|
|
61
|
+
`GSE_id/dataset_id`.
|
|
62
|
+
|
|
63
|
+
## Multi-Match Rule
|
|
64
|
+
|
|
65
|
+
`download_datasets()` and `load_datasets()` use this default rule:
|
|
66
|
+
|
|
67
|
+
- one match: return one path or one MuData object
|
|
68
|
+
- two to five matches: return a list
|
|
69
|
+
- more than five matches: stop and report all matched `dataset_uid` values
|
|
70
|
+
|
|
71
|
+
This prevents accidental large downloads.
|
|
72
|
+
|
|
73
|
+
## Command Line
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
scmora-db search --usage-tag control
|
|
77
|
+
scmora-db search --detailed-condition Control
|
|
78
|
+
scmora-db search --detail-source "GM12878 (Cell Line)"
|
|
79
|
+
scmora-db download --dataset-id GSM5085810_GM12878_rep1
|
|
80
|
+
scmora-db load --dataset-id GSM5085810_GM12878_rep1 --backed r
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
List available metadata values:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
scmora-db list dataset-ids
|
|
87
|
+
scmora-db list dataset-uids
|
|
88
|
+
scmora-db list gse-ids
|
|
89
|
+
scmora-db list usage-tags
|
|
90
|
+
scmora-db list groups
|
|
91
|
+
scmora-db list condition
|
|
92
|
+
scmora-db list detailed-conditions
|
|
93
|
+
scmora-db list detail-sources
|
|
94
|
+
scmora-db list sample-types
|
|
95
|
+
scmora-db list species
|
|
96
|
+
scmora-db list references
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Useful options:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
scmora-db download --cache-dir ./hf-cache --local-dir ./data
|
|
103
|
+
scmora-db search --prefer-remote
|
|
104
|
+
scmora-db download --revision main
|
|
105
|
+
scmora-db download --token hf_xxx
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Metadata
|
|
109
|
+
|
|
110
|
+
The bundled metadata file contains 277 datasets and these core columns:
|
|
111
|
+
|
|
112
|
+
- `dataset_uid`
|
|
113
|
+
- `dataset_id`
|
|
114
|
+
- `gse_id`
|
|
115
|
+
- `file_path`
|
|
116
|
+
- `file_name`
|
|
117
|
+
- `species`
|
|
118
|
+
- `reference`
|
|
119
|
+
- `group`
|
|
120
|
+
- `usage_primary`
|
|
121
|
+
- `usage_tags`
|
|
122
|
+
- `sample_type`
|
|
123
|
+
- `detail_source`
|
|
124
|
+
- `condition`
|
|
125
|
+
- `detailed_condition`
|
|
126
|
+
|
|
127
|
+
The `file_path` values are relative to `shiny321/genome-db`, for example:
|
|
128
|
+
|
|
129
|
+
```text
|
|
130
|
+
GSE166797/GSM5085810_GM12878_rep1.h5mu
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Build and Publish
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
python -m pip install -U build twine
|
|
137
|
+
python -m build
|
|
138
|
+
python -m twine check dist/*
|
|
139
|
+
python -m twine upload --repository testpypi dist/*
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
After testing on TestPyPI:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
python -m twine upload dist/*
|
|
146
|
+
```
|