sampledisco 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sampledisco-0.1.0/LICENSE +21 -0
- sampledisco-0.1.0/PKG-INFO +254 -0
- sampledisco-0.1.0/README.md +178 -0
- sampledisco-0.1.0/pyproject.toml +49 -0
- sampledisco-0.1.0/requirements.txt +37 -0
- sampledisco-0.1.0/setup.cfg +4 -0
- sampledisco-0.1.0/src/sampledisco/__init__.py +27 -0
- sampledisco-0.1.0/src/sampledisco/cli.py +70 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_ArchR.py +745 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_RNA_harmony.py +1107 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_gene_activity.py +547 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_peak_annotation.py +441 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/RNA_name_convertor.py +330 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/pseudo_correlation.py +1112 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/test_pseudo_correlation.py +694 -0
- sampledisco-0.1.0/src/sampledisco/gene_activity/validation.py +913 -0
- sampledisco-0.1.0/src/sampledisco/parameter_selection/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/parameter_selection/autotune.py +877 -0
- sampledisco-0.1.0/src/sampledisco/preparation/ATAC_cell_type.py +493 -0
- sampledisco-0.1.0/src/sampledisco/preparation/ATAC_cell_type_gpu.py +425 -0
- sampledisco-0.1.0/src/sampledisco/preparation/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/preparation/atac_preprocess_cpu.py +316 -0
- sampledisco-0.1.0/src/sampledisco/preparation/atac_preprocess_gpu.py +306 -0
- sampledisco-0.1.0/src/sampledisco/preparation/cell_type_cpu.py +189 -0
- sampledisco-0.1.0/src/sampledisco/preparation/cell_type_gpu.py +202 -0
- sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_batch_correction.py +111 -0
- sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_cell_type_cpu.py +447 -0
- sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_cell_type_gpu.py +482 -0
- sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_glue.py +1045 -0
- sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_merge.py +404 -0
- sampledisco-0.1.0/src/sampledisco/preparation/rna_preprocess_cpu.py +303 -0
- sampledisco-0.1.0/src/sampledisco/preparation/rna_preprocess_gpu.py +292 -0
- sampledisco-0.1.0/src/sampledisco/sample_association/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/sample_association/association.py +649 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/HRA_VEC.py +15 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/HRC_VEC.py +15 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/NN.py +108 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/RAISIN.py +847 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/RAISIN_TEST.py +787 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/UPGMA.py +84 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/cluster.py +117 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/cluster_helper.py +136 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/cluster_severity_reconcile.py +130 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/consensus.py +222 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/proportion_test.py +624 -0
- sampledisco-0.1.0/src/sampledisco/sample_clustering/tree_cut.py +90 -0
- sampledisco-0.1.0/src/sampledisco/sample_distance/ChiSquare.py +143 -0
- sampledisco-0.1.0/src/sampledisco/sample_distance/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/sample_distance/distance_test.py +147 -0
- sampledisco-0.1.0/src/sampledisco/sample_distance/jensenshannon.py +142 -0
- sampledisco-0.1.0/src/sampledisco/sample_distance/sample_distance.py +609 -0
- sampledisco-0.1.0/src/sampledisco/sample_embedding/__init__.py +73 -0
- sampledisco-0.1.0/src/sampledisco/sample_embedding/blocks.py +528 -0
- sampledisco-0.1.0/src/sampledisco/sample_embedding/sample_embedding.py +346 -0
- sampledisco-0.1.0/src/sampledisco/sample_embedding/sample_embedding_gpu.py +359 -0
- sampledisco-0.1.0/src/sampledisco/sample_trajectory/CCA.py +482 -0
- sampledisco-0.1.0/src/sampledisco/sample_trajectory/CCA_test.py +611 -0
- sampledisco-0.1.0/src/sampledisco/sample_trajectory/TSCAN.py +998 -0
- sampledisco-0.1.0/src/sampledisco/sample_trajectory/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/sample_trajectory/trajectory_DGE_visualization.py +1600 -0
- sampledisco-0.1.0/src/sampledisco/sample_trajectory/trajectory_diff_gene.py +1184 -0
- sampledisco-0.1.0/src/sampledisco/utils/Grouping.py +112 -0
- sampledisco-0.1.0/src/sampledisco/utils/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/utils/batch_regress.py +49 -0
- sampledisco-0.1.0/src/sampledisco/utils/imbalance_cell_type_handler.py +79 -0
- sampledisco-0.1.0/src/sampledisco/utils/inspector.py +391 -0
- sampledisco-0.1.0/src/sampledisco/utils/limma.py +54 -0
- sampledisco-0.1.0/src/sampledisco/utils/merge_sample_meta.py +102 -0
- sampledisco-0.1.0/src/sampledisco/utils/random_seed.py +36 -0
- sampledisco-0.1.0/src/sampledisco/utils/safe_save.py +132 -0
- sampledisco-0.1.0/src/sampledisco/utils/slim_adata.py +24 -0
- sampledisco-0.1.0/src/sampledisco/utils/subsample.py +117 -0
- sampledisco-0.1.0/src/sampledisco/utils/subsample_generator.py +418 -0
- sampledisco-0.1.0/src/sampledisco/utils/tf_idf.py +299 -0
- sampledisco-0.1.0/src/sampledisco/utils/unify_optimal.py +339 -0
- sampledisco-0.1.0/src/sampledisco/visualization/ATAC_visualization.py +359 -0
- sampledisco-0.1.0/src/sampledisco/visualization/DEG_visualization.py +359 -0
- sampledisco-0.1.0/src/sampledisco/visualization/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/visualization/multi_omics_visualization.py +1266 -0
- sampledisco-0.1.0/src/sampledisco/visualization/visualization_embedding.py +603 -0
- sampledisco-0.1.0/src/sampledisco/visualization/visualization_helper.py +439 -0
- sampledisco-0.1.0/src/sampledisco/visualization/visualization_other.py +121 -0
- sampledisco-0.1.0/src/sampledisco/wrapper/__init__.py +0 -0
- sampledisco-0.1.0/src/sampledisco/wrapper/atac_wrapper.py +234 -0
- sampledisco-0.1.0/src/sampledisco/wrapper/multiomics_wrapper.py +483 -0
- sampledisco-0.1.0/src/sampledisco/wrapper/rna_wrapper.py +233 -0
- sampledisco-0.1.0/src/sampledisco/wrapper/wrapper.py +1646 -0
- sampledisco-0.1.0/src/sampledisco.egg-info/PKG-INFO +254 -0
- sampledisco-0.1.0/src/sampledisco.egg-info/SOURCES.txt +93 -0
- sampledisco-0.1.0/src/sampledisco.egg-info/dependency_links.txt +1 -0
- sampledisco-0.1.0/src/sampledisco.egg-info/entry_points.txt +2 -0
- sampledisco-0.1.0/src/sampledisco.egg-info/requires.txt +34 -0
- sampledisco-0.1.0/src/sampledisco.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Harry Jiang, Hongkai Ji, and the SampleDisco authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sampledisco
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cross-omics, cross-condition sample embedding for single-cell data (RNA + ATAC)
|
|
5
|
+
Author: Hongkai Ji
|
|
6
|
+
Author-email: Harry Jiang <hjiang55@jh.edu>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Harry Jiang, Hongkai Ji, and the SampleDisco authors
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
|
|
29
|
+
Project-URL: Homepage, https://github.com/J041120h/GenoDistance
|
|
30
|
+
Project-URL: Documentation, https://j041120h.github.io/SampleDisco_tutorial/
|
|
31
|
+
Project-URL: Repository, https://github.com/J041120h/GenoDistance
|
|
32
|
+
Keywords: single-cell,sample-embedding,multi-omics,scRNA-seq,scATAC-seq,bioinformatics
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
39
|
+
Requires-Python: >=3.10
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
License-File: LICENSE
|
|
42
|
+
Requires-Dist: numpy>=1.23
|
|
43
|
+
Requires-Dist: pandas>=1.5
|
|
44
|
+
Requires-Dist: scipy>=1.9
|
|
45
|
+
Requires-Dist: scikit-learn>=1.2
|
|
46
|
+
Requires-Dist: scikit-misc>=0.3
|
|
47
|
+
Requires-Dist: statsmodels>=0.13
|
|
48
|
+
Requires-Dist: patsy>=0.5
|
|
49
|
+
Requires-Dist: scanpy>=1.11
|
|
50
|
+
Requires-Dist: anndata>=0.10
|
|
51
|
+
Requires-Dist: muon>=0.1
|
|
52
|
+
Requires-Dist: harmony-pytorch>=0.1
|
|
53
|
+
Requires-Dist: harmonypy>=0.0.9
|
|
54
|
+
Requires-Dist: scglue>=0.3
|
|
55
|
+
Requires-Dist: torch>=2.0
|
|
56
|
+
Requires-Dist: python-igraph>=0.10
|
|
57
|
+
Requires-Dist: leidenalg>=0.9
|
|
58
|
+
Requires-Dist: networkx>=3.0
|
|
59
|
+
Requires-Dist: POT>=0.9
|
|
60
|
+
Requires-Dist: pygam>=0.9
|
|
61
|
+
Requires-Dist: numba>=0.57
|
|
62
|
+
Requires-Dist: matplotlib>=3.6
|
|
63
|
+
Requires-Dist: seaborn>=0.12
|
|
64
|
+
Requires-Dist: pyyaml>=6.0
|
|
65
|
+
Requires-Dist: psutil>=5.9
|
|
66
|
+
Requires-Dist: ruamel.yaml>=0.17
|
|
67
|
+
Requires-Dist: pybedtools>=0.9
|
|
68
|
+
Requires-Dist: pyensembl>=2.3
|
|
69
|
+
Requires-Dist: combat>=0.3
|
|
70
|
+
Requires-Dist: adjustText>=1.0
|
|
71
|
+
Provides-Extra: trees
|
|
72
|
+
Requires-Dist: biopython>=1.80; extra == "trees"
|
|
73
|
+
Requires-Dist: scikit-bio>=0.5; extra == "trees"
|
|
74
|
+
Requires-Dist: dendropy>=4.5; extra == "trees"
|
|
75
|
+
Dynamic: license-file
|
|
76
|
+
|
|
77
|
+
# SampleDisco
|
|
78
|
+
|
|
79
|
+
A cross-omics, cross-condition **sample embedding** tool for single-cell data.
|
|
80
|
+
|
|
81
|
+
SampleDisco takes a cell-level embedding (from any standard scRNA / scATAC / multi-omics integration method) and lifts it to a **sample-level embedding** that captures both cell-type composition and the per-cell-type state of each sample. Every downstream analysis — sample-to-sample distance, clustering, trajectory inference, phenotype association — then runs on that single shared sample embedding, regardless of modality.
|
|
82
|
+
|
|
83
|
+
Paper draft: [`/users/hjiang/GenoDistance/SampleDisco_Draft-11.pdf`](../SampleDisco_Draft-11.pdf)
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## What the method does
|
|
88
|
+
|
|
89
|
+
For each modality (RNA, ATAC, or integrated multi-omics) the pipeline produces two cell-level views:
|
|
90
|
+
|
|
91
|
+
| Key | Role | Source |
|
|
92
|
+
|---|---|---|
|
|
93
|
+
| **`Z_clust`** | sample-removed embedding — used for clustering and composition blocks | Harmony (single-omics) / Harmony post-pass on scGLUE (multi-omics) |
|
|
94
|
+
| **`Z_cmd`** | sample-preserved embedding — used for the counterfactual displacement (CMD) block | second Harmony pass (single-omics) / scGLUE primary output (multi-omics) |
|
|
95
|
+
|
|
96
|
+
It then assembles **four blocks** per sample (or per sample × modality for multi-omics):
|
|
97
|
+
|
|
98
|
+
1. **A1** — one-hot cell-type composition
|
|
99
|
+
2. **A2** — soft k-means composition at K_med (≈120)
|
|
100
|
+
3. **A3** — soft k-means composition at K_fine (≈300)
|
|
101
|
+
4. **CMD** — leave-one-out cell-type-resolved displacement on `Z_cmd`
|
|
102
|
+
|
|
103
|
+
The four blocks are inverse-variance weighted, Frobenius-stacked, PCA-reduced to 10 dimensions, and Harmony-corrected at sample level. The result is stored as `adata.uns['X_DR_sample']` and feeds every downstream module.
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Repository layout
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
code/
|
|
111
|
+
├── SampleDisc.py # CLI entry point (simple or complex mode)
|
|
112
|
+
├── config/ # 9 YAML configs covering covid / blood / eye / heart / ENCODE / tabula / long_covid / unpaired / default
|
|
113
|
+
├── wrapper/ # Orchestration
|
|
114
|
+
│ ├── wrapper.py # Master wrapper; gates RNA + ATAC + multiomics + shared downstream
|
|
115
|
+
│ ├── rna_wrapper.py
|
|
116
|
+
│ ├── atac_wrapper.py
|
|
117
|
+
│ └── multiomics_wrapper.py
|
|
118
|
+
├── preparation/ # Preprocessing
|
|
119
|
+
│ ├── rna_preprocess_{cpu,gpu}.py # QC → HVG → PCA → dual Harmony → Z_clust + Z_cmd
|
|
120
|
+
│ ├── atac_preprocess_{cpu,gpu}.py # QC → TF-IDF → HVF → LSI → dual Harmony → Z_clust + Z_cmd
|
|
121
|
+
│ ├── cell_type_{cpu,gpu}.py # Leiden clustering on Z_clust (RNA or ATAC)
|
|
122
|
+
│ ├── ATAC_cell_type{,_gpu}.py # ATAC-specific cell typing variants
|
|
123
|
+
│ ├── multi_omics_glue.py # scGLUE integration (cross-modality VAE + guidance graph)
|
|
124
|
+
│ ├── multi_omics_batch_correction.py # Harmony post-pass on X_glue → Z_clust
|
|
125
|
+
│ ├── multi_omics_merge.py # post-GLUE merge + per-modality preprocess/slimming
|
|
126
|
+
│ └── multi_omics_cell_type_{cpu,gpu}.py # RNA-Leiden + k-NN label transfer to ATAC
|
|
127
|
+
├── sample_embedding/ # Core method
|
|
128
|
+
│ ├── blocks.py # composition, CMD, weighting, Frobenius stack, final PCA + Harmony
|
|
129
|
+
│ ├── sample_embedding.py # CPU pipeline
|
|
130
|
+
│ └── sample_embedding_gpu.py # GPU pipeline (cuML + cupy)
|
|
131
|
+
├── parameter_selection/
|
|
132
|
+
│ └── autotune.py # Bayesian GP sweep over CMD α; adaptive proxy ensemble
|
|
133
|
+
├── sample_distance/ # Pairwise sample distances (DR / EMD / chi-square / JS)
|
|
134
|
+
├── sample_clustering/ # Hierarchical (HRA / HRC / NN / UPGMA / consensus), K-means, proportion test, RAISIN
|
|
135
|
+
├── sample_trajectory/ # CCA (supervised) and TSCAN (unsupervised) + GAM-based trajectory DGE
|
|
136
|
+
├── sample_association/ # Per-PC variance explained vs sample-level covariates (permutation FDR)
|
|
137
|
+
├── visualization/ # Embedding plots, dendrograms, DGE volcanos, modality-aware multi-omics scatters
|
|
138
|
+
├── utils/ # Shared helpers: seed, safe h5ad I/O, limma, TF-IDF, batch regress, Grouping
|
|
139
|
+
├── gene_activity/ # ATAC peak → gene activity inference + RNA-ATAC validation
|
|
140
|
+
└── claude/ # Active one-off run scripts (rerun launchers, monitored SE, parameter sweeps)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Usage
|
|
146
|
+
|
|
147
|
+
### Complex mode (recommended) — YAML-driven
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
python SampleDisc.py -m complex --config config/config.yaml
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
The YAML drives every flag and parameter for all three pipelines:
|
|
154
|
+
|
|
155
|
+
- **Pipeline gates** (top-level): `run_rna_pipeline`, `run_atac_pipeline`, `run_multiomics_pipeline`
|
|
156
|
+
- **Per-modality phase gates** (Phase 1): `*_preprocessing`, `*_cell_type_cluster`, `*_derive_sample_embedding`
|
|
157
|
+
- **Per-modality downstream gates** (Phase 2): `*_sample_distance_calculation`, `*_trajectory_analysis`, `*_trajectory_dge`, `*_sample_cluster`, `*_proportion_test`, `*_cluster_dge`, `*_visualize_data`, `*_dimension_association_analysis`
|
|
158
|
+
- **Multi-omics-specific**: `multiomics_run_glue_*`, `multiomics_treat_sample_as_batch`, `multiomics_run_glue_twice_for_sample_removal`
|
|
159
|
+
|
|
160
|
+
The 9 ready-to-use configs in `config/` are point-in-time snapshots for the datasets used in the paper; copy one and adjust paths / column names for your own data.
|
|
161
|
+
|
|
162
|
+
### Simple mode — one positional file, defaults everywhere
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
python SampleDisc.py -m simple -c <count_data.h5ad> -o <output_dir>
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Inputs
|
|
171
|
+
|
|
172
|
+
A standard scanpy AnnData file with at minimum:
|
|
173
|
+
- `.X` — count matrix (genes for RNA, peaks for ATAC)
|
|
174
|
+
- `.obs['sample']` — sample column (required)
|
|
175
|
+
- Optional: `.obs['batch']`, `.obs['cell_type']`, sample-level metadata file (CSV) to merge
|
|
176
|
+
|
|
177
|
+
For multi-omics, the pipeline takes two separate h5ads (RNA + ATAC) and integrates them via scGLUE; samples may be **paired** (1:1 cell correspondence) or **unpaired**.
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Outputs (under `output_dir`)
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
<output_dir>/
|
|
185
|
+
├── rna/
|
|
186
|
+
│ ├── preprocess/adata_preprocessed.h5ad
|
|
187
|
+
│ ├── sample_embedding/sample_embedding.csv
|
|
188
|
+
│ ├── Sample_distance/{cosine,correlation}/*
|
|
189
|
+
│ ├── CCA/ or TSCAN/ # whichever trajectory mode
|
|
190
|
+
│ ├── trajectoryDEG/
|
|
191
|
+
│ ├── sample_cluster/{kmeans_*,proportion_test/}
|
|
192
|
+
│ ├── sample_association/variance_explained_sample.csv + figures/
|
|
193
|
+
│ └── visualization/*.png
|
|
194
|
+
├── atac/ (parallel structure)
|
|
195
|
+
├── multiomics/
|
|
196
|
+
│ ├── integration/glue/{rna-pp,atac-pp,guidance.graphml.gz}
|
|
197
|
+
│ ├── preprocess/adata_sample.h5ad # post-GLUE merged adata with Z_clust + Z_cmd
|
|
198
|
+
│ ├── sample_embedding/sample_embedding.csv
|
|
199
|
+
│ └── (same downstream subdirs as single-omics)
|
|
200
|
+
└── sys_log/main_process_status.json # which stages completed
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Installation
|
|
206
|
+
|
|
207
|
+
SampleDisco is **one package**. The CPU install is pip-only; **GPU acceleration is
|
|
208
|
+
activated simply by installing the GPU libraries separately** — the same package
|
|
209
|
+
detects and uses them at runtime. There is no separate "GPU build" of SampleDisco.
|
|
210
|
+
|
|
211
|
+
### 1. Core install (CPU)
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
pip install sampledisco # once published — or `pip install -e .` from a clone
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### 2. System prerequisite — bedtools
|
|
218
|
+
|
|
219
|
+
scGLUE (the multi-omics integrator) calls the `bedtools` binary, which pip cannot
|
|
220
|
+
provide:
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
conda install -c bioconda bedtools
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### 3. GPU acceleration (optional, install yourself)
|
|
227
|
+
|
|
228
|
+
The GPU functions (RAPIDS-accelerated normalization, Harmony, k-means / PCA, Leiden,
|
|
229
|
+
scGLUE training) turn on **only when the RAPIDS stack is present** in your
|
|
230
|
+
environment. RAPIDS is CUDA-driver-specific and conda-only, so you install it
|
|
231
|
+
separately, matching your driver (the pins below target a CUDA-12.5 driver such as
|
|
232
|
+
the cluster's GPU nodes):
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
conda install -c rapidsai -c conda-forge -c nvidia \
|
|
236
|
+
cuml=24.12 cudf=24.12 cugraph=24.12 rmm=24.12 cuvs=24.12 cupy=13 cuda-version=12.5
|
|
237
|
+
pip install rapids-singlecell==0.13.1 --no-deps
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
Then set `use_gpu: true` in your config. **You do not reinstall SampleDisco** — once
|
|
241
|
+
those packages are importable the GPU paths activate automatically; if they are
|
|
242
|
+
missing or the driver is too old, SampleDisco falls back to CPU equivalents
|
|
243
|
+
(`harmonypy` / linear regression, scikit-learn k-means, PyTorch CPU).
|
|
244
|
+
|
|
245
|
+
### One-command environments (recommended)
|
|
246
|
+
|
|
247
|
+
For a fully reproducible environment (including bedtools), use the provided conda
|
|
248
|
+
files instead of the manual steps above — see `INSTALL.md` for the driver/version
|
|
249
|
+
notes:
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
conda env create -f environment-cpu.yml # CPU
|
|
253
|
+
conda env create -f environment-gpu.yml # GPU (RAPIDS 24.12)
|
|
254
|
+
```
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# SampleDisco
|
|
2
|
+
|
|
3
|
+
A cross-omics, cross-condition **sample embedding** tool for single-cell data.
|
|
4
|
+
|
|
5
|
+
SampleDisco takes a cell-level embedding (from any standard scRNA / scATAC / multi-omics integration method) and lifts it to a **sample-level embedding** that captures both cell-type composition and the per-cell-type state of each sample. Every downstream analysis — sample-to-sample distance, clustering, trajectory inference, phenotype association — then runs on that single shared sample embedding, regardless of modality.
|
|
6
|
+
|
|
7
|
+
Paper draft: [`/users/hjiang/GenoDistance/SampleDisco_Draft-11.pdf`](../SampleDisco_Draft-11.pdf)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## What the method does
|
|
12
|
+
|
|
13
|
+
For each modality (RNA, ATAC, or integrated multi-omics) the pipeline produces two cell-level views:
|
|
14
|
+
|
|
15
|
+
| Key | Role | Source |
|
|
16
|
+
|---|---|---|
|
|
17
|
+
| **`Z_clust`** | sample-removed embedding — used for clustering and composition blocks | Harmony (single-omics) / Harmony post-pass on scGLUE (multi-omics) |
|
|
18
|
+
| **`Z_cmd`** | sample-preserved embedding — used for the counterfactual displacement (CMD) block | second Harmony pass (single-omics) / scGLUE primary output (multi-omics) |
|
|
19
|
+
|
|
20
|
+
It then assembles **four blocks** per sample (or per sample × modality for multi-omics):
|
|
21
|
+
|
|
22
|
+
1. **A1** — one-hot cell-type composition
|
|
23
|
+
2. **A2** — soft k-means composition at K_med (≈120)
|
|
24
|
+
3. **A3** — soft k-means composition at K_fine (≈300)
|
|
25
|
+
4. **CMD** — leave-one-out cell-type-resolved displacement on `Z_cmd`
|
|
26
|
+
|
|
27
|
+
The four blocks are inverse-variance weighted, Frobenius-stacked, PCA-reduced to 10 dimensions, and Harmony-corrected at sample level. The result is stored as `adata.uns['X_DR_sample']` and feeds every downstream module.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Repository layout
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
code/
|
|
35
|
+
├── SampleDisc.py # CLI entry point (simple or complex mode)
|
|
36
|
+
├── config/ # 9 YAML configs covering covid / blood / eye / heart / ENCODE / tabula / long_covid / unpaired / default
|
|
37
|
+
├── wrapper/ # Orchestration
|
|
38
|
+
│ ├── wrapper.py # Master wrapper; gates RNA + ATAC + multiomics + shared downstream
|
|
39
|
+
│ ├── rna_wrapper.py
|
|
40
|
+
│ ├── atac_wrapper.py
|
|
41
|
+
│ └── multiomics_wrapper.py
|
|
42
|
+
├── preparation/ # Preprocessing
|
|
43
|
+
│ ├── rna_preprocess_{cpu,gpu}.py # QC → HVG → PCA → dual Harmony → Z_clust + Z_cmd
|
|
44
|
+
│ ├── atac_preprocess_{cpu,gpu}.py # QC → TF-IDF → HVF → LSI → dual Harmony → Z_clust + Z_cmd
|
|
45
|
+
│ ├── cell_type_{cpu,gpu}.py # Leiden clustering on Z_clust (RNA or ATAC)
|
|
46
|
+
│ ├── ATAC_cell_type{,_gpu}.py # ATAC-specific cell typing variants
|
|
47
|
+
│ ├── multi_omics_glue.py # scGLUE integration (cross-modality VAE + guidance graph)
|
|
48
|
+
│ ├── multi_omics_batch_correction.py # Harmony post-pass on X_glue → Z_clust
|
|
49
|
+
│ ├── multi_omics_merge.py # post-GLUE merge + per-modality preprocess/slimming
|
|
50
|
+
│ └── multi_omics_cell_type_{cpu,gpu}.py # RNA-Leiden + k-NN label transfer to ATAC
|
|
51
|
+
├── sample_embedding/ # Core method
|
|
52
|
+
│ ├── blocks.py # composition, CMD, weighting, Frobenius stack, final PCA + Harmony
|
|
53
|
+
│ ├── sample_embedding.py # CPU pipeline
|
|
54
|
+
│ └── sample_embedding_gpu.py # GPU pipeline (cuML + cupy)
|
|
55
|
+
├── parameter_selection/
|
|
56
|
+
│ └── autotune.py # Bayesian GP sweep over CMD α; adaptive proxy ensemble
|
|
57
|
+
├── sample_distance/ # Pairwise sample distances (DR / EMD / chi-square / JS)
|
|
58
|
+
├── sample_clustering/ # Hierarchical (HRA / HRC / NN / UPGMA / consensus), K-means, proportion test, RAISIN
|
|
59
|
+
├── sample_trajectory/ # CCA (supervised) and TSCAN (unsupervised) + GAM-based trajectory DGE
|
|
60
|
+
├── sample_association/ # Per-PC variance explained vs sample-level covariates (permutation FDR)
|
|
61
|
+
├── visualization/ # Embedding plots, dendrograms, DGE volcanos, modality-aware multi-omics scatters
|
|
62
|
+
├── utils/ # Shared helpers: seed, safe h5ad I/O, limma, TF-IDF, batch regress, Grouping
|
|
63
|
+
├── gene_activity/ # ATAC peak → gene activity inference + RNA-ATAC validation
|
|
64
|
+
└── claude/ # Active one-off run scripts (rerun launchers, monitored SE, parameter sweeps)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
|
|
71
|
+
### Complex mode (recommended) — YAML-driven
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
python SampleDisc.py -m complex --config config/config.yaml
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
The YAML drives every flag and parameter for all three pipelines:
|
|
78
|
+
|
|
79
|
+
- **Pipeline gates** (top-level): `run_rna_pipeline`, `run_atac_pipeline`, `run_multiomics_pipeline`
|
|
80
|
+
- **Per-modality phase gates** (Phase 1): `*_preprocessing`, `*_cell_type_cluster`, `*_derive_sample_embedding`
|
|
81
|
+
- **Per-modality downstream gates** (Phase 2): `*_sample_distance_calculation`, `*_trajectory_analysis`, `*_trajectory_dge`, `*_sample_cluster`, `*_proportion_test`, `*_cluster_dge`, `*_visualize_data`, `*_dimension_association_analysis`
|
|
82
|
+
- **Multi-omics-specific**: `multiomics_run_glue_*`, `multiomics_treat_sample_as_batch`, `multiomics_run_glue_twice_for_sample_removal`
|
|
83
|
+
|
|
84
|
+
The 9 ready-to-use configs in `config/` are point-in-time snapshots for the datasets used in the paper; copy one and adjust paths / column names for your own data.
|
|
85
|
+
|
|
86
|
+
### Simple mode — one positional file, defaults everywhere
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
python SampleDisc.py -m simple -c <count_data.h5ad> -o <output_dir>
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Inputs
|
|
95
|
+
|
|
96
|
+
A standard scanpy AnnData file with at minimum:
|
|
97
|
+
- `.X` — count matrix (genes for RNA, peaks for ATAC)
|
|
98
|
+
- `.obs['sample']` — sample column (required)
|
|
99
|
+
- Optional: `.obs['batch']`, `.obs['cell_type']`, sample-level metadata file (CSV) to merge
|
|
100
|
+
|
|
101
|
+
For multi-omics, the pipeline takes two separate h5ads (RNA + ATAC) and integrates them via scGLUE; samples may be **paired** (1:1 cell correspondence) or **unpaired**.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Outputs (under `output_dir`)
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
<output_dir>/
|
|
109
|
+
├── rna/
|
|
110
|
+
│ ├── preprocess/adata_preprocessed.h5ad
|
|
111
|
+
│ ├── sample_embedding/sample_embedding.csv
|
|
112
|
+
│ ├── Sample_distance/{cosine,correlation}/*
|
|
113
|
+
│ ├── CCA/ or TSCAN/ # whichever trajectory mode
|
|
114
|
+
│ ├── trajectoryDEG/
|
|
115
|
+
│ ├── sample_cluster/{kmeans_*,proportion_test/}
|
|
116
|
+
│ ├── sample_association/variance_explained_sample.csv + figures/
|
|
117
|
+
│ └── visualization/*.png
|
|
118
|
+
├── atac/ (parallel structure)
|
|
119
|
+
├── multiomics/
|
|
120
|
+
│ ├── integration/glue/{rna-pp,atac-pp,guidance.graphml.gz}
|
|
121
|
+
│ ├── preprocess/adata_sample.h5ad # post-GLUE merged adata with Z_clust + Z_cmd
|
|
122
|
+
│ ├── sample_embedding/sample_embedding.csv
|
|
123
|
+
│ └── (same downstream subdirs as single-omics)
|
|
124
|
+
└── sys_log/main_process_status.json # which stages completed
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Installation
|
|
130
|
+
|
|
131
|
+
SampleDisco is **one package**. The CPU install is pip-only; **GPU acceleration is
|
|
132
|
+
activated simply by installing the GPU libraries separately** — the same package
|
|
133
|
+
detects and uses them at runtime. There is no separate "GPU build" of SampleDisco.
|
|
134
|
+
|
|
135
|
+
### 1. Core install (CPU)
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
pip install sampledisco # once published — or `pip install -e .` from a clone
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### 2. System prerequisite — bedtools
|
|
142
|
+
|
|
143
|
+
scGLUE (the multi-omics integrator) calls the `bedtools` binary, which pip cannot
|
|
144
|
+
provide:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
conda install -c bioconda bedtools
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### 3. GPU acceleration (optional, install yourself)
|
|
151
|
+
|
|
152
|
+
The GPU functions (RAPIDS-accelerated normalization, Harmony, k-means / PCA, Leiden,
|
|
153
|
+
scGLUE training) turn on **only when the RAPIDS stack is present** in your
|
|
154
|
+
environment. RAPIDS is CUDA-driver-specific and conda-only, so you install it
|
|
155
|
+
separately, matching your driver (the pins below target a CUDA-12.5 driver such as
|
|
156
|
+
the cluster's GPU nodes):
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
conda install -c rapidsai -c conda-forge -c nvidia \
|
|
160
|
+
cuml=24.12 cudf=24.12 cugraph=24.12 rmm=24.12 cuvs=24.12 cupy=13 cuda-version=12.5
|
|
161
|
+
pip install rapids-singlecell==0.13.1 --no-deps
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Then set `use_gpu: true` in your config. **You do not reinstall SampleDisco** — once
|
|
165
|
+
those packages are importable the GPU paths activate automatically; if they are
|
|
166
|
+
missing or the driver is too old, SampleDisco falls back to CPU equivalents
|
|
167
|
+
(`harmonypy` / linear regression, scikit-learn k-means, PyTorch CPU).
|
|
168
|
+
|
|
169
|
+
### One-command environments (recommended)
|
|
170
|
+
|
|
171
|
+
For a fully reproducible environment (including bedtools), use the provided conda
|
|
172
|
+
files instead of the manual steps above — see `INSTALL.md` for the driver/version
|
|
173
|
+
notes:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
conda env create -f environment-cpu.yml # CPU
|
|
177
|
+
conda env create -f environment-gpu.yml # GPU (RAPIDS 24.12)
|
|
178
|
+
```
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Packaging metadata for SampleDisco.
|
|
2
|
+
# Installable as a normal package (src/ layout): `pip install -e .` exposes the
|
|
3
|
+
# `sampledisco` import and the `sampledisco` console command from anywhere — no
|
|
4
|
+
# more `cd code && python SampleDisc.py`. Core deps are declared in
|
|
5
|
+
# requirements.txt; optional GPU acceleration is in requirements-gpu.txt.
|
|
6
|
+
[build-system]
|
|
7
|
+
requires = ["setuptools>=61"]
|
|
8
|
+
build-backend = "setuptools.build_meta"
|
|
9
|
+
|
|
10
|
+
[project]
|
|
11
|
+
name = "sampledisco"
|
|
12
|
+
version = "0.1.0"
|
|
13
|
+
description = "Cross-omics, cross-condition sample embedding for single-cell data (RNA + ATAC)"
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
requires-python = ">=3.10"
|
|
16
|
+
license = { file = "LICENSE" }
|
|
17
|
+
authors = [
|
|
18
|
+
{ name = "Harry Jiang", email = "hjiang55@jh.edu" },
|
|
19
|
+
{ name = "Hongkai Ji" },
|
|
20
|
+
]
|
|
21
|
+
keywords = ["single-cell", "sample-embedding", "multi-omics", "scRNA-seq", "scATAC-seq", "bioinformatics"]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Operating System :: POSIX :: Linux",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
29
|
+
]
|
|
30
|
+
dynamic = ["dependencies"]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/J041120h/GenoDistance"
|
|
34
|
+
Documentation = "https://j041120h.github.io/SampleDisco_tutorial/"
|
|
35
|
+
Repository = "https://github.com/J041120h/GenoDistance"
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
sampledisco = "sampledisco.cli:main"
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
# Phylogenetic-tree sample clustering (NN / UPGMA / consensus). Off the default
|
|
42
|
+
# wrapper path, so its heavy deps are optional: pip install -e '.[trees]'
|
|
43
|
+
trees = ["biopython>=1.80", "scikit-bio>=0.5", "dendropy>=4.5"]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.dynamic]
|
|
49
|
+
dependencies = { file = ["requirements.txt"] }
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# SampleDisco core (CPU) dependencies.
|
|
2
|
+
# GPU acceleration is optional and environment/driver-specific: see
|
|
3
|
+
# requirements-gpu.txt and .claude/CLAUDE.md.
|
|
4
|
+
numpy>=1.23
|
|
5
|
+
pandas>=1.5
|
|
6
|
+
scipy>=1.9
|
|
7
|
+
scikit-learn>=1.2
|
|
8
|
+
scikit-misc>=0.3
|
|
9
|
+
statsmodels>=0.13
|
|
10
|
+
patsy>=0.5
|
|
11
|
+
scanpy>=1.11
|
|
12
|
+
anndata>=0.10
|
|
13
|
+
muon>=0.1
|
|
14
|
+
harmony-pytorch>=0.1
|
|
15
|
+
harmonypy>=0.0.9
|
|
16
|
+
scglue>=0.3
|
|
17
|
+
torch>=2.0
|
|
18
|
+
python-igraph>=0.10
|
|
19
|
+
leidenalg>=0.9
|
|
20
|
+
networkx>=3.0
|
|
21
|
+
POT>=0.9
|
|
22
|
+
pygam>=0.9
|
|
23
|
+
numba>=0.57
|
|
24
|
+
matplotlib>=3.6
|
|
25
|
+
seaborn>=0.12
|
|
26
|
+
pyyaml>=6.0
|
|
27
|
+
psutil>=5.9
|
|
28
|
+
ruamel.yaml>=0.17
|
|
29
|
+
pybedtools>=0.9
|
|
30
|
+
|
|
31
|
+
# multi-omics gene annotation (scGLUE path, imported at module top)
|
|
32
|
+
pyensembl>=2.3
|
|
33
|
+
# cluster-DGE (RAISIN) — both on the live downstream path
|
|
34
|
+
combat>=0.3
|
|
35
|
+
adjustText>=1.0
|
|
36
|
+
# NOTE: scGLUE must be installed from PyPI (`pip install scglue`), NOT bioconda —
|
|
37
|
+
# the bioconda recipe caps numpy<1.22 and breaks the rest of the stack.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""SampleDisco — cross-omics, cross-condition sample embedding for single-cell data.
|
|
2
|
+
|
|
3
|
+
Public API (imported lazily so ``import sampledisco`` stays light and does not
|
|
4
|
+
pull scanpy / torch / scGLUE until you actually call into the pipeline):
|
|
5
|
+
|
|
6
|
+
import sampledisco
|
|
7
|
+
sampledisco.wrapper(...) # full pipeline (RNA / ATAC / multi-omics)
|
|
8
|
+
sampledisco.compute_sample_embedding(...) # the core method only
|
|
9
|
+
|
|
10
|
+
The CLI entry point is ``sampledisco --config <yaml>`` (see ``sampledisco.cli``).
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
|
|
16
|
+
__all__ = ["wrapper", "compute_sample_embedding"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def __getattr__(name: str):
|
|
20
|
+
# PEP 562 lazy attribute access — defer heavy imports to first use.
|
|
21
|
+
if name == "wrapper":
|
|
22
|
+
from sampledisco.wrapper.wrapper import wrapper
|
|
23
|
+
return wrapper
|
|
24
|
+
if name == "compute_sample_embedding":
|
|
25
|
+
from sampledisco.sample_embedding import compute_sample_embedding
|
|
26
|
+
return compute_sample_embedding
|
|
27
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import traceback
|
|
4
|
+
import yaml
|
|
5
|
+
import os
|
|
6
|
+
import inspect
|
|
7
|
+
from sampledisco.wrapper.wrapper import wrapper
|
|
8
|
+
|
|
9
|
+
def parse_args():
|
|
10
|
+
parser = argparse.ArgumentParser(description="Run the data processing wrapper.")
|
|
11
|
+
|
|
12
|
+
parser.add_argument("-m", "--mode", type=str, required=True, choices=["simple", "complex"],
|
|
13
|
+
help="Run mode. Choose 'simple' or 'complex'.")
|
|
14
|
+
|
|
15
|
+
# Simple mode args
|
|
16
|
+
parser.add_argument("-c", "--count_data", type=str, help="Path to count data file")
|
|
17
|
+
parser.add_argument("-s", "--sample_meta_data", type=str, help="(Optional) Path to sample metadata file")
|
|
18
|
+
parser.add_argument("-o", "--output_directory", type=str, help="Path to output directory")
|
|
19
|
+
|
|
20
|
+
# Complex mode args
|
|
21
|
+
parser.add_argument("--config", type=str, help="Path to YAML config file")
|
|
22
|
+
|
|
23
|
+
return parser.parse_args()
|
|
24
|
+
|
|
25
|
+
def load_config(config_path):
|
|
26
|
+
if not os.path.exists(config_path):
|
|
27
|
+
print(f"Error: Config file '{config_path}' does not exist.", file=sys.stderr)
|
|
28
|
+
sys.exit(1)
|
|
29
|
+
with open(config_path, 'r') as f:
|
|
30
|
+
return yaml.safe_load(f)
|
|
31
|
+
|
|
32
|
+
def validate_config(config, func):
|
|
33
|
+
valid_params = inspect.signature(func).parameters
|
|
34
|
+
for key in config:
|
|
35
|
+
if key not in valid_params:
|
|
36
|
+
raise ValueError(f"Unexpected parameter in config: '{key}'")
|
|
37
|
+
for key in valid_params:
|
|
38
|
+
if key not in config:
|
|
39
|
+
raise ValueError(f"Missing required parameter in config: '{key}'")
|
|
40
|
+
|
|
41
|
+
def main():
|
|
42
|
+
args = parse_args()
|
|
43
|
+
|
|
44
|
+
if args.mode == "simple":
|
|
45
|
+
if not args.count_data or not args.output_directory:
|
|
46
|
+
print("Error: In 'simple' mode, -c and -o must be provided.", file=sys.stderr)
|
|
47
|
+
sys.exit(1)
|
|
48
|
+
|
|
49
|
+
if args.sample_meta_data:
|
|
50
|
+
wrapper(args.count_data, args.sample_meta_data, args.output_directory)
|
|
51
|
+
else:
|
|
52
|
+
wrapper(args.count_data, output_directory=args.output_directory)
|
|
53
|
+
|
|
54
|
+
elif args.mode == "complex":
|
|
55
|
+
if not args.config:
|
|
56
|
+
print("Error: In 'complex' mode, --config must be provided.", file=sys.stderr)
|
|
57
|
+
sys.exit(1)
|
|
58
|
+
|
|
59
|
+
config = load_config(args.config)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
validate_config(config, wrapper)
|
|
63
|
+
wrapper(**config)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
66
|
+
print(traceback.format_exc(), file=sys.stderr)
|
|
67
|
+
sys.exit(1)
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
main()
|