sampledisco 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. sampledisco-0.1.0/LICENSE +21 -0
  2. sampledisco-0.1.0/PKG-INFO +254 -0
  3. sampledisco-0.1.0/README.md +178 -0
  4. sampledisco-0.1.0/pyproject.toml +49 -0
  5. sampledisco-0.1.0/requirements.txt +37 -0
  6. sampledisco-0.1.0/setup.cfg +4 -0
  7. sampledisco-0.1.0/src/sampledisco/__init__.py +27 -0
  8. sampledisco-0.1.0/src/sampledisco/cli.py +70 -0
  9. sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_ArchR.py +745 -0
  10. sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_RNA_harmony.py +1107 -0
  11. sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_gene_activity.py +547 -0
  12. sampledisco-0.1.0/src/sampledisco/gene_activity/ATAC_peak_annotation.py +441 -0
  13. sampledisco-0.1.0/src/sampledisco/gene_activity/RNA_name_convertor.py +330 -0
  14. sampledisco-0.1.0/src/sampledisco/gene_activity/__init__.py +0 -0
  15. sampledisco-0.1.0/src/sampledisco/gene_activity/pseudo_correlation.py +1112 -0
  16. sampledisco-0.1.0/src/sampledisco/gene_activity/test_pseudo_correlation.py +694 -0
  17. sampledisco-0.1.0/src/sampledisco/gene_activity/validation.py +913 -0
  18. sampledisco-0.1.0/src/sampledisco/parameter_selection/__init__.py +0 -0
  19. sampledisco-0.1.0/src/sampledisco/parameter_selection/autotune.py +877 -0
  20. sampledisco-0.1.0/src/sampledisco/preparation/ATAC_cell_type.py +493 -0
  21. sampledisco-0.1.0/src/sampledisco/preparation/ATAC_cell_type_gpu.py +425 -0
  22. sampledisco-0.1.0/src/sampledisco/preparation/__init__.py +0 -0
  23. sampledisco-0.1.0/src/sampledisco/preparation/atac_preprocess_cpu.py +316 -0
  24. sampledisco-0.1.0/src/sampledisco/preparation/atac_preprocess_gpu.py +306 -0
  25. sampledisco-0.1.0/src/sampledisco/preparation/cell_type_cpu.py +189 -0
  26. sampledisco-0.1.0/src/sampledisco/preparation/cell_type_gpu.py +202 -0
  27. sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_batch_correction.py +111 -0
  28. sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_cell_type_cpu.py +447 -0
  29. sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_cell_type_gpu.py +482 -0
  30. sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_glue.py +1045 -0
  31. sampledisco-0.1.0/src/sampledisco/preparation/multi_omics_merge.py +404 -0
  32. sampledisco-0.1.0/src/sampledisco/preparation/rna_preprocess_cpu.py +303 -0
  33. sampledisco-0.1.0/src/sampledisco/preparation/rna_preprocess_gpu.py +292 -0
  34. sampledisco-0.1.0/src/sampledisco/sample_association/__init__.py +0 -0
  35. sampledisco-0.1.0/src/sampledisco/sample_association/association.py +649 -0
  36. sampledisco-0.1.0/src/sampledisco/sample_clustering/HRA_VEC.py +15 -0
  37. sampledisco-0.1.0/src/sampledisco/sample_clustering/HRC_VEC.py +15 -0
  38. sampledisco-0.1.0/src/sampledisco/sample_clustering/NN.py +108 -0
  39. sampledisco-0.1.0/src/sampledisco/sample_clustering/RAISIN.py +847 -0
  40. sampledisco-0.1.0/src/sampledisco/sample_clustering/RAISIN_TEST.py +787 -0
  41. sampledisco-0.1.0/src/sampledisco/sample_clustering/UPGMA.py +84 -0
  42. sampledisco-0.1.0/src/sampledisco/sample_clustering/__init__.py +0 -0
  43. sampledisco-0.1.0/src/sampledisco/sample_clustering/cluster.py +117 -0
  44. sampledisco-0.1.0/src/sampledisco/sample_clustering/cluster_helper.py +136 -0
  45. sampledisco-0.1.0/src/sampledisco/sample_clustering/cluster_severity_reconcile.py +130 -0
  46. sampledisco-0.1.0/src/sampledisco/sample_clustering/consensus.py +222 -0
  47. sampledisco-0.1.0/src/sampledisco/sample_clustering/proportion_test.py +624 -0
  48. sampledisco-0.1.0/src/sampledisco/sample_clustering/tree_cut.py +90 -0
  49. sampledisco-0.1.0/src/sampledisco/sample_distance/ChiSquare.py +143 -0
  50. sampledisco-0.1.0/src/sampledisco/sample_distance/__init__.py +0 -0
  51. sampledisco-0.1.0/src/sampledisco/sample_distance/distance_test.py +147 -0
  52. sampledisco-0.1.0/src/sampledisco/sample_distance/jensenshannon.py +142 -0
  53. sampledisco-0.1.0/src/sampledisco/sample_distance/sample_distance.py +609 -0
  54. sampledisco-0.1.0/src/sampledisco/sample_embedding/__init__.py +73 -0
  55. sampledisco-0.1.0/src/sampledisco/sample_embedding/blocks.py +528 -0
  56. sampledisco-0.1.0/src/sampledisco/sample_embedding/sample_embedding.py +346 -0
  57. sampledisco-0.1.0/src/sampledisco/sample_embedding/sample_embedding_gpu.py +359 -0
  58. sampledisco-0.1.0/src/sampledisco/sample_trajectory/CCA.py +482 -0
  59. sampledisco-0.1.0/src/sampledisco/sample_trajectory/CCA_test.py +611 -0
  60. sampledisco-0.1.0/src/sampledisco/sample_trajectory/TSCAN.py +998 -0
  61. sampledisco-0.1.0/src/sampledisco/sample_trajectory/__init__.py +0 -0
  62. sampledisco-0.1.0/src/sampledisco/sample_trajectory/trajectory_DGE_visualization.py +1600 -0
  63. sampledisco-0.1.0/src/sampledisco/sample_trajectory/trajectory_diff_gene.py +1184 -0
  64. sampledisco-0.1.0/src/sampledisco/utils/Grouping.py +112 -0
  65. sampledisco-0.1.0/src/sampledisco/utils/__init__.py +0 -0
  66. sampledisco-0.1.0/src/sampledisco/utils/batch_regress.py +49 -0
  67. sampledisco-0.1.0/src/sampledisco/utils/imbalance_cell_type_handler.py +79 -0
  68. sampledisco-0.1.0/src/sampledisco/utils/inspector.py +391 -0
  69. sampledisco-0.1.0/src/sampledisco/utils/limma.py +54 -0
  70. sampledisco-0.1.0/src/sampledisco/utils/merge_sample_meta.py +102 -0
  71. sampledisco-0.1.0/src/sampledisco/utils/random_seed.py +36 -0
  72. sampledisco-0.1.0/src/sampledisco/utils/safe_save.py +132 -0
  73. sampledisco-0.1.0/src/sampledisco/utils/slim_adata.py +24 -0
  74. sampledisco-0.1.0/src/sampledisco/utils/subsample.py +117 -0
  75. sampledisco-0.1.0/src/sampledisco/utils/subsample_generator.py +418 -0
  76. sampledisco-0.1.0/src/sampledisco/utils/tf_idf.py +299 -0
  77. sampledisco-0.1.0/src/sampledisco/utils/unify_optimal.py +339 -0
  78. sampledisco-0.1.0/src/sampledisco/visualization/ATAC_visualization.py +359 -0
  79. sampledisco-0.1.0/src/sampledisco/visualization/DEG_visualization.py +359 -0
  80. sampledisco-0.1.0/src/sampledisco/visualization/__init__.py +0 -0
  81. sampledisco-0.1.0/src/sampledisco/visualization/multi_omics_visualization.py +1266 -0
  82. sampledisco-0.1.0/src/sampledisco/visualization/visualization_embedding.py +603 -0
  83. sampledisco-0.1.0/src/sampledisco/visualization/visualization_helper.py +439 -0
  84. sampledisco-0.1.0/src/sampledisco/visualization/visualization_other.py +121 -0
  85. sampledisco-0.1.0/src/sampledisco/wrapper/__init__.py +0 -0
  86. sampledisco-0.1.0/src/sampledisco/wrapper/atac_wrapper.py +234 -0
  87. sampledisco-0.1.0/src/sampledisco/wrapper/multiomics_wrapper.py +483 -0
  88. sampledisco-0.1.0/src/sampledisco/wrapper/rna_wrapper.py +233 -0
  89. sampledisco-0.1.0/src/sampledisco/wrapper/wrapper.py +1646 -0
  90. sampledisco-0.1.0/src/sampledisco.egg-info/PKG-INFO +254 -0
  91. sampledisco-0.1.0/src/sampledisco.egg-info/SOURCES.txt +93 -0
  92. sampledisco-0.1.0/src/sampledisco.egg-info/dependency_links.txt +1 -0
  93. sampledisco-0.1.0/src/sampledisco.egg-info/entry_points.txt +2 -0
  94. sampledisco-0.1.0/src/sampledisco.egg-info/requires.txt +34 -0
  95. sampledisco-0.1.0/src/sampledisco.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Harry Jiang, Hongkai Ji, and the SampleDisco authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,254 @@
1
+ Metadata-Version: 2.4
2
+ Name: sampledisco
3
+ Version: 0.1.0
4
+ Summary: Cross-omics, cross-condition sample embedding for single-cell data (RNA + ATAC)
5
+ Author: Hongkai Ji
6
+ Author-email: Harry Jiang <hjiang55@jh.edu>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 Harry Jiang, Hongkai Ji, and the SampleDisco authors
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: Homepage, https://github.com/J041120h/GenoDistance
30
+ Project-URL: Documentation, https://j041120h.github.io/SampleDisco_tutorial/
31
+ Project-URL: Repository, https://github.com/J041120h/GenoDistance
32
+ Keywords: single-cell,sample-embedding,multi-omics,scRNA-seq,scATAC-seq,bioinformatics
33
+ Classifier: Development Status :: 3 - Alpha
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Operating System :: POSIX :: Linux
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
39
+ Requires-Python: >=3.10
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: numpy>=1.23
43
+ Requires-Dist: pandas>=1.5
44
+ Requires-Dist: scipy>=1.9
45
+ Requires-Dist: scikit-learn>=1.2
46
+ Requires-Dist: scikit-misc>=0.3
47
+ Requires-Dist: statsmodels>=0.13
48
+ Requires-Dist: patsy>=0.5
49
+ Requires-Dist: scanpy>=1.11
50
+ Requires-Dist: anndata>=0.10
51
+ Requires-Dist: muon>=0.1
52
+ Requires-Dist: harmony-pytorch>=0.1
53
+ Requires-Dist: harmonypy>=0.0.9
54
+ Requires-Dist: scglue>=0.3
55
+ Requires-Dist: torch>=2.0
56
+ Requires-Dist: python-igraph>=0.10
57
+ Requires-Dist: leidenalg>=0.9
58
+ Requires-Dist: networkx>=3.0
59
+ Requires-Dist: POT>=0.9
60
+ Requires-Dist: pygam>=0.9
61
+ Requires-Dist: numba>=0.57
62
+ Requires-Dist: matplotlib>=3.6
63
+ Requires-Dist: seaborn>=0.12
64
+ Requires-Dist: pyyaml>=6.0
65
+ Requires-Dist: psutil>=5.9
66
+ Requires-Dist: ruamel.yaml>=0.17
67
+ Requires-Dist: pybedtools>=0.9
68
+ Requires-Dist: pyensembl>=2.3
69
+ Requires-Dist: combat>=0.3
70
+ Requires-Dist: adjustText>=1.0
71
+ Provides-Extra: trees
72
+ Requires-Dist: biopython>=1.80; extra == "trees"
73
+ Requires-Dist: scikit-bio>=0.5; extra == "trees"
74
+ Requires-Dist: dendropy>=4.5; extra == "trees"
75
+ Dynamic: license-file
76
+
77
+ # SampleDisco
78
+
79
+ A cross-omics, cross-condition **sample embedding** tool for single-cell data.
80
+
81
+ SampleDisco takes a cell-level embedding (from any standard scRNA / scATAC / multi-omics integration method) and lifts it to a **sample-level embedding** that captures both cell-type composition and the per-cell-type state of each sample. Every downstream analysis — sample-to-sample distance, clustering, trajectory inference, phenotype association — then runs on that single shared sample embedding, regardless of modality.
82
+
83
+ Paper draft: [`/users/hjiang/GenoDistance/SampleDisco_Draft-11.pdf`](../SampleDisco_Draft-11.pdf)
84
+
85
+ ---
86
+
87
+ ## What the method does
88
+
89
+ For each modality (RNA, ATAC, or integrated multi-omics) the pipeline produces two cell-level views:
90
+
91
+ | Key | Role | Source |
92
+ |---|---|---|
93
+ | **`Z_clust`** | sample-removed embedding — used for clustering and composition blocks | Harmony (single-omics) / Harmony post-pass on scGLUE (multi-omics) |
94
+ | **`Z_cmd`** | sample-preserved embedding — used for the counterfactual displacement (CMD) block | second Harmony pass (single-omics) / scGLUE primary output (multi-omics) |
95
+
96
+ It then assembles **four blocks** per sample (or per sample × modality for multi-omics):
97
+
98
+ 1. **A1** — one-hot cell-type composition
99
+ 2. **A2** — soft k-means composition at K_med (≈120)
100
+ 3. **A3** — soft k-means composition at K_fine (≈300)
101
+ 4. **CMD** — leave-one-out cell-type-resolved displacement on `Z_cmd`
102
+
103
+ The four blocks are inverse-variance weighted, Frobenius-stacked, PCA-reduced to 10 dimensions, and Harmony-corrected at sample level. The result is stored as `adata.uns['X_DR_sample']` and feeds every downstream module.
104
+
105
+ ---
106
+
107
+ ## Repository layout
108
+
109
+ ```
110
+ code/
111
+ ├── SampleDisc.py # CLI entry point (simple or complex mode)
112
+ ├── config/ # 9 YAML configs covering covid / blood / eye / heart / ENCODE / tabula / long_covid / unpaired / default
113
+ ├── wrapper/ # Orchestration
114
+ │ ├── wrapper.py # Master wrapper; gates RNA + ATAC + multiomics + shared downstream
115
+ │ ├── rna_wrapper.py
116
+ │ ├── atac_wrapper.py
117
+ │ └── multiomics_wrapper.py
118
+ ├── preparation/ # Preprocessing
119
+ │ ├── rna_preprocess_{cpu,gpu}.py # QC → HVG → PCA → dual Harmony → Z_clust + Z_cmd
120
+ │ ├── atac_preprocess_{cpu,gpu}.py # QC → TF-IDF → HVF → LSI → dual Harmony → Z_clust + Z_cmd
121
+ │ ├── cell_type_{cpu,gpu}.py # Leiden clustering on Z_clust (RNA or ATAC)
122
+ │ ├── ATAC_cell_type{,_gpu}.py # ATAC-specific cell typing variants
123
+ │ ├── multi_omics_glue.py # scGLUE integration (cross-modality VAE + guidance graph)
124
+ │ ├── multi_omics_batch_correction.py # Harmony post-pass on X_glue → Z_clust
125
+ │ ├── multi_omics_merge.py # post-GLUE merge + per-modality preprocess/slimming
126
+ │ └── multi_omics_cell_type_{cpu,gpu}.py # RNA-Leiden + k-NN label transfer to ATAC
127
+ ├── sample_embedding/ # Core method
128
+ │ ├── blocks.py # composition, CMD, weighting, Frobenius stack, final PCA + Harmony
129
+ │ ├── sample_embedding.py # CPU pipeline
130
+ │ └── sample_embedding_gpu.py # GPU pipeline (cuML + cupy)
131
+ ├── parameter_selection/
132
+ │ └── autotune.py # Bayesian GP sweep over CMD α; adaptive proxy ensemble
133
+ ├── sample_distance/ # Pairwise sample distances (DR / EMD / chi-square / JS)
134
+ ├── sample_clustering/ # Hierarchical (HRA / HRC / NN / UPGMA / consensus), K-means, proportion test, RAISIN
135
+ ├── sample_trajectory/ # CCA (supervised) and TSCAN (unsupervised) + GAM-based trajectory DGE
136
+ ├── sample_association/ # Per-PC variance explained vs sample-level covariates (permutation FDR)
137
+ ├── visualization/ # Embedding plots, dendrograms, DGE volcanos, modality-aware multi-omics scatters
138
+ ├── utils/ # Shared helpers: seed, safe h5ad I/O, limma, TF-IDF, batch regress, Grouping
139
+ ├── gene_activity/ # ATAC peak → gene activity inference + RNA-ATAC validation
140
+ └── claude/ # Active one-off run scripts (rerun launchers, monitored SE, parameter sweeps)
141
+ ```
142
+
143
+ ---
144
+
145
+ ## Usage
146
+
147
+ ### Complex mode (recommended) — YAML-driven
148
+
149
+ ```bash
150
+ python SampleDisc.py -m complex --config config/config.yaml
151
+ ```
152
+
153
+ The YAML drives every flag and parameter for all three pipelines:
154
+
155
+ - **Pipeline gates** (top-level): `run_rna_pipeline`, `run_atac_pipeline`, `run_multiomics_pipeline`
156
+ - **Per-modality phase gates** (Phase 1): `*_preprocessing`, `*_cell_type_cluster`, `*_derive_sample_embedding`
157
+ - **Per-modality downstream gates** (Phase 2): `*_sample_distance_calculation`, `*_trajectory_analysis`, `*_trajectory_dge`, `*_sample_cluster`, `*_proportion_test`, `*_cluster_dge`, `*_visualize_data`, `*_dimension_association_analysis`
158
+ - **Multi-omics-specific**: `multiomics_run_glue_*`, `multiomics_treat_sample_as_batch`, `multiomics_run_glue_twice_for_sample_removal`
159
+
160
+ The 9 ready-to-use configs in `config/` are point-in-time snapshots for the datasets used in the paper; copy one and adjust paths / column names for your own data.
161
+
162
+ ### Simple mode — one positional file, defaults everywhere
163
+
164
+ ```bash
165
+ python SampleDisc.py -m simple -c <count_data.h5ad> -o <output_dir>
166
+ ```
167
+
168
+ ---
169
+
170
+ ## Inputs
171
+
172
+ A standard scanpy AnnData file with at minimum:
173
+ - `.X` — count matrix (genes for RNA, peaks for ATAC)
174
+ - `.obs['sample']` — sample column (required)
175
+ - Optional: `.obs['batch']`, `.obs['cell_type']`, sample-level metadata file (CSV) to merge
176
+
177
+ For multi-omics, the pipeline takes two separate h5ads (RNA + ATAC) and integrates them via scGLUE; samples may be **paired** (1:1 cell correspondence) or **unpaired**.
178
+
179
+ ---
180
+
181
+ ## Outputs (under `output_dir`)
182
+
183
+ ```
184
+ <output_dir>/
185
+ ├── rna/
186
+ │ ├── preprocess/adata_preprocessed.h5ad
187
+ │ ├── sample_embedding/sample_embedding.csv
188
+ │ ├── Sample_distance/{cosine,correlation}/*
189
+ │ ├── CCA/ or TSCAN/ # whichever trajectory mode
190
+ │ ├── trajectoryDEG/
191
+ │ ├── sample_cluster/{kmeans_*,proportion_test/}
192
+ │ ├── sample_association/variance_explained_sample.csv + figures/
193
+ │ └── visualization/*.png
194
+ ├── atac/ (parallel structure)
195
+ ├── multiomics/
196
+ │ ├── integration/glue/{rna-pp,atac-pp,guidance.graphml.gz}
197
+ │ ├── preprocess/adata_sample.h5ad # post-GLUE merged adata with Z_clust + Z_cmd
198
+ │ ├── sample_embedding/sample_embedding.csv
199
+ │ └── (same downstream subdirs as single-omics)
200
+ └── sys_log/main_process_status.json # which stages completed
201
+ ```
202
+
203
+ ---
204
+
205
+ ## Installation
206
+
207
+ SampleDisco is **one package**. The CPU install is pip-only; **GPU acceleration is
208
+ activated simply by installing the GPU libraries separately** — the same package
209
+ detects and uses them at runtime. There is no separate "GPU build" of SampleDisco.
210
+
211
+ ### 1. Core install (CPU)
212
+
213
+ ```bash
214
+ pip install sampledisco # once published — or `pip install -e .` from a clone
215
+ ```
216
+
217
+ ### 2. System prerequisite — bedtools
218
+
219
+ scGLUE (the multi-omics integrator) calls the `bedtools` binary, which pip cannot
220
+ provide:
221
+
222
+ ```bash
223
+ conda install -c bioconda bedtools
224
+ ```
225
+
226
+ ### 3. GPU acceleration (optional, install yourself)
227
+
228
+ The GPU functions (RAPIDS-accelerated normalization, Harmony, k-means / PCA, Leiden,
229
+ scGLUE training) turn on **only when the RAPIDS stack is present** in your
230
+ environment. RAPIDS is CUDA-driver-specific and conda-only, so you install it
231
+ separately, matching your driver (the pins below target a CUDA-12.5 driver such as
232
+ the cluster's GPU nodes):
233
+
234
+ ```bash
235
+ conda install -c rapidsai -c conda-forge -c nvidia \
236
+ cuml=24.12 cudf=24.12 cugraph=24.12 rmm=24.12 cuvs=24.12 cupy=13 cuda-version=12.5
237
+ pip install rapids-singlecell==0.13.1 --no-deps
238
+ ```
239
+
240
+ Then set `use_gpu: true` in your config. **You do not reinstall SampleDisco** — once
241
+ those packages are importable the GPU paths activate automatically; if they are
242
+ missing or the driver is too old, SampleDisco falls back to CPU equivalents
243
+ (`harmonypy` / linear regression, scikit-learn k-means, PyTorch CPU).
244
+
245
+ ### One-command environments (recommended)
246
+
247
+ For a fully reproducible environment (including bedtools), use the provided conda
248
+ files instead of the manual steps above — see `INSTALL.md` for the driver/version
249
+ notes:
250
+
251
+ ```bash
252
+ conda env create -f environment-cpu.yml # CPU
253
+ conda env create -f environment-gpu.yml # GPU (RAPIDS 24.12)
254
+ ```
@@ -0,0 +1,178 @@
1
+ # SampleDisco
2
+
3
+ A cross-omics, cross-condition **sample embedding** tool for single-cell data.
4
+
5
+ SampleDisco takes a cell-level embedding (from any standard scRNA / scATAC / multi-omics integration method) and lifts it to a **sample-level embedding** that captures both cell-type composition and the per-cell-type state of each sample. Every downstream analysis — sample-to-sample distance, clustering, trajectory inference, phenotype association — then runs on that single shared sample embedding, regardless of modality.
6
+
7
+ Paper draft: [`/users/hjiang/GenoDistance/SampleDisco_Draft-11.pdf`](../SampleDisco_Draft-11.pdf)
8
+
9
+ ---
10
+
11
+ ## What the method does
12
+
13
+ For each modality (RNA, ATAC, or integrated multi-omics) the pipeline produces two cell-level views:
14
+
15
+ | Key | Role | Source |
16
+ |---|---|---|
17
+ | **`Z_clust`** | sample-removed embedding — used for clustering and composition blocks | Harmony (single-omics) / Harmony post-pass on scGLUE (multi-omics) |
18
+ | **`Z_cmd`** | sample-preserved embedding — used for the counterfactual displacement (CMD) block | second Harmony pass (single-omics) / scGLUE primary output (multi-omics) |
19
+
20
+ It then assembles **four blocks** per sample (or per sample × modality for multi-omics):
21
+
22
+ 1. **A1** — one-hot cell-type composition
23
+ 2. **A2** — soft k-means composition at K_med (≈120)
24
+ 3. **A3** — soft k-means composition at K_fine (≈300)
25
+ 4. **CMD** — leave-one-out cell-type-resolved displacement on `Z_cmd`
26
+
27
+ The four blocks are inverse-variance weighted, Frobenius-stacked, PCA-reduced to 10 dimensions, and Harmony-corrected at sample level. The result is stored as `adata.uns['X_DR_sample']` and feeds every downstream module.
28
+
29
+ ---
30
+
31
+ ## Repository layout
32
+
33
+ ```
34
+ code/
35
+ ├── SampleDisc.py # CLI entry point (simple or complex mode)
36
+ ├── config/ # 9 YAML configs covering covid / blood / eye / heart / ENCODE / tabula / long_covid / unpaired / default
37
+ ├── wrapper/ # Orchestration
38
+ │ ├── wrapper.py # Master wrapper; gates RNA + ATAC + multiomics + shared downstream
39
+ │ ├── rna_wrapper.py
40
+ │ ├── atac_wrapper.py
41
+ │ └── multiomics_wrapper.py
42
+ ├── preparation/ # Preprocessing
43
+ │ ├── rna_preprocess_{cpu,gpu}.py # QC → HVG → PCA → dual Harmony → Z_clust + Z_cmd
44
+ │ ├── atac_preprocess_{cpu,gpu}.py # QC → TF-IDF → HVF → LSI → dual Harmony → Z_clust + Z_cmd
45
+ │ ├── cell_type_{cpu,gpu}.py # Leiden clustering on Z_clust (RNA or ATAC)
46
+ │ ├── ATAC_cell_type{,_gpu}.py # ATAC-specific cell typing variants
47
+ │ ├── multi_omics_glue.py # scGLUE integration (cross-modality VAE + guidance graph)
48
+ │ ├── multi_omics_batch_correction.py # Harmony post-pass on X_glue → Z_clust
49
+ │ ├── multi_omics_merge.py # post-GLUE merge + per-modality preprocess/slimming
50
+ │ └── multi_omics_cell_type_{cpu,gpu}.py # RNA-Leiden + k-NN label transfer to ATAC
51
+ ├── sample_embedding/ # Core method
52
+ │ ├── blocks.py # composition, CMD, weighting, Frobenius stack, final PCA + Harmony
53
+ │ ├── sample_embedding.py # CPU pipeline
54
+ │ └── sample_embedding_gpu.py # GPU pipeline (cuML + cupy)
55
+ ├── parameter_selection/
56
+ │ └── autotune.py # Bayesian GP sweep over CMD α; adaptive proxy ensemble
57
+ ├── sample_distance/ # Pairwise sample distances (DR / EMD / chi-square / JS)
58
+ ├── sample_clustering/ # Hierarchical (HRA / HRC / NN / UPGMA / consensus), K-means, proportion test, RAISIN
59
+ ├── sample_trajectory/ # CCA (supervised) and TSCAN (unsupervised) + GAM-based trajectory DGE
60
+ ├── sample_association/ # Per-PC variance explained vs sample-level covariates (permutation FDR)
61
+ ├── visualization/ # Embedding plots, dendrograms, DGE volcanos, modality-aware multi-omics scatters
62
+ ├── utils/ # Shared helpers: seed, safe h5ad I/O, limma, TF-IDF, batch regress, Grouping
63
+ ├── gene_activity/ # ATAC peak → gene activity inference + RNA-ATAC validation
64
+ └── claude/ # Active one-off run scripts (rerun launchers, monitored SE, parameter sweeps)
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Usage
70
+
71
+ ### Complex mode (recommended) — YAML-driven
72
+
73
+ ```bash
74
+ python SampleDisc.py -m complex --config config/config.yaml
75
+ ```
76
+
77
+ The YAML drives every flag and parameter for all three pipelines:
78
+
79
+ - **Pipeline gates** (top-level): `run_rna_pipeline`, `run_atac_pipeline`, `run_multiomics_pipeline`
80
+ - **Per-modality phase gates** (Phase 1): `*_preprocessing`, `*_cell_type_cluster`, `*_derive_sample_embedding`
81
+ - **Per-modality downstream gates** (Phase 2): `*_sample_distance_calculation`, `*_trajectory_analysis`, `*_trajectory_dge`, `*_sample_cluster`, `*_proportion_test`, `*_cluster_dge`, `*_visualize_data`, `*_dimension_association_analysis`
82
+ - **Multi-omics-specific**: `multiomics_run_glue_*`, `multiomics_treat_sample_as_batch`, `multiomics_run_glue_twice_for_sample_removal`
83
+
84
+ The 9 ready-to-use configs in `config/` are point-in-time snapshots for the datasets used in the paper; copy one and adjust paths / column names for your own data.
85
+
86
+ ### Simple mode — one positional file, defaults everywhere
87
+
88
+ ```bash
89
+ python SampleDisc.py -m simple -c <count_data.h5ad> -o <output_dir>
90
+ ```
91
+
92
+ ---
93
+
94
+ ## Inputs
95
+
96
+ A standard scanpy AnnData file with at minimum:
97
+ - `.X` — count matrix (genes for RNA, peaks for ATAC)
98
+ - `.obs['sample']` — sample column (required)
99
+ - Optional: `.obs['batch']`, `.obs['cell_type']`, sample-level metadata file (CSV) to merge
100
+
101
+ For multi-omics, the pipeline takes two separate h5ads (RNA + ATAC) and integrates them via scGLUE; samples may be **paired** (1:1 cell correspondence) or **unpaired**.
102
+
103
+ ---
104
+
105
+ ## Outputs (under `output_dir`)
106
+
107
+ ```
108
+ <output_dir>/
109
+ ├── rna/
110
+ │ ├── preprocess/adata_preprocessed.h5ad
111
+ │ ├── sample_embedding/sample_embedding.csv
112
+ │ ├── Sample_distance/{cosine,correlation}/*
113
+ │ ├── CCA/ or TSCAN/ # whichever trajectory mode
114
+ │ ├── trajectoryDEG/
115
+ │ ├── sample_cluster/{kmeans_*,proportion_test/}
116
+ │ ├── sample_association/variance_explained_sample.csv + figures/
117
+ │ └── visualization/*.png
118
+ ├── atac/ (parallel structure)
119
+ ├── multiomics/
120
+ │ ├── integration/glue/{rna-pp,atac-pp,guidance.graphml.gz}
121
+ │ ├── preprocess/adata_sample.h5ad # post-GLUE merged adata with Z_clust + Z_cmd
122
+ │ ├── sample_embedding/sample_embedding.csv
123
+ │ └── (same downstream subdirs as single-omics)
124
+ └── sys_log/main_process_status.json # which stages completed
125
+ ```
126
+
127
+ ---
128
+
129
+ ## Installation
130
+
131
+ SampleDisco is **one package**. The CPU install is pip-only; **GPU acceleration is
132
+ activated simply by installing the GPU libraries separately** — the same package
133
+ detects and uses them at runtime. There is no separate "GPU build" of SampleDisco.
134
+
135
+ ### 1. Core install (CPU)
136
+
137
+ ```bash
138
+ pip install sampledisco # once published — or `pip install -e .` from a clone
139
+ ```
140
+
141
+ ### 2. System prerequisite — bedtools
142
+
143
+ scGLUE (the multi-omics integrator) calls the `bedtools` binary, which pip cannot
144
+ provide:
145
+
146
+ ```bash
147
+ conda install -c bioconda bedtools
148
+ ```
149
+
150
+ ### 3. GPU acceleration (optional, install yourself)
151
+
152
+ The GPU functions (RAPIDS-accelerated normalization, Harmony, k-means / PCA, Leiden,
153
+ scGLUE training) turn on **only when the RAPIDS stack is present** in your
154
+ environment. RAPIDS is CUDA-driver-specific and conda-only, so you install it
155
+ separately, matching your driver (the pins below target a CUDA-12.5 driver such as
156
+ the cluster's GPU nodes):
157
+
158
+ ```bash
159
+ conda install -c rapidsai -c conda-forge -c nvidia \
160
+ cuml=24.12 cudf=24.12 cugraph=24.12 rmm=24.12 cuvs=24.12 cupy=13 cuda-version=12.5
161
+ pip install rapids-singlecell==0.13.1 --no-deps
162
+ ```
163
+
164
+ Then set `use_gpu: true` in your config. **You do not reinstall SampleDisco** — once
165
+ those packages are importable the GPU paths activate automatically; if they are
166
+ missing or the driver is too old, SampleDisco falls back to CPU equivalents
167
+ (`harmonypy` / linear regression, scikit-learn k-means, PyTorch CPU).
168
+
169
+ ### One-command environments (recommended)
170
+
171
+ For a fully reproducible environment (including bedtools), use the provided conda
172
+ files instead of the manual steps above — see `INSTALL.md` for the driver/version
173
+ notes:
174
+
175
+ ```bash
176
+ conda env create -f environment-cpu.yml # CPU
177
+ conda env create -f environment-gpu.yml # GPU (RAPIDS 24.12)
178
+ ```
@@ -0,0 +1,49 @@
1
+ # Packaging metadata for SampleDisco.
2
+ # Installable as a normal package (src/ layout): `pip install -e .` exposes the
3
+ # `sampledisco` import and the `sampledisco` console command from anywhere — no
4
+ # more `cd code && python SampleDisc.py`. Core deps are declared in
5
+ # requirements.txt; optional GPU acceleration is in requirements-gpu.txt.
6
+ [build-system]
7
+ requires = ["setuptools>=61"]
8
+ build-backend = "setuptools.build_meta"
9
+
10
+ [project]
11
+ name = "sampledisco"
12
+ version = "0.1.0"
13
+ description = "Cross-omics, cross-condition sample embedding for single-cell data (RNA + ATAC)"
14
+ readme = "README.md"
15
+ requires-python = ">=3.10"
16
+ license = { file = "LICENSE" }
17
+ authors = [
18
+ { name = "Harry Jiang", email = "hjiang55@jh.edu" },
19
+ { name = "Hongkai Ji" },
20
+ ]
21
+ keywords = ["single-cell", "sample-embedding", "multi-omics", "scRNA-seq", "scATAC-seq", "bioinformatics"]
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "Intended Audience :: Science/Research",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: POSIX :: Linux",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
29
+ ]
30
+ dynamic = ["dependencies"]
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/J041120h/GenoDistance"
34
+ Documentation = "https://j041120h.github.io/SampleDisco_tutorial/"
35
+ Repository = "https://github.com/J041120h/GenoDistance"
36
+
37
+ [project.scripts]
38
+ sampledisco = "sampledisco.cli:main"
39
+
40
+ [project.optional-dependencies]
41
+ # Phylogenetic-tree sample clustering (NN / UPGMA / consensus). Off the default
42
+ # wrapper path, so its heavy deps are optional: pip install -e '.[trees]'
43
+ trees = ["biopython>=1.80", "scikit-bio>=0.5", "dendropy>=4.5"]
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
47
+
48
+ [tool.setuptools.dynamic]
49
+ dependencies = { file = ["requirements.txt"] }
@@ -0,0 +1,37 @@
1
+ # SampleDisco core (CPU) dependencies.
2
+ # GPU acceleration is optional and environment/driver-specific: see
3
+ # requirements-gpu.txt and .claude/CLAUDE.md.
4
+ numpy>=1.23
5
+ pandas>=1.5
6
+ scipy>=1.9
7
+ scikit-learn>=1.2
8
+ scikit-misc>=0.3
9
+ statsmodels>=0.13
10
+ patsy>=0.5
11
+ scanpy>=1.11
12
+ anndata>=0.10
13
+ muon>=0.1
14
+ harmony-pytorch>=0.1
15
+ harmonypy>=0.0.9
16
+ scglue>=0.3
17
+ torch>=2.0
18
+ python-igraph>=0.10
19
+ leidenalg>=0.9
20
+ networkx>=3.0
21
+ POT>=0.9
22
+ pygam>=0.9
23
+ numba>=0.57
24
+ matplotlib>=3.6
25
+ seaborn>=0.12
26
+ pyyaml>=6.0
27
+ psutil>=5.9
28
+ ruamel.yaml>=0.17
29
+ pybedtools>=0.9
30
+
31
+ # multi-omics gene annotation (scGLUE path, imported at module top)
32
+ pyensembl>=2.3
33
+ # cluster-DGE (RAISIN) — both on the live downstream path
34
+ combat>=0.3
35
+ adjustText>=1.0
36
+ # NOTE: scGLUE must be installed from PyPI (`pip install scglue`), NOT bioconda —
37
+ # the bioconda recipe caps numpy<1.22 and breaks the rest of the stack.
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,27 @@
1
+ """SampleDisco — cross-omics, cross-condition sample embedding for single-cell data.
2
+
3
+ Public API (imported lazily so ``import sampledisco`` stays light and does not
4
+ pull scanpy / torch / scGLUE until you actually call into the pipeline):
5
+
6
+ import sampledisco
7
+ sampledisco.wrapper(...) # full pipeline (RNA / ATAC / multi-omics)
8
+ sampledisco.compute_sample_embedding(...) # the core method only
9
+
10
+ The CLI entry point is ``sampledisco --config <yaml>`` (see ``sampledisco.cli``).
11
+ """
12
+ from __future__ import annotations
13
+
14
+ __version__ = "0.1.0"
15
+
16
+ __all__ = ["wrapper", "compute_sample_embedding"]
17
+
18
+
19
+ def __getattr__(name: str):
20
+ # PEP 562 lazy attribute access — defer heavy imports to first use.
21
+ if name == "wrapper":
22
+ from sampledisco.wrapper.wrapper import wrapper
23
+ return wrapper
24
+ if name == "compute_sample_embedding":
25
+ from sampledisco.sample_embedding import compute_sample_embedding
26
+ return compute_sample_embedding
27
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,70 @@
1
+ import argparse
2
+ import sys
3
+ import traceback
4
+ import yaml
5
+ import os
6
+ import inspect
7
+ from sampledisco.wrapper.wrapper import wrapper
8
+
9
+ def parse_args():
10
+ parser = argparse.ArgumentParser(description="Run the data processing wrapper.")
11
+
12
+ parser.add_argument("-m", "--mode", type=str, required=True, choices=["simple", "complex"],
13
+ help="Run mode. Choose 'simple' or 'complex'.")
14
+
15
+ # Simple mode args
16
+ parser.add_argument("-c", "--count_data", type=str, help="Path to count data file")
17
+ parser.add_argument("-s", "--sample_meta_data", type=str, help="(Optional) Path to sample metadata file")
18
+ parser.add_argument("-o", "--output_directory", type=str, help="Path to output directory")
19
+
20
+ # Complex mode args
21
+ parser.add_argument("--config", type=str, help="Path to YAML config file")
22
+
23
+ return parser.parse_args()
24
+
25
+ def load_config(config_path):
26
+ if not os.path.exists(config_path):
27
+ print(f"Error: Config file '{config_path}' does not exist.", file=sys.stderr)
28
+ sys.exit(1)
29
+ with open(config_path, 'r') as f:
30
+ return yaml.safe_load(f)
31
+
32
+ def validate_config(config, func):
33
+ valid_params = inspect.signature(func).parameters
34
+ for key in config:
35
+ if key not in valid_params:
36
+ raise ValueError(f"Unexpected parameter in config: '{key}'")
37
+ for key in valid_params:
38
+ if key not in config:
39
+ raise ValueError(f"Missing required parameter in config: '{key}'")
40
+
41
+ def main():
42
+ args = parse_args()
43
+
44
+ if args.mode == "simple":
45
+ if not args.count_data or not args.output_directory:
46
+ print("Error: In 'simple' mode, -c and -o must be provided.", file=sys.stderr)
47
+ sys.exit(1)
48
+
49
+ if args.sample_meta_data:
50
+ wrapper(args.count_data, args.sample_meta_data, args.output_directory)
51
+ else:
52
+ wrapper(args.count_data, output_directory=args.output_directory)
53
+
54
+ elif args.mode == "complex":
55
+ if not args.config:
56
+ print("Error: In 'complex' mode, --config must be provided.", file=sys.stderr)
57
+ sys.exit(1)
58
+
59
+ config = load_config(args.config)
60
+
61
+ try:
62
+ validate_config(config, wrapper)
63
+ wrapper(**config)
64
+ except Exception as e:
65
+ print(f"Error: {e}", file=sys.stderr)
66
+ print(traceback.format_exc(), file=sys.stderr)
67
+ sys.exit(1)
68
+
69
+ if __name__ == "__main__":
70
+ main()