isoform-dominance 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isoform_dominance-2.0.0/LICENSE +21 -0
- isoform_dominance-2.0.0/PKG-INFO +132 -0
- isoform_dominance-2.0.0/README.md +108 -0
- isoform_dominance-2.0.0/pyproject.toml +36 -0
- isoform_dominance-2.0.0/setup.cfg +4 -0
- isoform_dominance-2.0.0/src/isoform_dominance/__init__.py +12 -0
- isoform_dominance-2.0.0/src/isoform_dominance/_selftest.py +101 -0
- isoform_dominance-2.0.0/src/isoform_dominance/annotate.py +99 -0
- isoform_dominance-2.0.0/src/isoform_dominance/cli.py +118 -0
- isoform_dominance-2.0.0/src/isoform_dominance/contamination.py +68 -0
- isoform_dominance-2.0.0/src/isoform_dominance/extract.py +46 -0
- isoform_dominance-2.0.0/src/isoform_dominance/identifiability.py +67 -0
- isoform_dominance-2.0.0/src/isoform_dominance/io.py +31 -0
- isoform_dominance-2.0.0/src/isoform_dominance/stats.py +68 -0
- isoform_dominance-2.0.0/src/isoform_dominance.egg-info/PKG-INFO +132 -0
- isoform_dominance-2.0.0/src/isoform_dominance.egg-info/SOURCES.txt +21 -0
- isoform_dominance-2.0.0/src/isoform_dominance.egg-info/dependency_links.txt +1 -0
- isoform_dominance-2.0.0/src/isoform_dominance.egg-info/entry_points.txt +2 -0
- isoform_dominance-2.0.0/src/isoform_dominance.egg-info/requires.txt +7 -0
- isoform_dominance-2.0.0/src/isoform_dominance.egg-info/top_level.txt +1 -0
- isoform_dominance-2.0.0/tests/test_annotate.py +27 -0
- isoform_dominance-2.0.0/tests/test_identifiability.py +25 -0
- isoform_dominance-2.0.0/tests/test_selftest.py +21 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sangeon Kim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: isoform-dominance
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Config-driven isoform-usage quantification and discrimination from bulk RNA-seq
|
|
5
|
+
Author: Sangeon Kim
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/charliekim97/isoform-dominance-pipeline
|
|
8
|
+
Project-URL: Repository, https://github.com/charliekim97/isoform-dominance-pipeline
|
|
9
|
+
Keywords: RNA-seq,alternative splicing,isoform quantification,Salmon,reproducible pipeline
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: numpy>=1.24
|
|
18
|
+
Requires-Dist: scipy>=1.10
|
|
19
|
+
Requires-Dist: matplotlib>=3.7
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# isoform-dominance
|
|
26
|
+
|
|
27
|
+
[](https://github.com/charliekim97/isoform-dominance-pipeline/actions/workflows/ci.yml)
|
|
28
|
+

|
|
29
|
+

|
|
30
|
+
[](https://doi.org/10.5281/zenodo.20672052)
|
|
31
|
+
|
|
32
|
+
**Isoform-usage quantification *and discrimination* from bulk RNA-seq — one config away for any gene.**
|
|
33
|
+
|
|
34
|
+
Most isoform analyses stumble on two things: deciding *which transcripts form a functional
|
|
35
|
+
group*, and knowing whether short reads can even *tell the groups apart*. `isoform-dominance`
|
|
36
|
+
handles both, then quantifies and tests the comparison end-to-end:
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
annotate gene symbol → proposed isoform groups (Ensembl, by 3' terminal exon)
|
|
40
|
+
identify config → are the groups distinguishable by short reads? (k-mer uniqueness)
|
|
41
|
+
extract Salmon quant.sf → per-donor isoform-group TPM
|
|
42
|
+
stats per-donor TPM → paired Wilcoxon, per cohort + combined, + figure
|
|
43
|
+
qc marker TPM → contamination control (is the signal a cell-type artifact?)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+

|
|
47
|
+
|
|
48
|
+
*Bundled self-test: short LEPR isoform (LepRa) predominates over the long isoform (LepRb) in
|
|
49
|
+
control human choroid plexus across two independent cohorts; combined n = 11, P = 1×10⁻³.*
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install -e ".[dev]" # from a clone
|
|
57
|
+
isoform-dominance --version
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Verify it works (no downloads, seconds)
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
isoform-dominance selftest
|
|
64
|
+
# or: pytest -q
|
|
65
|
+
```
|
|
66
|
+
reproduces the published LEPR result (5/5, 6/6, combined n=11 P=9.8e-4) on a clean machine.
|
|
67
|
+
|
|
68
|
+
## What makes it more than a quantifier
|
|
69
|
+
|
|
70
|
+
**1. Auto isoform grouping (`annotate`).** Give a gene symbol; it pulls the gene's
|
|
71
|
+
protein-coding transcripts from Ensembl, clusters them by their 3' terminal-exon splice
|
|
72
|
+
acceptor (the alternative last exon that defines functional isoform classes), and proposes a
|
|
73
|
+
comparison you review and rename.
|
|
74
|
+
```bash
|
|
75
|
+
isoform-dominance annotate --gene LEPR --out config.json
|
|
76
|
+
# iso_896aa : 7 transcripts (short / LepRa)
|
|
77
|
+
# iso_1165aa: 2 transcripts (long / LepRb, canonical)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**2. Identifiability guardrail (`identify`).** Short reads can only quantify an isoform group
|
|
81
|
+
that has *unique* sequence. This checks per-group unique k-mers and **refuses to pretend** a
|
|
82
|
+
group is measurable when it isn't.
|
|
83
|
+
```bash
|
|
84
|
+
isoform-dominance identifiability --config config.json
|
|
85
|
+
# [OK] iso_896aa : 3399 unique k-mers
|
|
86
|
+
# [OK] iso_1165aa: 5369 unique k-mers
|
|
87
|
+
# primary_comparison distinguishable by short reads: True
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Full workflow (real data)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# 0) build a decoy-aware index once (Salmon + GENCODE) — see scripts/01_salmon_quant.sbatch
|
|
94
|
+
# 1) quantify on an HPC cluster:
|
|
95
|
+
sbatch scripts/01_salmon_quant.sbatch # -> quant/<donor>/quant.sf
|
|
96
|
+
# 2) extract per cohort:
|
|
97
|
+
isoform-dominance extract --config config.json --quantdir quant \
|
|
98
|
+
--samplemap example/sample_map_GSE228458.csv --cohort GSE228458 --out perdonor_GSE228458.csv
|
|
99
|
+
# 3) stats + figure:
|
|
100
|
+
isoform-dominance stats --config config.json --condition control \
|
|
101
|
+
--perdonor GSE228458=perdonor_GSE228458.csv --perdonor GSE137619=perdonor_GSE137619.csv \
|
|
102
|
+
--out results/dominance
|
|
103
|
+
# 4) optional contamination control:
|
|
104
|
+
isoform-dominance qc --config config.json \
|
|
105
|
+
--markers GSE228458=markers_228.csv --target GSE228458=perdonor_GSE228458.csv --out results/qc
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Statistical notes
|
|
109
|
+
|
|
110
|
+
- Donor-level two-sided **exact Wilcoxon signed-rank** (`scipy.stats.wilcoxon`), per cohort + combined.
|
|
111
|
+
- **Small-n floor:** n = 5 cannot reach P < 0.05 in the exact two-sided test (floor 0.0625);
|
|
112
|
+
report exact P + direction (e.g. 5/5) and combine concordant cohorts for the summary statistic.
|
|
113
|
+
- Effect size = median fold-change, reported alongside significance.
|
|
114
|
+
|
|
115
|
+
## Layout
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
src/isoform_dominance/ annotate · identifiability · extract · stats · contamination · cli · _selftest
|
|
119
|
+
tests/ pytest (offline; reproduces the published result + unit tests)
|
|
120
|
+
scripts/01_salmon_quant.sbatch
|
|
121
|
+
example/ config + sample maps
|
|
122
|
+
.github/workflows/ci.yml docs/ pyproject.toml CITATION.cff LICENSE
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Citation
|
|
126
|
+
|
|
127
|
+
Cite this repository (see `CITATION.cff`, DOI 10.5281/zenodo.20672052) and Salmon:
|
|
128
|
+
Patro, R. et al. *Nat. Methods* **14**, 417–419 (2017). https://doi.org/10.1038/nmeth.4197
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
MIT (see `LICENSE`).
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# isoform-dominance
|
|
2
|
+
|
|
3
|
+
[](https://github.com/charliekim97/isoform-dominance-pipeline/actions/workflows/ci.yml)
|
|
4
|
+

|
|
5
|
+

|
|
6
|
+
[](https://doi.org/10.5281/zenodo.20672052)
|
|
7
|
+
|
|
8
|
+
**Isoform-usage quantification *and discrimination* from bulk RNA-seq — one config away for any gene.**
|
|
9
|
+
|
|
10
|
+
Most isoform analyses stumble on two things: deciding *which transcripts form a functional
|
|
11
|
+
group*, and knowing whether short reads can even *tell the groups apart*. `isoform-dominance`
|
|
12
|
+
handles both, then quantifies and tests the comparison end-to-end:
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
annotate gene symbol → proposed isoform groups (Ensembl, by 3' terminal exon)
|
|
16
|
+
identify config → are the groups distinguishable by short reads? (k-mer uniqueness)
|
|
17
|
+
extract Salmon quant.sf → per-donor isoform-group TPM
|
|
18
|
+
stats per-donor TPM → paired Wilcoxon, per cohort + combined, + figure
|
|
19
|
+
qc marker TPM → contamination control (is the signal a cell-type artifact?)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+

|
|
23
|
+
|
|
24
|
+
*Bundled self-test: short LEPR isoform (LepRa) predominates over the long isoform (LepRb) in
|
|
25
|
+
control human choroid plexus across two independent cohorts; combined n = 11, P = 1×10⁻³.*
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install -e ".[dev]" # from a clone
|
|
33
|
+
isoform-dominance --version
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Verify it works (no downloads, seconds)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
isoform-dominance selftest
|
|
40
|
+
# or: pytest -q
|
|
41
|
+
```
|
|
42
|
+
reproduces the published LEPR result (5/5, 6/6, combined n=11 P=9.8e-4) on a clean machine.
|
|
43
|
+
|
|
44
|
+
## What makes it more than a quantifier
|
|
45
|
+
|
|
46
|
+
**1. Auto isoform grouping (`annotate`).** Give a gene symbol; it pulls the gene's
|
|
47
|
+
protein-coding transcripts from Ensembl, clusters them by their 3' terminal-exon splice
|
|
48
|
+
acceptor (the alternative last exon that defines functional isoform classes), and proposes a
|
|
49
|
+
comparison you review and rename.
|
|
50
|
+
```bash
|
|
51
|
+
isoform-dominance annotate --gene LEPR --out config.json
|
|
52
|
+
# iso_896aa : 7 transcripts (short / LepRa)
|
|
53
|
+
# iso_1165aa: 2 transcripts (long / LepRb, canonical)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**2. Identifiability guardrail (`identify`).** Short reads can only quantify an isoform group
|
|
57
|
+
that has *unique* sequence. This checks per-group unique k-mers and **refuses to pretend** a
|
|
58
|
+
group is measurable when it isn't.
|
|
59
|
+
```bash
|
|
60
|
+
isoform-dominance identifiability --config config.json
|
|
61
|
+
# [OK] iso_896aa : 3399 unique k-mers
|
|
62
|
+
# [OK] iso_1165aa: 5369 unique k-mers
|
|
63
|
+
# primary_comparison distinguishable by short reads: True
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Full workflow (real data)
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# 0) build a decoy-aware index once (Salmon + GENCODE) — see scripts/01_salmon_quant.sbatch
|
|
70
|
+
# 1) quantify on an HPC cluster:
|
|
71
|
+
sbatch scripts/01_salmon_quant.sbatch # -> quant/<donor>/quant.sf
|
|
72
|
+
# 2) extract per cohort:
|
|
73
|
+
isoform-dominance extract --config config.json --quantdir quant \
|
|
74
|
+
--samplemap example/sample_map_GSE228458.csv --cohort GSE228458 --out perdonor_GSE228458.csv
|
|
75
|
+
# 3) stats + figure:
|
|
76
|
+
isoform-dominance stats --config config.json --condition control \
|
|
77
|
+
--perdonor GSE228458=perdonor_GSE228458.csv --perdonor GSE137619=perdonor_GSE137619.csv \
|
|
78
|
+
--out results/dominance
|
|
79
|
+
# 4) optional contamination control:
|
|
80
|
+
isoform-dominance qc --config config.json \
|
|
81
|
+
--markers GSE228458=markers_228.csv --target GSE228458=perdonor_GSE228458.csv --out results/qc
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Statistical notes
|
|
85
|
+
|
|
86
|
+
- Donor-level two-sided **exact Wilcoxon signed-rank** (`scipy.stats.wilcoxon`), per cohort + combined.
|
|
87
|
+
- **Small-n floor:** n = 5 cannot reach P < 0.05 in the exact two-sided test (floor 0.0625);
|
|
88
|
+
report exact P + direction (e.g. 5/5) and combine concordant cohorts for the summary statistic.
|
|
89
|
+
- Effect size = median fold-change, reported alongside significance.
|
|
90
|
+
|
|
91
|
+
## Layout
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
src/isoform_dominance/ annotate · identifiability · extract · stats · contamination · cli · _selftest
|
|
95
|
+
tests/ pytest (offline; reproduces the published result + unit tests)
|
|
96
|
+
scripts/01_salmon_quant.sbatch
|
|
97
|
+
example/ config + sample maps
|
|
98
|
+
.github/workflows/ci.yml docs/ pyproject.toml CITATION.cff LICENSE
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Citation
|
|
102
|
+
|
|
103
|
+
Cite this repository (see `CITATION.cff`, DOI 10.5281/zenodo.20672052) and Salmon:
|
|
104
|
+
Patro, R. et al. *Nat. Methods* **14**, 417–419 (2017). https://doi.org/10.1038/nmeth.4197
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT (see `LICENSE`).
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "isoform-dominance"
|
|
7
|
+
version = "2.0.0"
|
|
8
|
+
description = "Config-driven isoform-usage quantification and discrimination from bulk RNA-seq"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Sangeon Kim" }]
|
|
13
|
+
keywords = ["RNA-seq", "alternative splicing", "isoform quantification", "Salmon", "reproducible pipeline"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
19
|
+
]
|
|
20
|
+
dependencies = ["numpy>=1.24", "scipy>=1.10", "matplotlib>=3.7"]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
dev = ["pytest>=7", "pytest-cov>=4"]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/charliekim97/isoform-dominance-pipeline"
|
|
27
|
+
Repository = "https://github.com/charliekim97/isoform-dominance-pipeline"
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
isoform-dominance = "isoform_dominance.cli:main"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["src"]
|
|
34
|
+
|
|
35
|
+
[tool.pytest.ini_options]
|
|
36
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""isoform-dominance: config-driven isoform-usage quantification and discrimination
|
|
2
|
+
from bulk RNA-seq.
|
|
3
|
+
|
|
4
|
+
Modules
|
|
5
|
+
-------
|
|
6
|
+
annotate gene symbol -> proposed isoform groups (Ensembl)
|
|
7
|
+
identifiability are two isoform groups distinguishable by short reads?
|
|
8
|
+
extract Salmon quant.sf -> per-donor isoform-group TPM
|
|
9
|
+
stats paired Wilcoxon + figure
|
|
10
|
+
contamination marker-based contamination control
|
|
11
|
+
"""
|
|
12
|
+
__version__ = "2.0.0"
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Download-free end-to-end self-test.
|
|
2
|
+
|
|
3
|
+
Generates synthetic Salmon quant.sf files from the real per-transcript LEPR TPM values of
|
|
4
|
+
the published analysis, runs extract + stats (+ contamination), and asserts the published
|
|
5
|
+
result is reproduced: combined n=11, 11/11 short>long, paired Wilcoxon P ~= 9.77e-4.
|
|
6
|
+
"""
|
|
7
|
+
import tempfile, os, csv, shutil
|
|
8
|
+
from . import extract, stats, contamination
|
|
9
|
+
|
|
10
|
+
# donor: (ENST00000371060, ENST00000616738, ENST00000349533) TPM (control choroid plexus)
|
|
11
|
+
DATA = {
|
|
12
|
+
"GSE228458": [("ctrl1", 6.521574, 36.556954, 1.314808), ("ctrl2", 1.250224, 8.637422, 0.349876),
|
|
13
|
+
("ctrl3", 2.238854, 13.496253, 0.704219), ("ctrl4", 3.684999, 15.015936, 0.721059),
|
|
14
|
+
("ctrl5", 0.495420, 6.599920, 1.440164)],
|
|
15
|
+
"GSE137619": [("ctrl1", 9.130898, 9.898263, 0.351829), ("ctrl2", 5.896581, 5.928743, 0.216838),
|
|
16
|
+
("ctrl3", 7.516654, 20.317774, 1.033148), ("ctrl4", 1.804960, 3.059827, 0.143530),
|
|
17
|
+
("ctrl5", 3.730544, 2.848134, 0.203918), ("ctrl6", 2.582321, 4.337355, 0.166890)],
|
|
18
|
+
}
|
|
19
|
+
CONFIG = {
|
|
20
|
+
"gene": "LEPR",
|
|
21
|
+
"groups": {"short": ["ENST00000371060", "ENST00000616738"], "long": ["ENST00000349533"]},
|
|
22
|
+
"primary_comparison": ["short", "long"],
|
|
23
|
+
"contamination_qc": {"target_group": "long",
|
|
24
|
+
"marker_panels": {"tissue": ["TTR", "FOLR1", "OTX2", "AQP1"],
|
|
25
|
+
"contaminant": ["RBFOX3", "SNAP25", "MAP2", "GAD1"]}},
|
|
26
|
+
}
|
|
27
|
+
EXPECT = {"GSE228458": (5, 5), "GSE137619": (6, 6), "COMBINED": (11, 11, 9.7656e-4)}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _write_quant(path, t1, t2, tl):
|
|
31
|
+
os.makedirs(path, exist_ok=True)
|
|
32
|
+
with open(os.path.join(path, "quant.sf"), "w") as f:
|
|
33
|
+
f.write("Name\tLength\tEffectiveLength\tTPM\tNumReads\n")
|
|
34
|
+
f.write("ENST00000371060.5\t3000\t2800\t%s\t100\n" % t1)
|
|
35
|
+
f.write("ENST00000616738.1\t3000\t2800\t%s\t100\n" % t2)
|
|
36
|
+
f.write("ENST00000349533.11\t4000\t3800\t%s\t10\n" % tl)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def generate(base):
|
|
40
|
+
out = {}
|
|
41
|
+
for cohort, rows in DATA.items():
|
|
42
|
+
qd = os.path.join(base, "quant_%s" % cohort)
|
|
43
|
+
for donor, t1, t2, tl in rows:
|
|
44
|
+
_write_quant(os.path.join(qd, donor), t1, t2, tl)
|
|
45
|
+
sm = os.path.join(base, "sample_map_%s.csv" % cohort)
|
|
46
|
+
with open(sm, "w", newline="") as f:
|
|
47
|
+
w = csv.writer(f); w.writerow(["donor", "condition", "SRR"])
|
|
48
|
+
for donor, *_ in rows:
|
|
49
|
+
w.writerow([donor, "control", "SYNTHETIC"])
|
|
50
|
+
mk = os.path.join(base, "markers_%s.csv" % cohort)
|
|
51
|
+
with open(mk, "w", newline="") as f:
|
|
52
|
+
w = csv.writer(f)
|
|
53
|
+
w.writerow(["donor", "TTR", "FOLR1", "OTX2", "AQP1", "RBFOX3", "SNAP25", "MAP2", "GAD1"])
|
|
54
|
+
for i, (donor, *_rest) in enumerate(rows):
|
|
55
|
+
w.writerow([donor, 40000 + i * 3000, 1500 + i * 90, 800 + i * 40, 1200 + i * 70,
|
|
56
|
+
round(0.05 + 0.01 * i, 3), round(0.30 + 0.03 * i, 3),
|
|
57
|
+
round(0.10 + 0.02 * i, 3), round(0.02 + 0.01 * i, 3)])
|
|
58
|
+
out[cohort] = {"quantdir": qd, "samplemap": sm, "markers": mk}
|
|
59
|
+
return out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def run(base):
|
|
63
|
+
"""Run the pipeline on synthetic data under `base`. Returns (ok, messages)."""
|
|
64
|
+
info = generate(base)
|
|
65
|
+
perdonor = {}
|
|
66
|
+
for cohort, p in info.items():
|
|
67
|
+
out = os.path.join(base, "perdonor_%s.csv" % cohort)
|
|
68
|
+
extract.run(CONFIG, p["quantdir"], p["samplemap"], cohort, out)
|
|
69
|
+
perdonor[cohort] = out
|
|
70
|
+
res = stats.run(CONFIG, "control", perdonor, os.path.join(base, "result"))
|
|
71
|
+
msgs, ok = [], True
|
|
72
|
+
by = {r[0]: r for r in res["per_cohort"]}
|
|
73
|
+
for coh, (n, ngt) in [("GSE228458", EXPECT["GSE228458"]), ("GSE137619", EXPECT["GSE137619"])]:
|
|
74
|
+
gn, gng = by[coh][1], by[coh][2]
|
|
75
|
+
good = gn == n and gng == ngt; ok &= good
|
|
76
|
+
msgs.append("[%s] %s %d/%d short>long" % ("OK" if good else "FAIL", coh, gng, gn))
|
|
77
|
+
cn, cgt, cp, _ = res["combined"]
|
|
78
|
+
en, eng, ep = EXPECT["COMBINED"]
|
|
79
|
+
cgood = cn == en and cgt == eng and abs(cp - ep) < 1e-4; ok &= cgood
|
|
80
|
+
msgs.append("[%s] COMBINED %d/%d P=%.4g (expect %.4g)" % ("OK" if cgood else "FAIL", cgt, cn, cp, ep))
|
|
81
|
+
fig_ok = os.path.exists(os.path.join(base, "result.png")); ok &= fig_ok
|
|
82
|
+
msgs.append("[%s] dominance figure produced" % ("OK" if fig_ok else "FAIL"))
|
|
83
|
+
rows = contamination.run(CONFIG, {c: info[c]["markers"] for c in info},
|
|
84
|
+
{c: perdonor[c] for c in info}, os.path.join(base, "qc"))
|
|
85
|
+
ratios = [r[4] for r in rows]
|
|
86
|
+
qc_ok = os.path.exists(os.path.join(base, "qc.png")) and all(x < 0.2 for x in ratios); ok &= qc_ok
|
|
87
|
+
msgs.append("[%s] contamination-QC purity ratios %s < 0.2"
|
|
88
|
+
% ("OK" if qc_ok else "FAIL", [round(x, 3) for x in ratios]))
|
|
89
|
+
return ok, msgs
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def main():
|
|
93
|
+
work = tempfile.mkdtemp(prefix="idp_selftest_")
|
|
94
|
+
try:
|
|
95
|
+
ok, msgs = run(work)
|
|
96
|
+
for m in msgs:
|
|
97
|
+
print(" " + m)
|
|
98
|
+
print("\n%s" % ("PASS - reproduces the published LEPR result." if ok else "FAIL"))
|
|
99
|
+
return 0 if ok else 1
|
|
100
|
+
finally:
|
|
101
|
+
shutil.rmtree(work, ignore_errors=True)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Gene symbol -> proposed isoform groups via the Ensembl REST API.
|
|
2
|
+
|
|
3
|
+
The hard part of isoform analysis is deciding which transcripts form a functional
|
|
4
|
+
group. This module clusters a gene's protein-coding transcripts by their 3' terminal-exon
|
|
5
|
+
splice acceptor — the alternative last exon that distinguishes functional isoform classes
|
|
6
|
+
(e.g. a long signalling form vs a short truncated form) — and proposes a two-group
|
|
7
|
+
comparison (canonical-isoform cluster vs the largest alternative cluster) that the user
|
|
8
|
+
reviews and renames before use.
|
|
9
|
+
"""
|
|
10
|
+
import json
|
|
11
|
+
import urllib.request
|
|
12
|
+
|
|
13
|
+
ENSEMBL = "https://rest.ensembl.org"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _get(path, timeout=30):
|
|
17
|
+
req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
|
|
18
|
+
return json.load(urllib.request.urlopen(req, timeout=timeout))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def fetch_transcripts(gene, species="homo_sapiens"):
|
|
22
|
+
"""Return {gene, species, strand, transcripts:[{id, protein_aa, terminal_acceptor, is_canonical}]}."""
|
|
23
|
+
g = _get("/lookup/symbol/%s/%s?expand=1" % (species, gene))
|
|
24
|
+
strand = g["strand"]
|
|
25
|
+
canonical = (g.get("canonical_transcript") or "").split(".")[0]
|
|
26
|
+
out = []
|
|
27
|
+
for t in g.get("Transcript", []):
|
|
28
|
+
if t.get("biotype") != "protein_coding":
|
|
29
|
+
continue
|
|
30
|
+
tl = t.get("Translation") or {}
|
|
31
|
+
plen = tl.get("length")
|
|
32
|
+
if not plen:
|
|
33
|
+
continue
|
|
34
|
+
exons = t["Exon"]
|
|
35
|
+
if strand == 1:
|
|
36
|
+
term = max(exons, key=lambda e: e["end"]); acc = term["start"]
|
|
37
|
+
else:
|
|
38
|
+
term = min(exons, key=lambda e: e["start"]); acc = term["end"]
|
|
39
|
+
tid = t["id"].split(".")[0]
|
|
40
|
+
out.append({"id": tid, "protein_aa": plen, "terminal_acceptor": acc,
|
|
41
|
+
"is_canonical": (t.get("is_canonical", 0) == 1) or (tid == canonical)})
|
|
42
|
+
if not out:
|
|
43
|
+
raise ValueError("No protein-coding transcripts with a translation found for %s" % gene)
|
|
44
|
+
return {"gene": gene, "species": species, "strand": strand, "transcripts": out}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def cluster_by_terminal_exon(info):
|
|
48
|
+
"""Group transcripts by 3' terminal-exon acceptor coordinate."""
|
|
49
|
+
clusters = {}
|
|
50
|
+
for t in info["transcripts"]:
|
|
51
|
+
clusters.setdefault(t["terminal_acceptor"], []).append(t)
|
|
52
|
+
out = []
|
|
53
|
+
for acc, txs in clusters.items():
|
|
54
|
+
lens = sorted(x["protein_aa"] for x in txs)
|
|
55
|
+
out.append({"acceptor": acc, "rep_aa": lens[len(lens) // 2], "n": len(txs),
|
|
56
|
+
"canonical": any(x["is_canonical"] for x in txs),
|
|
57
|
+
"ids": sorted(x["id"] for x in txs)})
|
|
58
|
+
return sorted(out, key=lambda c: -c["n"])
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def propose_groups(info):
|
|
62
|
+
clusters = cluster_by_terminal_exon(info)
|
|
63
|
+
canon = next((c for c in clusters if c["canonical"]), None) or max(clusters, key=lambda c: c["rep_aa"])
|
|
64
|
+
others = sorted([c for c in clusters if c is not canon], key=lambda c: (-c["n"], -c["rep_aa"]))
|
|
65
|
+
alt = others[0] if others else None
|
|
66
|
+
lbl_canon = "iso_%daa" % canon["rep_aa"]
|
|
67
|
+
groups = {lbl_canon: canon["ids"]}
|
|
68
|
+
primary = [lbl_canon]
|
|
69
|
+
if alt:
|
|
70
|
+
lbl_alt = "iso_%daa" % alt["rep_aa"]
|
|
71
|
+
if lbl_alt == lbl_canon:
|
|
72
|
+
lbl_alt += "_alt"
|
|
73
|
+
groups[lbl_alt] = alt["ids"]
|
|
74
|
+
primary = [lbl_alt, lbl_canon] # alternative (often shorter) first
|
|
75
|
+
return groups, primary, clusters
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_config(gene, species="homo_sapiens"):
|
|
79
|
+
"""Build a reviewable config.json dict for `gene` from live Ensembl annotation."""
|
|
80
|
+
info = fetch_transcripts(gene, species)
|
|
81
|
+
groups, primary, clusters = propose_groups(info)
|
|
82
|
+
return {
|
|
83
|
+
"gene": gene, "species": species, "reference": "Ensembl REST (live annotation)",
|
|
84
|
+
"groups": groups, "primary_comparison": primary,
|
|
85
|
+
"_proposed": ("Auto-proposed by `isoform-dominance annotate`. Groups = protein-coding "
|
|
86
|
+
"transcripts sharing a 3' terminal-exon splice acceptor (isoform-defining "
|
|
87
|
+
"alternative last exon). REVIEW and rename to functional names "
|
|
88
|
+
"(e.g. short/long) before use; smaller clusters are listed under _clusters."),
|
|
89
|
+
"_clusters": [{"terminal_acceptor": c["acceptor"], "rep_protein_aa": c["rep_aa"],
|
|
90
|
+
"n_transcripts": c["n"], "contains_canonical": c["canonical"],
|
|
91
|
+
"transcripts": c["ids"]} for c in clusters],
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def run(gene, out, species="homo_sapiens"):
|
|
96
|
+
cfg = build_config(gene, species)
|
|
97
|
+
with open(out, "w") as f:
|
|
98
|
+
json.dump(cfg, f, indent=2)
|
|
99
|
+
return cfg
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Unified command-line interface: isoform-dominance <subcommand>."""
|
|
2
|
+
import argparse, json, sys
|
|
3
|
+
from urllib.error import URLError, HTTPError
|
|
4
|
+
from . import __version__, io, extract, stats, contamination, annotate, identifiability
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _kv(items):
|
|
8
|
+
return dict(s.split("=", 1) for s in (items or []))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _net_fail(e):
|
|
12
|
+
print("Ensembl request failed (%s). Check your network connection, the gene symbol, "
|
|
13
|
+
"and species, or supply sequences offline." % e, file=sys.stderr)
|
|
14
|
+
return 1
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def cmd_annotate(a):
|
|
18
|
+
try:
|
|
19
|
+
cfg = annotate.run(a.gene, a.out, species=a.species)
|
|
20
|
+
except (URLError, HTTPError) as e:
|
|
21
|
+
return _net_fail(e)
|
|
22
|
+
print("Proposed groups for %s -> %s" % (a.gene, a.out))
|
|
23
|
+
for g, ids in cfg["groups"].items():
|
|
24
|
+
print(" %s: %d transcripts" % (g, len(ids)))
|
|
25
|
+
print(" primary_comparison:", cfg["primary_comparison"])
|
|
26
|
+
print(" REVIEW _proposed/_clusters and rename groups before use.")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def cmd_identifiability(a):
|
|
30
|
+
cfg = io.load_config(a.config)
|
|
31
|
+
seqs = json.load(open(a.sequences)) if a.sequences else None
|
|
32
|
+
try:
|
|
33
|
+
res = identifiability.analyze(cfg, k=a.k, sequences=seqs)
|
|
34
|
+
except (URLError, HTTPError) as e:
|
|
35
|
+
return _net_fail(e)
|
|
36
|
+
print("Identifiability (k=%d):" % res["k"])
|
|
37
|
+
for g, r in res["groups"].items():
|
|
38
|
+
flag = "OK" if r["distinguishable"] else "NOT DISTINGUISHABLE"
|
|
39
|
+
print(" [%s] %s: %d unique k-mers (%d transcripts)"
|
|
40
|
+
% (flag, g, r["n_unique_kmers"], r["n_transcripts"]))
|
|
41
|
+
print(" primary_comparison distinguishable by short reads:", res["primary_distinguishable"])
|
|
42
|
+
if not res["primary_distinguishable"]:
|
|
43
|
+
print(" WARNING: a primary group has no unique k-mers - short-read quantification "
|
|
44
|
+
"cannot resolve it. Reconsider the grouping or use long reads.", file=sys.stderr)
|
|
45
|
+
return 2
|
|
46
|
+
return 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def cmd_extract(a):
|
|
50
|
+
n = extract.run(io.load_config(a.config), a.quantdir, a.samplemap, a.cohort, a.out)
|
|
51
|
+
print("wrote %s (n=%d donors)" % (a.out, n))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def cmd_stats(a):
|
|
55
|
+
res = stats.run(io.load_config(a.config), a.condition, _kv(a.perdonor), a.out)
|
|
56
|
+
for name, n, ngt, p, fold in res["per_cohort"]:
|
|
57
|
+
print(" %-12s n=%d %d/%d fold=%.1fx P=%.4g" % (name, n, ngt, n, fold, p))
|
|
58
|
+
cn, cgt, cp, cfold = res["combined"]
|
|
59
|
+
print(" COMBINED n=%d %d/%d fold=%.1fx P=%.4g" % (cn, cgt, cn, cfold, cp))
|
|
60
|
+
print("wrote %s.{png,pdf,svg} + %s_stats.csv" % (a.out, a.out))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def cmd_qc(a):
|
|
64
|
+
rows = contamination.run(io.load_config(a.config), _kv(a.markers), _kv(a.target), a.out)
|
|
65
|
+
for name, n, rho, p, ratio in rows:
|
|
66
|
+
print(" %-12s n=%d rho=%+.3f P=%.3f contam/tissue=%.3f" % (name, n, rho, p, ratio))
|
|
67
|
+
print("wrote %s.{png,pdf,svg} + %s_scores.csv" % (a.out, a.out))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def cmd_selftest(a):
|
|
71
|
+
from . import _selftest
|
|
72
|
+
return _selftest.main()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_parser():
|
|
76
|
+
p = argparse.ArgumentParser(prog="isoform-dominance",
|
|
77
|
+
description="Isoform-usage quantification and discrimination from bulk RNA-seq.")
|
|
78
|
+
p.add_argument("--version", action="version", version="isoform-dominance %s" % __version__)
|
|
79
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
80
|
+
|
|
81
|
+
s = sub.add_parser("annotate", help="gene symbol -> proposed isoform groups (Ensembl)")
|
|
82
|
+
s.add_argument("--gene", required=True); s.add_argument("--species", default="homo_sapiens")
|
|
83
|
+
s.add_argument("--out", required=True); s.set_defaults(func=cmd_annotate)
|
|
84
|
+
|
|
85
|
+
s = sub.add_parser("identifiability", aliases=["identify"],
|
|
86
|
+
help="are the groups distinguishable by short reads?")
|
|
87
|
+
s.add_argument("--config", required=True); s.add_argument("--k", type=int, default=31)
|
|
88
|
+
s.add_argument("--sequences", help="optional JSON {transcript_id: cdna} (offline)")
|
|
89
|
+
s.set_defaults(func=cmd_identifiability)
|
|
90
|
+
|
|
91
|
+
s = sub.add_parser("extract", help="quant.sf -> per-donor isoform-group TPM")
|
|
92
|
+
for x in ("config", "quantdir", "samplemap", "cohort", "out"):
|
|
93
|
+
s.add_argument("--" + x, required=True)
|
|
94
|
+
s.set_defaults(func=cmd_extract)
|
|
95
|
+
|
|
96
|
+
s = sub.add_parser("stats", help="paired Wilcoxon + figure")
|
|
97
|
+
s.add_argument("--config", required=True); s.add_argument("--condition", default="control")
|
|
98
|
+
s.add_argument("--perdonor", action="append", required=True, help="NAME=perdonor.csv (repeatable)")
|
|
99
|
+
s.add_argument("--out", required=True); s.set_defaults(func=cmd_stats)
|
|
100
|
+
|
|
101
|
+
s = sub.add_parser("qc", help="contamination control")
|
|
102
|
+
s.add_argument("--config", required=True)
|
|
103
|
+
s.add_argument("--markers", action="append", required=True, help="NAME=marker_tpm.csv")
|
|
104
|
+
s.add_argument("--target", action="append", required=True, help="NAME=perdonor.csv")
|
|
105
|
+
s.add_argument("--out", required=True); s.set_defaults(func=cmd_qc)
|
|
106
|
+
|
|
107
|
+
s = sub.add_parser("selftest", help="run the download-free reproducibility test")
|
|
108
|
+
s.set_defaults(func=cmd_selftest)
|
|
109
|
+
return p
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def main(argv=None):
|
|
113
|
+
args = build_parser().parse_args(argv)
|
|
114
|
+
return args.func(args) or 0
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
if __name__ == "__main__":
|
|
118
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Contamination control: does a target isoform's signal track a contaminating cell type?"""
|
|
2
|
+
import csv, os, math
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.stats import spearmanr
|
|
5
|
+
import matplotlib as mpl; mpl.use("Agg")
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
|
|
8
|
+
PALETTE = ["#4C72B0", "#DD8452", "#55A868", "#C44E52"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _log2p1(x):
|
|
12
|
+
return math.log2(x + 1.0)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_markers(path, tissue, contaminant):
|
|
16
|
+
out = {}
|
|
17
|
+
with open(path) as f:
|
|
18
|
+
for r in csv.DictReader(f):
|
|
19
|
+
t = np.mean([_log2p1(float(r[g])) for g in tissue if g in r])
|
|
20
|
+
c = np.mean([_log2p1(float(r[g])) for g in contaminant if g in r])
|
|
21
|
+
out[r["donor"]] = (t, c, (c / t if t > 0 else float("nan")))
|
|
22
|
+
return out
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_target(path, target_group):
|
|
26
|
+
col = "%s_TPM" % target_group
|
|
27
|
+
with open(path) as f:
|
|
28
|
+
return {r["donor"]: float(r[col]) for r in csv.DictReader(f)}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def run(config, markers, targets, out):
|
|
32
|
+
"""markers/targets: {cohort: path}. Writes <out>.{png,pdf,svg}+_scores.csv. Returns rows."""
|
|
33
|
+
qc = config["contamination_qc"]
|
|
34
|
+
tg = qc["target_group"]
|
|
35
|
+
tissue = qc["marker_panels"]["tissue"]; contam = qc["marker_panels"]["contaminant"]
|
|
36
|
+
names = list(markers)
|
|
37
|
+
mpl.rcParams.update({"font.family": "sans-serif", "font.sans-serif": ["Arial", "DejaVu Sans"],
|
|
38
|
+
"font.size": 9, "pdf.fonttype": 42, "svg.fonttype": "none"})
|
|
39
|
+
os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
|
|
40
|
+
fig, axes = plt.subplots(1, len(names), figsize=(3.6 * len(names), 3.4), squeeze=False)
|
|
41
|
+
srows = []
|
|
42
|
+
for i, name in enumerate(names):
|
|
43
|
+
m = load_markers(markers[name], tissue, contam)
|
|
44
|
+
t = load_target(targets[name], tg)
|
|
45
|
+
donors = [d for d in m if d in t]
|
|
46
|
+
cs = np.array([m[d][1] for d in donors])
|
|
47
|
+
ratio = np.array([m[d][2] for d in donors])
|
|
48
|
+
tv = np.array([t[d] for d in donors])
|
|
49
|
+
rho, p = spearmanr(cs, tv)
|
|
50
|
+
srows.append((name, len(donors), rho, p, float(np.nanmedian(ratio))))
|
|
51
|
+
ax = axes[0, i]; c = PALETTE[i % len(PALETTE)]
|
|
52
|
+
ax.scatter(cs, tv, s=36, c=c, edgecolor="white", lw=0.5, zorder=3)
|
|
53
|
+
ax.set_xlabel("contamination score\n[mean log2(TPM+1), contaminant]")
|
|
54
|
+
ax.set_ylabel("%s (%s) TPM" % (config.get("gene", "target"), tg))
|
|
55
|
+
ax.set_title("%s (n=%d)" % (name, len(donors)), fontsize=9.5)
|
|
56
|
+
ax.text(0.03, 0.97, "rho=%.2f\nP=%.2f" % (rho, p), transform=ax.transAxes, va="top",
|
|
57
|
+
fontsize=8, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="#ccc", lw=0.6))
|
|
58
|
+
ax.spines[["top", "right"]].set_visible(False)
|
|
59
|
+
fig.suptitle("%s (%s) vs contamination - no positive dependence = genuine signal"
|
|
60
|
+
% (config.get("gene", "target"), tg), fontsize=10, fontweight="bold", y=1.04)
|
|
61
|
+
for ext in ("png", "pdf", "svg"):
|
|
62
|
+
fig.savefig("%s.%s" % (out, ext), dpi=300, bbox_inches="tight")
|
|
63
|
+
plt.close(fig)
|
|
64
|
+
with open(out + "_scores.csv", "w", newline="") as f:
|
|
65
|
+
w = csv.writer(f); w.writerow(["cohort", "n", "spearman_rho", "spearman_P", "median_contam_tissue_ratio"])
|
|
66
|
+
for r in srows:
|
|
67
|
+
w.writerow([r[0], r[1], "%.3f" % r[2], "%.4g" % r[3], "%.4f" % r[4]])
|
|
68
|
+
return srows
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Extract per-donor isoform-group TPM from Salmon quant.sf (stdlib only)."""
|
|
2
|
+
import csv, os, glob
|
|
3
|
+
from .io import transcript_to_group, load_sample_map, primary_pair
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract(config, quantdir, samplemap, cohort):
|
|
7
|
+
"""Return [(donor, condition, {group: tpm}), ...]. Reads quantdir/<donor>/quant.sf."""
|
|
8
|
+
groups = config["groups"]
|
|
9
|
+
tx2grp = transcript_to_group(groups)
|
|
10
|
+
cond = load_sample_map(samplemap)
|
|
11
|
+
rows = []
|
|
12
|
+
for q in sorted(glob.glob(os.path.join(quantdir, "*", "quant.sf"))):
|
|
13
|
+
donor = os.path.basename(os.path.dirname(q))
|
|
14
|
+
gt = {g: 0.0 for g in groups}
|
|
15
|
+
with open(q) as fh:
|
|
16
|
+
for row in csv.DictReader(fh, delimiter="\t"):
|
|
17
|
+
g = tx2grp.get(row["Name"].split(".")[0])
|
|
18
|
+
if g:
|
|
19
|
+
gt[g] += float(row["TPM"])
|
|
20
|
+
rows.append((donor, cond.get(donor, "NA"), gt))
|
|
21
|
+
return rows
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_perdonor(config, rows, cohort, out):
|
|
25
|
+
groups = config["groups"]
|
|
26
|
+
a, b = primary_pair(config)
|
|
27
|
+
has_pair = len(config.get("primary_comparison", list(groups)[:2])) == 2
|
|
28
|
+
with open(out, "w", newline="") as f:
|
|
29
|
+
w = csv.writer(f)
|
|
30
|
+
head = ["cohort", "donor", "condition"] + ["%s_TPM" % g for g in groups]
|
|
31
|
+
if has_pair:
|
|
32
|
+
head.append("%s_fraction" % a)
|
|
33
|
+
w.writerow(head)
|
|
34
|
+
for donor, c, gt in rows:
|
|
35
|
+
line = [cohort, donor, c] + ["%.4f" % gt[g] for g in groups]
|
|
36
|
+
if has_pair:
|
|
37
|
+
s = gt[a] + gt[b]
|
|
38
|
+
line.append("%.4f" % (gt[a] / s) if s > 0 else "NA")
|
|
39
|
+
w.writerow(line)
|
|
40
|
+
return out
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def run(config, quantdir, samplemap, cohort, out):
|
|
44
|
+
rows = extract(config, quantdir, samplemap, cohort)
|
|
45
|
+
write_perdonor(config, rows, cohort, out)
|
|
46
|
+
return len(rows)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Are two isoform groups distinguishable by short reads?
|
|
2
|
+
|
|
3
|
+
Short-read quantifiers (e.g. Salmon) can only apportion isoforms using sequence that is
|
|
4
|
+
UNIQUE to one isoform group. If a group shares all of its sequence with another group, no
|
|
5
|
+
short read can be assigned to it unambiguously and its abundance is not identifiable.
|
|
6
|
+
|
|
7
|
+
This module fetches transcript cDNA (Ensembl, or supplied sequences), builds per-group
|
|
8
|
+
k-mer sets, and reports each group's group-unique k-mer count. A group with zero unique
|
|
9
|
+
k-mers is flagged as NOT distinguishable by short reads — an honest guardrail that most
|
|
10
|
+
isoform analyses skip.
|
|
11
|
+
"""
|
|
12
|
+
import json
|
|
13
|
+
import urllib.request
|
|
14
|
+
|
|
15
|
+
ENSEMBL = "https://rest.ensembl.org"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_text(path, timeout=30):
|
|
19
|
+
req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "text/plain"})
|
|
20
|
+
return urllib.request.urlopen(req, timeout=timeout).read().decode().strip()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def fetch_cdna(transcript_id):
|
|
24
|
+
return _get_text("/sequence/id/%s?type=cdna" % transcript_id.split(".")[0])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def kmers(seq, k):
|
|
28
|
+
seq = seq.upper()
|
|
29
|
+
return {seq[i:i + k] for i in range(len(seq) - k + 1)}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def analyze(config, k=31, sequences=None):
|
|
33
|
+
"""sequences: optional {transcript_id: cdna}. If None, fetched from Ensembl.
|
|
34
|
+
Returns {group: {n_unique_kmers, n_transcripts, distinguishable}} + summary."""
|
|
35
|
+
groups = config["groups"]
|
|
36
|
+
needed = {t.split(".")[0] for ids in groups.values() for t in ids}
|
|
37
|
+
seqs = dict(sequences or {})
|
|
38
|
+
for tid in needed:
|
|
39
|
+
if tid not in seqs:
|
|
40
|
+
seqs[tid] = fetch_cdna(tid)
|
|
41
|
+
|
|
42
|
+
group_kmers = {}
|
|
43
|
+
for g, ids in groups.items():
|
|
44
|
+
ks = set()
|
|
45
|
+
for t in ids:
|
|
46
|
+
ks |= kmers(seqs[t.split(".")[0]], k)
|
|
47
|
+
group_kmers[g] = ks
|
|
48
|
+
|
|
49
|
+
report = {}
|
|
50
|
+
for g, ks in group_kmers.items():
|
|
51
|
+
others = set()
|
|
52
|
+
for g2, ks2 in group_kmers.items():
|
|
53
|
+
if g2 != g:
|
|
54
|
+
others |= ks2
|
|
55
|
+
uniq = ks - others
|
|
56
|
+
report[g] = {"n_unique_kmers": len(uniq), "n_transcripts": len(groups[g]),
|
|
57
|
+
"distinguishable": len(uniq) > 0}
|
|
58
|
+
|
|
59
|
+
pc = config.get("primary_comparison", list(groups)[:2])
|
|
60
|
+
primary_ok = all(report[g]["distinguishable"] for g in pc if g in report)
|
|
61
|
+
return {"k": k, "groups": report, "primary_comparison": pc,
|
|
62
|
+
"primary_distinguishable": primary_ok}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def run(config, k=31, sequences=None):
|
|
66
|
+
res = analyze(config, k=k, sequences=sequences)
|
|
67
|
+
return res
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Shared IO helpers: config and sample-map loading."""
|
|
2
|
+
import json, csv, os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def load_config(path):
|
|
6
|
+
with open(path) as f:
|
|
7
|
+
return json.load(f)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def transcript_to_group(groups):
|
|
11
|
+
"""{group: [ENST,...]} -> {ENST(no version): group}."""
|
|
12
|
+
m = {}
|
|
13
|
+
for g, txs in groups.items():
|
|
14
|
+
for t in txs:
|
|
15
|
+
m[t.split(".")[0]] = g
|
|
16
|
+
return m
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_sample_map(path):
|
|
20
|
+
"""CSV with columns donor,condition[,SRR] -> {donor: condition}."""
|
|
21
|
+
cond = {}
|
|
22
|
+
with open(path) as f:
|
|
23
|
+
for r in csv.DictReader(f):
|
|
24
|
+
cond[r["donor"]] = r.get("condition", "NA")
|
|
25
|
+
return cond
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def primary_pair(config):
|
|
29
|
+
groups = config["groups"]
|
|
30
|
+
pc = config.get("primary_comparison", list(groups)[:2])
|
|
31
|
+
return pc[0], pc[1]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Paired isoform-group statistics + figure."""
|
|
2
|
+
import csv, os
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.stats import wilcoxon
|
|
5
|
+
import matplotlib as mpl; mpl.use("Agg")
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
from .io import primary_pair
|
|
8
|
+
|
|
9
|
+
PALETTE = ["#4C72B0", "#DD8452", "#55A868", "#C44E52", "#8172B3", "#937860"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_perdonor(path, condition, gA, gB):
|
|
13
|
+
don, A, B = [], [], []
|
|
14
|
+
with open(path) as f:
|
|
15
|
+
for r in csv.DictReader(f):
|
|
16
|
+
if condition not in (None, "", "all") and r["condition"] != condition:
|
|
17
|
+
continue
|
|
18
|
+
don.append(r["donor"]); A.append(float(r["%s_TPM" % gA])); B.append(float(r["%s_TPM" % gB]))
|
|
19
|
+
return don, np.array(A), np.array(B)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def paired_stat(A, B):
|
|
23
|
+
n = len(A); ngt = int(np.sum(A > B))
|
|
24
|
+
try:
|
|
25
|
+
p = wilcoxon(A, B, alternative="two-sided").pvalue
|
|
26
|
+
except Exception:
|
|
27
|
+
p = float("nan")
|
|
28
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
29
|
+
fold = float(np.median(A / B))
|
|
30
|
+
return n, ngt, p, fold
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def run(config, condition, cohorts, out):
|
|
34
|
+
"""cohorts: {name: perdonor.csv}. Writes <out>.{png,pdf,svg} + <out>_stats.csv. Returns summary dict."""
|
|
35
|
+
gene = config.get("gene", "gene")
|
|
36
|
+
gA, gB = primary_pair(config)
|
|
37
|
+
mpl.rcParams.update({"font.family": "sans-serif", "font.sans-serif": ["Arial", "DejaVu Sans"],
|
|
38
|
+
"font.size": 9, "pdf.fonttype": 42, "svg.fonttype": "none"})
|
|
39
|
+
os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
|
|
40
|
+
names = list(cohorts)
|
|
41
|
+
fig, axes = plt.subplots(1, len(names), figsize=(3.6 * len(names), 3.6), squeeze=False)
|
|
42
|
+
allA, allB, statrows = [], [], []
|
|
43
|
+
for i, name in enumerate(names):
|
|
44
|
+
don, A, B = load_perdonor(cohorts[name], condition, gA, gB)
|
|
45
|
+
allA += list(A); allB += list(B)
|
|
46
|
+
n, ngt, p, fold = paired_stat(A, B); statrows.append((name, n, ngt, p, fold))
|
|
47
|
+
ax = axes[0, i]; c = PALETTE[i % len(PALETTE)]; fl = 1e-3
|
|
48
|
+
for k in range(n):
|
|
49
|
+
ax.plot([0, 1], [max(A[k], fl), max(B[k], fl)], color="#999", lw=0.8, zorder=1)
|
|
50
|
+
ax.scatter(np.zeros(n), np.clip(A, fl, None), s=34, c=c, edgecolor="white", lw=0.5, zorder=3)
|
|
51
|
+
ax.scatter(np.ones(n), np.clip(B, fl, None), s=34, c="#9aa0a6", edgecolor="white", lw=0.5, zorder=3)
|
|
52
|
+
ax.set_yscale("log"); ax.set_xlim(-0.4, 1.4); ax.set_xticks([0, 1]); ax.set_xticklabels([gA, gB])
|
|
53
|
+
ax.set_ylabel("%s TPM (log)" % gene); ax.set_title("%s (%s n=%d)" % (name, condition, n), fontsize=9.5)
|
|
54
|
+
ax.text(0.5, 1.16, "%d/%d %.0fx P=%.3f" % (ngt, n, fold, p), transform=ax.transAxes,
|
|
55
|
+
ha="center", va="top", fontsize=8, color="#333")
|
|
56
|
+
ax.spines[["top", "right"]].set_visible(False)
|
|
57
|
+
cn, cgt, cp, cfold = paired_stat(np.array(allA), np.array(allB))
|
|
58
|
+
fig.suptitle("%s: %s vs %s - combined %s n=%d, %d/%d, P=%.4g"
|
|
59
|
+
% (gene, gA, gB, condition, cn, cgt, cn, cp), fontsize=10.5, fontweight="bold", y=1.04)
|
|
60
|
+
for ext in ("png", "pdf", "svg"):
|
|
61
|
+
fig.savefig("%s.%s" % (out, ext), dpi=300, bbox_inches="tight")
|
|
62
|
+
plt.close(fig)
|
|
63
|
+
with open(out + "_stats.csv", "w", newline="") as f:
|
|
64
|
+
w = csv.writer(f); w.writerow(["cohort", "n", "%s>%s" % (gA, gB), "median_fold", "paired_wilcoxon_P"])
|
|
65
|
+
for r in statrows:
|
|
66
|
+
w.writerow([r[0], r[1], "%d/%d" % (r[2], r[1]), "%.2f" % r[4], "%.4g" % r[3]])
|
|
67
|
+
w.writerow(["COMBINED", cn, "%d/%d" % (cgt, cn), "%.2f" % cfold, "%.4g" % cp])
|
|
68
|
+
return {"per_cohort": statrows, "combined": (cn, cgt, cp, cfold)}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: isoform-dominance
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Config-driven isoform-usage quantification and discrimination from bulk RNA-seq
|
|
5
|
+
Author: Sangeon Kim
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/charliekim97/isoform-dominance-pipeline
|
|
8
|
+
Project-URL: Repository, https://github.com/charliekim97/isoform-dominance-pipeline
|
|
9
|
+
Keywords: RNA-seq,alternative splicing,isoform quantification,Salmon,reproducible pipeline
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: numpy>=1.24
|
|
18
|
+
Requires-Dist: scipy>=1.10
|
|
19
|
+
Requires-Dist: matplotlib>=3.7
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# isoform-dominance
|
|
26
|
+
|
|
27
|
+
[](https://github.com/charliekim97/isoform-dominance-pipeline/actions/workflows/ci.yml)
|
|
28
|
+

|
|
29
|
+

|
|
30
|
+
[](https://doi.org/10.5281/zenodo.20672052)
|
|
31
|
+
|
|
32
|
+
**Isoform-usage quantification *and discrimination* from bulk RNA-seq — one config away for any gene.**
|
|
33
|
+
|
|
34
|
+
Most isoform analyses stumble on two things: deciding *which transcripts form a functional
|
|
35
|
+
group*, and knowing whether short reads can even *tell the groups apart*. `isoform-dominance`
|
|
36
|
+
handles both, then quantifies and tests the comparison end-to-end:
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
annotate gene symbol → proposed isoform groups (Ensembl, by 3' terminal exon)
|
|
40
|
+
identify config → are the groups distinguishable by short reads? (k-mer uniqueness)
|
|
41
|
+
extract Salmon quant.sf → per-donor isoform-group TPM
|
|
42
|
+
stats per-donor TPM → paired Wilcoxon, per cohort + combined, + figure
|
|
43
|
+
qc marker TPM → contamination control (is the signal a cell-type artifact?)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+

|
|
47
|
+
|
|
48
|
+
*Bundled self-test: short LEPR isoform (LepRa) predominates over the long isoform (LepRb) in
|
|
49
|
+
control human choroid plexus across two independent cohorts; combined n = 11, P = 1×10⁻³.*
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install -e ".[dev]" # from a clone
|
|
57
|
+
isoform-dominance --version
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Verify it works (no downloads, seconds)
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
isoform-dominance selftest
|
|
64
|
+
# or: pytest -q
|
|
65
|
+
```
|
|
66
|
+
reproduces the published LEPR result (5/5, 6/6, combined n=11 P=9.8e-4) on a clean machine.
|
|
67
|
+
|
|
68
|
+
## What makes it more than a quantifier
|
|
69
|
+
|
|
70
|
+
**1. Auto isoform grouping (`annotate`).** Give a gene symbol; it pulls the gene's
|
|
71
|
+
protein-coding transcripts from Ensembl, clusters them by their 3' terminal-exon splice
|
|
72
|
+
acceptor (the alternative last exon that defines functional isoform classes), and proposes a
|
|
73
|
+
comparison you review and rename.
|
|
74
|
+
```bash
|
|
75
|
+
isoform-dominance annotate --gene LEPR --out config.json
|
|
76
|
+
# iso_896aa : 7 transcripts (short / LepRa)
|
|
77
|
+
# iso_1165aa: 2 transcripts (long / LepRb, canonical)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**2. Identifiability guardrail (`identify`).** Short reads can only quantify an isoform group
|
|
81
|
+
that has *unique* sequence. This checks per-group unique k-mers and **refuses to pretend** a
|
|
82
|
+
group is measurable when it isn't.
|
|
83
|
+
```bash
|
|
84
|
+
isoform-dominance identifiability --config config.json
|
|
85
|
+
# [OK] iso_896aa : 3399 unique k-mers
|
|
86
|
+
# [OK] iso_1165aa: 5369 unique k-mers
|
|
87
|
+
# primary_comparison distinguishable by short reads: True
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Full workflow (real data)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# 0) build a decoy-aware index once (Salmon + GENCODE) — see scripts/01_salmon_quant.sbatch
|
|
94
|
+
# 1) quantify on an HPC cluster:
|
|
95
|
+
sbatch scripts/01_salmon_quant.sbatch # -> quant/<donor>/quant.sf
|
|
96
|
+
# 2) extract per cohort:
|
|
97
|
+
isoform-dominance extract --config config.json --quantdir quant \
|
|
98
|
+
--samplemap example/sample_map_GSE228458.csv --cohort GSE228458 --out perdonor_GSE228458.csv
|
|
99
|
+
# 3) stats + figure:
|
|
100
|
+
isoform-dominance stats --config config.json --condition control \
|
|
101
|
+
--perdonor GSE228458=perdonor_GSE228458.csv --perdonor GSE137619=perdonor_GSE137619.csv \
|
|
102
|
+
--out results/dominance
|
|
103
|
+
# 4) optional contamination control:
|
|
104
|
+
isoform-dominance qc --config config.json \
|
|
105
|
+
--markers GSE228458=markers_228.csv --target GSE228458=perdonor_GSE228458.csv --out results/qc
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Statistical notes
|
|
109
|
+
|
|
110
|
+
- Donor-level two-sided **exact Wilcoxon signed-rank** (`scipy.stats.wilcoxon`), per cohort + combined.
|
|
111
|
+
- **Small-n floor:** n = 5 cannot reach P < 0.05 in the exact two-sided test (floor 0.0625);
|
|
112
|
+
report exact P + direction (e.g. 5/5) and combine concordant cohorts for the summary statistic.
|
|
113
|
+
- Effect size = median fold-change, reported alongside significance.
|
|
114
|
+
|
|
115
|
+
## Layout
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
src/isoform_dominance/ annotate · identifiability · extract · stats · contamination · cli · _selftest
|
|
119
|
+
tests/ pytest (offline; reproduces the published result + unit tests)
|
|
120
|
+
scripts/01_salmon_quant.sbatch
|
|
121
|
+
example/ config + sample maps
|
|
122
|
+
.github/workflows/ci.yml docs/ pyproject.toml CITATION.cff LICENSE
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Citation
|
|
126
|
+
|
|
127
|
+
Cite this repository (see `CITATION.cff`, DOI 10.5281/zenodo.20672052) and Salmon:
|
|
128
|
+
Patro, R. et al. *Nat. Methods* **14**, 417–419 (2017). https://doi.org/10.1038/nmeth.4197
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
MIT (see `LICENSE`).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/isoform_dominance/__init__.py
|
|
5
|
+
src/isoform_dominance/_selftest.py
|
|
6
|
+
src/isoform_dominance/annotate.py
|
|
7
|
+
src/isoform_dominance/cli.py
|
|
8
|
+
src/isoform_dominance/contamination.py
|
|
9
|
+
src/isoform_dominance/extract.py
|
|
10
|
+
src/isoform_dominance/identifiability.py
|
|
11
|
+
src/isoform_dominance/io.py
|
|
12
|
+
src/isoform_dominance/stats.py
|
|
13
|
+
src/isoform_dominance.egg-info/PKG-INFO
|
|
14
|
+
src/isoform_dominance.egg-info/SOURCES.txt
|
|
15
|
+
src/isoform_dominance.egg-info/dependency_links.txt
|
|
16
|
+
src/isoform_dominance.egg-info/entry_points.txt
|
|
17
|
+
src/isoform_dominance.egg-info/requires.txt
|
|
18
|
+
src/isoform_dominance.egg-info/top_level.txt
|
|
19
|
+
tests/test_annotate.py
|
|
20
|
+
tests/test_identifiability.py
|
|
21
|
+
tests/test_selftest.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
isoform_dominance
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Isoform-grouping logic (offline; no Ensembl call)."""
|
|
2
|
+
from isoform_dominance import annotate
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _info():
|
|
6
|
+
# canonical 1000aa (acceptor 100); two 900aa share acceptor 200; one 500aa acceptor 300
|
|
7
|
+
return {"gene": "X", "species": "homo_sapiens", "strand": 1, "transcripts": [
|
|
8
|
+
{"id": "T1", "protein_aa": 1000, "terminal_acceptor": 100, "is_canonical": True},
|
|
9
|
+
{"id": "T2", "protein_aa": 900, "terminal_acceptor": 200, "is_canonical": False},
|
|
10
|
+
{"id": "T3", "protein_aa": 900, "terminal_acceptor": 200, "is_canonical": False},
|
|
11
|
+
{"id": "T4", "protein_aa": 500, "terminal_acceptor": 300, "is_canonical": False},
|
|
12
|
+
]}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_clusters_by_terminal_exon():
|
|
16
|
+
clusters = annotate.cluster_by_terminal_exon(_info())
|
|
17
|
+
accs = {c["acceptor"]: c["n"] for c in clusters}
|
|
18
|
+
assert accs == {200: 2, 100: 1, 300: 1}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_proposes_canonical_vs_largest_alt():
|
|
22
|
+
groups, primary, _ = annotate.propose_groups(_info())
|
|
23
|
+
# canonical cluster -> iso_1000aa; largest alternative (2 tx) -> iso_900aa
|
|
24
|
+
assert groups["iso_1000aa"] == ["T1"]
|
|
25
|
+
assert groups["iso_900aa"] == ["T2", "T3"]
|
|
26
|
+
# alternative group is listed first in the comparison
|
|
27
|
+
assert primary == ["iso_900aa", "iso_1000aa"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Short-read distinguishability (offline; supplied sequences)."""
|
|
2
|
+
from isoform_dominance import identifiability
|
|
3
|
+
|
|
4
|
+
SHARED = "ACGT" * 20 # 80 bp shared backbone
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_distinguishable_when_each_group_has_unique_region():
|
|
8
|
+
seqs = {"A1": SHARED + "GGGGGGGGCATCAT", "B1": SHARED + "TTTTTTTTAGAGAG"}
|
|
9
|
+
cfg = {"groups": {"A": ["A1"], "B": ["B1"]}, "primary_comparison": ["A", "B"]}
|
|
10
|
+
res = identifiability.analyze(cfg, k=8, sequences=seqs)
|
|
11
|
+
assert res["primary_distinguishable"] is True
|
|
12
|
+
assert res["groups"]["A"]["n_unique_kmers"] > 0
|
|
13
|
+
assert res["groups"]["B"]["n_unique_kmers"] > 0
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_flags_group_with_no_unique_sequence():
|
|
17
|
+
# 'sub' sequence is fully contained in 'super' -> sub has no unique k-mers
|
|
18
|
+
sub = SHARED + "GGGGGGGG"
|
|
19
|
+
sup = sub + "TTTTTTTTTT"
|
|
20
|
+
seqs = {"S1": sub, "P1": sup}
|
|
21
|
+
cfg = {"groups": {"sub": ["S1"], "super": ["P1"]}, "primary_comparison": ["sub", "super"]}
|
|
22
|
+
res = identifiability.analyze(cfg, k=8, sequences=seqs)
|
|
23
|
+
assert res["groups"]["sub"]["n_unique_kmers"] == 0
|
|
24
|
+
assert res["groups"]["sub"]["distinguishable"] is False
|
|
25
|
+
assert res["primary_distinguishable"] is False
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""End-to-end: synthetic data must reproduce the published LEPR result."""
|
|
2
|
+
from isoform_dominance import _selftest
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_reproduces_published_result(tmp_path):
|
|
6
|
+
ok, msgs = _selftest.run(str(tmp_path))
|
|
7
|
+
assert ok, "self-test failed:\n" + "\n".join(msgs)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_combined_pvalue(tmp_path):
|
|
11
|
+
info = _selftest.generate(str(tmp_path))
|
|
12
|
+
from isoform_dominance import extract, stats
|
|
13
|
+
perdonor = {}
|
|
14
|
+
for cohort, p in info.items():
|
|
15
|
+
out = str(tmp_path / ("pd_%s.csv" % cohort))
|
|
16
|
+
extract.run(_selftest.CONFIG, p["quantdir"], p["samplemap"], cohort, out)
|
|
17
|
+
perdonor[cohort] = out
|
|
18
|
+
res = stats.run(_selftest.CONFIG, "control", perdonor, str(tmp_path / "r"))
|
|
19
|
+
cn, cgt, cp, _ = res["combined"]
|
|
20
|
+
assert cn == 11 and cgt == 11
|
|
21
|
+
assert abs(cp - 9.7656e-4) < 1e-4
|