deepvregulome 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepvregulome-0.1.0/PKG-INFO +139 -0
- deepvregulome-0.1.0/README.md +103 -0
- deepvregulome-0.1.0/pyproject.toml +51 -0
- deepvregulome-0.1.0/setup.cfg +4 -0
- deepvregulome-0.1.0/src/deepvregulome/__init__.py +32 -0
- deepvregulome-0.1.0/src/deepvregulome/cli.py +169 -0
- deepvregulome-0.1.0/src/deepvregulome/dnabert_data_generation.py +151 -0
- deepvregulome-0.1.0/src/deepvregulome/dvr.py +488 -0
- deepvregulome-0.1.0/src/deepvregulome/intersect.py +147 -0
- deepvregulome-0.1.0/src/deepvregulome/registry.py +572 -0
- deepvregulome-0.1.0/src/deepvregulome/utils.py +97 -0
- deepvregulome-0.1.0/src/deepvregulome/vcf_loader.py +80 -0
- deepvregulome-0.1.0/src/deepvregulome.egg-info/PKG-INFO +139 -0
- deepvregulome-0.1.0/src/deepvregulome.egg-info/SOURCES.txt +16 -0
- deepvregulome-0.1.0/src/deepvregulome.egg-info/dependency_links.txt +1 -0
- deepvregulome-0.1.0/src/deepvregulome.egg-info/entry_points.txt +2 -0
- deepvregulome-0.1.0/src/deepvregulome.egg-info/requires.txt +21 -0
- deepvregulome-0.1.0/src/deepvregulome.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepvregulome
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: DNABERT-based framework for predicting the functional impact of regulatory variants
|
|
5
|
+
Author: Ramana V. Davuluri
|
|
6
|
+
Author-email: Pratik Dutta <pratik.dutta@stonybrook.edu>
|
|
7
|
+
License: CC-BY-NC-4.0
|
|
8
|
+
Project-URL: Homepage, https://github.com/DavuluriLab/DeepVRegulome
|
|
9
|
+
Project-URL: Paper, https://arxiv.org/abs/2511.09026
|
|
10
|
+
Project-URL: Models, https://huggingface.co/duttaprat/DeepVRegulome
|
|
11
|
+
Project-URL: WebApp, https://deepvregulome.streamlit.app
|
|
12
|
+
Keywords: genomics,variant-effect-prediction,dnabert,regulome,transcription-factors,deep-learning
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: torch>=1.10
|
|
20
|
+
Requires-Dist: transformers>=4.20
|
|
21
|
+
Requires-Dist: huggingface-hub>=0.14
|
|
22
|
+
Requires-Dist: pandas>=1.3
|
|
23
|
+
Requires-Dist: numpy>=1.21
|
|
24
|
+
Provides-Extra: genome
|
|
25
|
+
Requires-Dist: pysam>=0.20; extra == "genome"
|
|
26
|
+
Provides-Extra: viz
|
|
27
|
+
Requires-Dist: matplotlib>=3.5; extra == "viz"
|
|
28
|
+
Requires-Dist: seaborn>=0.12; extra == "viz"
|
|
29
|
+
Provides-Extra: vcf
|
|
30
|
+
Requires-Dist: cyvcf2>=0.30; extra == "vcf"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: pysam>=0.20; extra == "all"
|
|
33
|
+
Requires-Dist: matplotlib>=3.5; extra == "all"
|
|
34
|
+
Requires-Dist: seaborn>=0.12; extra == "all"
|
|
35
|
+
Requires-Dist: cyvcf2>=0.30; extra == "all"
|
|
36
|
+
|
|
37
|
+
# DeepVRegulome
|
|
38
|
+

|
|
39
|
+
|
|
40
|
+
DeepVRegulome is an end‑to‑end framework for predicting the functional impact of small somatic variants in non‑coding regulatory regions (splice sites and transcription‑factor‑binding sites) using fine‑tuned DNABERT models.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## ✨ Key Features
|
|
45
|
+
|
|
46
|
+
- ✅ DNABERT-based classifiers for:
|
|
47
|
+
- Splice sites (acceptor, donor)
|
|
48
|
+
- ~700 TFBS models
|
|
49
|
+
- ✅ Region-aware scoring of somatic variants using Δp and log₂ odds
|
|
50
|
+
- ✅ Batch processing with multiprocessing and BED/VCF support
|
|
51
|
+
- ✅ Interactive Streamlit dashboard with:
|
|
52
|
+
- Variant tables, plots, and survival analysis
|
|
53
|
+
- Attention score visualizations
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
📁 Repository Structure
|
|
58
|
+
```
|
|
59
|
+
DeepVRegulome/
|
|
60
|
+
├── .devcontainer/
|
|
61
|
+
├── .streamlit/
|
|
62
|
+
├── data/
|
|
63
|
+
│ └── Brain/
|
|
64
|
+
├── figures/ # Exported visualizations (e.g. attention maps)
|
|
65
|
+
│ └── attention/
|
|
66
|
+
│ ├── CTCFL/
|
|
67
|
+
│ └── ZNF384/
|
|
68
|
+
├── notebooks/ # Jupyter notebooks for key pipeline steps
|
|
69
|
+
│ ├── 01_parse_and_merge_vcfs.ipynb # Merge and parse VCFs
|
|
70
|
+
│ ├── 02_tfbs_intersection.ipynb # Intersect VCF with TFBS BEDs
|
|
71
|
+
│ ├── 03_dnabert_input_generation.ipynb # Generate sequences for DNABERT
|
|
72
|
+
│ ├── 04_scoring_candidate_variants.ipynb # Compute Δp / logOR & rank variants
|
|
73
|
+
│ └── 05_tfbs_attention_motif_visualization.ipynb # Plot attention scores & motifs
|
|
74
|
+
├── scripts/ # Shell scripts for batch inference
|
|
75
|
+
│ ├── run_prediction_tfbs.sh # Predict with TFBS models
|
|
76
|
+
│ └── run_prediction_splice_acceptor.sh # Predict with acceptor models
|
|
77
|
+
├── src/
|
|
78
|
+
│ └── deepvregulome/ # Core Python modules
|
|
79
|
+
│ ├── __init__.py
|
|
80
|
+
│ ├── dnabert_data_generation.py # Wild/mutated seq generation
|
|
81
|
+
│ ├── intersect.py # BED/VCF overlap engine
|
|
82
|
+
│ ├── vcf_loader.py # VCF parsing utilities
|
|
83
|
+
│ └── config.yaml # Centralized path config
|
|
84
|
+
├── streamlit_app/
|
|
85
|
+
│ └── app_variant_clinical_dashboard.py # Live clinical dashboard
|
|
86
|
+
├── LICENSE
|
|
87
|
+
├── README.md
|
|
88
|
+
├── requirements.txt
|
|
89
|
+
└── .gitignore
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
## 🧪 Installation
|
|
93
|
+
```bash
|
|
94
|
+
git clone https://github.com/DavuluriLab//DeepVRegulome.git
|
|
95
|
+
cd DeepVRegulome
|
|
96
|
+
python3 -m venv venv && source venv/bin/activate
|
|
97
|
+
pip install -r requirements.txt
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
## ⚙️ Typical Pipeline Flow
|
|
103
|
+
| Step | Description | Location |
|
|
104
|
+
|------|-------------|----------|
|
|
105
|
+
| 1️⃣ | Parse + merge somatic VCFs | `01_parse_and_merge_vcfs.ipynb` |
|
|
106
|
+
| 2️⃣ | Intersect variants with TFBS BEDs | `02_tfbs_intersection.ipynb` |
|
|
107
|
+
| 3️⃣ | Generate ref/mutated k-mers for DNABERT | `03_dnabert_input_generation.ipynb` |
|
|
108
|
+
| 4️⃣ | Predict with DNABERT models | `scripts/run_prediction_tfbs.sh` |
|
|
109
|
+
| 5️⃣ | Compute Δp, find candidate variants | `04_scoring_candidate_variants.ipynb` |
|
|
110
|
+
| 6️⃣ | Visualize attention scores and motifs | `05_tfbs_attention_motif_visualization.ipynb` |
|
|
111
|
+
| 7️⃣ | Browse results interactively | `streamlit_app/app_variant_clinical_dashboard.py` |
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
## 📊 Example Outputs
|
|
115
|
+
* Candidate variant count by TFBS
|
|
116
|
+
* DNABERT attention heatmaps
|
|
117
|
+
* High-impact motif shifts due to mutations
|
|
118
|
+
* Kaplan–Meier plots for clinical stratification
|
|
119
|
+
|
|
120
|
+
See figures/attention/ for examples like CTCFL.
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## 🌐 Live Demo
|
|
124
|
+
|
|
125
|
+
An interactive instance of the DeepVRegulome dashboard is hosted here:
|
|
126
|
+
➡️ **https://davuluri-lab-brainved.streamlit.app/**
|
|
127
|
+
The deployed app lets you browse model performance metrics and variant-effect predictions without installing any software locally.
|
|
128
|
+
|
|
129
|
+
## 🧬 Model Checkpoints
|
|
130
|
+
Full DNABERT fine-tuned weights (acceptor, donor, and 700 TFBS models) will be deposited in Zenodo and made publicly available immediately upon journal acceptance.
|
|
131
|
+
In the meantime, researchers may request access by emailing pratik.dutta@stonybrook.edu and ramana.davuluri@stonybrookmedicine.edu with a brief statement of intended use.
|
|
132
|
+
|
|
133
|
+
## Citation
|
|
134
|
+
If you use DeepVRegulome in your research, please cite:
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
## 🧬 Model Checkpoints
|
|
139
|
+
MIT. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# DeepVRegulome
|
|
2
|
+

|
|
3
|
+
|
|
4
|
+
DeepVRegulome is an end‑to‑end framework for predicting the functional impact of small somatic variants in non‑coding regulatory regions (splice sites and transcription‑factor‑binding sites) using fine‑tuned DNABERT models.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## ✨ Key Features
|
|
9
|
+
|
|
10
|
+
- ✅ DNABERT-based classifiers for:
|
|
11
|
+
- Splice sites (acceptor, donor)
|
|
12
|
+
- ~700 TFBS models
|
|
13
|
+
- ✅ Region-aware scoring of somatic variants using Δp and log₂ odds
|
|
14
|
+
- ✅ Batch processing with multiprocessing and BED/VCF support
|
|
15
|
+
- ✅ Interactive Streamlit dashboard with:
|
|
16
|
+
- Variant tables, plots, and survival analysis
|
|
17
|
+
- Attention score visualizations
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
📁 Repository Structure
|
|
22
|
+
```
|
|
23
|
+
DeepVRegulome/
|
|
24
|
+
├── .devcontainer/
|
|
25
|
+
├── .streamlit/
|
|
26
|
+
├── data/
|
|
27
|
+
│ └── Brain/
|
|
28
|
+
├── figures/ # Exported visualizations (e.g. attention maps)
|
|
29
|
+
│ └── attention/
|
|
30
|
+
│ ├── CTCFL/
|
|
31
|
+
│ └── ZNF384/
|
|
32
|
+
├── notebooks/ # Jupyter notebooks for key pipeline steps
|
|
33
|
+
│ ├── 01_parse_and_merge_vcfs.ipynb # Merge and parse VCFs
|
|
34
|
+
│ ├── 02_tfbs_intersection.ipynb # Intersect VCF with TFBS BEDs
|
|
35
|
+
│ ├── 03_dnabert_input_generation.ipynb # Generate sequences for DNABERT
|
|
36
|
+
│ ├── 04_scoring_candidate_variants.ipynb # Compute Δp / logOR & rank variants
|
|
37
|
+
│ └── 05_tfbs_attention_motif_visualization.ipynb # Plot attention scores & motifs
|
|
38
|
+
├── scripts/ # Shell scripts for batch inference
|
|
39
|
+
│ ├── run_prediction_tfbs.sh # Predict with TFBS models
|
|
40
|
+
│ └── run_prediction_splice_acceptor.sh # Predict with acceptor models
|
|
41
|
+
├── src/
|
|
42
|
+
│ └── deepvregulome/ # Core Python modules
|
|
43
|
+
│ ├── __init__.py
|
|
44
|
+
│ ├── dnabert_data_generation.py # Wild/mutated seq generation
|
|
45
|
+
│ ├── intersect.py # BED/VCF overlap engine
|
|
46
|
+
│ ├── vcf_loader.py # VCF parsing utilities
|
|
47
|
+
│ └── config.yaml # Centralized path config
|
|
48
|
+
├── streamlit_app/
|
|
49
|
+
│ └── app_variant_clinical_dashboard.py # Live clinical dashboard
|
|
50
|
+
├── LICENSE
|
|
51
|
+
├── README.md
|
|
52
|
+
├── requirements.txt
|
|
53
|
+
└── .gitignore
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
## 🧪 Installation
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://github.com/DavuluriLab//DeepVRegulome.git
|
|
59
|
+
cd DeepVRegulome
|
|
60
|
+
python3 -m venv venv && source venv/bin/activate
|
|
61
|
+
pip install -r requirements.txt
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
## ⚙️ Typical Pipeline Flow
|
|
67
|
+
| Step | Description | Location |
|
|
68
|
+
|------|-------------|----------|
|
|
69
|
+
| 1️⃣ | Parse + merge somatic VCFs | `01_parse_and_merge_vcfs.ipynb` |
|
|
70
|
+
| 2️⃣ | Intersect variants with TFBS BEDs | `02_tfbs_intersection.ipynb` |
|
|
71
|
+
| 3️⃣ | Generate ref/mutated k-mers for DNABERT | `03_dnabert_input_generation.ipynb` |
|
|
72
|
+
| 4️⃣ | Predict with DNABERT models | `scripts/run_prediction_tfbs.sh` |
|
|
73
|
+
| 5️⃣ | Compute Δp, find candidate variants | `04_scoring_candidate_variants.ipynb` |
|
|
74
|
+
| 6️⃣ | Visualize attention scores and motifs | `05_tfbs_attention_motif_visualization.ipynb` |
|
|
75
|
+
| 7️⃣ | Browse results interactively | `streamlit_app/app_variant_clinical_dashboard.py` |
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
## 📊 Example Outputs
|
|
79
|
+
* Candidate variant count by TFBS
|
|
80
|
+
* DNABERT attention heatmaps
|
|
81
|
+
* High-impact motif shifts due to mutations
|
|
82
|
+
* Kaplan–Meier plots for clinical stratification
|
|
83
|
+
|
|
84
|
+
See figures/attention/ for examples like CTCFL.
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
## 🌐 Live Demo
|
|
88
|
+
|
|
89
|
+
An interactive instance of the DeepVRegulome dashboard is hosted here:
|
|
90
|
+
➡️ **https://davuluri-lab-brainved.streamlit.app/**
|
|
91
|
+
The deployed app lets you browse model performance metrics and variant-effect predictions without installing any software locally.
|
|
92
|
+
|
|
93
|
+
## 🧬 Model Checkpoints
|
|
94
|
+
Full DNABERT fine-tuned weights (acceptor, donor, and 700 TFBS models) will be deposited in Zenodo and made publicly available immediately upon journal acceptance.
|
|
95
|
+
In the meantime, researchers may request access by emailing pratik.dutta@stonybrook.edu and ramana.davuluri@stonybrookmedicine.edu with a brief statement of intended use.
|
|
96
|
+
|
|
97
|
+
## Citation
|
|
98
|
+
If you use DeepVRegulome in your research, please cite:
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
## 🧬 Model Checkpoints
|
|
103
|
+
MIT. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "deepvregulome"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "DNABERT-based framework for predicting the functional impact of regulatory variants"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "CC-BY-NC-4.0"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Pratik Dutta", email = "pratik.dutta@stonybrook.edu"},
|
|
14
|
+
{name = "Ramana V. Davuluri"},
|
|
15
|
+
]
|
|
16
|
+
keywords = [
|
|
17
|
+
"genomics", "variant-effect-prediction", "dnabert",
|
|
18
|
+
"regulome", "transcription-factors", "deep-learning",
|
|
19
|
+
]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 4 - Beta",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
dependencies = [
|
|
28
|
+
"torch>=1.10",
|
|
29
|
+
"transformers>=4.20",
|
|
30
|
+
"huggingface-hub>=0.14",
|
|
31
|
+
"pandas>=1.3",
|
|
32
|
+
"numpy>=1.21",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
genome = ["pysam>=0.20"]
|
|
37
|
+
viz = ["matplotlib>=3.5", "seaborn>=0.12"]
|
|
38
|
+
vcf = ["cyvcf2>=0.30"]
|
|
39
|
+
all = ["pysam>=0.20", "matplotlib>=3.5", "seaborn>=0.12", "cyvcf2>=0.30"]
|
|
40
|
+
|
|
41
|
+
[project.urls]
|
|
42
|
+
Homepage = "https://github.com/DavuluriLab/DeepVRegulome"
|
|
43
|
+
Paper = "https://arxiv.org/abs/2511.09026"
|
|
44
|
+
Models = "https://huggingface.co/duttaprat/DeepVRegulome"
|
|
45
|
+
WebApp = "https://deepvregulome.streamlit.app"
|
|
46
|
+
|
|
47
|
+
[project.scripts]
|
|
48
|
+
deepvregulome = "deepvregulome.cli:main"
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.packages.find]
|
|
51
|
+
where = ["src"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DeepVRegulome: DNABERT-based regulatory variant effect prediction.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from deepvregulome import DVR
|
|
6
|
+
|
|
7
|
+
dvr = DVR(genome="/path/to/hg38.fa")
|
|
8
|
+
|
|
9
|
+
# Score a single variant
|
|
10
|
+
result = dvr.score_variant("chr1", 3456782, "A", "TA", models=["CTCF", "SP1"])
|
|
11
|
+
|
|
12
|
+
# Score from sequences directly
|
|
13
|
+
result = dvr.score_sequence(ref_seq, alt_seq, models=["CTCF"])
|
|
14
|
+
|
|
15
|
+
# Score a VCF file
|
|
16
|
+
results = dvr.score_vcf("variants.vcf", models=["CTCF", "SP1", "MYC"])
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
from deepvregulome.registry import ModelRegistry
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __getattr__(name):
|
|
25
|
+
"""Lazy import DVR so registry works even without torch installed."""
|
|
26
|
+
if name == "DVR":
|
|
27
|
+
from deepvregulome.dvr import DVR
|
|
28
|
+
return DVR
|
|
29
|
+
raise AttributeError(f"module 'deepvregulome' has no attribute {name}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
__all__ = ["DVR", "ModelRegistry", "__version__"]
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for DeepVRegulome.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
# Score a single variant
|
|
6
|
+
deepvregulome score --chrom chr1 --pos 3456782 --ref A --alt TA \
|
|
7
|
+
--models CTCFL SP1 MYC --genome hg38.fa
|
|
8
|
+
|
|
9
|
+
# Score a VCF file
|
|
10
|
+
deepvregulome score-vcf variants.vcf --models CTCFL SP1 --genome hg38.fa -o results.tsv
|
|
11
|
+
|
|
12
|
+
# Score from sequences
|
|
13
|
+
deepvregulome score-seq --ref ATCG... --alt ATCG... --models CTCFL
|
|
14
|
+
|
|
15
|
+
# List available models
|
|
16
|
+
deepvregulome list
|
|
17
|
+
deepvregulome list --type TF
|
|
18
|
+
deepvregulome search ZNF
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import sys
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def cmd_score(args):
|
|
26
|
+
from deepvregulome import DVR
|
|
27
|
+
|
|
28
|
+
dvr = DVR(genome=args.genome, device=args.device)
|
|
29
|
+
result = dvr.score_variant(
|
|
30
|
+
chrom=args.chrom,
|
|
31
|
+
pos=args.pos,
|
|
32
|
+
ref=args.ref,
|
|
33
|
+
alt=args.alt,
|
|
34
|
+
models=args.models,
|
|
35
|
+
model_type=args.type,
|
|
36
|
+
return_attention=args.attention,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if args.output:
|
|
40
|
+
result.to_csv(args.output, sep="\t", index=False)
|
|
41
|
+
print(f"Results saved to {args.output}")
|
|
42
|
+
else:
|
|
43
|
+
print(result.to_string(index=False))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def cmd_score_seq(args):
|
|
47
|
+
from deepvregulome import DVR
|
|
48
|
+
|
|
49
|
+
dvr = DVR(device=args.device)
|
|
50
|
+
result = dvr.score_sequence(
|
|
51
|
+
ref_seq=args.ref,
|
|
52
|
+
alt_seq=args.alt,
|
|
53
|
+
models=args.models,
|
|
54
|
+
model_type=args.type,
|
|
55
|
+
return_attention=args.attention,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if args.output:
|
|
59
|
+
result.to_csv(args.output, sep="\t", index=False)
|
|
60
|
+
else:
|
|
61
|
+
print(result.to_string(index=False))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def cmd_score_vcf(args):
|
|
65
|
+
from deepvregulome import DVR
|
|
66
|
+
|
|
67
|
+
dvr = DVR(genome=args.genome, device=args.device)
|
|
68
|
+
result = dvr.score_vcf(
|
|
69
|
+
vcf_path=args.vcf,
|
|
70
|
+
models=args.models,
|
|
71
|
+
model_type=args.type,
|
|
72
|
+
return_attention=args.attention,
|
|
73
|
+
max_variants=args.max_variants,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
output = args.output or args.vcf.replace(".vcf", "_dvr_scores.tsv")
|
|
77
|
+
result.to_csv(output, sep="\t", index=False)
|
|
78
|
+
print(f"Scored {len(result)} variant×model combinations → {output}")
|
|
79
|
+
disrupted = result["disrupted"].sum() if "disrupted" in result.columns else 0
|
|
80
|
+
print(f"Disrupted: {disrupted}/{len(result)}")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def cmd_list(args):
|
|
84
|
+
from deepvregulome import ModelRegistry
|
|
85
|
+
|
|
86
|
+
reg = ModelRegistry()
|
|
87
|
+
models = reg.list(model_type=args.type)
|
|
88
|
+
print(f"{'Name':<15} {'Type':<8} {'Acc':>6} {'ROC-AUC':>8} {'Peaks':>8}")
|
|
89
|
+
print("-" * 50)
|
|
90
|
+
for m in models:
|
|
91
|
+
print(f"{m.name:<15} {m.model_type:<8} {m.accuracy:>6.1f} {m.roc_auc:>8.2f} {m.peak_count:>8}")
|
|
92
|
+
print(f"\nTotal: {len(models)} models")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def cmd_search(args):
|
|
96
|
+
from deepvregulome import ModelRegistry
|
|
97
|
+
|
|
98
|
+
reg = ModelRegistry()
|
|
99
|
+
matches = reg.search(args.query)
|
|
100
|
+
if matches:
|
|
101
|
+
print(f"Models matching '{args.query}': {', '.join(matches)}")
|
|
102
|
+
else:
|
|
103
|
+
print(f"No models found matching '{args.query}'")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def main():
|
|
107
|
+
parser = argparse.ArgumentParser(
|
|
108
|
+
prog="deepvregulome",
|
|
109
|
+
description="DeepVRegulome: Regulatory variant effect prediction",
|
|
110
|
+
)
|
|
111
|
+
sub = parser.add_subparsers(dest="command")
|
|
112
|
+
|
|
113
|
+
# --- score ---
|
|
114
|
+
p_score = sub.add_parser("score", help="Score a single variant")
|
|
115
|
+
p_score.add_argument("--chrom", required=True, help="Chromosome (e.g., chr1)")
|
|
116
|
+
p_score.add_argument("--pos", type=int, required=True, help="1-based position")
|
|
117
|
+
p_score.add_argument("--ref", required=True, help="Reference allele")
|
|
118
|
+
p_score.add_argument("--alt", required=True, help="Alternate allele")
|
|
119
|
+
p_score.add_argument("--models", nargs="+", help="Model names (e.g., CTCFL SP1)")
|
|
120
|
+
p_score.add_argument("--type", choices=["TF", "HISTONE"], help="Score all models of this type")
|
|
121
|
+
p_score.add_argument("--genome", required=True, help="Reference genome FASTA")
|
|
122
|
+
p_score.add_argument("--attention", action="store_true", help="Include attention scores")
|
|
123
|
+
p_score.add_argument("--device", default=None, help="cuda or cpu")
|
|
124
|
+
p_score.add_argument("-o", "--output", help="Output TSV file")
|
|
125
|
+
p_score.set_defaults(func=cmd_score)
|
|
126
|
+
|
|
127
|
+
# --- score-seq ---
|
|
128
|
+
p_seq = sub.add_parser("score-seq", help="Score from pre-extracted sequences")
|
|
129
|
+
p_seq.add_argument("--ref", required=True, help="Reference sequence (301bp)")
|
|
130
|
+
p_seq.add_argument("--alt", required=True, help="Alternate sequence")
|
|
131
|
+
p_seq.add_argument("--models", nargs="+", help="Model names")
|
|
132
|
+
p_seq.add_argument("--type", choices=["TF", "HISTONE"], help="Score all of type")
|
|
133
|
+
p_seq.add_argument("--attention", action="store_true")
|
|
134
|
+
p_seq.add_argument("--device", default=None)
|
|
135
|
+
p_seq.add_argument("-o", "--output", help="Output TSV file")
|
|
136
|
+
p_seq.set_defaults(func=cmd_score_seq)
|
|
137
|
+
|
|
138
|
+
# --- score-vcf ---
|
|
139
|
+
p_vcf = sub.add_parser("score-vcf", help="Score variants from a VCF file")
|
|
140
|
+
p_vcf.add_argument("vcf", help="Input VCF file")
|
|
141
|
+
p_vcf.add_argument("--models", nargs="+", help="Model names")
|
|
142
|
+
p_vcf.add_argument("--type", choices=["TF", "HISTONE"], help="Score all of type")
|
|
143
|
+
p_vcf.add_argument("--genome", required=True, help="Reference genome FASTA")
|
|
144
|
+
p_vcf.add_argument("--attention", action="store_true")
|
|
145
|
+
p_vcf.add_argument("--max-variants", type=int, help="Max variants to score")
|
|
146
|
+
p_vcf.add_argument("--device", default=None)
|
|
147
|
+
p_vcf.add_argument("-o", "--output", help="Output TSV file")
|
|
148
|
+
p_vcf.set_defaults(func=cmd_score_vcf)
|
|
149
|
+
|
|
150
|
+
# --- list ---
|
|
151
|
+
p_list = sub.add_parser("list", help="List available models")
|
|
152
|
+
p_list.add_argument("--type", choices=["TF", "HISTONE"], help="Filter by type")
|
|
153
|
+
p_list.set_defaults(func=cmd_list)
|
|
154
|
+
|
|
155
|
+
# --- search ---
|
|
156
|
+
p_search = sub.add_parser("search", help="Search model names")
|
|
157
|
+
p_search.add_argument("query", help="Search string (e.g., ZNF, GATA)")
|
|
158
|
+
p_search.set_defaults(func=cmd_search)
|
|
159
|
+
|
|
160
|
+
args = parser.parse_args()
|
|
161
|
+
if not args.command:
|
|
162
|
+
parser.print_help()
|
|
163
|
+
sys.exit(1)
|
|
164
|
+
|
|
165
|
+
args.func(args)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
main()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import os, glob
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import pysam
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pickle
|
|
6
|
+
from multiprocessing import Pool, cpu_count
|
|
7
|
+
|
|
8
|
+
# === Configurable paths ===
|
|
9
|
+
cancer_type = "Brain"
|
|
10
|
+
intersected_base_path = f"/data/private/pdutta_new/GDC_Cancer_Wise/New_data/{cancer_type}/Generated_files/Intersected_Data/Somatic/300bp_TFBS"
|
|
11
|
+
reference_genome_path = "/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa"
|
|
12
|
+
output_path = f"/data/private/pdutta_new/GDC_Cancer_Wise/New_data/{cancer_type}/Generated_files/DNABERT_Data/Somatic/TFBS_300bp_new"
|
|
13
|
+
|
|
14
|
+
os.makedirs(output_path, exist_ok=True)
|
|
15
|
+
|
|
16
|
+
# === Load top models list ===
|
|
17
|
+
top_models_df = pd.read_csv("/home/campus.stonybrook.edu/pdutta/Github/Postdoc/DNABERT_data_processing/TFBS/300bp_TFBS_accuracy_Stat.tsv", sep="\t")
|
|
18
|
+
top_models_df = top_models_df[top_models_df['eval_acc'] >= 0.85].iloc[100:200]
|
|
19
|
+
|
|
20
|
+
# === Convert sequence to space-separated k-mers ===
|
|
21
|
+
def seq2kmer(seq, k=6):
|
|
22
|
+
return " ".join([seq[i:i+k] for i in range(len(seq)-k+1)])
|
|
23
|
+
|
|
24
|
+
# === Apply all mutations to a given reference sequence ===
|
|
25
|
+
def apply_mutations_absolute(ref_seq, chip_start, starts, ends, refs, alts):
|
|
26
|
+
mutations = sorted(zip(starts, ends, refs, alts), key=lambda x: x[0])
|
|
27
|
+
seq_list = list(ref_seq)
|
|
28
|
+
offset = chip_start
|
|
29
|
+
for start, end, ref, alt in mutations:
|
|
30
|
+
rel_pos = start - offset
|
|
31
|
+
if ''.join(seq_list[rel_pos:rel_pos+len(ref)]) == ref:
|
|
32
|
+
seq_list[rel_pos:rel_pos+len(ref)] = list(alt)
|
|
33
|
+
return ''.join(seq_list)
|
|
34
|
+
|
|
35
|
+
# === Load genome reference once per worker ===
|
|
36
|
+
def initialize_worker():
|
|
37
|
+
global reference_fasta
|
|
38
|
+
reference_fasta = pysam.FastaFile(reference_genome_path)
|
|
39
|
+
print("##### Reference genome loaded in worker #####")
|
|
40
|
+
|
|
41
|
+
# === Process a dataframe of variants for a single TFBS-patient combo ===
|
|
42
|
+
def get_sequences(df):
|
|
43
|
+
global reference_fasta
|
|
44
|
+
try:
|
|
45
|
+
data = []
|
|
46
|
+
for idx, row in df.iterrows():
|
|
47
|
+
chrom, ref_start, ref_end = row[0], row[1], row[2]
|
|
48
|
+
variant_start, variant_end = row['START_POS'], row['END_POS']
|
|
49
|
+
ref_nucleotide, alt = row['REF'], row['ALT']
|
|
50
|
+
ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
|
|
51
|
+
variant_pos_start = variant_start - ref_start
|
|
52
|
+
variant_pos_end = variant_end - ref_start
|
|
53
|
+
|
|
54
|
+
if len(ref_nucleotide) < len(alt): # Insertion
|
|
55
|
+
delete_size = len(alt) - len(ref_nucleotide)
|
|
56
|
+
alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq)-delete_size]
|
|
57
|
+
elif len(ref_nucleotide) > len(alt): # Deletion
|
|
58
|
+
insert_size = len(ref_nucleotide) - len(alt)
|
|
59
|
+
extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end+insert_size)
|
|
60
|
+
alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
|
|
61
|
+
else: # SNV
|
|
62
|
+
alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
|
|
63
|
+
|
|
64
|
+
data.append({
|
|
65
|
+
'chr': chrom, 'Chip_Seq_start': ref_start, 'Chip_Seq_end': ref_end,
|
|
66
|
+
'varinat_start': variant_start, 'variant_end': variant_end,
|
|
67
|
+
'ref_neucleotide': ref_nucleotide, 'alternative_neucleotide': alt,
|
|
68
|
+
'reference_seq': ref_seq, 'alt_seq': alt_seq
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
new_df = pd.DataFrame(data).drop_duplicates().reset_index(drop=True)
|
|
72
|
+
|
|
73
|
+
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
|
|
74
|
+
merged_list = [item.upper() for tup in merged_list for item in tup]
|
|
75
|
+
df_kmer = pd.DataFrame(list(map(seq2kmer, merged_list)), columns=['Sequence'])
|
|
76
|
+
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
|
|
77
|
+
|
|
78
|
+
grouped_df = new_df.groupby(['chr', 'Chip_Seq_start', 'Chip_Seq_end']).agg({
|
|
79
|
+
'varinat_start': list, 'variant_end': list,
|
|
80
|
+
'ref_neucleotide': list, 'alternative_neucleotide': list,
|
|
81
|
+
'reference_seq': 'first', 'alt_seq': lambda x: list(set(x))
|
|
82
|
+
}).reset_index()
|
|
83
|
+
|
|
84
|
+
grouped_df['mutated_sequence'] = grouped_df.apply(lambda row: apply_mutations_absolute(
|
|
85
|
+
row['reference_seq'], row['Chip_Seq_start'], row['varinat_start'],
|
|
86
|
+
row['variant_end'], row['ref_neucleotide'], row['alternative_neucleotide']), axis=1)
|
|
87
|
+
|
|
88
|
+
merged_regionwise = list(zip(grouped_df['reference_seq'], grouped_df['mutated_sequence']))
|
|
89
|
+
merged_regionwise = [item.upper() for tup in merged_regionwise for item in tup]
|
|
90
|
+
df_kmer_region = pd.DataFrame(list(map(seq2kmer, merged_regionwise)), columns=['Sequence'])
|
|
91
|
+
df_kmer_region['Label'] = np.random.choice([0, 1], size=len(df_kmer_region))
|
|
92
|
+
|
|
93
|
+
return new_df, df_kmer, grouped_df, df_kmer_region
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(f"Error in get_sequences: {e}")
|
|
97
|
+
raise
|
|
98
|
+
|
|
99
|
+
# === Main processing logic for each TFBS tag ===
|
|
100
|
+
def process_row(row):
|
|
101
|
+
global reference_fasta
|
|
102
|
+
try:
|
|
103
|
+
print(f"Processing {row['tags']}")
|
|
104
|
+
intersected_data = f"{intersected_base_path}/{row['tags']}/intersected_vcf_data.pkl"
|
|
105
|
+
|
|
106
|
+
with open(intersected_data, "rb") as file:
|
|
107
|
+
loaded_dictionary = pickle.load(file)
|
|
108
|
+
|
|
109
|
+
dnabert_raw_data = {}
|
|
110
|
+
dnabert_raw_data_regionwise = {}
|
|
111
|
+
|
|
112
|
+
for key, value in loaded_dictionary.items():
|
|
113
|
+
try:
|
|
114
|
+
print(f" → {key}")
|
|
115
|
+
new_df, df_kmer, grouped_df, df_kmer_region = get_sequences(value)
|
|
116
|
+
dnabert_raw_data[key] = new_df
|
|
117
|
+
dnabert_raw_data_regionwise[key] = grouped_df
|
|
118
|
+
|
|
119
|
+
patient_folder = f"{output_path}/{row['tags']}/Patient_wise"
|
|
120
|
+
os.makedirs(f"{patient_folder}/variant_wise/{key}", exist_ok=True)
|
|
121
|
+
os.makedirs(f"{patient_folder}/region_wise/{key}", exist_ok=True)
|
|
122
|
+
df_kmer.to_csv(f"{patient_folder}/variant_wise/{key}/dev.tsv", sep="\t", index=False)
|
|
123
|
+
df_kmer_region.to_csv(f"{patient_folder}/region_wise/{key}/dev.tsv", sep="\t", index=False)
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(f"Error: {e} in {row['tags']} → {key}")
|
|
127
|
+
return row['tags']
|
|
128
|
+
|
|
129
|
+
with open(f"{output_path}/{row['tags']}/variantwise_raw_vcf_data.pkl", "wb") as file:
|
|
130
|
+
pickle.dump(dnabert_raw_data, file)
|
|
131
|
+
with open(f"{output_path}/{row['tags']}/regionwise_raw_vcf_data.pkl", "wb") as file:
|
|
132
|
+
pickle.dump(dnabert_raw_data_regionwise, file)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f"Global error in {row['tags']}: {e}")
|
|
136
|
+
return row['tags']
|
|
137
|
+
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
# === Run multiprocessing ===
|
|
141
|
+
if __name__ == '__main__':
|
|
142
|
+
print(f"Using {cpu_count()} CPUs for processing")
|
|
143
|
+
with Pool(cpu_count() - 10, initializer=initialize_worker) as pool:
|
|
144
|
+
missing_files = pool.map(process_row, [row for _, row in top_models_df.iterrows()])
|
|
145
|
+
|
|
146
|
+
missing_files = [tag for tag in missing_files if tag is not None]
|
|
147
|
+
with open(f"{output_path}/missing_files.txt", "w") as f:
|
|
148
|
+
for tag in missing_files:
|
|
149
|
+
f.write(f"{tag}\n")
|
|
150
|
+
|
|
151
|
+
print(f"Missing files recorded at {output_path}/missing_files.txt")
|