PyBRAID 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pybraid-1.0.0/LICENSE +21 -0
- pybraid-1.0.0/PKG-INFO +227 -0
- pybraid-1.0.0/README.md +214 -0
- pybraid-1.0.0/pyproject.toml +26 -0
- pybraid-1.0.0/setup.cfg +4 -0
- pybraid-1.0.0/src/PyBRAID.egg-info/PKG-INFO +227 -0
- pybraid-1.0.0/src/PyBRAID.egg-info/SOURCES.txt +27 -0
- pybraid-1.0.0/src/PyBRAID.egg-info/dependency_links.txt +1 -0
- pybraid-1.0.0/src/PyBRAID.egg-info/entry_points.txt +2 -0
- pybraid-1.0.0/src/PyBRAID.egg-info/requires.txt +2 -0
- pybraid-1.0.0/src/PyBRAID.egg-info/top_level.txt +1 -0
- pybraid-1.0.0/src/braid/__init__.py +0 -0
- pybraid-1.0.0/src/braid/cli.py +156 -0
- pybraid-1.0.0/src/braid/data/test.fasta +3 -0
- pybraid-1.0.0/src/braid/data/test.fasta.fai +1 -0
- pybraid-1.0.0/src/braid/data/test.gff3 +15 -0
- pybraid-1.0.0/src/braid/data/test.vcf +8 -0
- pybraid-1.0.0/src/braid/data/test.vcf.gz +0 -0
- pybraid-1.0.0/src/braid/data/test.vcf.gz.csi +0 -0
- pybraid-1.0.0/src/braid/data/variant_analysis_output.alignment.txt +20 -0
- pybraid-1.0.0/src/braid/data/variant_analysis_output.log +191 -0
- pybraid-1.0.0/src/braid/data/variant_analysis_output.sample.txt +3 -0
- pybraid-1.0.0/src/braid/data/variant_analysis_output.tsv +5 -0
- pybraid-1.0.0/src/braid/genome.py +207 -0
- pybraid-1.0.0/src/braid/modifier.py +500 -0
- pybraid-1.0.0/src/braid/output.py +285 -0
- pybraid-1.0.0/src/braid/protein.py +141 -0
- pybraid-1.0.0/src/braid/utils.py +86 -0
- pybraid-1.0.0/src/braid/vcf.py +180 -0
pybraid-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yuefan Huang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pybraid-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PyBRAID
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Block Resolution and Annotation of Integrated DNA
|
|
5
|
+
Author: Yuefan Huang
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pysam>=0.17.0
|
|
11
|
+
Requires-Dist: biopython>=1.80
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# BRAID
|
|
15
|
+
**Block Resolution and Annotation of Integrated DNA**
|
|
16
|
+
|
|
17
|
+

|
|
18
|
+

|
|
19
|
+

|
|
20
|
+
|
|
21
|
+
> **Analyze phased VCF data to predict variant effects on protein sequences for each haplotype.**
|
|
22
|
+
|
|
23
|
+
## Overview
|
|
24
|
+
|
|
25
|
+
**BRAID** is a bioinformatics tool designed to go beyond isolated mutations annotation. By utilizing **phased VCF data**, this tool reconstructs the combination of mutations present on each chromosome (haplotype) to predict the actual protein sequence produced.
|
|
26
|
+
|
|
27
|
+
This allows for the detection of complex effects, such as:
|
|
28
|
+
* **Compound Heterozygosity:** Understanding how multiple mutations on the same haplotype interact.
|
|
29
|
+
* **Haplotype-specific LOF:** Determining if a combination effect of mutations leads to a Loss of Function.
|
|
30
|
+
* **Protein Structure Changes:** Visualizing the exact amino acid sequence changes.
|
|
31
|
+
|
|
32
|
+
<div align="center">
|
|
33
|
+
<img width="256.5" height="113.5" alt="Image" src="https://github.com/user-attachments/assets/55e71e73-1e83-44f2-9f62-187104f54523" />
|
|
34
|
+
</div>
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
## Installation & Requirements
|
|
38
|
+
|
|
39
|
+
BRAID is a software that requires standard bioinformatics libraries.
|
|
40
|
+
|
|
41
|
+
**Prerequisites:**
|
|
42
|
+
> * Python >= 3.8
|
|
43
|
+
> * `pysam`
|
|
44
|
+
> * `biopython`
|
|
45
|
+
|
|
46
|
+
**installation**
|
|
47
|
+
|
|
48
|
+
**Dowload via pip**
|
|
49
|
+
> ```
|
|
50
|
+
> pip install pybraid
|
|
51
|
+
> ```
|
|
52
|
+
|
|
53
|
+
**Dowload via conda**
|
|
54
|
+
> ```
|
|
55
|
+
> conda install braid
|
|
56
|
+
> ```
|
|
57
|
+
|
|
58
|
+
**Dowload via wget**
|
|
59
|
+
> ```
|
|
60
|
+
> wget https://github.com/YuefanHuang1998/BRAID/archive/refs/tags/braid-v1.0.1.tar.gz
|
|
61
|
+
> tar -zxvf braid-v1.0.1
|
|
62
|
+
> cd BRAID-1.0.1/
|
|
63
|
+
> pip install .
|
|
64
|
+
> ```
|
|
65
|
+
|
|
66
|
+
**Dowload via git**
|
|
67
|
+
> ```
|
|
68
|
+
> git clone https://github.com/YuefanHuang1998/BRAID.git
|
|
69
|
+
> cd BRAID
|
|
70
|
+
> pip install .
|
|
71
|
+
> ```
|
|
72
|
+
|
|
73
|
+
**To verify the installation was successful, run:**
|
|
74
|
+
> braid test
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
**Run the script from the command line by providing the GFF3 annotation, Reference Genome, and Phased VCF.**
|
|
78
|
+
|
|
79
|
+
`braid -r reference.fa -g annotation.gff3 -v phased_variants.vcf.gz`
|
|
80
|
+
|
|
81
|
+
### Example for test dataset
|
|
82
|
+
`braid -r test.fasta -g test.gff3 -v test.vcf.gz`
|
|
83
|
+
> ```
|
|
84
|
+
> You should have three output files and one log file:
|
|
85
|
+
> variant_analysis_output.tsv
|
|
86
|
+
> variant_analysis_output.alignment.txt
|
|
87
|
+
> variant_analysis_output.sample.txt
|
|
88
|
+
> variant_analysis_output.log
|
|
89
|
+
> ```
|
|
90
|
+
|
|
91
|
+
## Arguments Parameter Table
|
|
92
|
+
| **Short** | **Long** | **Description** | |
|
|
93
|
+
|:-----:|:------:|:-------------:|:-------------:|
|
|
94
|
+
| **-g** | **--gff** | Path to the GFF3 annotation file. | Required |
|
|
95
|
+
| **-r** | **--reference** | Path to the Reference Genome FASTA (must be indexed: .fai). | Required |
|
|
96
|
+
| **-v** | **--vcf** | Path to the Phased VCF file (must be indexed: .tbi/.csi). | Required |
|
|
97
|
+
| **-o** | **--output** | Output file name (default: variant_analysis_output.tsv). | Optional |
|
|
98
|
+
| **-s** | **--sample** | Path to file containing specific sample IDs to analyze (one per line, no header). | Optional |
|
|
99
|
+
| | **--gene** | Path to file with specific gene IDs to analyze (one per line, no header). | Optional |
|
|
100
|
+
| | **--lof-threshold** | Custom threshold for Loss-of-Function classification (e.g., 0.1; default: 0.3). | Optional |
|
|
101
|
+
| | **--force-unphased** | Skip phasing check (forces run on unphased VCF). | Optional |
|
|
102
|
+
| | **--ignore-intron** | Ignore variants marked strictly as intronic. | Optional |
|
|
103
|
+
|
|
104
|
+
## Output Files Explaination
|
|
105
|
+
|
|
106
|
+
**BRAID generates three main files to assist in your analysis.**
|
|
107
|
+
|
|
108
|
+
**1. Summary Table**
|
|
109
|
+
|
|
110
|
+
A comprehensive table detailing the protein changes for every haplotype (default: variant_analysis_output.tsv).
|
|
111
|
+
|
|
112
|
+
Example View:
|
|
113
|
+
| Gene_ID | Haplotype_ID | mRNA | Haplotype_Count | Frequency | Variant_Type | Protein_Changes | Haplotype_Mutations | Sample_Sources | Ref_Protein | Alt_Protein | Ref_CDS | Alt_CDS | Aligned_Ref | Comparison_String | Aligned_Alt |
|
|
114
|
+
|:-----|:------|:------|:-----|:------|:------|:-----|:------|:------|:-----|:------|:------|:-----|:------|:------|:-----|
|
|
115
|
+
| gene1 | transcript1:REF | transcript1 | . | . | NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:27) `\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | . | . | . | MSLASSANDMIDRSIDRSIDRSIDRS* | MSLASSANDMIDRSIDRSIDRSIDRS* | ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | MSLASSANDMIDRSIDRSIDRSIDRS* | `\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | MSLASSANDMIDRSIDRSIDRSIDRS* |
|
|
116
|
+
| gene1 | transcript1:1 | transcript1 | 4 | 0.500000 | NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)`\|\|\|`deletion `\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | Del(5)S | 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON] | sample1(Hap2);sample2(Hap1);sample3(Homo) | MSLASSANDMIDRSIDRSIDRSIDRS* | MSLASANDMIDRSIDRSIDRSIDRS* | ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | ATGAGCCTAGCTTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | MSLASSANDMIDRSIDRSIDRSIDRS* | `\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | MSLA-SANDMIDRSIDRSIDRSIDRS* |
|
|
117
|
+
|
|
118
|
+
**Explanation for each column**
|
|
119
|
+
|
|
120
|
+
| **Column** | **Example** | **Explanation** |
|
|
121
|
+
|:-----|:-----|:-------------|
|
|
122
|
+
| **Gene_ID** | gene1 | The gene to which this haplotype belongs. |
|
|
123
|
+
| **Haplotype_ID** | transcript1:REF;transcript1:1 | transcript:REF indicates the reference haplotype; others (e.g. transcript1:1) are alternative haplotypes. |
|
|
124
|
+
| **mRNA** | transcript1 | The transcript to which this haplotype belongs. |
|
|
125
|
+
| **Haplotype_Count** | 4 | Number of samples carrying this haplotype. |
|
|
126
|
+
| **Frequency** | 0.500000 | Population frequency of this haplotype. |
|
|
127
|
+
| **Variant_Type** | LOF/NoLoF (Please see <br>the detailed information below.) | Functional classification of the haplotype and indicates LOF status. |
|
|
128
|
+
| **Protein_Changes** | Del(5)S | Protein-level consequence HGVS format description. |
|
|
129
|
+
| **Haplotype_Mutations** | 1:17_CTTAG>C[CDS,EXON];<br>1:27_T>TT[CDS,EXON] | List of variants defining the haplotype. |
|
|
130
|
+
| **Sample_Sources** | sample1(Hap2);sample2(Hap1);<br>sample3(Homo) | Samples carrying this haplotype, including haplotype phase (Hap1, Hap2) or homozygous status (Homo). |
|
|
131
|
+
| **Ref_Protein** | MSLASSANDIDRSIDRS* | Reference protein sequence. |
|
|
132
|
+
| **Alt_Protein** | MSLASANDIDRSIDRS* | Alternative protein sequence. |
|
|
133
|
+
| **Ref_CDS** | ATGAGCTTAGCTAGCTCAGCTAACGATATCG<br>ATCGATCGATCGATCGATCGTGA | Reference CDS sequence. |
|
|
134
|
+
| **Alt_CDS** | ATGAGCCTAGCTTCAGCTAACGATATCG<br>ATCGATCGATCGATCGATCGTGA | Haplotype-specific CDS sequence. |
|
|
135
|
+
| **Aligned_Ref** | MSLASSANDIDRSIDRS* | Aligned reference protein, protein alignment string for visualization: `-` → Gap (insertion or deletion). |
|
|
136
|
+
| **Comparison_String** | MSLASANDIDRSIDRS* | Alignment comparison symbols: `*` → Different amino acid, `\|` → Same amino acid. |
|
|
137
|
+
| **Aligned_Alt** | MSLASANDIDRSIDRS* | Aligned alternative protein, shows amino acid changes: `-` → Gap (insertion or deletion). |
|
|
138
|
+
|
|
139
|
+
**Detail information for Variant_Type**
|
|
140
|
+
|
|
141
|
+
| **Column** | **Explanation** |
|
|
142
|
+
|:-----|:-------------|
|
|
143
|
+
| **LOF_Info** | LOF indicates haplotypes predicted to cause protein loss of function, whereas NoLOF indicates haplotypes without LOF effects. non_identity_rate represents the proportion of amino acid differences between ALT and REF proteins; non_identical_AAs is the corresponding count, and total_ref_AAs denotes the length of the REF protein. e.g., NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18). |
|
|
144
|
+
| **missense** | A nucleotide variant that results in the substitution of one amino acid by another in the protein sequence. |
|
|
145
|
+
| **insertion** | An insertion of one or more nucleotides that alters the coding sequence and may affect the resulting protein sequence. |
|
|
146
|
+
| **deletion** | A deletion of one or more nucleotides from the coding sequence, potentially altering the protein sequence or reading frame. |
|
|
147
|
+
| **complex_indel** | A combined insertion and deletion event that cannot be represented as a simple insertion or deletion and may cause complex changes to the coding sequence. |
|
|
148
|
+
| **exon_skip** | A splicing event in which one or more exons are completely skipped in the transcript, leading to an altered mRNA and protein sequence. |
|
|
149
|
+
| **skipped_exons_detail** | Detailed information specifying which exon is skipped in the exon-skipping event, SITE_PRESERVED, SITE_SHIFT, SITE_DESTROYED. e.g., SkippedExon:[1:51-77`\|`SITE_SHIFT]. |
|
|
150
|
+
| **intron_retention** | A splicing event in which one or more introns are retained in the mature transcript, potentially disrupting the coding sequence. |
|
|
151
|
+
| **retained_introns_detail** | Detailed information specifying which intron(s) are retained in the intron-retention event, SITE_PRESERVED, SITE_SHIFT, SITE_DESTROYED. e.g., RetainedIntron:[1:78-90`\|`SITE_DESTROYED]. |
|
|
152
|
+
| **start_codon_loss** | A variant that disrupts the canonical start codon, potentially preventing translation initiation. |
|
|
153
|
+
| **stop_loss** | A variant that removes or alters the stop codon, resulting in translational read-through and an extended protein. |
|
|
154
|
+
| **No_start_codon_for_reference** | The reference transcript or protein sequence lacks an annotated start codon. |
|
|
155
|
+
| **same_as_other_transcript** | The variant effect on this protein is identical to that observed in another transcript's protein of the same gene. |
|
|
156
|
+
| **protein_loss** | The variant or variant combination results in the complete loss of the predicted protein product. |
|
|
157
|
+
| **ref_protein_empty** | The reference transcript does not produce a protein sequence (e.g., non-coding or incomplete annotation). |
|
|
158
|
+
| **alignment_failed** | The reference and alternative protein sequences could not be reliably aligned, preventing accurate variant effect annotation. |
|
|
159
|
+
|
|
160
|
+
**Log information for splice sites mutations**
|
|
161
|
+
|
|
162
|
+
**SITE_PRESERVED**: The splice site is unaltered, with both its position and sequence conserved; normal splicing is expected.
|
|
163
|
+
> ```
|
|
164
|
+
> [transcript1] (strand +) Splice site 'GT' at 39 PRESERVED by mutation 1:39_GTCG>G.
|
|
165
|
+
> - Original Window Seq : GATGTCGTTAAG - Mutated Window Seq : GATGTTAAG.
|
|
166
|
+
> ```
|
|
167
|
+
|
|
168
|
+
**SITE_SHIFT**: The splice site may move to a nearby position due to the variant, potentially altering the exon–intron boundary.
|
|
169
|
+
> ```
|
|
170
|
+
> WARNING - [transcript1] Splice site may SHIFT for mutation 1:44_TAAGTA>A.
|
|
171
|
+
> - Splice Site : 'AG' (Original genomic pos: 49)
|
|
172
|
+
> - Mutation : TAAGTA -> A (Genomic pos: 44)
|
|
173
|
+
> - Window : 1:42-53
|
|
174
|
+
> - Original Window Seq : GTTAAGTAGATG
|
|
175
|
+
> - Mutated Window Seq : GTAGATG
|
|
176
|
+
> ```
|
|
177
|
+
|
|
178
|
+
**SITE_DESTROYED**: The splice site is disrupted or abolished by the variant, likely preventing normal splicing at this site.
|
|
179
|
+
> ```
|
|
180
|
+
> [transcript1] (strand +) Splice site 'GT' at 78 DESTROYED by mutation 1:50_GATGATCGATCGATCGATCGATCGATCGG>GG. It was 'GT', became 'AT'.
|
|
181
|
+
> - Original Window Seq : TCGGTCGATCGA - Mutated Window Seq : TCGATCGA
|
|
182
|
+
> ```
|
|
183
|
+
|
|
184
|
+
**2. Alignment Visualization (.alignment.txt)**
|
|
185
|
+
|
|
186
|
+
A text file showing the pairwise alignment of the Reference vs. Alternative protein.
|
|
187
|
+
Use the Haplotype_ID to match the haplotype in the summary table.
|
|
188
|
+
|
|
189
|
+
Example View:
|
|
190
|
+
> ```
|
|
191
|
+
> Haplotype_ID: transcript1:1
|
|
192
|
+
> Gene: gene1 | mRNA: transcript1
|
|
193
|
+
> Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
|
|
194
|
+
> Variant_Type: NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion||||||||||||||
|
|
195
|
+
> Protein_Changes: Del(5)S
|
|
196
|
+
> Alignment:
|
|
197
|
+
> Ref: MSLASSANDMIDRSIDRSIDRSIDRS*
|
|
198
|
+
> |||| ||||||||||||||||||||||
|
|
199
|
+
> Alt: MSLA-SANDMIDRSIDRSIDRSIDRS*
|
|
200
|
+
> ```
|
|
201
|
+
|
|
202
|
+
**3. Sample Matrix (.sample.txt)**
|
|
203
|
+
|
|
204
|
+
A matrix format ideal for heatmaps or downstream programmatic analysis.
|
|
205
|
+
|
|
206
|
+
Example View:
|
|
207
|
+
| Gene_ID | mRNA_ID | Ref_ID | Alt_IDs | sample1 | sample2 | sample3 |
|
|
208
|
+
|:-----|:------|:------|:-----|:------|:------|:------|
|
|
209
|
+
| gene1 | transcript1 | transcript1:REF | transcript1:1 | 0`\|`1 | 1`\|`0 | 1`\|`1 |
|
|
210
|
+
|
|
211
|
+
>```
|
|
212
|
+
> Gene_ID mRNA_ID Ref_ID Alt_IDs sample1 sample2 sample3
|
|
213
|
+
> gene1 transcript1 transcript1:REF transcript1:1 0|1 1|0 1|1
|
|
214
|
+
>```
|
|
215
|
+
|
|
216
|
+
| **Column** | **Explanation** |
|
|
217
|
+
|:-----|:-------------|
|
|
218
|
+
| **Gene_ID** | The identifier of the gene being analyzed. |
|
|
219
|
+
| **mRNA_ID** | The identifier of the specific transcript of that gene. |
|
|
220
|
+
| **Ref_ID** | Reference haplotype ID, labeled as transcript:REF. |
|
|
221
|
+
| **Alt_IDs** | Alternative haplotype IDs observed in the samples, with numbering to distinguish multiple haplotypes. |
|
|
222
|
+
| **samples** | The haplotype presence/absence in each sample. Each entry corresponds to the Ref or Alt haplotype. `0` represents REF. |
|
|
223
|
+
|
|
224
|
+
## Citation
|
|
225
|
+
|
|
226
|
+
If you use BRAID in your research, please cite our paper:
|
|
227
|
+
> *
|
pybraid-1.0.0/README.md
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# BRAID
|
|
2
|
+
**Block Resolution and Annotation of Integrated DNA**
|
|
3
|
+
|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+
|
|
8
|
+
> **Analyze phased VCF data to predict variant effects on protein sequences for each haplotype.**
|
|
9
|
+
|
|
10
|
+
## Overview
|
|
11
|
+
|
|
12
|
+
**BRAID** is a bioinformatics tool designed to go beyond isolated mutations annotation. By utilizing **phased VCF data**, this tool reconstructs the combination of mutations present on each chromosome (haplotype) to predict the actual protein sequence produced.
|
|
13
|
+
|
|
14
|
+
This allows for the detection of complex effects, such as:
|
|
15
|
+
* **Compound Heterozygosity:** Understanding how multiple mutations on the same haplotype interact.
|
|
16
|
+
* **Haplotype-specific LOF:** Determining if a combination effect of mutations leads to a Loss of Function.
|
|
17
|
+
* **Protein Structure Changes:** Visualizing the exact amino acid sequence changes.
|
|
18
|
+
|
|
19
|
+
<div align="center">
|
|
20
|
+
<img width="256.5" height="113.5" alt="Image" src="https://github.com/user-attachments/assets/55e71e73-1e83-44f2-9f62-187104f54523" />
|
|
21
|
+
</div>
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
## Installation & Requirements
|
|
25
|
+
|
|
26
|
+
BRAID is a software that requires standard bioinformatics libraries.
|
|
27
|
+
|
|
28
|
+
**Prerequisites:**
|
|
29
|
+
> * Python >= 3.8
|
|
30
|
+
> * `pysam`
|
|
31
|
+
> * `biopython`
|
|
32
|
+
|
|
33
|
+
**installation**
|
|
34
|
+
|
|
35
|
+
**Dowload via pip**
|
|
36
|
+
> ```
|
|
37
|
+
> pip install pybraid
|
|
38
|
+
> ```
|
|
39
|
+
|
|
40
|
+
**Dowload via conda**
|
|
41
|
+
> ```
|
|
42
|
+
> conda install braid
|
|
43
|
+
> ```
|
|
44
|
+
|
|
45
|
+
**Dowload via wget**
|
|
46
|
+
> ```
|
|
47
|
+
> wget https://github.com/YuefanHuang1998/BRAID/archive/refs/tags/braid-v1.0.1.tar.gz
|
|
48
|
+
> tar -zxvf braid-v1.0.1
|
|
49
|
+
> cd BRAID-1.0.1/
|
|
50
|
+
> pip install .
|
|
51
|
+
> ```
|
|
52
|
+
|
|
53
|
+
**Dowload via git**
|
|
54
|
+
> ```
|
|
55
|
+
> git clone https://github.com/YuefanHuang1998/BRAID.git
|
|
56
|
+
> cd BRAID
|
|
57
|
+
> pip install .
|
|
58
|
+
> ```
|
|
59
|
+
|
|
60
|
+
**To verify the installation was successful, run:**
|
|
61
|
+
> braid test
|
|
62
|
+
|
|
63
|
+
## Usage
|
|
64
|
+
**Run the script from the command line by providing the GFF3 annotation, Reference Genome, and Phased VCF.**
|
|
65
|
+
|
|
66
|
+
`braid -r reference.fa -g annotation.gff3 -v phased_variants.vcf.gz`
|
|
67
|
+
|
|
68
|
+
### Example for test dataset
|
|
69
|
+
`braid -r test.fasta -g test.gff3 -v test.vcf.gz`
|
|
70
|
+
> ```
|
|
71
|
+
> You should have three output files and one log file:
|
|
72
|
+
> variant_analysis_output.tsv
|
|
73
|
+
> variant_analysis_output.alignment.txt
|
|
74
|
+
> variant_analysis_output.sample.txt
|
|
75
|
+
> variant_analysis_output.log
|
|
76
|
+
> ```
|
|
77
|
+
|
|
78
|
+
## Arguments Parameter Table
|
|
79
|
+
| **Short** | **Long** | **Description** | |
|
|
80
|
+
|:-----:|:------:|:-------------:|:-------------:|
|
|
81
|
+
| **-g** | **--gff** | Path to the GFF3 annotation file. | Required |
|
|
82
|
+
| **-r** | **--reference** | Path to the Reference Genome FASTA (must be indexed: .fai). | Required |
|
|
83
|
+
| **-v** | **--vcf** | Path to the Phased VCF file (must be indexed: .tbi/.csi). | Required |
|
|
84
|
+
| **-o** | **--output** | Output file name (default: variant_analysis_output.tsv). | Optional |
|
|
85
|
+
| **-s** | **--sample** | Path to file containing specific sample IDs to analyze (one per line, no header). | Optional |
|
|
86
|
+
| | **--gene** | Path to file with specific gene IDs to analyze (one per line, no header). | Optional |
|
|
87
|
+
| | **--lof-threshold** | Custom threshold for Loss-of-Function classification (e.g., 0.1; default: 0.3). | Optional |
|
|
88
|
+
| | **--force-unphased** | Skip phasing check (forces run on unphased VCF). | Optional |
|
|
89
|
+
| | **--ignore-intron** | Ignore variants marked strictly as intronic. | Optional |
|
|
90
|
+
|
|
91
|
+
## Output Files Explaination
|
|
92
|
+
|
|
93
|
+
**BRAID generates three main files to assist in your analysis.**
|
|
94
|
+
|
|
95
|
+
**1. Summary Table**
|
|
96
|
+
|
|
97
|
+
A comprehensive table detailing the protein changes for every haplotype (default: variant_analysis_output.tsv).
|
|
98
|
+
|
|
99
|
+
Example View:
|
|
100
|
+
| Gene_ID | Haplotype_ID | mRNA | Haplotype_Count | Frequency | Variant_Type | Protein_Changes | Haplotype_Mutations | Sample_Sources | Ref_Protein | Alt_Protein | Ref_CDS | Alt_CDS | Aligned_Ref | Comparison_String | Aligned_Alt |
|
|
101
|
+
|:-----|:------|:------|:-----|:------|:------|:-----|:------|:------|:-----|:------|:------|:-----|:------|:------|:-----|
|
|
102
|
+
| gene1 | transcript1:REF | transcript1 | . | . | NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:27) `\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | . | . | . | MSLASSANDMIDRSIDRSIDRSIDRS* | MSLASSANDMIDRSIDRSIDRSIDRS* | ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | MSLASSANDMIDRSIDRSIDRSIDRS* | `\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | MSLASSANDMIDRSIDRSIDRSIDRS* |
|
|
103
|
+
| gene1 | transcript1:1 | transcript1 | 4 | 0.500000 | NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)`\|\|\|`deletion `\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | Del(5)S | 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON] | sample1(Hap2);sample2(Hap1);sample3(Homo) | MSLASSANDMIDRSIDRSIDRSIDRS* | MSLASANDMIDRSIDRSIDRSIDRS* | ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | ATGAGCCTAGCTTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA | MSLASSANDMIDRSIDRSIDRSIDRS* | `\|\|\|\| \|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|` | MSLA-SANDMIDRSIDRSIDRSIDRS* |
|
|
104
|
+
|
|
105
|
+
**Explanation for each column**
|
|
106
|
+
|
|
107
|
+
| **Column** | **Example** | **Explanation** |
|
|
108
|
+
|:-----|:-----|:-------------|
|
|
109
|
+
| **Gene_ID** | gene1 | The gene to which this haplotype belongs. |
|
|
110
|
+
| **Haplotype_ID** | transcript1:REF;transcript1:1 | transcript:REF indicates the reference haplotype; others (e.g. transcript1:1) are alternative haplotypes. |
|
|
111
|
+
| **mRNA** | transcript1 | The transcript to which this haplotype belongs. |
|
|
112
|
+
| **Haplotype_Count** | 4 | Number of samples carrying this haplotype. |
|
|
113
|
+
| **Frequency** | 0.500000 | Population frequency of this haplotype. |
|
|
114
|
+
| **Variant_Type** | LOF/NoLoF (Please see <br>the detailed information below.) | Functional classification of the haplotype and indicates LOF status. |
|
|
115
|
+
| **Protein_Changes** | Del(5)S | Protein-level consequence HGVS format description. |
|
|
116
|
+
| **Haplotype_Mutations** | 1:17_CTTAG>C[CDS,EXON];<br>1:27_T>TT[CDS,EXON] | List of variants defining the haplotype. |
|
|
117
|
+
| **Sample_Sources** | sample1(Hap2);sample2(Hap1);<br>sample3(Homo) | Samples carrying this haplotype, including haplotype phase (Hap1, Hap2) or homozygous status (Homo). |
|
|
118
|
+
| **Ref_Protein** | MSLASSANDIDRSIDRS* | Reference protein sequence. |
|
|
119
|
+
| **Alt_Protein** | MSLASANDIDRSIDRS* | Alternative protein sequence. |
|
|
120
|
+
| **Ref_CDS** | ATGAGCTTAGCTAGCTCAGCTAACGATATCG<br>ATCGATCGATCGATCGATCGTGA | Reference CDS sequence. |
|
|
121
|
+
| **Alt_CDS** | ATGAGCCTAGCTTCAGCTAACGATATCG<br>ATCGATCGATCGATCGATCGTGA | Haplotype-specific CDS sequence. |
|
|
122
|
+
| **Aligned_Ref** | MSLASSANDIDRSIDRS* | Aligned reference protein, protein alignment string for visualization: `-` → Gap (insertion or deletion). |
|
|
123
|
+
| **Comparison_String** | MSLASANDIDRSIDRS* | Alignment comparison symbols: `*` → Different amino acid, `\|` → Same amino acid. |
|
|
124
|
+
| **Aligned_Alt** | MSLASANDIDRSIDRS* | Aligned alternative protein, shows amino acid changes: `-` → Gap (insertion or deletion). |
|
|
125
|
+
|
|
126
|
+
**Detail information for Variant_Type**
|
|
127
|
+
|
|
128
|
+
| **Column** | **Explanation** |
|
|
129
|
+
|:-----|:-------------|
|
|
130
|
+
| **LOF_Info** | LOF indicates haplotypes predicted to cause protein loss of function, whereas NoLOF indicates haplotypes without LOF effects. non_identity_rate represents the proportion of amino acid differences between ALT and REF proteins; non_identical_AAs is the corresponding count, and total_ref_AAs denotes the length of the REF protein. e.g., NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18). |
|
|
131
|
+
| **missense** | A nucleotide variant that results in the substitution of one amino acid by another in the protein sequence. |
|
|
132
|
+
| **insertion** | An insertion of one or more nucleotides that alters the coding sequence and may affect the resulting protein sequence. |
|
|
133
|
+
| **deletion** | A deletion of one or more nucleotides from the coding sequence, potentially altering the protein sequence or reading frame. |
|
|
134
|
+
| **complex_indel** | A combined insertion and deletion event that cannot be represented as a simple insertion or deletion and may cause complex changes to the coding sequence. |
|
|
135
|
+
| **exon_skip** | A splicing event in which one or more exons are completely skipped in the transcript, leading to an altered mRNA and protein sequence. |
|
|
136
|
+
| **skipped_exons_detail** | Detailed information specifying which exon is skipped in the exon-skipping event, SITE_PRESERVED, SITE_SHIFT, SITE_DESTROYED. e.g., SkippedExon:[1:51-77`\|`SITE_SHIFT]. |
|
|
137
|
+
| **intron_retention** | A splicing event in which one or more introns are retained in the mature transcript, potentially disrupting the coding sequence. |
|
|
138
|
+
| **retained_introns_detail** | Detailed information specifying which intron(s) are retained in the intron-retention event, SITE_PRESERVED, SITE_SHIFT, SITE_DESTROYED. e.g., RetainedIntron:[1:78-90`\|`SITE_DESTROYED]. |
|
|
139
|
+
| **start_codon_loss** | A variant that disrupts the canonical start codon, potentially preventing translation initiation. |
|
|
140
|
+
| **stop_loss** | A variant that removes or alters the stop codon, resulting in translational read-through and an extended protein. |
|
|
141
|
+
| **No_start_codon_for_reference** | The reference transcript or protein sequence lacks an annotated start codon. |
|
|
142
|
+
| **same_as_other_transcript** | The variant effect on this protein is identical to that observed in another transcript's protein of the same gene. |
|
|
143
|
+
| **protein_loss** | The variant or variant combination results in the complete loss of the predicted protein product. |
|
|
144
|
+
| **ref_protein_empty** | The reference transcript does not produce a protein sequence (e.g., non-coding or incomplete annotation). |
|
|
145
|
+
| **alignment_failed** | The reference and alternative protein sequences could not be reliably aligned, preventing accurate variant effect annotation. |
|
|
146
|
+
|
|
147
|
+
**Log information for splice sites mutations**
|
|
148
|
+
|
|
149
|
+
**SITE_PRESERVED**: The splice site is unaltered, with both its position and sequence conserved; normal splicing is expected.
|
|
150
|
+
> ```
|
|
151
|
+
> [transcript1] (strand +) Splice site 'GT' at 39 PRESERVED by mutation 1:39_GTCG>G.
|
|
152
|
+
> - Original Window Seq : GATGTCGTTAAG - Mutated Window Seq : GATGTTAAG.
|
|
153
|
+
> ```
|
|
154
|
+
|
|
155
|
+
**SITE_SHIFT**: The splice site may move to a nearby position due to the variant, potentially altering the exon–intron boundary.
|
|
156
|
+
> ```
|
|
157
|
+
> WARNING - [transcript1] Splice site may SHIFT for mutation 1:44_TAAGTA>A.
|
|
158
|
+
> - Splice Site : 'AG' (Original genomic pos: 49)
|
|
159
|
+
> - Mutation : TAAGTA -> A (Genomic pos: 44)
|
|
160
|
+
> - Window : 1:42-53
|
|
161
|
+
> - Original Window Seq : GTTAAGTAGATG
|
|
162
|
+
> - Mutated Window Seq : GTAGATG
|
|
163
|
+
> ```
|
|
164
|
+
|
|
165
|
+
**SITE_DESTROYED**: The splice site is disrupted or abolished by the variant, likely preventing normal splicing at this site.
|
|
166
|
+
> ```
|
|
167
|
+
> [transcript1] (strand +) Splice site 'GT' at 78 DESTROYED by mutation 1:50_GATGATCGATCGATCGATCGATCGATCGG>GG. It was 'GT', became 'AT'.
|
|
168
|
+
> - Original Window Seq : TCGGTCGATCGA - Mutated Window Seq : TCGATCGA
|
|
169
|
+
> ```
|
|
170
|
+
|
|
171
|
+
**2. Alignment Visualization (.alignment.txt)**
|
|
172
|
+
|
|
173
|
+
A text file showing the pairwise alignment of the Reference vs. Alternative protein.
|
|
174
|
+
Use the Haplotype_ID to match the haplotype in the summary table.
|
|
175
|
+
|
|
176
|
+
Example View:
|
|
177
|
+
> ```
|
|
178
|
+
> Haplotype_ID: transcript1:1
|
|
179
|
+
> Gene: gene1 | mRNA: transcript1
|
|
180
|
+
> Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
|
|
181
|
+
> Variant_Type: NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion||||||||||||||
|
|
182
|
+
> Protein_Changes: Del(5)S
|
|
183
|
+
> Alignment:
|
|
184
|
+
> Ref: MSLASSANDMIDRSIDRSIDRSIDRS*
|
|
185
|
+
> |||| ||||||||||||||||||||||
|
|
186
|
+
> Alt: MSLA-SANDMIDRSIDRSIDRSIDRS*
|
|
187
|
+
> ```
|
|
188
|
+
|
|
189
|
+
**3. Sample Matrix (.sample.txt)**
|
|
190
|
+
|
|
191
|
+
A matrix format ideal for heatmaps or downstream programmatic analysis.
|
|
192
|
+
|
|
193
|
+
Example View:
|
|
194
|
+
| Gene_ID | mRNA_ID | Ref_ID | Alt_IDs | sample1 | sample2 | sample3 |
|
|
195
|
+
|:-----|:------|:------|:-----|:------|:------|:------|
|
|
196
|
+
| gene1 | transcript1 | transcript1:REF | transcript1:1 | 0`\|`1 | 1`\|`0 | 1`\|`1 |
|
|
197
|
+
|
|
198
|
+
>```
|
|
199
|
+
> Gene_ID mRNA_ID Ref_ID Alt_IDs sample1 sample2 sample3
|
|
200
|
+
> gene1 transcript1 transcript1:REF transcript1:1 0|1 1|0 1|1
|
|
201
|
+
>```
|
|
202
|
+
|
|
203
|
+
| **Column** | **Explanation** |
|
|
204
|
+
|:-----|:-------------|
|
|
205
|
+
| **Gene_ID** | The identifier of the gene being analyzed. |
|
|
206
|
+
| **mRNA_ID** | The identifier of the specific transcript of that gene. |
|
|
207
|
+
| **Ref_ID** | Reference haplotype ID, labeled as transcript:REF. |
|
|
208
|
+
| **Alt_IDs** | Alternative haplotype IDs observed in the samples, with numbering to distinguish multiple haplotypes. |
|
|
209
|
+
| **samples** | The haplotype presence/absence in each sample. Each entry corresponds to the Ref or Alt haplotype. `0` represents REF. |
|
|
210
|
+
|
|
211
|
+
## Citation
|
|
212
|
+
|
|
213
|
+
If you use BRAID in your research, please cite our paper:
|
|
214
|
+
> *
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "PyBRAID"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Block Resolution and Annotation of Integrated DNA"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Yuefan Huang" },
|
|
11
|
+
]
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
requires-python = ">=3.8"
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pysam>=0.17.0",
|
|
17
|
+
"biopython>=1.80",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
braid = "braid.cli:main"
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
where = ["src"]
|
|
25
|
+
[tool.setuptools.package-data]
|
|
26
|
+
braid = ["data/*"]
|
pybraid-1.0.0/setup.cfg
ADDED