stralln 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stralln-0.1/PKG-INFO +51 -0
- stralln-0.1/README.md +40 -0
- stralln-0.1/pyproject.toml +25 -0
- stralln-0.1/setup.cfg +4 -0
- stralln-0.1/setup.py +17 -0
- stralln-0.1/stralln.egg-info/PKG-INFO +51 -0
- stralln-0.1/stralln.egg-info/SOURCES.txt +14 -0
- stralln-0.1/stralln.egg-info/dependency_links.txt +1 -0
- stralln-0.1/stralln.egg-info/entry_points.txt +2 -0
- stralln-0.1/stralln.egg-info/requires.txt +3 -0
- stralln-0.1/stralln.egg-info/top_level.txt +1 -0
- stralln-0.1/stretcher_parser/__init__.py +0 -0
- stralln-0.1/stretcher_parser/__main__.py +120 -0
- stralln-0.1/stretcher_parser/parser.py +154 -0
- stralln-0.1/stretcher_parser/reformat.py +118 -0
- stralln-0.1/stretcher_parser/utils.py +51 -0
stralln-0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stralln
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: A CLI tool for EMBOSS stretcher alignments parsing and VCF analysis
|
|
5
|
+
Author: nikolienka24
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: PyVCF3
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: python-Levenshtein
|
|
11
|
+
|
|
12
|
+
# straln: Stretcher Alignment & Alternative Mutation Finder
|
|
13
|
+
|
|
14
|
+
`straln` is a bioinformatics tool designed to parse `.aln` files in `markx0` format generated by
|
|
15
|
+
the **EMBOSS stretcher** tool.
|
|
16
|
+
It converts pairwise alignments into genomic formats (BEDPE/BED)
|
|
17
|
+
and can optionally intersect these results with VCF files to identify
|
|
18
|
+
alternative genomic variations nearby.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
* **Automated Parsing:** Converts EMBOSS stretcher `.aln` format to `BEDPE` and two separate `BED`.
|
|
22
|
+
* **Coordinate Mapping:** Automatically extracts starting offsets from alignment headers or allows manual overrides.
|
|
23
|
+
* **Mutation Analysis:** Compares alignment gaps/mismatches with a VCF file to find alternative mutations within a specified window.
|
|
24
|
+
* **Refined Output:** Joins consecutive alignment blocks for cleaner downstream analysis.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Since the project uses `setuptools`, you can install it in editable mode:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# From the root of the project (where setup.py is located)
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Getting Help
|
|
37
|
+
straln --help
|
|
38
|
+
# only parse the alignment
|
|
39
|
+
straln my_alignment.aln -o ./output_folder
|
|
40
|
+
# parse alignment and find alternative mutations from a VCF (when using -vcf parameter -c is mandatory)
|
|
41
|
+
straln my_alignment.aln -vcf variations.vcf -c 17 -d 150 -o ./output_folder
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Output files
|
|
45
|
+
The tool generates the following files in the specified output folder:
|
|
46
|
+
|
|
47
|
+
* **`parsed.bedpe`**: Raw alignment blocks in BEDPE format.
|
|
48
|
+
* **`parsed.joined.bedpe`**: Merged consecutive alignment blocks for easier visualization of londer indels.
|
|
49
|
+
* **`seq1.bed` / `seq2.bed`**: Separate tracks for each sequence.
|
|
50
|
+
* **`stats.txt`**: A summary file containing the alignment length, number of mismatches, and total gaps.
|
|
51
|
+
* **`alternative_mutations.csv`**: *(Generated only when using the `-vcf` flag)* A list of variants from your VCF file that were found near alignment discrepancies.
|
stralln-0.1/README.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# straln: Stretcher Alignment & Alternative Mutation Finder
|
|
2
|
+
|
|
3
|
+
`straln` is a bioinformatics tool designed to parse `.aln` files in `markx0` format generated by
|
|
4
|
+
the **EMBOSS stretcher** tool.
|
|
5
|
+
It converts pairwise alignments into genomic formats (BEDPE/BED)
|
|
6
|
+
and can optionally intersect these results with VCF files to identify
|
|
7
|
+
alternative genomic variations nearby.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
* **Automated Parsing:** Converts EMBOSS stretcher `.aln` format to `BEDPE` and two separate `BED`.
|
|
11
|
+
* **Coordinate Mapping:** Automatically extracts starting offsets from alignment headers or allows manual overrides.
|
|
12
|
+
* **Mutation Analysis:** Compares alignment gaps/mismatches with a VCF file to find alternative mutations within a specified window.
|
|
13
|
+
* **Refined Output:** Joins consecutive alignment blocks for cleaner downstream analysis.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Since the project uses `setuptools`, you can install it in editable mode:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# From the root of the project (where setup.py is located)
|
|
21
|
+
pip install -e .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Getting Help
|
|
26
|
+
straln --help
|
|
27
|
+
# only parse the alignment
|
|
28
|
+
straln my_alignment.aln -o ./output_folder
|
|
29
|
+
# parse alignment and find alternative mutations from a VCF (when using -vcf parameter -c is mandatory)
|
|
30
|
+
straln my_alignment.aln -vcf variations.vcf -c 17 -d 150 -o ./output_folder
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Output files
|
|
34
|
+
The tool generates the following files in the specified output folder:
|
|
35
|
+
|
|
36
|
+
* **`parsed.bedpe`**: Raw alignment blocks in BEDPE format.
|
|
37
|
+
* **`parsed.joined.bedpe`**: Merged consecutive alignment blocks for easier visualization of londer indels.
|
|
38
|
+
* **`seq1.bed` / `seq2.bed`**: Separate tracks for each sequence.
|
|
39
|
+
* **`stats.txt`**: A summary file containing the alignment length, number of mismatches, and total gaps.
|
|
40
|
+
* **`alternative_mutations.csv`**: *(Generated only when using the `-vcf` flag)* A list of variants from your VCF file that were found near alignment discrepancies.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "stralln"
|
|
7
|
+
version = "0.1"
|
|
8
|
+
description = "A CLI tool for EMBOSS stretcher alignments parsing and VCF analysis"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "nikolienka24" }
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
dependencies = [
|
|
16
|
+
"PyVCF3",
|
|
17
|
+
"pandas",
|
|
18
|
+
"python-Levenshtein",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.scripts]
|
|
22
|
+
straln = "stretcher_parser.__main__:main"
|
|
23
|
+
|
|
24
|
+
[tool.setuptools]
|
|
25
|
+
packages = ["stretcher_parser"]
|
stralln-0.1/setup.cfg
ADDED
stralln-0.1/setup.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="straln",
|
|
5
|
+
version="0.1",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
install_requires=[
|
|
8
|
+
"PyVCF3",
|
|
9
|
+
"pandas",
|
|
10
|
+
"Levenshtein"
|
|
11
|
+
],
|
|
12
|
+
entry_points={
|
|
13
|
+
'console_scripts': [
|
|
14
|
+
'straln=stretcher_parser.__main__:main',
|
|
15
|
+
],
|
|
16
|
+
},
|
|
17
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stralln
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: A CLI tool for EMBOSS stretcher alignments parsing and VCF analysis
|
|
5
|
+
Author: nikolienka24
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: PyVCF3
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: python-Levenshtein
|
|
11
|
+
|
|
12
|
+
# straln: Stretcher Alignment & Alternative Mutation Finder
|
|
13
|
+
|
|
14
|
+
`straln` is a bioinformatics tool designed to parse `.aln` files in `markx0` format generated by
|
|
15
|
+
the **EMBOSS stretcher** tool.
|
|
16
|
+
It converts pairwise alignments into genomic formats (BEDPE/BED)
|
|
17
|
+
and can optionally intersect these results with VCF files to identify
|
|
18
|
+
alternative genomic variations nearby.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
* **Automated Parsing:** Converts EMBOSS stretcher `.aln` format to `BEDPE` and two separate `BED`.
|
|
22
|
+
* **Coordinate Mapping:** Automatically extracts starting offsets from alignment headers or allows manual overrides.
|
|
23
|
+
* **Mutation Analysis:** Compares alignment gaps/mismatches with a VCF file to find alternative mutations within a specified window.
|
|
24
|
+
* **Refined Output:** Joins consecutive alignment blocks for cleaner downstream analysis.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Since the project uses `setuptools`, you can install it in editable mode:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# From the root of the project (where setup.py is located)
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Getting Help
|
|
37
|
+
straln --help
|
|
38
|
+
# only parse the alignment
|
|
39
|
+
straln my_alignment.aln -o ./output_folder
|
|
40
|
+
# parse alignment and find alternative mutations from a VCF (when using -vcf parameter -c is mandatory)
|
|
41
|
+
straln my_alignment.aln -vcf variations.vcf -c 17 -d 150 -o ./output_folder
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Output files
|
|
45
|
+
The tool generates the following files in the specified output folder:
|
|
46
|
+
|
|
47
|
+
* **`parsed.bedpe`**: Raw alignment blocks in BEDPE format.
|
|
48
|
+
* **`parsed.joined.bedpe`**: Merged consecutive alignment blocks for easier visualization of londer indels.
|
|
49
|
+
* **`seq1.bed` / `seq2.bed`**: Separate tracks for each sequence.
|
|
50
|
+
* **`stats.txt`**: A summary file containing the alignment length, number of mismatches, and total gaps.
|
|
51
|
+
* **`alternative_mutations.csv`**: *(Generated only when using the `-vcf` flag)* A list of variants from your VCF file that were found near alignment discrepancies.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
stralln.egg-info/PKG-INFO
|
|
5
|
+
stralln.egg-info/SOURCES.txt
|
|
6
|
+
stralln.egg-info/dependency_links.txt
|
|
7
|
+
stralln.egg-info/entry_points.txt
|
|
8
|
+
stralln.egg-info/requires.txt
|
|
9
|
+
stralln.egg-info/top_level.txt
|
|
10
|
+
stretcher_parser/__init__.py
|
|
11
|
+
stretcher_parser/__main__.py
|
|
12
|
+
stretcher_parser/parser.py
|
|
13
|
+
stretcher_parser/reformat.py
|
|
14
|
+
stretcher_parser/utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stretcher_parser
|
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
import vcf
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from . import parser, reformat
|
|
9
|
+
from .utils import save_statistics, get_offsets_from_file
|
|
10
|
+
from analysis import find_alternative_mutations
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_arguments() -> argparse.Namespace:
|
|
14
|
+
parser_arg = argparse.ArgumentParser(
|
|
15
|
+
description="""
|
|
16
|
+
straln: Stretcher Alignment & Alternative Mutation Finder
|
|
17
|
+
---------------------------------------------------------
|
|
18
|
+
Parses .aln files from EMBOSS stretcher into BEDPE/BED formats and optionally
|
|
19
|
+
compares them with VCF files to identify alternative genomic variations.
|
|
20
|
+
""",
|
|
21
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# ----- POSITIONAL ARGUMENTS -----
|
|
25
|
+
parser_arg.add_argument("aln_input_file", help="path to the .aln alignment file from EMBOSS stretcher (.aln format)")
|
|
26
|
+
|
|
27
|
+
# ----- OPTIONAL ARGUMENTS -----
|
|
28
|
+
parser_arg.add_argument("-o", "--output_folder", type=str, default=None, help="path to output folder (default: current folder)")
|
|
29
|
+
|
|
30
|
+
# ----- MUTATION ANALYSIS GROUP (OPTIONAL) -----
|
|
31
|
+
mutation = parser_arg.add_argument_group('Alternative Mutation Analysis (Optional)')
|
|
32
|
+
mutation.add_argument(
|
|
33
|
+
"-vcf", "--vcf_input_file",
|
|
34
|
+
help="input VCF file, providing this triggers the search for alternative mutations"
|
|
35
|
+
)
|
|
36
|
+
parser_arg.add_argument(
|
|
37
|
+
"-c", "--chromosome",
|
|
38
|
+
help="target chromosome (1-22, X, Y)")
|
|
39
|
+
mutation.add_argument(
|
|
40
|
+
"-d", "--distance", type=int, default=100,
|
|
41
|
+
help="window size (bp) for finding nearby alternative mutations (default: 100)"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# ----- ALIGNMENT METADATA GROUP -----
|
|
45
|
+
metadata = parser_arg.add_argument_group('Alignment Metadata & Offsets (Optional)')
|
|
46
|
+
metadata.add_argument("-s1", "--seq_name1", default="Seq1", help="label for the first sequence")
|
|
47
|
+
metadata.add_argument("-s2", "--seq_name2", default="Seq2", help="label for the second sequence")
|
|
48
|
+
metadata.add_argument("-off1", "--offset1", type=int, help="starting position for seq1 (if not provided, the tool automatically extracts it from the .aln header)")
|
|
49
|
+
metadata.add_argument("-off2", "--offset2", type=int, help="starting position for seq2 (if not provided, the tool automatically extracts it from the .aln header)")
|
|
50
|
+
|
|
51
|
+
return parser_arg.parse_args()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def main() -> None:
|
|
55
|
+
args = parse_arguments()
|
|
56
|
+
|
|
57
|
+
# 0. parse input files =============================================================
|
|
58
|
+
aln_input_file = args.aln_input_file
|
|
59
|
+
|
|
60
|
+
vcf_input_file = None
|
|
61
|
+
chromosome = None
|
|
62
|
+
if args.vcf_input_file:
|
|
63
|
+
vcf_input_file = args.vcf_input_file
|
|
64
|
+
chromosome = args.chromosome
|
|
65
|
+
|
|
66
|
+
output_folder = "."
|
|
67
|
+
if args.output_folder:
|
|
68
|
+
output_folder = args.output_folder
|
|
69
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
seq_name1, seq_name2 = "seq1", "seq2"
|
|
72
|
+
if args.seq_name1:
|
|
73
|
+
seq_name1 = args.seq_name1
|
|
74
|
+
if args.seq_name2:
|
|
75
|
+
seq_name2 = args.seq_name2
|
|
76
|
+
|
|
77
|
+
distance_threshold = None
|
|
78
|
+
if args.distance:
|
|
79
|
+
distance_threshold = args.distance
|
|
80
|
+
|
|
81
|
+
if args.offset1 is not None and args.offset2 is not None:
|
|
82
|
+
offset1 = args.offset1
|
|
83
|
+
offset2 = args.offset2
|
|
84
|
+
else:
|
|
85
|
+
# If one or both are missing, we fetch them from the alignment file
|
|
86
|
+
offset1, offset2 = get_offsets_from_file(aln_input_file)
|
|
87
|
+
|
|
88
|
+
# If the user provided one but not the other, the manual one takes precedence
|
|
89
|
+
if args.offset1 is not None:
|
|
90
|
+
offset1 = args.offset1
|
|
91
|
+
if args.offset2 is not None:
|
|
92
|
+
offset2 = args.offset2
|
|
93
|
+
|
|
94
|
+
print(f"Using offsets extracted from alignment header: {offset1}, {offset2}")
|
|
95
|
+
|
|
96
|
+
# 1. stretcher parser =================================================================
|
|
97
|
+
parsed_file = output_folder + "/parsed.bedpe"
|
|
98
|
+
length, mismatches, gaps = parser.run(aln_input_file, parsed_file, seq_name1, seq_name2, offset1, offset2)
|
|
99
|
+
|
|
100
|
+
output_stats = output_folder + "/stats.txt"
|
|
101
|
+
save_statistics(seq_name1, seq_name2, length, mismatches, gaps, output_stats)
|
|
102
|
+
|
|
103
|
+
# 2. join consecutive rows ============================================================
|
|
104
|
+
parsed_joined_file = output_folder + "/parsed.joined.bedpe"
|
|
105
|
+
reformat.join_consecutive_rows(parsed_file, parsed_joined_file)
|
|
106
|
+
|
|
107
|
+
# 3. convert bedpe to bed two separate bed files =======================================================
|
|
108
|
+
bed1, bed2 = output_folder + "/seq1.bed", output_folder + "/seq2.bed"
|
|
109
|
+
reformat.bedpe_to_bed(parsed_joined_file, bed1, bed2)
|
|
110
|
+
|
|
111
|
+
# 4. find alternative mutations =======================================================
|
|
112
|
+
if vcf_input_file:
|
|
113
|
+
find_alternative_mutations.find(vcf_input_file, parsed_joined_file, output_folder, chromosome, distance_threshold)
|
|
114
|
+
|
|
115
|
+
# 5. print final information =======================================================
|
|
116
|
+
print("Output files saved to " + output_folder)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
main()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from typing import Tuple, List, Optional, Union
|
|
2
|
+
from stretcher_parser.utils import get_offsets_from_file
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _check_sequences(sequence_name1: str, sequence_name2: str,
|
|
6
|
+
seq1_data: List[Union[str, int]], seq2_data: List[Union[str, int]],
|
|
7
|
+
found_start: bool, buffer: List[str],
|
|
8
|
+
prefix_a: Optional[str], prefix_b: Optional[str]) -> Tuple[str, int, int, int, bool, List[str], int, int, Optional[str], Optional[str]]:
|
|
9
|
+
"""
|
|
10
|
+
Looks at the alignment letter-by-letter to find where the two sequences don't match.
|
|
11
|
+
It figures out the exact positions for mutations and handles insertions or deletions.
|
|
12
|
+
"""
|
|
13
|
+
output = ""
|
|
14
|
+
p1, p2 = seq1_data[1], seq2_data[1]
|
|
15
|
+
seq1_chunk, seq2_chunk = seq1_data[0], seq2_data[0]
|
|
16
|
+
|
|
17
|
+
line_len, line_len_curr = 0, 0
|
|
18
|
+
line_mismatches, line_gaps = 0, 0
|
|
19
|
+
|
|
20
|
+
for a, b in zip(seq1_chunk, seq2_chunk):
|
|
21
|
+
if a != "-":
|
|
22
|
+
p1 += 1
|
|
23
|
+
if b != "-":
|
|
24
|
+
p2 += 1
|
|
25
|
+
|
|
26
|
+
if not found_start:
|
|
27
|
+
if a in ("A", "C", "G", "T") and b in ("A", "C", "G", "T"):
|
|
28
|
+
found_start = True
|
|
29
|
+
else:
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
line_len_curr += 1
|
|
33
|
+
|
|
34
|
+
# 0-based BEDPE coordinates
|
|
35
|
+
pos1_start, pos1_end = p1 - 1, p1 - 1
|
|
36
|
+
pos2_start, pos2_end = p2 - 1, p2 - 1
|
|
37
|
+
|
|
38
|
+
if a != b:
|
|
39
|
+
if a != "-" and b == "-": # deletion in seq2
|
|
40
|
+
out1 = prefix_a + a # REF: prefix + deleted base(s)
|
|
41
|
+
pos1_end += 1
|
|
42
|
+
out2 = prefix_b # ALT: prefix only
|
|
43
|
+
elif a == "-" and b != "-": # deletion in seq1
|
|
44
|
+
out1 = prefix_a # REF: prefix only
|
|
45
|
+
out2 = prefix_b + b # ALT: prefix + inserted base(s)
|
|
46
|
+
pos2_end += 1
|
|
47
|
+
else:
|
|
48
|
+
# normal mismatch
|
|
49
|
+
out1 = a
|
|
50
|
+
out2 = b
|
|
51
|
+
|
|
52
|
+
buffer.append(
|
|
53
|
+
f"{sequence_name1}\t{pos1_start}\t{pos1_end}\t"
|
|
54
|
+
f"{sequence_name2}\t{pos2_start}\t{pos2_end}\t"
|
|
55
|
+
f"{out1}\t{out2}\n"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
line_mismatches += 1
|
|
59
|
+
if a == "-" or b == "-":
|
|
60
|
+
line_gaps += 1
|
|
61
|
+
|
|
62
|
+
# Flush buffer on real aligned A/C/G/T pairs
|
|
63
|
+
if a in ("A", "C", "G", "T") and b in ("A", "C", "G", "T"):
|
|
64
|
+
output += "".join(buffer)
|
|
65
|
+
line_len += line_len_curr
|
|
66
|
+
line_len_curr = 0
|
|
67
|
+
buffer.clear()
|
|
68
|
+
|
|
69
|
+
if a in ("A", "C", "G", "T"):
|
|
70
|
+
prefix_a = a
|
|
71
|
+
if b in ("A", "C", "G", "T"):
|
|
72
|
+
prefix_b = b
|
|
73
|
+
|
|
74
|
+
return output, line_len, line_mismatches, line_gaps, found_start, buffer, p1, p2, prefix_a, prefix_b
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _parse(input_file: str, output_file: str,
|
|
78
|
+
seq_name1: str, seq_name2:str,
|
|
79
|
+
offset_seq1: int = 0, offset_seq2: int = 0) -> Tuple[int, int, int]:
|
|
80
|
+
"""
|
|
81
|
+
The main engine that opens the alignment file, skips the technical headers,
|
|
82
|
+
and reads the sequences line-by-line to find differences.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
with open(input_file) as infile, open(output_file, 'w') as outfile:
|
|
86
|
+
# BEDPE header with 2 additional columns
|
|
87
|
+
outfile.write("chrom1\tstart1\tend1\tchrom2\tstart2\tend2\tnucleotide1\tnucleotide2\n")
|
|
88
|
+
|
|
89
|
+
found_start = False
|
|
90
|
+
buffer = []
|
|
91
|
+
seq_len, mismatches, gaps = 0, 0, 0
|
|
92
|
+
|
|
93
|
+
# read header
|
|
94
|
+
line = infile.readline()
|
|
95
|
+
while line.startswith("#"):
|
|
96
|
+
line = infile.readline()
|
|
97
|
+
|
|
98
|
+
line = infile.readline()
|
|
99
|
+
while line.startswith("#"):
|
|
100
|
+
line = infile.readline()
|
|
101
|
+
|
|
102
|
+
infile.readline() # skip blank line at the end of header
|
|
103
|
+
# end read header
|
|
104
|
+
|
|
105
|
+
curr1, curr2 = offset_seq1, offset_seq2
|
|
106
|
+
prefix_a, prefix_b = None, None
|
|
107
|
+
while True:
|
|
108
|
+
if line.startswith("#") or not line:
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
seq1_line = infile.readline().strip().split()
|
|
112
|
+
infile.readline() # match line
|
|
113
|
+
seq2_line = infile.readline().strip().split()
|
|
114
|
+
infile.readline() # lower pos
|
|
115
|
+
infile.readline() # blank line
|
|
116
|
+
|
|
117
|
+
if seq1_line[0].startswith("#") or seq2_line[0].startswith("#"):
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
seq1_seq = seq1_line[1]
|
|
121
|
+
seq2_seq = seq2_line[1]
|
|
122
|
+
|
|
123
|
+
if not seq2_seq:
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
out, line_length, mm_add, gaps_add, found_start, buffer, curr1, curr2, prefix_a, prefix_b = _check_sequences(
|
|
127
|
+
seq_name1, seq_name2,
|
|
128
|
+
[seq1_seq, curr1],
|
|
129
|
+
[seq2_seq, curr2],
|
|
130
|
+
found_start, buffer,
|
|
131
|
+
prefix_a, prefix_b
|
|
132
|
+
)
|
|
133
|
+
outfile.write(out)
|
|
134
|
+
|
|
135
|
+
seq_len += line_length
|
|
136
|
+
mismatches += mm_add
|
|
137
|
+
gaps += gaps_add
|
|
138
|
+
|
|
139
|
+
line = infile.readline()
|
|
140
|
+
if not line:
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
return seq_len, mismatches, gaps
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def run(in_file: str, out_file: str,
|
|
147
|
+
seq_name1: str, seq_name2: str,
|
|
148
|
+
offset1: int, offset2: int) -> Tuple[int, int, int]:
|
|
149
|
+
"""
|
|
150
|
+
Starts the parsing process and returns a summary of the results,
|
|
151
|
+
including how long the sequences are and how many errors were found.
|
|
152
|
+
"""
|
|
153
|
+
length, mismatches, gaps = _parse(in_file, out_file, seq_name1, seq_name2, offset1, offset2)
|
|
154
|
+
return length, mismatches, gaps
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from typing import List, TextIO
|
|
2
|
+
|
|
3
|
+
### FORMAT CONVERSIONS ===========================================================
|
|
4
|
+
def bedpe_to_bed(infile: str, out1: str, out2: str) -> None:
|
|
5
|
+
"""
|
|
6
|
+
Splits the combined alignment file into two separate BED files,
|
|
7
|
+
one for each sequence, so they can be viewed individually.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
with open(infile) as f, open(out1, "w") as bed1, open(out2, "w") as bed2:
|
|
11
|
+
header = next(f)
|
|
12
|
+
bed1.write("chrom\tstart\tend\tseq\n")
|
|
13
|
+
bed2.write("chrom\tstart\tend\tseq\n")
|
|
14
|
+
|
|
15
|
+
for line in f:
|
|
16
|
+
chrom1, start1, end1, chrom2, start2, end2, seq1, seq2 = line.rstrip().split("\t")
|
|
17
|
+
bed1.write(f"{chrom1}\t{start1}\t{end1}\t{seq1}\n")
|
|
18
|
+
bed2.write(f"{chrom2}\t{start2}\t{end2}\t{seq2}\n")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
### REFORMATTING =================================================================
|
|
22
|
+
def _flush_buffer(buf: List[List[str]], out_fh: TextIO) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Merge consecutive positions in buffer and write a single BEDPE row.
|
|
25
|
+
- Coordinates: start = first position, end = last position
|
|
26
|
+
- Nucleotides: concatenate all nucleotides in the run, ignoring gaps
|
|
27
|
+
"""
|
|
28
|
+
if not buf:
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
# assume that chrom1 and chrom2 are in the whole file same
|
|
32
|
+
chrom1 = buf[0][0]
|
|
33
|
+
chrom2 = buf[0][3]
|
|
34
|
+
|
|
35
|
+
# Get start/end coordinates for ref and alt
|
|
36
|
+
seq1, seq2 = "", ""
|
|
37
|
+
start1, end1 = None, None
|
|
38
|
+
start2, end2 = None, None
|
|
39
|
+
for idx, line in enumerate(buf):
|
|
40
|
+
if idx == 0:
|
|
41
|
+
start1 = line[1]
|
|
42
|
+
start2 = line[4]
|
|
43
|
+
seq1, seq2 = line[6], line[7]
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
if len(line[6]) == 1 and len(line[7]) == 1:
|
|
47
|
+
seq1 += line[6]
|
|
48
|
+
seq2 += line[7]
|
|
49
|
+
elif len(line[6]) == 2 and len(line[7]) == 1:
|
|
50
|
+
seq1 += line[6][1]
|
|
51
|
+
elif len(line[6]) == 1 and len(line[7]) == 2:
|
|
52
|
+
seq2 += line[7][1]
|
|
53
|
+
|
|
54
|
+
end1, end2 = buf[-1][2], buf[-1][5]
|
|
55
|
+
|
|
56
|
+
out_fh.write(f"{chrom1}\t{start1}\t{end1}\t{chrom2}\t{start2}\t{end2}\t{seq1}\t{seq2}\n")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _is_consecutive(prev: List[str], curr: List[str]) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Determine if two BEDPE rows are consecutive:
|
|
62
|
+
- consecutive in both sequences, or in ref-only or alt-only
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
prev1_start, prev1_end = prev[1], prev[2]
|
|
66
|
+
prev2_start, prev2_end = prev[4], prev[5]
|
|
67
|
+
curr1_start, curr1_end = curr[1], curr[2]
|
|
68
|
+
curr2_start, curr2_end = curr[4], curr[5]
|
|
69
|
+
|
|
70
|
+
# Both sequences advance
|
|
71
|
+
if int(prev1_end) == int(curr1_start) and int(prev2_end) + 1 == int(curr2_start) + 1:
|
|
72
|
+
return True
|
|
73
|
+
# Ref only
|
|
74
|
+
elif int(prev1_end) == int(curr1_start):
|
|
75
|
+
return True
|
|
76
|
+
# Alt only
|
|
77
|
+
elif int(prev2_end) == int(curr2_start):
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def join_consecutive_rows(input_file: str, output_file: str) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Reads through the alignment differences and merges nearby changes
|
|
86
|
+
into single rows. This makes the data much cleaner and easier to read.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
with open(input_file, 'r') as in_fh, open(output_file, 'w') as out_fh:
|
|
90
|
+
out_fh.write("chrom1\tstart1\tend1\tchrom2\tstart2\tend2\tsequence1\tsequence2\n")
|
|
91
|
+
|
|
92
|
+
buffer = []
|
|
93
|
+
found_header = False
|
|
94
|
+
for line in in_fh:
|
|
95
|
+
if not found_header or not line.strip():
|
|
96
|
+
found_header = True
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
row = line.strip().split("\t")
|
|
100
|
+
if len(row) < 8:
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
# BEDPE row: chrom1, start1, end1, chrom2, start2, end2, nt1, nt2
|
|
104
|
+
chrom1, start1, end1, chrom2, start2, end2, nt1, nt2 = row
|
|
105
|
+
row_arr = [chrom1, start1, end1, chrom2, start2, end2, nt1, nt2]
|
|
106
|
+
|
|
107
|
+
if not buffer:
|
|
108
|
+
buffer.append(row_arr)
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
if _is_consecutive(buffer[-1], row_arr):
|
|
112
|
+
buffer.append(row_arr)
|
|
113
|
+
else:
|
|
114
|
+
_flush_buffer(buffer, out_fh)
|
|
115
|
+
buffer = [row_arr]
|
|
116
|
+
|
|
117
|
+
# Flush remaining buffer
|
|
118
|
+
_flush_buffer(buffer, out_fh)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Tuple, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def save_statistics(seq_name1: str, seq_name2: str,
|
|
5
|
+
length: int, mismatches: int, gaps: int,
|
|
6
|
+
output_file: str) -> None:
|
|
7
|
+
"""
|
|
8
|
+
Creates a summary report. It calculates the error percentages (mismatch and gap rates)
|
|
9
|
+
and saves all the final numbers into a text file.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
with open(output_file, "w") as f:
|
|
13
|
+
f.write(f"Alignment statistics between '{seq_name1}' and '{seq_name2}':\n")
|
|
14
|
+
f.write(f" Total aligned positions (excluding ignored ends): {length}\n")
|
|
15
|
+
f.write(f" Base mismatches (including gaps): {mismatches}\n")
|
|
16
|
+
f.write(f" Gaps detected: {gaps}\n")
|
|
17
|
+
if length > 0:
|
|
18
|
+
mismatch_rate = (mismatches + gaps) / length * 100
|
|
19
|
+
gap_rate = gaps / length * 100
|
|
20
|
+
f.write(f" Mismatch rate (including gaps): {mismatch_rate:.8f}%\n")
|
|
21
|
+
f.write(f" Gap rate: {gap_rate:.8f}%\n")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_offsets_from_file(aln_file_path: str) -> Tuple[int, int]:
|
|
25
|
+
"""
|
|
26
|
+
Reads the header of the Stretcher file to find the starting genomic positions.
|
|
27
|
+
It automatically figures out where your sequences begin on the chromosome.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
offset_seq1 = None
|
|
31
|
+
offset_seq2 = None
|
|
32
|
+
|
|
33
|
+
with open(aln_file_path, "r") as f:
|
|
34
|
+
for line in f:
|
|
35
|
+
line = line.strip()
|
|
36
|
+
if line.startswith("# 1:"):
|
|
37
|
+
# Format: "# 1: 87413227-97757117"
|
|
38
|
+
parts = line.split(":")[1].strip().split("-")
|
|
39
|
+
offset_seq1 = int(parts[0])
|
|
40
|
+
elif line.startswith("# 2:"):
|
|
41
|
+
parts = line.split(":")[1].strip().split("-")
|
|
42
|
+
offset_seq2 = int(parts[0])
|
|
43
|
+
|
|
44
|
+
if offset_seq1 is not None and offset_seq2 is not None:
|
|
45
|
+
break
|
|
46
|
+
|
|
47
|
+
if offset_seq1 is None or offset_seq2 is None:
|
|
48
|
+
raise ValueError(f"Could not parse offsets from Stretcher file: {aln_file_path}")
|
|
49
|
+
|
|
50
|
+
return offset_seq1, offset_seq2
|
|
51
|
+
|