stralln 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
stralln-0.1/PKG-INFO ADDED
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: stralln
3
+ Version: 0.1
4
+ Summary: A CLI tool for EMBOSS stretcher alignments parsing and VCF analysis
5
+ Author: nikolienka24
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: PyVCF3
9
+ Requires-Dist: pandas
10
+ Requires-Dist: python-Levenshtein
11
+
12
+ # straln: Stretcher Alignment & Alternative Mutation Finder
13
+
14
+ `straln` is a bioinformatics tool designed to parse `.aln` files in `markx0` format generated by
15
+ the **EMBOSS stretcher** tool.
16
+ It converts pairwise alignments into genomic formats (BEDPE/BED)
17
+ and can optionally intersect these results with VCF files to identify
18
+ alternative genomic variations nearby.
19
+
20
+ ## Features
21
+ * **Automated Parsing:** Converts EMBOSS stretcher `.aln` format to `BEDPE` and two separate `BED`.
22
+ * **Coordinate Mapping:** Automatically extracts starting offsets from alignment headers or allows manual overrides.
23
+ * **Mutation Analysis:** Compares alignment gaps/mismatches with a VCF file to find alternative mutations within a specified window.
24
+ * **Refined Output:** Joins consecutive alignment blocks for cleaner downstream analysis.
25
+
26
+ ## Installation
27
+
28
+ Since the project uses `setuptools`, you can install it in editable mode:
29
+
30
+ ```bash
31
+ # From the root of the project (where setup.py is located)
32
+ pip install -e .
33
+ ```
34
+
35
+ ```bash
36
+ # Getting Help
37
+ straln --help
38
+ # only parse the alignment
39
+ straln my_alignment.aln -o ./output_folder
40
+ # parse alignment and find alternative mutations from a VCF (when using -vcf parameter -c is mandatory)
41
+ straln my_alignment.aln -vcf variations.vcf -c 17 -d 150 -o ./output_folder
42
+ ```
43
+
44
+ ### Output files
45
+ The tool generates the following files in the specified output folder:
46
+
47
+ * **`parsed.bedpe`**: Raw alignment blocks in BEDPE format.
48
+ * **`parsed.joined.bedpe`**: Merged consecutive alignment blocks for easier visualization of londer indels.
49
+ * **`seq1.bed` / `seq2.bed`**: Separate tracks for each sequence.
50
+ * **`stats.txt`**: A summary file containing the alignment length, number of mismatches, and total gaps.
51
+ * **`alternative_mutations.csv`**: *(Generated only when using the `-vcf` flag)* A list of variants from your VCF file that were found near alignment discrepancies.
stralln-0.1/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # straln: Stretcher Alignment & Alternative Mutation Finder
2
+
3
+ `straln` is a bioinformatics tool designed to parse `.aln` files in `markx0` format generated by
4
+ the **EMBOSS stretcher** tool.
5
+ It converts pairwise alignments into genomic formats (BEDPE/BED)
6
+ and can optionally intersect these results with VCF files to identify
7
+ alternative genomic variations nearby.
8
+
9
+ ## Features
10
+ * **Automated Parsing:** Converts EMBOSS stretcher `.aln` format to `BEDPE` and two separate `BED`.
11
+ * **Coordinate Mapping:** Automatically extracts starting offsets from alignment headers or allows manual overrides.
12
+ * **Mutation Analysis:** Compares alignment gaps/mismatches with a VCF file to find alternative mutations within a specified window.
13
+ * **Refined Output:** Joins consecutive alignment blocks for cleaner downstream analysis.
14
+
15
+ ## Installation
16
+
17
+ Since the project uses `setuptools`, you can install it in editable mode:
18
+
19
+ ```bash
20
+ # From the root of the project (where setup.py is located)
21
+ pip install -e .
22
+ ```
23
+
24
+ ```bash
25
+ # Getting Help
26
+ straln --help
27
+ # only parse the alignment
28
+ straln my_alignment.aln -o ./output_folder
29
+ # parse alignment and find alternative mutations from a VCF (when using -vcf parameter -c is mandatory)
30
+ straln my_alignment.aln -vcf variations.vcf -c 17 -d 150 -o ./output_folder
31
+ ```
32
+
33
+ ### Output files
34
+ The tool generates the following files in the specified output folder:
35
+
36
+ * **`parsed.bedpe`**: Raw alignment blocks in BEDPE format.
37
+ * **`parsed.joined.bedpe`**: Merged consecutive alignment blocks for easier visualization of londer indels.
38
+ * **`seq1.bed` / `seq2.bed`**: Separate tracks for each sequence.
39
+ * **`stats.txt`**: A summary file containing the alignment length, number of mismatches, and total gaps.
40
+ * **`alternative_mutations.csv`**: *(Generated only when using the `-vcf` flag)* A list of variants from your VCF file that were found near alignment discrepancies.
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "stralln"
7
+ version = "0.1"
8
+ description = "A CLI tool for EMBOSS stretcher alignments parsing and VCF analysis"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ authors = [
12
+ { name = "nikolienka24" }
13
+ ]
14
+
15
+ dependencies = [
16
+ "PyVCF3",
17
+ "pandas",
18
+ "python-Levenshtein",
19
+ ]
20
+
21
+ [project.scripts]
22
+ straln = "stretcher_parser.__main__:main"
23
+
24
+ [tool.setuptools]
25
+ packages = ["stretcher_parser"]
stralln-0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
stralln-0.1/setup.py ADDED
@@ -0,0 +1,17 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="straln",
5
+ version="0.1",
6
+ packages=find_packages(),
7
+ install_requires=[
8
+ "PyVCF3",
9
+ "pandas",
10
+ "Levenshtein"
11
+ ],
12
+ entry_points={
13
+ 'console_scripts': [
14
+ 'straln=stretcher_parser.__main__:main',
15
+ ],
16
+ },
17
+ )
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: stralln
3
+ Version: 0.1
4
+ Summary: A CLI tool for EMBOSS stretcher alignments parsing and VCF analysis
5
+ Author: nikolienka24
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: PyVCF3
9
+ Requires-Dist: pandas
10
+ Requires-Dist: python-Levenshtein
11
+
12
+ # straln: Stretcher Alignment & Alternative Mutation Finder
13
+
14
+ `straln` is a bioinformatics tool designed to parse `.aln` files in `markx0` format generated by
15
+ the **EMBOSS stretcher** tool.
16
+ It converts pairwise alignments into genomic formats (BEDPE/BED)
17
+ and can optionally intersect these results with VCF files to identify
18
+ alternative genomic variations nearby.
19
+
20
+ ## Features
21
+ * **Automated Parsing:** Converts EMBOSS stretcher `.aln` format to `BEDPE` and two separate `BED`.
22
+ * **Coordinate Mapping:** Automatically extracts starting offsets from alignment headers or allows manual overrides.
23
+ * **Mutation Analysis:** Compares alignment gaps/mismatches with a VCF file to find alternative mutations within a specified window.
24
+ * **Refined Output:** Joins consecutive alignment blocks for cleaner downstream analysis.
25
+
26
+ ## Installation
27
+
28
+ Since the project uses `setuptools`, you can install it in editable mode:
29
+
30
+ ```bash
31
+ # From the root of the project (where setup.py is located)
32
+ pip install -e .
33
+ ```
34
+
35
+ ```bash
36
+ # Getting Help
37
+ straln --help
38
+ # only parse the alignment
39
+ straln my_alignment.aln -o ./output_folder
40
+ # parse alignment and find alternative mutations from a VCF (when using -vcf parameter -c is mandatory)
41
+ straln my_alignment.aln -vcf variations.vcf -c 17 -d 150 -o ./output_folder
42
+ ```
43
+
44
+ ### Output files
45
+ The tool generates the following files in the specified output folder:
46
+
47
+ * **`parsed.bedpe`**: Raw alignment blocks in BEDPE format.
48
+ * **`parsed.joined.bedpe`**: Merged consecutive alignment blocks for easier visualization of londer indels.
49
+ * **`seq1.bed` / `seq2.bed`**: Separate tracks for each sequence.
50
+ * **`stats.txt`**: A summary file containing the alignment length, number of mismatches, and total gaps.
51
+ * **`alternative_mutations.csv`**: *(Generated only when using the `-vcf` flag)* A list of variants from your VCF file that were found near alignment discrepancies.
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ stralln.egg-info/PKG-INFO
5
+ stralln.egg-info/SOURCES.txt
6
+ stralln.egg-info/dependency_links.txt
7
+ stralln.egg-info/entry_points.txt
8
+ stralln.egg-info/requires.txt
9
+ stralln.egg-info/top_level.txt
10
+ stretcher_parser/__init__.py
11
+ stretcher_parser/__main__.py
12
+ stretcher_parser/parser.py
13
+ stretcher_parser/reformat.py
14
+ stretcher_parser/utils.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ straln = stretcher_parser.__main__:main
@@ -0,0 +1,3 @@
1
+ PyVCF3
2
+ pandas
3
+ python-Levenshtein
@@ -0,0 +1 @@
1
+ stretcher_parser
File without changes
@@ -0,0 +1,120 @@
1
+ import sys
2
+ import os
3
+ import argparse
4
+
5
+ import vcf
6
+ import pandas as pd
7
+
8
+ from . import parser, reformat
9
+ from .utils import save_statistics, get_offsets_from_file
10
+ from analysis import find_alternative_mutations
11
+
12
+
13
+ def parse_arguments() -> argparse.Namespace:
14
+ parser_arg = argparse.ArgumentParser(
15
+ description="""
16
+ straln: Stretcher Alignment & Alternative Mutation Finder
17
+ ---------------------------------------------------------
18
+ Parses .aln files from EMBOSS stretcher into BEDPE/BED formats and optionally
19
+ compares them with VCF files to identify alternative genomic variations.
20
+ """,
21
+ formatter_class=argparse.RawDescriptionHelpFormatter
22
+ )
23
+
24
+ # ----- POSITIONAL ARGUMENTS -----
25
+ parser_arg.add_argument("aln_input_file", help="path to the .aln alignment file from EMBOSS stretcher (.aln format)")
26
+
27
+ # ----- OPTIONAL ARGUMENTS -----
28
+ parser_arg.add_argument("-o", "--output_folder", type=str, default=None, help="path to output folder (default: current folder)")
29
+
30
+ # ----- MUTATION ANALYSIS GROUP (OPTIONAL) -----
31
+ mutation = parser_arg.add_argument_group('Alternative Mutation Analysis (Optional)')
32
+ mutation.add_argument(
33
+ "-vcf", "--vcf_input_file",
34
+ help="input VCF file, providing this triggers the search for alternative mutations"
35
+ )
36
+ parser_arg.add_argument(
37
+ "-c", "--chromosome",
38
+ help="target chromosome (1-22, X, Y)")
39
+ mutation.add_argument(
40
+ "-d", "--distance", type=int, default=100,
41
+ help="window size (bp) for finding nearby alternative mutations (default: 100)"
42
+ )
43
+
44
+ # ----- ALIGNMENT METADATA GROUP -----
45
+ metadata = parser_arg.add_argument_group('Alignment Metadata & Offsets (Optional)')
46
+ metadata.add_argument("-s1", "--seq_name1", default="Seq1", help="label for the first sequence")
47
+ metadata.add_argument("-s2", "--seq_name2", default="Seq2", help="label for the second sequence")
48
+ metadata.add_argument("-off1", "--offset1", type=int, help="starting position for seq1 (if not provided, the tool automatically extracts it from the .aln header)")
49
+ metadata.add_argument("-off2", "--offset2", type=int, help="starting position for seq2 (if not provided, the tool automatically extracts it from the .aln header)")
50
+
51
+ return parser_arg.parse_args()
52
+
53
+
54
+ def main() -> None:
55
+ args = parse_arguments()
56
+
57
+ # 0. parse input files =============================================================
58
+ aln_input_file = args.aln_input_file
59
+
60
+ vcf_input_file = None
61
+ chromosome = None
62
+ if args.vcf_input_file:
63
+ vcf_input_file = args.vcf_input_file
64
+ chromosome = args.chromosome
65
+
66
+ output_folder = "."
67
+ if args.output_folder:
68
+ output_folder = args.output_folder
69
+ os.makedirs(output_folder, exist_ok=True)
70
+
71
+ seq_name1, seq_name2 = "seq1", "seq2"
72
+ if args.seq_name1:
73
+ seq_name1 = args.seq_name1
74
+ if args.seq_name2:
75
+ seq_name2 = args.seq_name2
76
+
77
+ distance_threshold = None
78
+ if args.distance:
79
+ distance_threshold = args.distance
80
+
81
+ if args.offset1 is not None and args.offset2 is not None:
82
+ offset1 = args.offset1
83
+ offset2 = args.offset2
84
+ else:
85
+ # If one or both are missing, we fetch them from the alignment file
86
+ offset1, offset2 = get_offsets_from_file(aln_input_file)
87
+
88
+ # If the user provided one but not the other, the manual one takes precedence
89
+ if args.offset1 is not None:
90
+ offset1 = args.offset1
91
+ if args.offset2 is not None:
92
+ offset2 = args.offset2
93
+
94
+ print(f"Using offsets extracted from alignment header: {offset1}, {offset2}")
95
+
96
+ # 1. stretcher parser =================================================================
97
+ parsed_file = output_folder + "/parsed.bedpe"
98
+ length, mismatches, gaps = parser.run(aln_input_file, parsed_file, seq_name1, seq_name2, offset1, offset2)
99
+
100
+ output_stats = output_folder + "/stats.txt"
101
+ save_statistics(seq_name1, seq_name2, length, mismatches, gaps, output_stats)
102
+
103
+ # 2. join consecutive rows ============================================================
104
+ parsed_joined_file = output_folder + "/parsed.joined.bedpe"
105
+ reformat.join_consecutive_rows(parsed_file, parsed_joined_file)
106
+
107
+ # 3. convert bedpe to bed two separate bed files =======================================================
108
+ bed1, bed2 = output_folder + "/seq1.bed", output_folder + "/seq2.bed"
109
+ reformat.bedpe_to_bed(parsed_joined_file, bed1, bed2)
110
+
111
+ # 4. find alternative mutations =======================================================
112
+ if vcf_input_file:
113
+ find_alternative_mutations.find(vcf_input_file, parsed_joined_file, output_folder, chromosome, distance_threshold)
114
+
115
+ # 5. print final information =======================================================
116
+ print("Output files saved to " + output_folder)
117
+
118
+
119
+ if __name__ == "__main__":
120
+ main()
@@ -0,0 +1,154 @@
1
+ from typing import Tuple, List, Optional, Union
2
+ from stretcher_parser.utils import get_offsets_from_file
3
+
4
+
5
+ def _check_sequences(sequence_name1: str, sequence_name2: str,
6
+ seq1_data: List[Union[str, int]], seq2_data: List[Union[str, int]],
7
+ found_start: bool, buffer: List[str],
8
+ prefix_a: Optional[str], prefix_b: Optional[str]) -> Tuple[str, int, int, int, bool, List[str], int, int, Optional[str], Optional[str]]:
9
+ """
10
+ Looks at the alignment letter-by-letter to find where the two sequences don't match.
11
+ It figures out the exact positions for mutations and handles insertions or deletions.
12
+ """
13
+ output = ""
14
+ p1, p2 = seq1_data[1], seq2_data[1]
15
+ seq1_chunk, seq2_chunk = seq1_data[0], seq2_data[0]
16
+
17
+ line_len, line_len_curr = 0, 0
18
+ line_mismatches, line_gaps = 0, 0
19
+
20
+ for a, b in zip(seq1_chunk, seq2_chunk):
21
+ if a != "-":
22
+ p1 += 1
23
+ if b != "-":
24
+ p2 += 1
25
+
26
+ if not found_start:
27
+ if a in ("A", "C", "G", "T") and b in ("A", "C", "G", "T"):
28
+ found_start = True
29
+ else:
30
+ continue
31
+
32
+ line_len_curr += 1
33
+
34
+ # 0-based BEDPE coordinates
35
+ pos1_start, pos1_end = p1 - 1, p1 - 1
36
+ pos2_start, pos2_end = p2 - 1, p2 - 1
37
+
38
+ if a != b:
39
+ if a != "-" and b == "-": # deletion in seq2
40
+ out1 = prefix_a + a # REF: prefix + deleted base(s)
41
+ pos1_end += 1
42
+ out2 = prefix_b # ALT: prefix only
43
+ elif a == "-" and b != "-": # deletion in seq1
44
+ out1 = prefix_a # REF: prefix only
45
+ out2 = prefix_b + b # ALT: prefix + inserted base(s)
46
+ pos2_end += 1
47
+ else:
48
+ # normal mismatch
49
+ out1 = a
50
+ out2 = b
51
+
52
+ buffer.append(
53
+ f"{sequence_name1}\t{pos1_start}\t{pos1_end}\t"
54
+ f"{sequence_name2}\t{pos2_start}\t{pos2_end}\t"
55
+ f"{out1}\t{out2}\n"
56
+ )
57
+
58
+ line_mismatches += 1
59
+ if a == "-" or b == "-":
60
+ line_gaps += 1
61
+
62
+ # Flush buffer on real aligned A/C/G/T pairs
63
+ if a in ("A", "C", "G", "T") and b in ("A", "C", "G", "T"):
64
+ output += "".join(buffer)
65
+ line_len += line_len_curr
66
+ line_len_curr = 0
67
+ buffer.clear()
68
+
69
+ if a in ("A", "C", "G", "T"):
70
+ prefix_a = a
71
+ if b in ("A", "C", "G", "T"):
72
+ prefix_b = b
73
+
74
+ return output, line_len, line_mismatches, line_gaps, found_start, buffer, p1, p2, prefix_a, prefix_b
75
+
76
+
77
+ def _parse(input_file: str, output_file: str,
78
+ seq_name1: str, seq_name2:str,
79
+ offset_seq1: int = 0, offset_seq2: int = 0) -> Tuple[int, int, int]:
80
+ """
81
+ The main engine that opens the alignment file, skips the technical headers,
82
+ and reads the sequences line-by-line to find differences.
83
+ """
84
+
85
+ with open(input_file) as infile, open(output_file, 'w') as outfile:
86
+ # BEDPE header with 2 additional columns
87
+ outfile.write("chrom1\tstart1\tend1\tchrom2\tstart2\tend2\tnucleotide1\tnucleotide2\n")
88
+
89
+ found_start = False
90
+ buffer = []
91
+ seq_len, mismatches, gaps = 0, 0, 0
92
+
93
+ # read header
94
+ line = infile.readline()
95
+ while line.startswith("#"):
96
+ line = infile.readline()
97
+
98
+ line = infile.readline()
99
+ while line.startswith("#"):
100
+ line = infile.readline()
101
+
102
+ infile.readline() # skip blank line at the end of header
103
+ # end read header
104
+
105
+ curr1, curr2 = offset_seq1, offset_seq2
106
+ prefix_a, prefix_b = None, None
107
+ while True:
108
+ if line.startswith("#") or not line:
109
+ break
110
+
111
+ seq1_line = infile.readline().strip().split()
112
+ infile.readline() # match line
113
+ seq2_line = infile.readline().strip().split()
114
+ infile.readline() # lower pos
115
+ infile.readline() # blank line
116
+
117
+ if seq1_line[0].startswith("#") or seq2_line[0].startswith("#"):
118
+ break
119
+
120
+ seq1_seq = seq1_line[1]
121
+ seq2_seq = seq2_line[1]
122
+
123
+ if not seq2_seq:
124
+ break
125
+
126
+ out, line_length, mm_add, gaps_add, found_start, buffer, curr1, curr2, prefix_a, prefix_b = _check_sequences(
127
+ seq_name1, seq_name2,
128
+ [seq1_seq, curr1],
129
+ [seq2_seq, curr2],
130
+ found_start, buffer,
131
+ prefix_a, prefix_b
132
+ )
133
+ outfile.write(out)
134
+
135
+ seq_len += line_length
136
+ mismatches += mm_add
137
+ gaps += gaps_add
138
+
139
+ line = infile.readline()
140
+ if not line:
141
+ break
142
+
143
+ return seq_len, mismatches, gaps
144
+
145
+
146
+ def run(in_file: str, out_file: str,
147
+ seq_name1: str, seq_name2: str,
148
+ offset1: int, offset2: int) -> Tuple[int, int, int]:
149
+ """
150
+ Starts the parsing process and returns a summary of the results,
151
+ including how long the sequences are and how many errors were found.
152
+ """
153
+ length, mismatches, gaps = _parse(in_file, out_file, seq_name1, seq_name2, offset1, offset2)
154
+ return length, mismatches, gaps
@@ -0,0 +1,118 @@
1
+ from typing import List, TextIO
2
+
3
+ ### FORMAT CONVERSIONS ===========================================================
4
+ def bedpe_to_bed(infile: str, out1: str, out2: str) -> None:
5
+ """
6
+ Splits the combined alignment file into two separate BED files,
7
+ one for each sequence, so they can be viewed individually.
8
+ """
9
+
10
+ with open(infile) as f, open(out1, "w") as bed1, open(out2, "w") as bed2:
11
+ header = next(f)
12
+ bed1.write("chrom\tstart\tend\tseq\n")
13
+ bed2.write("chrom\tstart\tend\tseq\n")
14
+
15
+ for line in f:
16
+ chrom1, start1, end1, chrom2, start2, end2, seq1, seq2 = line.rstrip().split("\t")
17
+ bed1.write(f"{chrom1}\t{start1}\t{end1}\t{seq1}\n")
18
+ bed2.write(f"{chrom2}\t{start2}\t{end2}\t{seq2}\n")
19
+
20
+
21
+ ### REFORMATTING =================================================================
22
+ def _flush_buffer(buf: List[List[str]], out_fh: TextIO) -> None:
23
+ """
24
+ Merge consecutive positions in buffer and write a single BEDPE row.
25
+ - Coordinates: start = first position, end = last position
26
+ - Nucleotides: concatenate all nucleotides in the run, ignoring gaps
27
+ """
28
+ if not buf:
29
+ return
30
+
31
+ # assume that chrom1 and chrom2 are in the whole file same
32
+ chrom1 = buf[0][0]
33
+ chrom2 = buf[0][3]
34
+
35
+ # Get start/end coordinates for ref and alt
36
+ seq1, seq2 = "", ""
37
+ start1, end1 = None, None
38
+ start2, end2 = None, None
39
+ for idx, line in enumerate(buf):
40
+ if idx == 0:
41
+ start1 = line[1]
42
+ start2 = line[4]
43
+ seq1, seq2 = line[6], line[7]
44
+ continue
45
+
46
+ if len(line[6]) == 1 and len(line[7]) == 1:
47
+ seq1 += line[6]
48
+ seq2 += line[7]
49
+ elif len(line[6]) == 2 and len(line[7]) == 1:
50
+ seq1 += line[6][1]
51
+ elif len(line[6]) == 1 and len(line[7]) == 2:
52
+ seq2 += line[7][1]
53
+
54
+ end1, end2 = buf[-1][2], buf[-1][5]
55
+
56
+ out_fh.write(f"{chrom1}\t{start1}\t{end1}\t{chrom2}\t{start2}\t{end2}\t{seq1}\t{seq2}\n")
57
+
58
+
59
+ def _is_consecutive(prev: List[str], curr: List[str]) -> bool:
60
+ """
61
+ Determine if two BEDPE rows are consecutive:
62
+ - consecutive in both sequences, or in ref-only or alt-only
63
+ """
64
+
65
+ prev1_start, prev1_end = prev[1], prev[2]
66
+ prev2_start, prev2_end = prev[4], prev[5]
67
+ curr1_start, curr1_end = curr[1], curr[2]
68
+ curr2_start, curr2_end = curr[4], curr[5]
69
+
70
+ # Both sequences advance
71
+ if int(prev1_end) == int(curr1_start) and int(prev2_end) + 1 == int(curr2_start) + 1:
72
+ return True
73
+ # Ref only
74
+ elif int(prev1_end) == int(curr1_start):
75
+ return True
76
+ # Alt only
77
+ elif int(prev2_end) == int(curr2_start):
78
+ return True
79
+
80
+ return False
81
+
82
+
83
+ def join_consecutive_rows(input_file: str, output_file: str) -> None:
84
+ """
85
+ Reads through the alignment differences and merges nearby changes
86
+ into single rows. This makes the data much cleaner and easier to read.
87
+ """
88
+
89
+ with open(input_file, 'r') as in_fh, open(output_file, 'w') as out_fh:
90
+ out_fh.write("chrom1\tstart1\tend1\tchrom2\tstart2\tend2\tsequence1\tsequence2\n")
91
+
92
+ buffer = []
93
+ found_header = False
94
+ for line in in_fh:
95
+ if not found_header or not line.strip():
96
+ found_header = True
97
+ continue
98
+
99
+ row = line.strip().split("\t")
100
+ if len(row) < 8:
101
+ continue
102
+
103
+ # BEDPE row: chrom1, start1, end1, chrom2, start2, end2, nt1, nt2
104
+ chrom1, start1, end1, chrom2, start2, end2, nt1, nt2 = row
105
+ row_arr = [chrom1, start1, end1, chrom2, start2, end2, nt1, nt2]
106
+
107
+ if not buffer:
108
+ buffer.append(row_arr)
109
+ continue
110
+
111
+ if _is_consecutive(buffer[-1], row_arr):
112
+ buffer.append(row_arr)
113
+ else:
114
+ _flush_buffer(buffer, out_fh)
115
+ buffer = [row_arr]
116
+
117
+ # Flush remaining buffer
118
+ _flush_buffer(buffer, out_fh)
@@ -0,0 +1,51 @@
1
+ from typing import Tuple, Optional
2
+
3
+
4
+ def save_statistics(seq_name1: str, seq_name2: str,
5
+ length: int, mismatches: int, gaps: int,
6
+ output_file: str) -> None:
7
+ """
8
+ Creates a summary report. It calculates the error percentages (mismatch and gap rates)
9
+ and saves all the final numbers into a text file.
10
+ """
11
+
12
+ with open(output_file, "w") as f:
13
+ f.write(f"Alignment statistics between '{seq_name1}' and '{seq_name2}':\n")
14
+ f.write(f" Total aligned positions (excluding ignored ends): {length}\n")
15
+ f.write(f" Base mismatches (including gaps): {mismatches}\n")
16
+ f.write(f" Gaps detected: {gaps}\n")
17
+ if length > 0:
18
+ mismatch_rate = (mismatches + gaps) / length * 100
19
+ gap_rate = gaps / length * 100
20
+ f.write(f" Mismatch rate (including gaps): {mismatch_rate:.8f}%\n")
21
+ f.write(f" Gap rate: {gap_rate:.8f}%\n")
22
+
23
+
24
+ def get_offsets_from_file(aln_file_path: str) -> Tuple[int, int]:
25
+ """
26
+ Reads the header of the Stretcher file to find the starting genomic positions.
27
+ It automatically figures out where your sequences begin on the chromosome.
28
+ """
29
+
30
+ offset_seq1 = None
31
+ offset_seq2 = None
32
+
33
+ with open(aln_file_path, "r") as f:
34
+ for line in f:
35
+ line = line.strip()
36
+ if line.startswith("# 1:"):
37
+ # Format: "# 1: 87413227-97757117"
38
+ parts = line.split(":")[1].strip().split("-")
39
+ offset_seq1 = int(parts[0])
40
+ elif line.startswith("# 2:"):
41
+ parts = line.split(":")[1].strip().split("-")
42
+ offset_seq2 = int(parts[0])
43
+
44
+ if offset_seq1 is not None and offset_seq2 is not None:
45
+ break
46
+
47
+ if offset_seq1 is None or offset_seq2 is None:
48
+ raise ValueError(f"Could not parse offsets from Stretcher file: {aln_file_path}")
49
+
50
+ return offset_seq1, offset_seq2
51
+