phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
from textwrap import dedent
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
from Bio import SeqIO
|
|
6
|
+
from Bio.SeqRecord import SeqRecord
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
8
|
+
from functools import partial
|
|
9
|
+
import multiprocessing as mp
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
|
|
12
|
+
from .base import Alignment
|
|
13
|
+
from ...helpers.files import read_single_column_file_to_list
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CreateConcatenationMatrix(Alignment):
|
|
17
|
+
def __init__(self, args) -> None:
|
|
18
|
+
super().__init__(**self.process_args(args))
|
|
19
|
+
|
|
20
|
+
def run(self) -> None:
|
|
21
|
+
self.create_concatenation_matrix(
|
|
22
|
+
self.alignment_list_path,
|
|
23
|
+
self.prefix
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
27
|
+
return dict(alignment_list_path=args.alignment_list, prefix=args.prefix)
|
|
28
|
+
|
|
29
|
+
def read_alignment_paths(self, alignment_list_path: str) -> List[str]:
|
|
30
|
+
try:
|
|
31
|
+
return read_single_column_file_to_list(alignment_list_path)
|
|
32
|
+
except FileNotFoundError:
|
|
33
|
+
print("Alignment list file (-a) is not found. Please check pathing.")
|
|
34
|
+
sys.exit(2)
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def _get_taxa_from_alignment(alignment_path: str) -> set:
|
|
38
|
+
"""Extract taxa names from a single alignment file."""
|
|
39
|
+
return {seq_record.id for seq_record in SeqIO.parse(alignment_path, "fasta")}
|
|
40
|
+
|
|
41
|
+
def get_taxa_names(self, alignment_paths: List[str]) -> List[str]:
|
|
42
|
+
"""Get all unique taxa names from alignment files in parallel."""
|
|
43
|
+
taxa = set()
|
|
44
|
+
|
|
45
|
+
# Process files in parallel if there are many
|
|
46
|
+
if len(alignment_paths) > 10:
|
|
47
|
+
with ProcessPoolExecutor(max_workers=min(mp.cpu_count(), len(alignment_paths))) as executor:
|
|
48
|
+
futures = [executor.submit(self._get_taxa_from_alignment, path) for path in alignment_paths]
|
|
49
|
+
for future in as_completed(futures):
|
|
50
|
+
taxa.update(future.result())
|
|
51
|
+
else:
|
|
52
|
+
# Process sequentially for small datasets
|
|
53
|
+
for alignment_path in alignment_paths:
|
|
54
|
+
taxa.update(self._get_taxa_from_alignment(alignment_path))
|
|
55
|
+
|
|
56
|
+
return sorted(taxa)
|
|
57
|
+
|
|
58
|
+
def print_start_message(
|
|
59
|
+
self,
|
|
60
|
+
taxa: List[str],
|
|
61
|
+
alignment_paths: List[str],
|
|
62
|
+
file_partition: str,
|
|
63
|
+
fasta_output: str,
|
|
64
|
+
file_occupancy: str,
|
|
65
|
+
) -> None:
|
|
66
|
+
start_message = dedent(f"""
|
|
67
|
+
--------------------
|
|
68
|
+
| General features |
|
|
69
|
+
--------------------
|
|
70
|
+
Total number of taxa: {len(taxa)}
|
|
71
|
+
Total number of alignments: {len(alignment_paths)}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
----------------
|
|
75
|
+
| Output files |
|
|
76
|
+
----------------
|
|
77
|
+
Partition file output: {file_partition}
|
|
78
|
+
Concatenated fasta output: {fasta_output}
|
|
79
|
+
Occupancy report: {file_occupancy}
|
|
80
|
+
""")
|
|
81
|
+
print(start_message)
|
|
82
|
+
|
|
83
|
+
def get_list_of_taxa_and_records(
|
|
84
|
+
self, alignment_path: str
|
|
85
|
+
) -> Tuple[set, List[SeqRecord]]:
|
|
86
|
+
records = list(SeqIO.parse(alignment_path, "fasta"))
|
|
87
|
+
og_taxa = {record.id for record in records}
|
|
88
|
+
return og_taxa, records
|
|
89
|
+
|
|
90
|
+
def create_missing_seq_str(self, records: List[SeqRecord]) -> Tuple[str, int]:
|
|
91
|
+
"""Create a placeholder string for sequences with missing taxa."""
|
|
92
|
+
if not records:
|
|
93
|
+
print(f"No sequence records found. Exiting...")
|
|
94
|
+
sys.exit(2)
|
|
95
|
+
|
|
96
|
+
og_len = len(records[0].seq)
|
|
97
|
+
missing_seq = '?' * og_len
|
|
98
|
+
return missing_seq, og_len
|
|
99
|
+
|
|
100
|
+
def process_taxa_sequences(
|
|
101
|
+
self,
|
|
102
|
+
records: List[SeqRecord],
|
|
103
|
+
taxa: List[str],
|
|
104
|
+
concatenated_seqs: Dict[str, List[str]],
|
|
105
|
+
missing_seq: str,
|
|
106
|
+
) -> None:
|
|
107
|
+
present_taxa = {record.id for record in records}
|
|
108
|
+
missing_taxa = set(taxa) - present_taxa
|
|
109
|
+
|
|
110
|
+
# Add sequences for present taxa
|
|
111
|
+
for record in records:
|
|
112
|
+
concatenated_seqs[record.id].append(str(record.seq))
|
|
113
|
+
|
|
114
|
+
# Add missing sequences for missing taxa
|
|
115
|
+
for taxon in missing_taxa:
|
|
116
|
+
concatenated_seqs[taxon].append(missing_seq)
|
|
117
|
+
|
|
118
|
+
def add_to_partition_info(
|
|
119
|
+
self,
|
|
120
|
+
partition_info: List[str],
|
|
121
|
+
og_len: int,
|
|
122
|
+
field_one: str,
|
|
123
|
+
fasta: str,
|
|
124
|
+
first_len: int,
|
|
125
|
+
second_len: int,
|
|
126
|
+
) -> Tuple[List[str], int, int]:
|
|
127
|
+
second_len += og_len
|
|
128
|
+
partition_info.append(f"{field_one}, {fasta}={first_len}-{second_len}\n")
|
|
129
|
+
return partition_info, second_len + 1, second_len
|
|
130
|
+
|
|
131
|
+
def add_to_occupancy_info(
|
|
132
|
+
self,
|
|
133
|
+
occupancy_info: List[str],
|
|
134
|
+
present_taxa: set,
|
|
135
|
+
taxa: List[str],
|
|
136
|
+
fasta: str,
|
|
137
|
+
) -> List[str]:
|
|
138
|
+
missing_taxa = sorted(set(taxa) - present_taxa)
|
|
139
|
+
num_present = len(present_taxa)
|
|
140
|
+
num_missing = len(missing_taxa)
|
|
141
|
+
percent_occupancy = num_present / len(taxa)
|
|
142
|
+
occupancy_info.append(f"{fasta}\t{num_present}\t{num_missing}\t{percent_occupancy:.4f}\t{';'.join(missing_taxa)}\n")
|
|
143
|
+
return occupancy_info
|
|
144
|
+
|
|
145
|
+
def fasta_file_write(self, fasta_output: str, concatenated_seqs: Dict[str, List[str]]) -> None:
|
|
146
|
+
"""Write concatenated sequences to FASTA file with buffered I/O."""
|
|
147
|
+
# Use larger buffer for better I/O performance
|
|
148
|
+
with open(fasta_output, "w", buffering=8192) as final_fasta_file:
|
|
149
|
+
for taxon, sequences in concatenated_seqs.items():
|
|
150
|
+
# Join sequences once instead of in the write statement
|
|
151
|
+
concatenated = ''.join(sequences)
|
|
152
|
+
final_fasta_file.write(f">{taxon}\n{concatenated}\n")
|
|
153
|
+
|
|
154
|
+
def write_occupancy_or_partition_file(self, info: List[str], output_file_name: str) -> None:
|
|
155
|
+
with open(output_file_name, "w") as f:
|
|
156
|
+
f.writelines(info)
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def _process_alignment_file(alignment_path: str, taxa: List[str]) -> Tuple[str, Dict[str, str], set, int]:
|
|
160
|
+
"""Process a single alignment file and return its data."""
|
|
161
|
+
records = list(SeqIO.parse(alignment_path, "fasta"))
|
|
162
|
+
present_taxa = {record.id for record in records}
|
|
163
|
+
|
|
164
|
+
if not records:
|
|
165
|
+
return alignment_path, {}, present_taxa, 0
|
|
166
|
+
|
|
167
|
+
og_len = len(records[0].seq)
|
|
168
|
+
missing_seq = '?' * og_len
|
|
169
|
+
|
|
170
|
+
# Create sequence dict for this alignment
|
|
171
|
+
seq_dict = {}
|
|
172
|
+
for taxon in taxa:
|
|
173
|
+
if taxon in present_taxa:
|
|
174
|
+
# Find the sequence for this taxon
|
|
175
|
+
for record in records:
|
|
176
|
+
if record.id == taxon:
|
|
177
|
+
seq_dict[taxon] = str(record.seq)
|
|
178
|
+
break
|
|
179
|
+
else:
|
|
180
|
+
seq_dict[taxon] = missing_seq
|
|
181
|
+
|
|
182
|
+
return alignment_path, seq_dict, present_taxa, og_len
|
|
183
|
+
|
|
184
|
+
def create_concatenation_matrix(self, alignment_list_path: str, prefix: str) -> None:
|
|
185
|
+
alignment_paths = self.read_alignment_paths(alignment_list_path)
|
|
186
|
+
taxa = self.get_taxa_names(alignment_paths)
|
|
187
|
+
|
|
188
|
+
# Create output directory if needed
|
|
189
|
+
output_dir = os.path.dirname(prefix)
|
|
190
|
+
if output_dir and not os.path.exists(output_dir):
|
|
191
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
192
|
+
|
|
193
|
+
# Assign output file names
|
|
194
|
+
file_partition = f"{prefix}.partition"
|
|
195
|
+
fasta_output = f"{prefix}.fa"
|
|
196
|
+
file_occupancy = f"{prefix}.occupancy"
|
|
197
|
+
|
|
198
|
+
self.print_start_message(taxa, alignment_paths, file_partition, fasta_output, file_occupancy)
|
|
199
|
+
|
|
200
|
+
# Initialize placeholders for partition info
|
|
201
|
+
first_len, second_len = 1, 0
|
|
202
|
+
partition_info, occupancy_info = [], []
|
|
203
|
+
concatenated_seqs = defaultdict(list)
|
|
204
|
+
|
|
205
|
+
# Process alignment files in parallel if there are many
|
|
206
|
+
if len(alignment_paths) > 2:
|
|
207
|
+
with ProcessPoolExecutor(max_workers=min(mp.cpu_count(), 8)) as executor:
|
|
208
|
+
process_func = partial(self._process_alignment_file, taxa=taxa)
|
|
209
|
+
# Keep results indexed by path to maintain order
|
|
210
|
+
futures = {executor.submit(process_func, path): path for path in alignment_paths}
|
|
211
|
+
results = {}
|
|
212
|
+
|
|
213
|
+
for future in as_completed(futures):
|
|
214
|
+
path = futures[future]
|
|
215
|
+
results[path] = future.result()
|
|
216
|
+
|
|
217
|
+
# Process results in original order
|
|
218
|
+
for alignment_path in alignment_paths:
|
|
219
|
+
_, seq_dict, present_taxa, og_len = results[alignment_path]
|
|
220
|
+
|
|
221
|
+
# Add sequences to concatenated dict
|
|
222
|
+
for taxon in taxa:
|
|
223
|
+
concatenated_seqs[taxon].append(seq_dict[taxon])
|
|
224
|
+
|
|
225
|
+
# Add to partition and occupancy info
|
|
226
|
+
partition_info, first_len, second_len = self.add_to_partition_info(
|
|
227
|
+
partition_info, og_len, "AUTO", alignment_path, first_len, second_len
|
|
228
|
+
)
|
|
229
|
+
occupancy_info = self.add_to_occupancy_info(occupancy_info, present_taxa, taxa, alignment_path)
|
|
230
|
+
else:
|
|
231
|
+
# Process sequentially for small datasets
|
|
232
|
+
for alignment_path in alignment_paths:
|
|
233
|
+
present_taxa, records = self.get_list_of_taxa_and_records(alignment_path)
|
|
234
|
+
missing_seq, og_len = self.create_missing_seq_str(records)
|
|
235
|
+
|
|
236
|
+
# Process taxa sequences and add to the concatenated sequences
|
|
237
|
+
self.process_taxa_sequences(records, taxa, concatenated_seqs, missing_seq)
|
|
238
|
+
|
|
239
|
+
# Add to partition and occupancy info
|
|
240
|
+
partition_info, first_len, second_len = self.add_to_partition_info(
|
|
241
|
+
partition_info, og_len, "AUTO", alignment_path, first_len, second_len
|
|
242
|
+
)
|
|
243
|
+
occupancy_info = self.add_to_occupancy_info(occupancy_info, present_taxa, taxa, alignment_path)
|
|
244
|
+
|
|
245
|
+
# Convert defaultdict to regular dict for writing
|
|
246
|
+
if isinstance(concatenated_seqs, defaultdict):
|
|
247
|
+
concatenated_seqs = dict(concatenated_seqs)
|
|
248
|
+
|
|
249
|
+
# Write output files
|
|
250
|
+
self.fasta_file_write(fasta_output, concatenated_seqs)
|
|
251
|
+
self.write_occupancy_or_partition_file(occupancy_info, file_occupancy)
|
|
252
|
+
self.write_occupancy_or_partition_file(partition_info, file_partition)
|
|
253
|
+
|
|
254
|
+
print("Complete!\n")
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
import numpy as np
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from Bio.Seq import Seq
|
|
6
|
+
from .base import Alignment
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DNAThreader(Alignment):
|
|
10
|
+
"""
|
|
11
|
+
Threads DNA on top of protein alignment
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, args) -> None:
|
|
15
|
+
self.process_args(args)
|
|
16
|
+
|
|
17
|
+
def process_args(self, args):
|
|
18
|
+
self.remove_stop_codon = args.stop
|
|
19
|
+
self.protein_file_path = args.protein
|
|
20
|
+
self.nucleotide_file_path = args.nucleotide
|
|
21
|
+
self.clipkit_log_file = args.clipkit_log_file
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def clipkit_log_data(self) -> List[List[str]]:
|
|
25
|
+
if self.clipkit_log_file:
|
|
26
|
+
with open(self.clipkit_log_file) as f:
|
|
27
|
+
return [line.rstrip("\n").split(" ") for line in f.readlines()]
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
def run(self) -> None:
|
|
31
|
+
prot_records = SeqIO.parse(self.protein_file_path, "fasta")
|
|
32
|
+
pal2nal = self.thread(prot_records)
|
|
33
|
+
|
|
34
|
+
for gene_id, sequence in pal2nal.items():
|
|
35
|
+
print(f">{gene_id}")
|
|
36
|
+
print(f"{sequence}")
|
|
37
|
+
|
|
38
|
+
def create_mask(self, length: int) -> List[bool]:
|
|
39
|
+
if not self.clipkit_log_data:
|
|
40
|
+
return [True] * length
|
|
41
|
+
|
|
42
|
+
# create a mask that replicates the 'keep' and 'remove' status 3 times for each amino acid
|
|
43
|
+
return [
|
|
44
|
+
True if row[1] == "keep" else False
|
|
45
|
+
for row in self.clipkit_log_data
|
|
46
|
+
for _ in range(3)
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
def normalize_p_seq(self, p_seq: Seq) -> str:
|
|
50
|
+
# triplicate each amino acid
|
|
51
|
+
return ''.join(c * 3 for c in p_seq)
|
|
52
|
+
|
|
53
|
+
def normalize_n_seq(self, n_seq: Seq, p_seq: Seq) -> str:
|
|
54
|
+
# Pre-split codons for faster access
|
|
55
|
+
codons = [str(n_seq[i:i+3]) for i in range(0, len(n_seq), 3)]
|
|
56
|
+
normalized_n_seq = []
|
|
57
|
+
gap_chars = {'-', '?', '*', 'X', 'x'}
|
|
58
|
+
|
|
59
|
+
codon_idx = 0
|
|
60
|
+
for aa in p_seq:
|
|
61
|
+
if aa in gap_chars:
|
|
62
|
+
normalized_n_seq.append("---")
|
|
63
|
+
else:
|
|
64
|
+
if codon_idx < len(codons):
|
|
65
|
+
normalized_n_seq.append(codons[codon_idx])
|
|
66
|
+
codon_idx += 1
|
|
67
|
+
else:
|
|
68
|
+
normalized_n_seq.append("---") # fallback in case of misalignment
|
|
69
|
+
|
|
70
|
+
return ''.join(normalized_n_seq)
|
|
71
|
+
|
|
72
|
+
def thread(self, prot_records) -> Dict[str, str]:
|
|
73
|
+
pal2nal = dict()
|
|
74
|
+
prot_dict = SeqIO.to_dict(prot_records)
|
|
75
|
+
|
|
76
|
+
if not prot_dict:
|
|
77
|
+
print("Protein file is empty or incorrectly formatted.")
|
|
78
|
+
sys.exit(2)
|
|
79
|
+
|
|
80
|
+
# Pre-load nucleotide sequences only for proteins we have
|
|
81
|
+
nucl_records = {}
|
|
82
|
+
for record in SeqIO.parse(self.nucleotide_file_path, "fasta"):
|
|
83
|
+
if record.id in prot_dict:
|
|
84
|
+
nucl_records[record.id] = record
|
|
85
|
+
|
|
86
|
+
length = len(next(iter(prot_dict.values())).seq)
|
|
87
|
+
keep_mask = self.create_mask(length * 3)
|
|
88
|
+
|
|
89
|
+
# Convert keep_mask to numpy array for faster operations
|
|
90
|
+
keep_mask_arr = np.array(keep_mask)
|
|
91
|
+
gap_chars = {'-', '?', '*', 'X', 'x'}
|
|
92
|
+
|
|
93
|
+
for gene_id, protein_seq_record in prot_dict.items():
|
|
94
|
+
try:
|
|
95
|
+
if gene_id not in nucl_records:
|
|
96
|
+
print(f"Nucleotide sequence for {gene_id} not found.")
|
|
97
|
+
sys.exit(2)
|
|
98
|
+
|
|
99
|
+
p_seq = protein_seq_record.seq
|
|
100
|
+
n_seq = nucl_records[gene_id].seq
|
|
101
|
+
|
|
102
|
+
# Get normalized sequences
|
|
103
|
+
normalized_p_seq = self.normalize_p_seq(p_seq)
|
|
104
|
+
normalized_n_seq = self.normalize_n_seq(n_seq, normalized_p_seq)
|
|
105
|
+
|
|
106
|
+
# Convert to numpy arrays for faster operations
|
|
107
|
+
p_arr = np.array(list(normalized_p_seq), dtype='U1')
|
|
108
|
+
n_arr = np.array(list(normalized_n_seq), dtype='U1')
|
|
109
|
+
|
|
110
|
+
# Create mask for non-gap positions in protein
|
|
111
|
+
non_gap_mask_protein = ~np.isin(p_arr, list(gap_chars))
|
|
112
|
+
|
|
113
|
+
# Expand protein mask to nucleotide positions (each AA = 3 nucleotides)
|
|
114
|
+
non_gap_mask = np.repeat(non_gap_mask_protein, 3)
|
|
115
|
+
|
|
116
|
+
# Ensure masks have same shape
|
|
117
|
+
min_len = min(len(non_gap_mask), len(keep_mask_arr), len(n_arr))
|
|
118
|
+
non_gap_mask = non_gap_mask[:min_len]
|
|
119
|
+
keep_mask_arr_trimmed = keep_mask_arr[:min_len]
|
|
120
|
+
n_arr = n_arr[:min_len]
|
|
121
|
+
|
|
122
|
+
# Combine masks
|
|
123
|
+
final_mask = keep_mask_arr_trimmed & non_gap_mask
|
|
124
|
+
|
|
125
|
+
# Apply masks and build result
|
|
126
|
+
result = np.where(final_mask, n_arr, '-')
|
|
127
|
+
|
|
128
|
+
# Handle stop codon if needed
|
|
129
|
+
if self.remove_stop_codon and p_seq[-1] == "*":
|
|
130
|
+
# Find the last 3 positions that were kept
|
|
131
|
+
kept_indices = np.where(keep_mask_arr_trimmed)[0]
|
|
132
|
+
if len(kept_indices) >= 3:
|
|
133
|
+
last_3_indices = kept_indices[-3:]
|
|
134
|
+
for idx in last_3_indices:
|
|
135
|
+
if idx < len(result) and idx < len(n_arr):
|
|
136
|
+
result[idx] = n_arr[idx]
|
|
137
|
+
|
|
138
|
+
# Only keep positions marked in keep_mask
|
|
139
|
+
pal2nal[gene_id] = ''.join(result[keep_mask_arr_trimmed[:len(result)]])
|
|
140
|
+
|
|
141
|
+
except KeyError:
|
|
142
|
+
print(f"Nucleotide sequence for {gene_id} not found.")
|
|
143
|
+
sys.exit(2)
|
|
144
|
+
|
|
145
|
+
return pal2nal
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from Bio.Align import MultipleSeqAlignment
|
|
6
|
+
|
|
7
|
+
from .base import Alignment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EvolutionaryRatePerSite(Alignment):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
16
|
+
pic_values = self.calculate_evolutionary_rate_per_site(alignment)
|
|
17
|
+
|
|
18
|
+
for idx, value in enumerate(pic_values):
|
|
19
|
+
print(f"{idx + 1}\t{round(value, 4)}")
|
|
20
|
+
|
|
21
|
+
def process_args(self, args):
|
|
22
|
+
return dict(alignment_file_path=args.alignment)
|
|
23
|
+
|
|
24
|
+
def remove_gap_characters(self, seq: str, gap_chars: List[str]) -> str:
|
|
25
|
+
return ''.join([char for char in seq if char not in gap_chars]).upper()
|
|
26
|
+
|
|
27
|
+
def get_number_of_occurrences_per_character(
|
|
28
|
+
self,
|
|
29
|
+
alignment: MultipleSeqAlignment,
|
|
30
|
+
idx: int,
|
|
31
|
+
gap_chars: List[str]
|
|
32
|
+
) -> Dict[str, int]:
|
|
33
|
+
seq_at_position = alignment[:, idx]
|
|
34
|
+
clean_seq = self.remove_gap_characters(seq_at_position, gap_chars)
|
|
35
|
+
|
|
36
|
+
return Counter(clean_seq)
|
|
37
|
+
|
|
38
|
+
def calculate_pic(
|
|
39
|
+
self,
|
|
40
|
+
num_occurrences: Dict[str, int],
|
|
41
|
+
) -> float:
|
|
42
|
+
total_frequencies = sum(num_occurrences.values())
|
|
43
|
+
sum_of_frequencies = sum(
|
|
44
|
+
(frequency / total_frequencies) ** 2
|
|
45
|
+
for frequency in num_occurrences.values()
|
|
46
|
+
)
|
|
47
|
+
return 1 - sum_of_frequencies
|
|
48
|
+
|
|
49
|
+
def calculate_evolutionary_rate_per_site(
|
|
50
|
+
self,
|
|
51
|
+
alignment: MultipleSeqAlignment,
|
|
52
|
+
) -> List[float]:
|
|
53
|
+
aln_len = alignment.get_alignment_length()
|
|
54
|
+
gap_chars = set(self.get_gap_chars())
|
|
55
|
+
|
|
56
|
+
# Convert alignment to numpy array for vectorized operations
|
|
57
|
+
alignment_array = np.array([
|
|
58
|
+
[c.upper() for c in str(record.seq)]
|
|
59
|
+
for record in alignment
|
|
60
|
+
], dtype='U1')
|
|
61
|
+
|
|
62
|
+
pic_values = []
|
|
63
|
+
|
|
64
|
+
# Process each column
|
|
65
|
+
for col_idx in range(aln_len):
|
|
66
|
+
column = alignment_array[:, col_idx]
|
|
67
|
+
|
|
68
|
+
# Filter out gaps
|
|
69
|
+
non_gap_mask = ~np.isin(column, list(gap_chars))
|
|
70
|
+
filtered_column = column[non_gap_mask]
|
|
71
|
+
|
|
72
|
+
if len(filtered_column) > 0:
|
|
73
|
+
# Count occurrences using numpy
|
|
74
|
+
unique_chars, counts = np.unique(filtered_column, return_counts=True)
|
|
75
|
+
total_frequencies = len(filtered_column)
|
|
76
|
+
|
|
77
|
+
# Calculate PIC (Probability of Identical Characters)
|
|
78
|
+
sum_of_frequencies = np.sum((counts / total_frequencies) ** 2)
|
|
79
|
+
pic = 1 - sum_of_frequencies
|
|
80
|
+
else:
|
|
81
|
+
pic = 0
|
|
82
|
+
|
|
83
|
+
pic_values.append(pic)
|
|
84
|
+
|
|
85
|
+
return pic_values
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from Bio import SeqIO
|
|
4
|
+
|
|
5
|
+
from .base import Alignment
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Faidx(Alignment):
|
|
9
|
+
def __init__(self, args) -> None:
|
|
10
|
+
super().__init__(**self.process_args(args))
|
|
11
|
+
|
|
12
|
+
def run(self) -> None:
|
|
13
|
+
record_dict = SeqIO.index(self.fasta, "fasta")
|
|
14
|
+
|
|
15
|
+
# Split entries and iterate
|
|
16
|
+
for e in map(str.strip, self.entry.split(",")):
|
|
17
|
+
record = record_dict[e]
|
|
18
|
+
print(f">{record.name}\n{record.seq}")
|
|
19
|
+
|
|
20
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
21
|
+
return dict(fasta=args.fasta, entry=args.entry)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import re
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Dict, Tuple
|
|
5
|
+
from collections import Counter
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from Bio.Align import MultipleSeqAlignment
|
|
9
|
+
|
|
10
|
+
from .base import Alignment
|
|
11
|
+
from ...helpers.files import get_alignment_and_format
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileFormat(Enum):
|
|
15
|
+
fasta = "fasta"
|
|
16
|
+
clustal = "clustal"
|
|
17
|
+
maf = "maf"
|
|
18
|
+
mauve = "mauve"
|
|
19
|
+
phylip = "phylip"
|
|
20
|
+
phylip_seq = "phylip-sequential"
|
|
21
|
+
stockholm = "stockholm"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GCContent(Alignment):
|
|
25
|
+
def __init__(self, args) -> None:
|
|
26
|
+
super().__init__(**self.process_args(args))
|
|
27
|
+
|
|
28
|
+
def run(self):
|
|
29
|
+
records, _, is_protein = get_alignment_and_format(self.fasta)
|
|
30
|
+
|
|
31
|
+
if is_protein:
|
|
32
|
+
print("GC content can't be calculated for protein sequences")
|
|
33
|
+
sys.exit(2)
|
|
34
|
+
|
|
35
|
+
if self.verbose:
|
|
36
|
+
self.calculate_gc_per_sequence(records)
|
|
37
|
+
else:
|
|
38
|
+
self.calculate_gc_total(records)
|
|
39
|
+
|
|
40
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
41
|
+
return dict(fasta=args.fasta, verbose=args.verbose)
|
|
42
|
+
|
|
43
|
+
def calculate_gc_per_sequence(self, records: MultipleSeqAlignment) -> None:
|
|
44
|
+
gap_chars = set(self.get_gap_chars())
|
|
45
|
+
|
|
46
|
+
for record in records:
|
|
47
|
+
# Convert to numpy array for faster operations
|
|
48
|
+
seq_arr = np.array(list(str(record.seq).upper()), dtype='U1')
|
|
49
|
+
|
|
50
|
+
# Filter out gaps
|
|
51
|
+
non_gap_mask = ~np.isin(seq_arr, list(gap_chars))
|
|
52
|
+
cleaned_seq = seq_arr[non_gap_mask]
|
|
53
|
+
|
|
54
|
+
if len(cleaned_seq) > 0:
|
|
55
|
+
# Count G and C
|
|
56
|
+
gc_count = np.sum((cleaned_seq == 'G') | (cleaned_seq == 'C'))
|
|
57
|
+
gc_content = gc_count / len(cleaned_seq)
|
|
58
|
+
else:
|
|
59
|
+
gc_content = 0
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
print(f"{record.id}\t{round(gc_content, 4)}")
|
|
63
|
+
except BrokenPipeError:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
def calculate_gc_total(self, records: MultipleSeqAlignment) -> None:
|
|
67
|
+
gap_chars = set(self.get_gap_chars())
|
|
68
|
+
|
|
69
|
+
# Combine all sequences into one array
|
|
70
|
+
all_seqs = [list(str(record.seq).upper()) for record in records]
|
|
71
|
+
combined_arr = np.concatenate([np.array(seq, dtype='U1') for seq in all_seqs])
|
|
72
|
+
|
|
73
|
+
# Filter out gaps
|
|
74
|
+
non_gap_mask = ~np.isin(combined_arr, list(gap_chars))
|
|
75
|
+
cleaned_seq = combined_arr[non_gap_mask]
|
|
76
|
+
|
|
77
|
+
if len(cleaned_seq) > 0:
|
|
78
|
+
# Count G and C
|
|
79
|
+
gc_count = np.sum((cleaned_seq == 'G') | (cleaned_seq == 'C'))
|
|
80
|
+
gc_content = round(gc_count / len(cleaned_seq), 4)
|
|
81
|
+
print(gc_content)
|
|
82
|
+
else:
|
|
83
|
+
print(
|
|
84
|
+
"Input file has an unacceptable format. Please check input file argument."
|
|
85
|
+
)
|
|
86
|
+
sys.exit(2)
|
|
87
|
+
|
|
88
|
+
def remove_gaps_and_count_gc(self, seq: str) -> Tuple[str, float]:
|
|
89
|
+
gap_chars = self.get_gap_chars()
|
|
90
|
+
pattern = "[" + "".join(re.escape(char) for char in gap_chars) + "]"
|
|
91
|
+
cleaned_seq = re.sub(pattern, "", seq)
|
|
92
|
+
gc_count = Counter(cleaned_seq.upper())["G"] + Counter(cleaned_seq.upper())["C"]
|
|
93
|
+
|
|
94
|
+
return cleaned_seq, gc_count
|