phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import Dict, List, Tuple
|
|
3
|
+
import numpy as np
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
from functools import partial
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from Bio.Align import MultipleSeqAlignment
|
|
9
|
+
try:
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
except ImportError:
|
|
12
|
+
# Fallback if tqdm is not installed
|
|
13
|
+
def tqdm(iterable, *args, **kwargs):
|
|
14
|
+
return iterable
|
|
15
|
+
|
|
16
|
+
from .base import Alignment
|
|
17
|
+
from ...helpers.stats_summary import (
|
|
18
|
+
calculate_summary_statistics_from_dict,
|
|
19
|
+
print_summary_statistics,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PairwiseIdentity(Alignment):
|
|
24
|
+
def __init__(self, args) -> None:
|
|
25
|
+
super().__init__(**self.process_args(args))
|
|
26
|
+
|
|
27
|
+
def run(self):
|
|
28
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
29
|
+
|
|
30
|
+
pair_ids, pairwise_identities, stats = \
|
|
31
|
+
self.calculate_pairwise_identities(
|
|
32
|
+
alignment, self.exclude_gaps
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
if self.verbose:
|
|
36
|
+
try:
|
|
37
|
+
for pair, identity in zip(
|
|
38
|
+
pair_ids, pairwise_identities.values()
|
|
39
|
+
):
|
|
40
|
+
print(f"{pair[0]}\t{pair[1]}\t{round(identity, 4)}")
|
|
41
|
+
except BrokenPipeError:
|
|
42
|
+
pass
|
|
43
|
+
else:
|
|
44
|
+
print_summary_statistics(stats)
|
|
45
|
+
|
|
46
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
47
|
+
return dict(
|
|
48
|
+
alignment_file_path=args.alignment,
|
|
49
|
+
verbose=args.verbose,
|
|
50
|
+
exclude_gaps=args.exclude_gaps,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def _calculate_identity_vectorized(self, seq_arr1, seq_arr2, gap_mask=None, exclude_gaps=False):
|
|
54
|
+
"""Vectorized calculation of sequence identity."""
|
|
55
|
+
matches = (seq_arr1 == seq_arr2)
|
|
56
|
+
|
|
57
|
+
if exclude_gaps and gap_mask is not None:
|
|
58
|
+
# Match original behavior: count identities when at least one doesn't have a gap
|
|
59
|
+
# This matches the original "res_one not in gap_chars or res_two not in gap_chars"
|
|
60
|
+
valid_for_identity = ~gap_mask[0] | ~gap_mask[1]
|
|
61
|
+
identities = np.sum(matches & valid_for_identity)
|
|
62
|
+
else:
|
|
63
|
+
identities = np.sum(matches)
|
|
64
|
+
|
|
65
|
+
# Total compared is always the full length (matching original behavior)
|
|
66
|
+
total_compared = len(seq_arr1)
|
|
67
|
+
|
|
68
|
+
return identities / total_compared if total_compared > 0 else 0
|
|
69
|
+
|
|
70
|
+
def _process_pair_batch(self, alignment_data, pair_indices, exclude_gaps, gap_chars):
|
|
71
|
+
"""Process a batch of sequence pairs."""
|
|
72
|
+
results = []
|
|
73
|
+
for idx1, idx2 in pair_indices:
|
|
74
|
+
seq_one = alignment_data[idx1]['seq']
|
|
75
|
+
seq_two = alignment_data[idx2]['seq']
|
|
76
|
+
|
|
77
|
+
if exclude_gaps:
|
|
78
|
+
# Create boolean masks for gap positions
|
|
79
|
+
gap_mask1 = np.isin(seq_one, list(gap_chars))
|
|
80
|
+
gap_mask2 = np.isin(seq_two, list(gap_chars))
|
|
81
|
+
identity = self._calculate_identity_vectorized(
|
|
82
|
+
seq_one, seq_two, (gap_mask1, gap_mask2), exclude_gaps
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
identity = self._calculate_identity_vectorized(seq_one, seq_two)
|
|
86
|
+
|
|
87
|
+
results.append({
|
|
88
|
+
'pair_id': [alignment_data[idx1]['id'], alignment_data[idx2]['id']],
|
|
89
|
+
'identity': identity
|
|
90
|
+
})
|
|
91
|
+
return results
|
|
92
|
+
|
|
93
|
+
def calculate_pairwise_identities(
|
|
94
|
+
self,
|
|
95
|
+
alignment: MultipleSeqAlignment,
|
|
96
|
+
exclude_gaps: bool,
|
|
97
|
+
) -> Tuple[List[List[str]], Dict[str, float], Dict[str, float]]:
|
|
98
|
+
gap_chars = self.get_gap_chars()
|
|
99
|
+
|
|
100
|
+
# Convert sequences to numpy arrays for faster comparison
|
|
101
|
+
alignment_data = []
|
|
102
|
+
for record in alignment:
|
|
103
|
+
seq_array = np.array([c.upper() for c in str(record.seq)], dtype='U1')
|
|
104
|
+
alignment_data.append({
|
|
105
|
+
'id': record.id,
|
|
106
|
+
'seq': seq_array
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
# Generate all pairwise combinations
|
|
110
|
+
all_pairs = list(itertools.combinations(range(len(alignment)), 2))
|
|
111
|
+
|
|
112
|
+
pairwise_identities = {}
|
|
113
|
+
pair_ids = []
|
|
114
|
+
|
|
115
|
+
# For small datasets or when not using multiprocessing
|
|
116
|
+
if len(all_pairs) < 50:
|
|
117
|
+
# Process all pairs without multiprocessing
|
|
118
|
+
results = self._process_pair_batch(alignment_data, all_pairs, exclude_gaps, gap_chars)
|
|
119
|
+
for result in results:
|
|
120
|
+
pair_id = result['pair_id']
|
|
121
|
+
pair_ids.append(pair_id)
|
|
122
|
+
pairwise_identities["-".join(pair_id)] = result['identity']
|
|
123
|
+
else:
|
|
124
|
+
# Use multiprocessing for larger datasets
|
|
125
|
+
num_workers = min(mp.cpu_count(), 8)
|
|
126
|
+
chunk_size = max(1, len(all_pairs) // (num_workers * 4))
|
|
127
|
+
pair_chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]
|
|
128
|
+
|
|
129
|
+
# Create partial function
|
|
130
|
+
process_func = partial(
|
|
131
|
+
self._process_pair_batch,
|
|
132
|
+
alignment_data,
|
|
133
|
+
exclude_gaps=exclude_gaps,
|
|
134
|
+
gap_chars=gap_chars
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Process in parallel with progress bar
|
|
138
|
+
with mp.Pool(processes=num_workers) as pool:
|
|
139
|
+
# Only show progress bar if stderr is a tty (not redirected)
|
|
140
|
+
if sys.stderr.isatty():
|
|
141
|
+
chunk_results = list(tqdm(
|
|
142
|
+
pool.imap(process_func, pair_chunks),
|
|
143
|
+
total=len(pair_chunks),
|
|
144
|
+
desc="Calculating pairwise identities",
|
|
145
|
+
unit="batch"
|
|
146
|
+
))
|
|
147
|
+
else:
|
|
148
|
+
chunk_results = pool.map(process_func, pair_chunks)
|
|
149
|
+
|
|
150
|
+
# Combine results
|
|
151
|
+
for chunk_result in chunk_results:
|
|
152
|
+
for result in chunk_result:
|
|
153
|
+
pair_id = result['pair_id']
|
|
154
|
+
pair_ids.append(pair_id)
|
|
155
|
+
pairwise_identities["-".join(pair_id)] = result['identity']
|
|
156
|
+
|
|
157
|
+
stats = calculate_summary_statistics_from_dict(pairwise_identities)
|
|
158
|
+
|
|
159
|
+
return pair_ids, pairwise_identities, stats
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from typing import Dict, Tuple
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from Bio.Align import MultipleSeqAlignment
|
|
6
|
+
|
|
7
|
+
from .base import Alignment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ParsimonyInformative(Alignment):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
16
|
+
pi_sites, aln_len, pi_sites_per = self.calculate_parsimony_informative_sites(
|
|
17
|
+
alignment
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}")
|
|
21
|
+
|
|
22
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
23
|
+
return dict(alignment_file_path=args.alignment)
|
|
24
|
+
|
|
25
|
+
def get_number_of_occurrences_per_character(
|
|
26
|
+
self,
|
|
27
|
+
alignment: MultipleSeqAlignment,
|
|
28
|
+
idx: int
|
|
29
|
+
) -> Counter:
|
|
30
|
+
gap_chars = self.get_gap_chars()
|
|
31
|
+
seq_at_position = alignment[:, idx].upper()
|
|
32
|
+
filtered_seq = filter(lambda c: c not in gap_chars, seq_at_position)
|
|
33
|
+
|
|
34
|
+
return Counter(filtered_seq)
|
|
35
|
+
|
|
36
|
+
def is_parsimony_informative(
|
|
37
|
+
self,
|
|
38
|
+
num_occurrences: Counter,
|
|
39
|
+
) -> bool:
|
|
40
|
+
"""
|
|
41
|
+
Check if a site is parsimony informative.
|
|
42
|
+
That is, the site has two characters that appear at least twice.
|
|
43
|
+
"""
|
|
44
|
+
informative_char_count = sum(1 for count in num_occurrences.values() if count >= 2)
|
|
45
|
+
return informative_char_count >= 2
|
|
46
|
+
|
|
47
|
+
def calculate_parsimony_informative_sites(
|
|
48
|
+
self,
|
|
49
|
+
alignment: MultipleSeqAlignment,
|
|
50
|
+
) -> Tuple[int, int, float]:
|
|
51
|
+
aln_len = alignment.get_alignment_length()
|
|
52
|
+
gap_chars = self.get_gap_chars()
|
|
53
|
+
|
|
54
|
+
# Convert alignment to numpy array for vectorized operations
|
|
55
|
+
alignment_array = np.array([
|
|
56
|
+
[c.upper() for c in str(record.seq)]
|
|
57
|
+
for record in alignment
|
|
58
|
+
], dtype='U1')
|
|
59
|
+
|
|
60
|
+
pi_sites = 0
|
|
61
|
+
|
|
62
|
+
# Process each column more efficiently
|
|
63
|
+
for col_idx in range(aln_len):
|
|
64
|
+
column = alignment_array[:, col_idx]
|
|
65
|
+
|
|
66
|
+
# Filter out gaps
|
|
67
|
+
non_gap_mask = ~np.isin(column, list(gap_chars))
|
|
68
|
+
filtered_column = column[non_gap_mask]
|
|
69
|
+
|
|
70
|
+
if len(filtered_column) > 0:
|
|
71
|
+
# Count occurrences of each character
|
|
72
|
+
unique_chars, counts = np.unique(filtered_column, return_counts=True)
|
|
73
|
+
|
|
74
|
+
# Check if parsimony informative (at least 2 chars appearing >= 2 times)
|
|
75
|
+
chars_appearing_twice = np.sum(counts >= 2)
|
|
76
|
+
if chars_appearing_twice >= 2:
|
|
77
|
+
pi_sites += 1
|
|
78
|
+
|
|
79
|
+
pi_sites_per = (pi_sites / aln_len) * 100
|
|
80
|
+
|
|
81
|
+
return pi_sites, aln_len, pi_sites_per
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .base import Alignment
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RelativeCompositionVariability(Alignment):
|
|
5
|
+
def __init__(self, args) -> None:
|
|
6
|
+
super().__init__(**self.process_args(args))
|
|
7
|
+
|
|
8
|
+
def run(self):
|
|
9
|
+
# calc rcv and print val
|
|
10
|
+
relative_composition_variability = self.calculate_rcv()
|
|
11
|
+
print(round(relative_composition_variability, 4))
|
|
12
|
+
|
|
13
|
+
def process_args(self, args):
|
|
14
|
+
return dict(alignment_file_path=args.alignment)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from .base import Alignment
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class RelativeCompositionVariabilityTaxon(Alignment):
|
|
7
|
+
def __init__(self, args) -> None:
|
|
8
|
+
super().__init__(**self.process_args(args))
|
|
9
|
+
|
|
10
|
+
def run(self):
|
|
11
|
+
alignment, _, _ = self.get_alignment_and_format()
|
|
12
|
+
aln_len = alignment.get_alignment_length()
|
|
13
|
+
num_records = len(alignment)
|
|
14
|
+
|
|
15
|
+
# Convert alignment to numpy array for faster operations
|
|
16
|
+
alignment_array = np.array([
|
|
17
|
+
list(str(record.seq)) for record in alignment
|
|
18
|
+
], dtype='U1')
|
|
19
|
+
|
|
20
|
+
# Get all unique characters and create mapping
|
|
21
|
+
unique_chars = np.unique(alignment_array)
|
|
22
|
+
char_to_idx = {char: i for i, char in enumerate(unique_chars)}
|
|
23
|
+
|
|
24
|
+
# Create integer representation for faster counting
|
|
25
|
+
alignment_int = np.zeros_like(alignment_array, dtype=np.int8)
|
|
26
|
+
for char, idx in char_to_idx.items():
|
|
27
|
+
alignment_int[alignment_array == char] = idx
|
|
28
|
+
|
|
29
|
+
# Vectorized counting for all sequences and characters
|
|
30
|
+
count_matrix = np.zeros((num_records, len(unique_chars)), dtype=np.float32)
|
|
31
|
+
for i in range(len(unique_chars)):
|
|
32
|
+
count_matrix[:, i] = np.sum(alignment_int == i, axis=1)
|
|
33
|
+
|
|
34
|
+
# Calculate average counts per sequence (total counts / num_records)
|
|
35
|
+
average_counts = np.sum(count_matrix, axis=0) / num_records
|
|
36
|
+
|
|
37
|
+
# Vectorized RCV calculation for all sequences at once
|
|
38
|
+
deviations = np.abs(count_matrix - average_counts)
|
|
39
|
+
rcv_values = np.sum(deviations, axis=1) / (num_records * aln_len)
|
|
40
|
+
|
|
41
|
+
# Print results - convert to float64 for consistent rounding
|
|
42
|
+
for i, record in enumerate(alignment):
|
|
43
|
+
rcv_val = float(rcv_values[i])
|
|
44
|
+
print(f"{record.id}\t{round(rcv_val, 4)}")
|
|
45
|
+
|
|
46
|
+
def process_args(self, args):
|
|
47
|
+
return dict(alignment_file_path=args.alignment)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from Bio.SeqIO.FastaIO import FastaIterator
|
|
6
|
+
|
|
7
|
+
from .base import Alignment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RenameFastaEntries(Alignment):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
try:
|
|
16
|
+
records = SeqIO.parse(self.fasta, "fasta")
|
|
17
|
+
except FileNotFoundError:
|
|
18
|
+
print("FASTA file path corresponds to no such file. Please check the path.")
|
|
19
|
+
sys.exit(2)
|
|
20
|
+
|
|
21
|
+
idmap = self.load_idmap(self.idmap)
|
|
22
|
+
|
|
23
|
+
self.replace_ids_and_write(self.output_file_path, records, idmap)
|
|
24
|
+
|
|
25
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
26
|
+
output_file_path = f"{args.output or args.fasta}.renamed.fa"
|
|
27
|
+
return dict(
|
|
28
|
+
fasta=args.fasta,
|
|
29
|
+
idmap=args.idmap,
|
|
30
|
+
output_file_path=output_file_path,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def load_idmap(self, idmap_file: str) -> Dict[str, str]:
|
|
34
|
+
try:
|
|
35
|
+
with open(idmap_file) as f:
|
|
36
|
+
return dict(line.split() for line in f)
|
|
37
|
+
except FileNotFoundError:
|
|
38
|
+
print("Idmap path corresponds to no such file. Please check the path.")
|
|
39
|
+
sys.exit(2)
|
|
40
|
+
|
|
41
|
+
def replace_ids_and_write(
|
|
42
|
+
self,
|
|
43
|
+
output_file_path: str,
|
|
44
|
+
records: FastaIterator,
|
|
45
|
+
idmap: Dict[str, str]
|
|
46
|
+
) -> None:
|
|
47
|
+
print(records)
|
|
48
|
+
with open(output_file_path, "w") as output_file:
|
|
49
|
+
for record in records:
|
|
50
|
+
if record.id in idmap:
|
|
51
|
+
record.id = idmap[record.id]
|
|
52
|
+
record.description = ""
|
|
53
|
+
SeqIO.write(record, output_file, "fasta")
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import Dict, List, Tuple
|
|
3
|
+
import numpy as np
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
from functools import partial
|
|
6
|
+
|
|
7
|
+
from Bio import SeqIO
|
|
8
|
+
from Bio.SeqRecord import SeqRecord
|
|
9
|
+
|
|
10
|
+
from .base import Alignment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SumOfPairsScore(Alignment):
|
|
14
|
+
def __init__(self, args) -> None:
|
|
15
|
+
super().__init__(**self.process_args(args))
|
|
16
|
+
|
|
17
|
+
def run(self):
|
|
18
|
+
query_records = SeqIO.to_dict(SeqIO.parse(self.fasta, "fasta"))
|
|
19
|
+
reference_records = SeqIO.to_dict(SeqIO.parse(self.reference, "fasta"))
|
|
20
|
+
|
|
21
|
+
record_id_pairs = list(
|
|
22
|
+
itertools.combinations(reference_records.keys(), 2)
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
number_of_matches, number_of_total_pairs = \
|
|
26
|
+
self.determine_number_of_matches_and_total_pairs(
|
|
27
|
+
record_id_pairs, reference_records, query_records
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
print(round(number_of_matches / number_of_total_pairs, 4))
|
|
31
|
+
|
|
32
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
33
|
+
return dict(fasta=args.fasta, reference=args.reference)
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _process_pair_batch(
|
|
37
|
+
pair_batch: List[Tuple[str, str]],
|
|
38
|
+
reference_records: Dict[str, SeqRecord],
|
|
39
|
+
query_records: Dict[str, SeqRecord],
|
|
40
|
+
) -> Tuple[int, int]:
|
|
41
|
+
"""Process a batch of sequence pairs in parallel."""
|
|
42
|
+
batch_matches = 0
|
|
43
|
+
batch_total = 0
|
|
44
|
+
|
|
45
|
+
# Pre-convert sequences to numpy arrays for the batch
|
|
46
|
+
seq_arrays = {}
|
|
47
|
+
for pair in pair_batch:
|
|
48
|
+
for seq_id in [pair[0], pair[1]]:
|
|
49
|
+
if seq_id not in seq_arrays:
|
|
50
|
+
ref_seq = str(reference_records[seq_id].seq)
|
|
51
|
+
query_seq = str(query_records[seq_id].seq)
|
|
52
|
+
seq_arrays[seq_id] = {
|
|
53
|
+
'ref': ref_seq,
|
|
54
|
+
'query': query_seq,
|
|
55
|
+
'ref_array': np.array(list(ref_seq), dtype='U1'),
|
|
56
|
+
'query_array': np.array(list(query_seq), dtype='U1')
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
for first_in_pair, second_in_pair in pair_batch:
|
|
60
|
+
ref_seq1 = seq_arrays[first_in_pair]['ref_array']
|
|
61
|
+
ref_seq2 = seq_arrays[second_in_pair]['ref_array']
|
|
62
|
+
query_seq1 = seq_arrays[first_in_pair]['query_array']
|
|
63
|
+
query_seq2 = seq_arrays[second_in_pair]['query_array']
|
|
64
|
+
|
|
65
|
+
# Check if all sequences have the same length
|
|
66
|
+
if (len(ref_seq1) == len(query_seq1) and
|
|
67
|
+
len(ref_seq2) == len(query_seq2) and
|
|
68
|
+
len(ref_seq1) == len(ref_seq2)):
|
|
69
|
+
# Use vectorized comparison
|
|
70
|
+
matches = (ref_seq1 == query_seq1) & (ref_seq2 == query_seq2)
|
|
71
|
+
batch_matches += np.sum(matches)
|
|
72
|
+
batch_total += len(ref_seq1)
|
|
73
|
+
else:
|
|
74
|
+
# Fall back to optimized comparison for mismatched lengths
|
|
75
|
+
ref_seq1_str = seq_arrays[first_in_pair]['ref']
|
|
76
|
+
ref_seq2_str = seq_arrays[second_in_pair]['ref']
|
|
77
|
+
query_seq1_str = seq_arrays[first_in_pair]['query']
|
|
78
|
+
query_seq2_str = seq_arrays[second_in_pair]['query']
|
|
79
|
+
|
|
80
|
+
min_len = min(len(ref_seq1_str), len(ref_seq2_str),
|
|
81
|
+
len(query_seq1_str), len(query_seq2_str))
|
|
82
|
+
|
|
83
|
+
# Vectorize the mismatched comparison when possible
|
|
84
|
+
if min_len > 0:
|
|
85
|
+
ref1_trimmed = np.array(list(ref_seq1_str[:min_len]), dtype='U1')
|
|
86
|
+
ref2_trimmed = np.array(list(ref_seq2_str[:min_len]), dtype='U1')
|
|
87
|
+
query1_trimmed = np.array(list(query_seq1_str[:min_len]), dtype='U1')
|
|
88
|
+
query2_trimmed = np.array(list(query_seq2_str[:min_len]), dtype='U1')
|
|
89
|
+
|
|
90
|
+
matches = (ref1_trimmed == query1_trimmed) & (ref2_trimmed == query2_trimmed)
|
|
91
|
+
batch_matches += np.sum(matches)
|
|
92
|
+
batch_total += min_len
|
|
93
|
+
|
|
94
|
+
return int(batch_matches), batch_total
|
|
95
|
+
|
|
96
|
+
def determine_number_of_matches_and_total_pairs(
|
|
97
|
+
self,
|
|
98
|
+
record_id_pairs: List[Tuple[str, str]],
|
|
99
|
+
reference_records: Dict[str, SeqRecord],
|
|
100
|
+
query_records: Dict[str, SeqRecord],
|
|
101
|
+
) -> Tuple[int, int]:
|
|
102
|
+
# For small datasets, use sequential processing
|
|
103
|
+
if len(record_id_pairs) < 50:
|
|
104
|
+
number_of_matches = 0
|
|
105
|
+
number_of_total_pairs = 0
|
|
106
|
+
|
|
107
|
+
for first_in_pair, second_in_pair in record_id_pairs:
|
|
108
|
+
ref_seq1_str = str(reference_records[first_in_pair].seq)
|
|
109
|
+
ref_seq2_str = str(reference_records[second_in_pair].seq)
|
|
110
|
+
query_seq1_str = str(query_records[first_in_pair].seq)
|
|
111
|
+
query_seq2_str = str(query_records[second_in_pair].seq)
|
|
112
|
+
|
|
113
|
+
if (len(ref_seq1_str) == len(query_seq1_str) and
|
|
114
|
+
len(ref_seq2_str) == len(query_seq2_str) and
|
|
115
|
+
len(ref_seq1_str) == len(ref_seq2_str)):
|
|
116
|
+
# Use vectorized comparison
|
|
117
|
+
ref_seq1 = np.array(list(ref_seq1_str), dtype='U1')
|
|
118
|
+
ref_seq2 = np.array(list(ref_seq2_str), dtype='U1')
|
|
119
|
+
query_seq1 = np.array(list(query_seq1_str), dtype='U1')
|
|
120
|
+
query_seq2 = np.array(list(query_seq2_str), dtype='U1')
|
|
121
|
+
|
|
122
|
+
matches = (ref_seq1 == query_seq1) & (ref_seq2 == query_seq2)
|
|
123
|
+
number_of_matches += np.sum(matches)
|
|
124
|
+
number_of_total_pairs += len(ref_seq1)
|
|
125
|
+
else:
|
|
126
|
+
min_len = min(len(ref_seq1_str), len(ref_seq2_str),
|
|
127
|
+
len(query_seq1_str), len(query_seq2_str))
|
|
128
|
+
|
|
129
|
+
for i in range(min_len):
|
|
130
|
+
if (ref_seq1_str[i] == query_seq1_str[i] and
|
|
131
|
+
ref_seq2_str[i] == query_seq2_str[i]):
|
|
132
|
+
number_of_matches += 1
|
|
133
|
+
number_of_total_pairs += 1
|
|
134
|
+
|
|
135
|
+
return int(number_of_matches), number_of_total_pairs
|
|
136
|
+
|
|
137
|
+
# Use multiprocessing for larger datasets
|
|
138
|
+
num_workers = min(mp.cpu_count(), 8)
|
|
139
|
+
batch_size = max(10, len(record_id_pairs) // (num_workers * 4))
|
|
140
|
+
|
|
141
|
+
# Create batches
|
|
142
|
+
pair_batches = [record_id_pairs[i:i + batch_size]
|
|
143
|
+
for i in range(0, len(record_id_pairs), batch_size)]
|
|
144
|
+
|
|
145
|
+
# Process batches in parallel
|
|
146
|
+
process_func = partial(self._process_pair_batch,
|
|
147
|
+
reference_records=reference_records,
|
|
148
|
+
query_records=query_records)
|
|
149
|
+
|
|
150
|
+
with mp.Pool(processes=num_workers) as pool:
|
|
151
|
+
batch_results = pool.map(process_func, pair_batches)
|
|
152
|
+
|
|
153
|
+
# Aggregate results
|
|
154
|
+
total_matches = sum(matches for matches, _ in batch_results)
|
|
155
|
+
total_pairs = sum(pairs for _, pairs in batch_results)
|
|
156
|
+
|
|
157
|
+
return int(total_matches), total_pairs
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from typing import Dict, Tuple
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
from Bio.Align import MultipleSeqAlignment
|
|
5
|
+
|
|
6
|
+
from .base import Alignment
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VariableSites(Alignment):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self):
|
|
14
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
15
|
+
var_sites, aln_len, var_sites_per = \
|
|
16
|
+
self.calculate_variable_sites(alignment)
|
|
17
|
+
|
|
18
|
+
print(f"{var_sites}\t{aln_len}\t{round(var_sites_per, 4)}")
|
|
19
|
+
|
|
20
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
21
|
+
return dict(alignment_file_path=args.alignment)
|
|
22
|
+
|
|
23
|
+
def calculate_variable_sites(
|
|
24
|
+
self,
|
|
25
|
+
alignment: MultipleSeqAlignment
|
|
26
|
+
) -> Tuple[int, int, float]:
|
|
27
|
+
aln_len = alignment.get_alignment_length()
|
|
28
|
+
gap_chars = self.get_gap_chars()
|
|
29
|
+
|
|
30
|
+
# Convert alignment to numpy array for vectorized operations
|
|
31
|
+
alignment_array = np.array([
|
|
32
|
+
[c.upper() for c in str(record.seq)]
|
|
33
|
+
for record in alignment
|
|
34
|
+
], dtype='U1')
|
|
35
|
+
|
|
36
|
+
var_sites = 0
|
|
37
|
+
|
|
38
|
+
# Process each column
|
|
39
|
+
for col in range(aln_len):
|
|
40
|
+
column = alignment_array[:, col]
|
|
41
|
+
|
|
42
|
+
# Filter out gap characters
|
|
43
|
+
non_gap_mask = ~np.isin(column, list(gap_chars))
|
|
44
|
+
filtered_column = column[non_gap_mask]
|
|
45
|
+
|
|
46
|
+
# Check if variable (more than one unique character)
|
|
47
|
+
if len(filtered_column) > 0:
|
|
48
|
+
unique_chars = np.unique(filtered_column)
|
|
49
|
+
if len(unique_chars) > 1:
|
|
50
|
+
var_sites += 1
|
|
51
|
+
|
|
52
|
+
var_sites_per = (var_sites / aln_len) * 100
|
|
53
|
+
|
|
54
|
+
return var_sites, aln_len, var_sites_per
|
phykit/services/base.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from .bipartition_support_stats import BipartitionSupportStats
|
|
2
|
+
from .branch_length_multiplier import BranchLengthMultiplier
|
|
3
|
+
from .covarying_evolutionary_rates import CovaryingEvolutionaryRates
|
|
4
|
+
from .dvmc import DVMC
|
|
5
|
+
from .evolutionary_rate import EvolutionaryRate
|
|
6
|
+
from .hidden_paralogy_check import HiddenParalogyCheck
|
|
7
|
+
from .internal_branch_stats import InternalBranchStats
|
|
8
|
+
from .internode_labeler import InternodeLabeler
|
|
9
|
+
from .last_common_ancestor_subtree import LastCommonAncestorSubtree
|
|
10
|
+
from .lb_score import LBScore
|
|
11
|
+
from .monophyly_check import MonophylyCheck
|
|
12
|
+
from .nearest_neighbor_interchange import NearestNeighborInterchange
|
|
13
|
+
from .patristic_distances import PatristicDistances
|
|
14
|
+
from .polytomy_test import PolytomyTest
|
|
15
|
+
from .print_tree import PrintTree
|
|
16
|
+
from .prune_tree import PruneTree
|
|
17
|
+
from .rename_tree_tips import RenameTreeTips
|
|
18
|
+
from .root_tree import RootTree
|
|
19
|
+
from .rf_distance import RobinsonFouldsDistance
|
|
20
|
+
from .saturation import Saturation
|
|
21
|
+
from .spurious_sequence import SpuriousSequence
|
|
22
|
+
from .treeness import Treeness
|
|
23
|
+
from .treeness_over_rcv import TreenessOverRCV
|
|
24
|
+
from .terminal_branch_stats import TerminalBranchStats
|
|
25
|
+
from .tip_labels import TipLabels
|
|
26
|
+
from .tip_to_tip_distance import TipToTipDistance
|
|
27
|
+
from .tip_to_tip_node_distance import TipToTipNodeDistance
|
|
28
|
+
from .total_tree_length import TotalTreeLength
|
|
29
|
+
from .collapse_branches import CollapseBranches
|