phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,159 @@
1
+ import itertools
2
+ from typing import Dict, List, Tuple
3
+ import numpy as np
4
+ import multiprocessing as mp
5
+ from functools import partial
6
+ import sys
7
+
8
+ from Bio.Align import MultipleSeqAlignment
9
+ try:
10
+ from tqdm import tqdm
11
+ except ImportError:
12
+ # Fallback if tqdm is not installed
13
+ def tqdm(iterable, *args, **kwargs):
14
+ return iterable
15
+
16
+ from .base import Alignment
17
+ from ...helpers.stats_summary import (
18
+ calculate_summary_statistics_from_dict,
19
+ print_summary_statistics,
20
+ )
21
+
22
+
23
+ class PairwiseIdentity(Alignment):
24
+ def __init__(self, args) -> None:
25
+ super().__init__(**self.process_args(args))
26
+
27
+ def run(self):
28
+ alignment, _, is_protein = self.get_alignment_and_format()
29
+
30
+ pair_ids, pairwise_identities, stats = \
31
+ self.calculate_pairwise_identities(
32
+ alignment, self.exclude_gaps
33
+ )
34
+
35
+ if self.verbose:
36
+ try:
37
+ for pair, identity in zip(
38
+ pair_ids, pairwise_identities.values()
39
+ ):
40
+ print(f"{pair[0]}\t{pair[1]}\t{round(identity, 4)}")
41
+ except BrokenPipeError:
42
+ pass
43
+ else:
44
+ print_summary_statistics(stats)
45
+
46
+ def process_args(self, args) -> Dict[str, str]:
47
+ return dict(
48
+ alignment_file_path=args.alignment,
49
+ verbose=args.verbose,
50
+ exclude_gaps=args.exclude_gaps,
51
+ )
52
+
53
+ def _calculate_identity_vectorized(self, seq_arr1, seq_arr2, gap_mask=None, exclude_gaps=False):
54
+ """Vectorized calculation of sequence identity."""
55
+ matches = (seq_arr1 == seq_arr2)
56
+
57
+ if exclude_gaps and gap_mask is not None:
58
+ # Match original behavior: count identities when at least one doesn't have a gap
59
+ # This matches the original "res_one not in gap_chars or res_two not in gap_chars"
60
+ valid_for_identity = ~gap_mask[0] | ~gap_mask[1]
61
+ identities = np.sum(matches & valid_for_identity)
62
+ else:
63
+ identities = np.sum(matches)
64
+
65
+ # Total compared is always the full length (matching original behavior)
66
+ total_compared = len(seq_arr1)
67
+
68
+ return identities / total_compared if total_compared > 0 else 0
69
+
70
+ def _process_pair_batch(self, alignment_data, pair_indices, exclude_gaps, gap_chars):
71
+ """Process a batch of sequence pairs."""
72
+ results = []
73
+ for idx1, idx2 in pair_indices:
74
+ seq_one = alignment_data[idx1]['seq']
75
+ seq_two = alignment_data[idx2]['seq']
76
+
77
+ if exclude_gaps:
78
+ # Create boolean masks for gap positions
79
+ gap_mask1 = np.isin(seq_one, list(gap_chars))
80
+ gap_mask2 = np.isin(seq_two, list(gap_chars))
81
+ identity = self._calculate_identity_vectorized(
82
+ seq_one, seq_two, (gap_mask1, gap_mask2), exclude_gaps
83
+ )
84
+ else:
85
+ identity = self._calculate_identity_vectorized(seq_one, seq_two)
86
+
87
+ results.append({
88
+ 'pair_id': [alignment_data[idx1]['id'], alignment_data[idx2]['id']],
89
+ 'identity': identity
90
+ })
91
+ return results
92
+
93
+ def calculate_pairwise_identities(
94
+ self,
95
+ alignment: MultipleSeqAlignment,
96
+ exclude_gaps: bool,
97
+ ) -> Tuple[List[List[str]], Dict[str, float], Dict[str, float]]:
98
+ gap_chars = self.get_gap_chars()
99
+
100
+ # Convert sequences to numpy arrays for faster comparison
101
+ alignment_data = []
102
+ for record in alignment:
103
+ seq_array = np.array([c.upper() for c in str(record.seq)], dtype='U1')
104
+ alignment_data.append({
105
+ 'id': record.id,
106
+ 'seq': seq_array
107
+ })
108
+
109
+ # Generate all pairwise combinations
110
+ all_pairs = list(itertools.combinations(range(len(alignment)), 2))
111
+
112
+ pairwise_identities = {}
113
+ pair_ids = []
114
+
115
+ # For small datasets or when not using multiprocessing
116
+ if len(all_pairs) < 50:
117
+ # Process all pairs without multiprocessing
118
+ results = self._process_pair_batch(alignment_data, all_pairs, exclude_gaps, gap_chars)
119
+ for result in results:
120
+ pair_id = result['pair_id']
121
+ pair_ids.append(pair_id)
122
+ pairwise_identities["-".join(pair_id)] = result['identity']
123
+ else:
124
+ # Use multiprocessing for larger datasets
125
+ num_workers = min(mp.cpu_count(), 8)
126
+ chunk_size = max(1, len(all_pairs) // (num_workers * 4))
127
+ pair_chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]
128
+
129
+ # Create partial function
130
+ process_func = partial(
131
+ self._process_pair_batch,
132
+ alignment_data,
133
+ exclude_gaps=exclude_gaps,
134
+ gap_chars=gap_chars
135
+ )
136
+
137
+ # Process in parallel with progress bar
138
+ with mp.Pool(processes=num_workers) as pool:
139
+ # Only show progress bar if stderr is a tty (not redirected)
140
+ if sys.stderr.isatty():
141
+ chunk_results = list(tqdm(
142
+ pool.imap(process_func, pair_chunks),
143
+ total=len(pair_chunks),
144
+ desc="Calculating pairwise identities",
145
+ unit="batch"
146
+ ))
147
+ else:
148
+ chunk_results = pool.map(process_func, pair_chunks)
149
+
150
+ # Combine results
151
+ for chunk_result in chunk_results:
152
+ for result in chunk_result:
153
+ pair_id = result['pair_id']
154
+ pair_ids.append(pair_id)
155
+ pairwise_identities["-".join(pair_id)] = result['identity']
156
+
157
+ stats = calculate_summary_statistics_from_dict(pairwise_identities)
158
+
159
+ return pair_ids, pairwise_identities, stats
@@ -0,0 +1,81 @@
1
+ from collections import Counter
2
+ from typing import Dict, Tuple
3
+ import numpy as np
4
+
5
+ from Bio.Align import MultipleSeqAlignment
6
+
7
+ from .base import Alignment
8
+
9
+
10
+ class ParsimonyInformative(Alignment):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self):
15
+ alignment, _, is_protein = self.get_alignment_and_format()
16
+ pi_sites, aln_len, pi_sites_per = self.calculate_parsimony_informative_sites(
17
+ alignment
18
+ )
19
+
20
+ print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}")
21
+
22
+ def process_args(self, args) -> Dict[str, str]:
23
+ return dict(alignment_file_path=args.alignment)
24
+
25
+ def get_number_of_occurrences_per_character(
26
+ self,
27
+ alignment: MultipleSeqAlignment,
28
+ idx: int
29
+ ) -> Counter:
30
+ gap_chars = self.get_gap_chars()
31
+ seq_at_position = alignment[:, idx].upper()
32
+ filtered_seq = filter(lambda c: c not in gap_chars, seq_at_position)
33
+
34
+ return Counter(filtered_seq)
35
+
36
+ def is_parsimony_informative(
37
+ self,
38
+ num_occurrences: Counter,
39
+ ) -> bool:
40
+ """
41
+ Check if a site is parsimony informative.
42
+ That is, the site has two characters that appear at least twice.
43
+ """
44
+ informative_char_count = sum(1 for count in num_occurrences.values() if count >= 2)
45
+ return informative_char_count >= 2
46
+
47
+ def calculate_parsimony_informative_sites(
48
+ self,
49
+ alignment: MultipleSeqAlignment,
50
+ ) -> Tuple[int, int, float]:
51
+ aln_len = alignment.get_alignment_length()
52
+ gap_chars = self.get_gap_chars()
53
+
54
+ # Convert alignment to numpy array for vectorized operations
55
+ alignment_array = np.array([
56
+ [c.upper() for c in str(record.seq)]
57
+ for record in alignment
58
+ ], dtype='U1')
59
+
60
+ pi_sites = 0
61
+
62
+ # Process each column more efficiently
63
+ for col_idx in range(aln_len):
64
+ column = alignment_array[:, col_idx]
65
+
66
+ # Filter out gaps
67
+ non_gap_mask = ~np.isin(column, list(gap_chars))
68
+ filtered_column = column[non_gap_mask]
69
+
70
+ if len(filtered_column) > 0:
71
+ # Count occurrences of each character
72
+ unique_chars, counts = np.unique(filtered_column, return_counts=True)
73
+
74
+ # Check if parsimony informative (at least 2 chars appearing >= 2 times)
75
+ chars_appearing_twice = np.sum(counts >= 2)
76
+ if chars_appearing_twice >= 2:
77
+ pi_sites += 1
78
+
79
+ pi_sites_per = (pi_sites / aln_len) * 100
80
+
81
+ return pi_sites, aln_len, pi_sites_per
@@ -0,0 +1,14 @@
1
+ from .base import Alignment
2
+
3
+
4
+ class RelativeCompositionVariability(Alignment):
5
+ def __init__(self, args) -> None:
6
+ super().__init__(**self.process_args(args))
7
+
8
+ def run(self):
9
+ # calc rcv and print val
10
+ relative_composition_variability = self.calculate_rcv()
11
+ print(round(relative_composition_variability, 4))
12
+
13
+ def process_args(self, args):
14
+ return dict(alignment_file_path=args.alignment)
@@ -0,0 +1,47 @@
1
+ import numpy as np
2
+
3
+ from .base import Alignment
4
+
5
+
6
+ class RelativeCompositionVariabilityTaxon(Alignment):
7
+ def __init__(self, args) -> None:
8
+ super().__init__(**self.process_args(args))
9
+
10
+ def run(self):
11
+ alignment, _, _ = self.get_alignment_and_format()
12
+ aln_len = alignment.get_alignment_length()
13
+ num_records = len(alignment)
14
+
15
+ # Convert alignment to numpy array for faster operations
16
+ alignment_array = np.array([
17
+ list(str(record.seq)) for record in alignment
18
+ ], dtype='U1')
19
+
20
+ # Get all unique characters and create mapping
21
+ unique_chars = np.unique(alignment_array)
22
+ char_to_idx = {char: i for i, char in enumerate(unique_chars)}
23
+
24
+ # Create integer representation for faster counting
25
+ alignment_int = np.zeros_like(alignment_array, dtype=np.int8)
26
+ for char, idx in char_to_idx.items():
27
+ alignment_int[alignment_array == char] = idx
28
+
29
+ # Vectorized counting for all sequences and characters
30
+ count_matrix = np.zeros((num_records, len(unique_chars)), dtype=np.float32)
31
+ for i in range(len(unique_chars)):
32
+ count_matrix[:, i] = np.sum(alignment_int == i, axis=1)
33
+
34
+ # Calculate average counts per sequence (total counts / num_records)
35
+ average_counts = np.sum(count_matrix, axis=0) / num_records
36
+
37
+ # Vectorized RCV calculation for all sequences at once
38
+ deviations = np.abs(count_matrix - average_counts)
39
+ rcv_values = np.sum(deviations, axis=1) / (num_records * aln_len)
40
+
41
+ # Print results - convert to float64 for consistent rounding
42
+ for i, record in enumerate(alignment):
43
+ rcv_val = float(rcv_values[i])
44
+ print(f"{record.id}\t{round(rcv_val, 4)}")
45
+
46
+ def process_args(self, args):
47
+ return dict(alignment_file_path=args.alignment)
@@ -0,0 +1,53 @@
1
+ import sys
2
+ from typing import Dict
3
+
4
+ from Bio import SeqIO
5
+ from Bio.SeqIO.FastaIO import FastaIterator
6
+
7
+ from .base import Alignment
8
+
9
+
10
+ class RenameFastaEntries(Alignment):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self):
15
+ try:
16
+ records = SeqIO.parse(self.fasta, "fasta")
17
+ except FileNotFoundError:
18
+ print("FASTA file path corresponds to no such file. Please check the path.")
19
+ sys.exit(2)
20
+
21
+ idmap = self.load_idmap(self.idmap)
22
+
23
+ self.replace_ids_and_write(self.output_file_path, records, idmap)
24
+
25
+ def process_args(self, args) -> Dict[str, str]:
26
+ output_file_path = f"{args.output or args.fasta}.renamed.fa"
27
+ return dict(
28
+ fasta=args.fasta,
29
+ idmap=args.idmap,
30
+ output_file_path=output_file_path,
31
+ )
32
+
33
+ def load_idmap(self, idmap_file: str) -> Dict[str, str]:
34
+ try:
35
+ with open(idmap_file) as f:
36
+ return dict(line.split() for line in f)
37
+ except FileNotFoundError:
38
+ print("Idmap path corresponds to no such file. Please check the path.")
39
+ sys.exit(2)
40
+
41
+ def replace_ids_and_write(
42
+ self,
43
+ output_file_path: str,
44
+ records: FastaIterator,
45
+ idmap: Dict[str, str]
46
+ ) -> None:
47
+ print(records)
48
+ with open(output_file_path, "w") as output_file:
49
+ for record in records:
50
+ if record.id in idmap:
51
+ record.id = idmap[record.id]
52
+ record.description = ""
53
+ SeqIO.write(record, output_file, "fasta")
@@ -0,0 +1,157 @@
1
+ import itertools
2
+ from typing import Dict, List, Tuple
3
+ import numpy as np
4
+ import multiprocessing as mp
5
+ from functools import partial
6
+
7
+ from Bio import SeqIO
8
+ from Bio.SeqRecord import SeqRecord
9
+
10
+ from .base import Alignment
11
+
12
+
13
+ class SumOfPairsScore(Alignment):
14
+ def __init__(self, args) -> None:
15
+ super().__init__(**self.process_args(args))
16
+
17
+ def run(self):
18
+ query_records = SeqIO.to_dict(SeqIO.parse(self.fasta, "fasta"))
19
+ reference_records = SeqIO.to_dict(SeqIO.parse(self.reference, "fasta"))
20
+
21
+ record_id_pairs = list(
22
+ itertools.combinations(reference_records.keys(), 2)
23
+ )
24
+
25
+ number_of_matches, number_of_total_pairs = \
26
+ self.determine_number_of_matches_and_total_pairs(
27
+ record_id_pairs, reference_records, query_records
28
+ )
29
+
30
+ print(round(number_of_matches / number_of_total_pairs, 4))
31
+
32
+ def process_args(self, args) -> Dict[str, str]:
33
+ return dict(fasta=args.fasta, reference=args.reference)
34
+
35
+ @staticmethod
36
+ def _process_pair_batch(
37
+ pair_batch: List[Tuple[str, str]],
38
+ reference_records: Dict[str, SeqRecord],
39
+ query_records: Dict[str, SeqRecord],
40
+ ) -> Tuple[int, int]:
41
+ """Process a batch of sequence pairs in parallel."""
42
+ batch_matches = 0
43
+ batch_total = 0
44
+
45
+ # Pre-convert sequences to numpy arrays for the batch
46
+ seq_arrays = {}
47
+ for pair in pair_batch:
48
+ for seq_id in [pair[0], pair[1]]:
49
+ if seq_id not in seq_arrays:
50
+ ref_seq = str(reference_records[seq_id].seq)
51
+ query_seq = str(query_records[seq_id].seq)
52
+ seq_arrays[seq_id] = {
53
+ 'ref': ref_seq,
54
+ 'query': query_seq,
55
+ 'ref_array': np.array(list(ref_seq), dtype='U1'),
56
+ 'query_array': np.array(list(query_seq), dtype='U1')
57
+ }
58
+
59
+ for first_in_pair, second_in_pair in pair_batch:
60
+ ref_seq1 = seq_arrays[first_in_pair]['ref_array']
61
+ ref_seq2 = seq_arrays[second_in_pair]['ref_array']
62
+ query_seq1 = seq_arrays[first_in_pair]['query_array']
63
+ query_seq2 = seq_arrays[second_in_pair]['query_array']
64
+
65
+ # Check if all sequences have the same length
66
+ if (len(ref_seq1) == len(query_seq1) and
67
+ len(ref_seq2) == len(query_seq2) and
68
+ len(ref_seq1) == len(ref_seq2)):
69
+ # Use vectorized comparison
70
+ matches = (ref_seq1 == query_seq1) & (ref_seq2 == query_seq2)
71
+ batch_matches += np.sum(matches)
72
+ batch_total += len(ref_seq1)
73
+ else:
74
+ # Fall back to optimized comparison for mismatched lengths
75
+ ref_seq1_str = seq_arrays[first_in_pair]['ref']
76
+ ref_seq2_str = seq_arrays[second_in_pair]['ref']
77
+ query_seq1_str = seq_arrays[first_in_pair]['query']
78
+ query_seq2_str = seq_arrays[second_in_pair]['query']
79
+
80
+ min_len = min(len(ref_seq1_str), len(ref_seq2_str),
81
+ len(query_seq1_str), len(query_seq2_str))
82
+
83
+ # Vectorize the mismatched comparison when possible
84
+ if min_len > 0:
85
+ ref1_trimmed = np.array(list(ref_seq1_str[:min_len]), dtype='U1')
86
+ ref2_trimmed = np.array(list(ref_seq2_str[:min_len]), dtype='U1')
87
+ query1_trimmed = np.array(list(query_seq1_str[:min_len]), dtype='U1')
88
+ query2_trimmed = np.array(list(query_seq2_str[:min_len]), dtype='U1')
89
+
90
+ matches = (ref1_trimmed == query1_trimmed) & (ref2_trimmed == query2_trimmed)
91
+ batch_matches += np.sum(matches)
92
+ batch_total += min_len
93
+
94
+ return int(batch_matches), batch_total
95
+
96
+ def determine_number_of_matches_and_total_pairs(
97
+ self,
98
+ record_id_pairs: List[Tuple[str, str]],
99
+ reference_records: Dict[str, SeqRecord],
100
+ query_records: Dict[str, SeqRecord],
101
+ ) -> Tuple[int, int]:
102
+ # For small datasets, use sequential processing
103
+ if len(record_id_pairs) < 50:
104
+ number_of_matches = 0
105
+ number_of_total_pairs = 0
106
+
107
+ for first_in_pair, second_in_pair in record_id_pairs:
108
+ ref_seq1_str = str(reference_records[first_in_pair].seq)
109
+ ref_seq2_str = str(reference_records[second_in_pair].seq)
110
+ query_seq1_str = str(query_records[first_in_pair].seq)
111
+ query_seq2_str = str(query_records[second_in_pair].seq)
112
+
113
+ if (len(ref_seq1_str) == len(query_seq1_str) and
114
+ len(ref_seq2_str) == len(query_seq2_str) and
115
+ len(ref_seq1_str) == len(ref_seq2_str)):
116
+ # Use vectorized comparison
117
+ ref_seq1 = np.array(list(ref_seq1_str), dtype='U1')
118
+ ref_seq2 = np.array(list(ref_seq2_str), dtype='U1')
119
+ query_seq1 = np.array(list(query_seq1_str), dtype='U1')
120
+ query_seq2 = np.array(list(query_seq2_str), dtype='U1')
121
+
122
+ matches = (ref_seq1 == query_seq1) & (ref_seq2 == query_seq2)
123
+ number_of_matches += np.sum(matches)
124
+ number_of_total_pairs += len(ref_seq1)
125
+ else:
126
+ min_len = min(len(ref_seq1_str), len(ref_seq2_str),
127
+ len(query_seq1_str), len(query_seq2_str))
128
+
129
+ for i in range(min_len):
130
+ if (ref_seq1_str[i] == query_seq1_str[i] and
131
+ ref_seq2_str[i] == query_seq2_str[i]):
132
+ number_of_matches += 1
133
+ number_of_total_pairs += 1
134
+
135
+ return int(number_of_matches), number_of_total_pairs
136
+
137
+ # Use multiprocessing for larger datasets
138
+ num_workers = min(mp.cpu_count(), 8)
139
+ batch_size = max(10, len(record_id_pairs) // (num_workers * 4))
140
+
141
+ # Create batches
142
+ pair_batches = [record_id_pairs[i:i + batch_size]
143
+ for i in range(0, len(record_id_pairs), batch_size)]
144
+
145
+ # Process batches in parallel
146
+ process_func = partial(self._process_pair_batch,
147
+ reference_records=reference_records,
148
+ query_records=query_records)
149
+
150
+ with mp.Pool(processes=num_workers) as pool:
151
+ batch_results = pool.map(process_func, pair_batches)
152
+
153
+ # Aggregate results
154
+ total_matches = sum(matches for matches, _ in batch_results)
155
+ total_pairs = sum(pairs for _, pairs in batch_results)
156
+
157
+ return int(total_matches), total_pairs
@@ -0,0 +1,54 @@
1
+ from typing import Dict, Tuple
2
+ import numpy as np
3
+
4
+ from Bio.Align import MultipleSeqAlignment
5
+
6
+ from .base import Alignment
7
+
8
+
9
+ class VariableSites(Alignment):
10
+ def __init__(self, args) -> None:
11
+ super().__init__(**self.process_args(args))
12
+
13
+ def run(self):
14
+ alignment, _, is_protein = self.get_alignment_and_format()
15
+ var_sites, aln_len, var_sites_per = \
16
+ self.calculate_variable_sites(alignment)
17
+
18
+ print(f"{var_sites}\t{aln_len}\t{round(var_sites_per, 4)}")
19
+
20
+ def process_args(self, args) -> Dict[str, str]:
21
+ return dict(alignment_file_path=args.alignment)
22
+
23
+ def calculate_variable_sites(
24
+ self,
25
+ alignment: MultipleSeqAlignment
26
+ ) -> Tuple[int, int, float]:
27
+ aln_len = alignment.get_alignment_length()
28
+ gap_chars = self.get_gap_chars()
29
+
30
+ # Convert alignment to numpy array for vectorized operations
31
+ alignment_array = np.array([
32
+ [c.upper() for c in str(record.seq)]
33
+ for record in alignment
34
+ ], dtype='U1')
35
+
36
+ var_sites = 0
37
+
38
+ # Process each column
39
+ for col in range(aln_len):
40
+ column = alignment_array[:, col]
41
+
42
+ # Filter out gap characters
43
+ non_gap_mask = ~np.isin(column, list(gap_chars))
44
+ filtered_column = column[non_gap_mask]
45
+
46
+ # Check if variable (more than one unique character)
47
+ if len(filtered_column) > 0:
48
+ unique_chars = np.unique(filtered_column)
49
+ if len(unique_chars) > 1:
50
+ var_sites += 1
51
+
52
+ var_sites_per = (var_sites / aln_len) * 100
53
+
54
+ return var_sites, aln_len, var_sites_per
@@ -0,0 +1,9 @@
1
+ class BaseService(object):
2
+ def __init__(self):
3
+ pass
4
+
5
+ def process_args(self, args):
6
+ raise NotImplementedError()
7
+
8
+ def run(self):
9
+ raise NotImplementedError()
@@ -0,0 +1,29 @@
1
+ from .bipartition_support_stats import BipartitionSupportStats
2
+ from .branch_length_multiplier import BranchLengthMultiplier
3
+ from .covarying_evolutionary_rates import CovaryingEvolutionaryRates
4
+ from .dvmc import DVMC
5
+ from .evolutionary_rate import EvolutionaryRate
6
+ from .hidden_paralogy_check import HiddenParalogyCheck
7
+ from .internal_branch_stats import InternalBranchStats
8
+ from .internode_labeler import InternodeLabeler
9
+ from .last_common_ancestor_subtree import LastCommonAncestorSubtree
10
+ from .lb_score import LBScore
11
+ from .monophyly_check import MonophylyCheck
12
+ from .nearest_neighbor_interchange import NearestNeighborInterchange
13
+ from .patristic_distances import PatristicDistances
14
+ from .polytomy_test import PolytomyTest
15
+ from .print_tree import PrintTree
16
+ from .prune_tree import PruneTree
17
+ from .rename_tree_tips import RenameTreeTips
18
+ from .root_tree import RootTree
19
+ from .rf_distance import RobinsonFouldsDistance
20
+ from .saturation import Saturation
21
+ from .spurious_sequence import SpuriousSequence
22
+ from .treeness import Treeness
23
+ from .treeness_over_rcv import TreenessOverRCV
24
+ from .terminal_branch_stats import TerminalBranchStats
25
+ from .tip_labels import TipLabels
26
+ from .tip_to_tip_distance import TipToTipDistance
27
+ from .tip_to_tip_node_distance import TipToTipNodeDistance
28
+ from .total_tree_length import TotalTreeLength
29
+ from .collapse_branches import CollapseBranches