phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .alignment_length import AlignmentLength
|
|
2
|
+
from .alignment_length_no_gaps import AlignmentLengthNoGaps
|
|
3
|
+
from .alignment_recoding import AlignmentRecoding
|
|
4
|
+
from .column_score import ColumnScore
|
|
5
|
+
from .compositional_bias_per_site import CompositionalBiasPerSite
|
|
6
|
+
from .create_concatenation_matrix import CreateConcatenationMatrix
|
|
7
|
+
from .dna_threader import DNAThreader
|
|
8
|
+
from .evolutionary_rate_per_site import EvolutionaryRatePerSite
|
|
9
|
+
from .faidx import Faidx
|
|
10
|
+
from .gc_content import GCContent
|
|
11
|
+
from .pairwise_identity import PairwiseIdentity
|
|
12
|
+
from .parsimony_informative_sites import ParsimonyInformative
|
|
13
|
+
from .rcv import RelativeCompositionVariability
|
|
14
|
+
from .rcvt import RelativeCompositionVariabilityTaxon
|
|
15
|
+
from .rename_fasta_entries import RenameFastaEntries
|
|
16
|
+
from .sum_of_pairs_score import SumOfPairsScore
|
|
17
|
+
from .variable_sites import VariableSites
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from .base import Alignment
|
|
2
|
+
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AlignmentLength(Alignment):
|
|
7
|
+
def __init__(self, args) -> None:
|
|
8
|
+
super().__init__(**self.process_args(args))
|
|
9
|
+
|
|
10
|
+
def run(self) -> None:
|
|
11
|
+
alignment, _, _ = self.get_alignment_and_format()
|
|
12
|
+
aln_len = alignment.get_alignment_length()
|
|
13
|
+
print(aln_len)
|
|
14
|
+
|
|
15
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
16
|
+
return dict(alignment_file_path=args.alignment)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from argparse import Namespace
|
|
2
|
+
from typing import Dict, Tuple
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from Bio.Align import MultipleSeqAlignment
|
|
6
|
+
|
|
7
|
+
from .base import Alignment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AlignmentLengthNoGaps(Alignment):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self) -> None:
|
|
15
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
16
|
+
(
|
|
17
|
+
aln_len_no_gaps,
|
|
18
|
+
aln_len,
|
|
19
|
+
aln_len_no_gaps_per,
|
|
20
|
+
) = self.calculate_alignment_length_no_gaps(alignment, is_protein)
|
|
21
|
+
print(f"{aln_len_no_gaps}\t{aln_len}\t{round(aln_len_no_gaps_per, 4)}")
|
|
22
|
+
|
|
23
|
+
def process_args(
|
|
24
|
+
self,
|
|
25
|
+
args: Namespace,
|
|
26
|
+
) -> Dict[str, str]:
|
|
27
|
+
return dict(alignment_file_path=args.alignment)
|
|
28
|
+
|
|
29
|
+
def calculate_alignment_length_no_gaps(
|
|
30
|
+
self,
|
|
31
|
+
alignment: MultipleSeqAlignment,
|
|
32
|
+
is_protein: bool,
|
|
33
|
+
) -> Tuple[int, int, float]:
|
|
34
|
+
aln_len = alignment.get_alignment_length()
|
|
35
|
+
aln_len_no_gaps = self.get_sites_no_gaps_count(
|
|
36
|
+
alignment,
|
|
37
|
+
aln_len,
|
|
38
|
+
is_protein
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
aln_len_no_gaps_per = (aln_len_no_gaps / aln_len) * 100
|
|
42
|
+
|
|
43
|
+
return aln_len_no_gaps, aln_len, aln_len_no_gaps_per
|
|
44
|
+
|
|
45
|
+
def get_sites_no_gaps_count(
|
|
46
|
+
self,
|
|
47
|
+
alignment: MultipleSeqAlignment,
|
|
48
|
+
aln_len: int,
|
|
49
|
+
is_protein: bool,
|
|
50
|
+
) -> int:
|
|
51
|
+
"""
|
|
52
|
+
Count sites in the alignment with no gaps
|
|
53
|
+
"""
|
|
54
|
+
gap_chars = set(self.get_gap_chars())
|
|
55
|
+
|
|
56
|
+
# Convert alignment to numpy array
|
|
57
|
+
alignment_array = np.array([
|
|
58
|
+
list(str(record.seq)) for record in alignment
|
|
59
|
+
], dtype='U1')
|
|
60
|
+
|
|
61
|
+
# Count columns with no gaps
|
|
62
|
+
aln_len_no_gaps = 0
|
|
63
|
+
for col_idx in range(aln_len):
|
|
64
|
+
column = alignment_array[:, col_idx]
|
|
65
|
+
# Check if column has any gap characters
|
|
66
|
+
if not np.any(np.isin(column, list(gap_chars))):
|
|
67
|
+
aln_len_no_gaps += 1
|
|
68
|
+
|
|
69
|
+
return aln_len_no_gaps
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from Bio.Align import MultipleSeqAlignment
|
|
6
|
+
|
|
7
|
+
from .base import Alignment
|
|
8
|
+
|
|
9
|
+
here = path.dirname(__file__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AlignmentRecoding(Alignment):
|
|
13
|
+
def __init__(self, args) -> None:
|
|
14
|
+
super().__init__(**self.process_args(args))
|
|
15
|
+
|
|
16
|
+
def run(self) -> None:
|
|
17
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
18
|
+
|
|
19
|
+
recoding_table = self.read_recoding_table(self.code[0])
|
|
20
|
+
|
|
21
|
+
recoded_alignment = self.recode_alignment(
|
|
22
|
+
alignment, recoding_table, is_protein
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
for k, v in recoded_alignment.items():
|
|
26
|
+
print(f">{k}\n{''.join(v)}")
|
|
27
|
+
|
|
28
|
+
def recode_alignment(
|
|
29
|
+
self,
|
|
30
|
+
alignment: MultipleSeqAlignment,
|
|
31
|
+
recoding_table: Dict[str, str],
|
|
32
|
+
is_protein: bool,
|
|
33
|
+
) -> Dict[str, List[str]]:
|
|
34
|
+
|
|
35
|
+
gap_chars = self.get_gap_chars()
|
|
36
|
+
recoded_alignment = dict()
|
|
37
|
+
|
|
38
|
+
for record in alignment:
|
|
39
|
+
recoded_sequence = [
|
|
40
|
+
recoding_table.get(base.upper(), base)
|
|
41
|
+
if base not in gap_chars else base
|
|
42
|
+
for base in record.seq
|
|
43
|
+
]
|
|
44
|
+
recoded_alignment[record.id] = recoded_sequence
|
|
45
|
+
|
|
46
|
+
return recoded_alignment
|
|
47
|
+
|
|
48
|
+
def read_recoding_table(
|
|
49
|
+
self,
|
|
50
|
+
recoding: str
|
|
51
|
+
) -> Dict[str, str]:
|
|
52
|
+
"""
|
|
53
|
+
return translation table with codons as keys and amino acids as values
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
recoding_table = dict()
|
|
57
|
+
|
|
58
|
+
if recoding is None:
|
|
59
|
+
print("Please specify a recoding table")
|
|
60
|
+
sys.exit(2)
|
|
61
|
+
|
|
62
|
+
recoding_paths = {
|
|
63
|
+
"RY-nucleotide": "../../recoding_tables/RY-nucleotide.txt",
|
|
64
|
+
"SandR-6": "../../recoding_tables/S_and_R-6.txt",
|
|
65
|
+
"KGB-6": "../../recoding_tables/KGB-6.txt",
|
|
66
|
+
"Dayhoff-6": "../../recoding_tables/Dayhoff-6.txt",
|
|
67
|
+
"Dayhoff-9": "../../recoding_tables/Dayhoff-9.txt",
|
|
68
|
+
"Dayhoff-12": "../../recoding_tables/Dayhoff-12.txt",
|
|
69
|
+
"Dayhoff-15": "../../recoding_tables/Dayhoff-15.txt",
|
|
70
|
+
"Dayhoff-18": "../../recoding_tables/Dayhoff-18.txt",
|
|
71
|
+
}
|
|
72
|
+
pathing = recoding_paths.get(recoding, str(recoding))
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
with open(path.join(here, pathing)) as code:
|
|
76
|
+
for line in code:
|
|
77
|
+
parts = line.split()
|
|
78
|
+
recoding_table[parts[1].upper()] = parts[0].upper()
|
|
79
|
+
except FileNotFoundError:
|
|
80
|
+
print(f"Recoding table file '{pathing}' not found.")
|
|
81
|
+
sys.exit(2)
|
|
82
|
+
|
|
83
|
+
return recoding_table
|
|
84
|
+
|
|
85
|
+
def process_args(self, args):
|
|
86
|
+
return dict(
|
|
87
|
+
alignment_file_path=args.alignment,
|
|
88
|
+
code=args.code
|
|
89
|
+
)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
import sys
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from ..base import BaseService
|
|
8
|
+
from ...helpers.files import (
|
|
9
|
+
get_alignment_and_format as get_alignment_and_format_helper
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Alignment(BaseService):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
*args,
|
|
17
|
+
alignment_file_path=None,
|
|
18
|
+
code=None,
|
|
19
|
+
fasta=None,
|
|
20
|
+
output_file_path=None,
|
|
21
|
+
protein_file_path=None,
|
|
22
|
+
nucleotide_file_path=None,
|
|
23
|
+
alignment_list_path=None,
|
|
24
|
+
prefix=None,
|
|
25
|
+
idmap=None,
|
|
26
|
+
reference=None,
|
|
27
|
+
verbose=None,
|
|
28
|
+
entry=None,
|
|
29
|
+
exclude_gaps=None,
|
|
30
|
+
):
|
|
31
|
+
self.alignment_file_path = alignment_file_path
|
|
32
|
+
self.code = code,
|
|
33
|
+
self.output_file_path = output_file_path
|
|
34
|
+
self.protein_file_path = (protein_file_path,)
|
|
35
|
+
self.nucleotide_file_path = nucleotide_file_path
|
|
36
|
+
self.alignment_list_path = alignment_list_path
|
|
37
|
+
self.prefix = prefix
|
|
38
|
+
self.fasta = fasta
|
|
39
|
+
self.idmap = idmap
|
|
40
|
+
self.reference = reference
|
|
41
|
+
self.verbose = verbose
|
|
42
|
+
self.entry = entry
|
|
43
|
+
self.exclude_gaps = exclude_gaps
|
|
44
|
+
|
|
45
|
+
def get_alignment_and_format(self):
|
|
46
|
+
"""
|
|
47
|
+
automatic file type determination
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
return get_alignment_and_format_helper(self.alignment_file_path)
|
|
51
|
+
except FileNotFoundError:
|
|
52
|
+
print("Input corresponds to no such file or directory.")
|
|
53
|
+
print("Please double check pathing and filenames")
|
|
54
|
+
sys.exit(2)
|
|
55
|
+
|
|
56
|
+
def calculate_rcv(self) -> float:
|
|
57
|
+
alignment, _, _ = self.get_alignment_and_format()
|
|
58
|
+
aln_len = alignment.get_alignment_length()
|
|
59
|
+
num_records = len(alignment)
|
|
60
|
+
|
|
61
|
+
# Convert alignment to numpy array for faster operations
|
|
62
|
+
alignment_array = np.array([
|
|
63
|
+
list(str(record.seq)) for record in alignment
|
|
64
|
+
], dtype='U1')
|
|
65
|
+
|
|
66
|
+
# Get all unique characters in the alignment
|
|
67
|
+
unique_chars = np.unique(alignment_array)
|
|
68
|
+
|
|
69
|
+
# Vectorized approach: create a count matrix for all sequences and characters at once
|
|
70
|
+
# Shape: (num_records, num_unique_chars)
|
|
71
|
+
count_matrix = np.zeros((num_records, len(unique_chars)), dtype=np.int32)
|
|
72
|
+
|
|
73
|
+
# Build character index mapping for fast lookup
|
|
74
|
+
char_to_idx = {char: idx for idx, char in enumerate(unique_chars)}
|
|
75
|
+
|
|
76
|
+
# Count characters for each sequence using vectorized operations
|
|
77
|
+
for seq_idx in range(num_records):
|
|
78
|
+
seq = alignment_array[seq_idx]
|
|
79
|
+
for char_idx, char in enumerate(unique_chars):
|
|
80
|
+
count_matrix[seq_idx, char_idx] = np.sum(seq == char)
|
|
81
|
+
|
|
82
|
+
# Calculate total counts and averages using matrix operations
|
|
83
|
+
total_counts = np.sum(count_matrix, axis=0)
|
|
84
|
+
average_counts = total_counts / num_records
|
|
85
|
+
|
|
86
|
+
# Calculate RCV values using vectorized operations
|
|
87
|
+
# Compute absolute differences from average for all sequences at once
|
|
88
|
+
abs_diffs = np.abs(count_matrix - average_counts)
|
|
89
|
+
|
|
90
|
+
# Sum across characters for each sequence
|
|
91
|
+
seq_rcv_sums = np.sum(abs_diffs, axis=1)
|
|
92
|
+
|
|
93
|
+
# Normalize and sum
|
|
94
|
+
indiv_rcv_values = seq_rcv_sums / (num_records * aln_len)
|
|
95
|
+
relative_composition_variability = np.sum(indiv_rcv_values)
|
|
96
|
+
|
|
97
|
+
return float(relative_composition_variability)
|
|
98
|
+
|
|
99
|
+
def get_gap_chars(is_protein: bool) -> List[str]:
|
|
100
|
+
if is_protein:
|
|
101
|
+
return ["-", "?", "*", "X", "x"]
|
|
102
|
+
else:
|
|
103
|
+
return ["-", "?", "*", "X", "x", "N", "n"]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
from Bio import AlignIO
|
|
5
|
+
from Bio.Align import MultipleSeqAlignment
|
|
6
|
+
|
|
7
|
+
from .base import Alignment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ColumnScore(Alignment):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self) -> None:
|
|
15
|
+
query_records = AlignIO.read(self.fasta, "fasta")
|
|
16
|
+
reference_records = AlignIO.read(self.reference, "fasta")
|
|
17
|
+
|
|
18
|
+
# create lists with strings of every columns
|
|
19
|
+
ref_columns, query_columns = self.get_columns_from_alignments(
|
|
20
|
+
reference_records, query_records
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# count the number of matches and total pairs
|
|
24
|
+
number_of_matches, number_of_total_columns = \
|
|
25
|
+
self.calculate_matches_between_ref_and_query_columns(
|
|
26
|
+
ref_columns, query_columns
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
print(round(number_of_matches / number_of_total_columns, 4))
|
|
30
|
+
|
|
31
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
32
|
+
return dict(fasta=args.fasta, reference=args.reference)
|
|
33
|
+
|
|
34
|
+
def get_columns_from_alignments(
|
|
35
|
+
self,
|
|
36
|
+
reference_records: MultipleSeqAlignment,
|
|
37
|
+
query_records: MultipleSeqAlignment,
|
|
38
|
+
) -> Tuple[List[str], List[str]]:
|
|
39
|
+
# Convert alignments to numpy arrays for faster column extraction
|
|
40
|
+
ref_array = np.array([
|
|
41
|
+
[c.upper() for c in str(record.seq)]
|
|
42
|
+
for record in reference_records
|
|
43
|
+
], dtype='U1')
|
|
44
|
+
|
|
45
|
+
query_array = np.array([
|
|
46
|
+
[c.upper() for c in str(record.seq)]
|
|
47
|
+
for record in query_records
|
|
48
|
+
], dtype='U1')
|
|
49
|
+
|
|
50
|
+
# Extract columns as strings
|
|
51
|
+
ref_columns = [''.join(ref_array[:, i]) for i in range(ref_array.shape[1])]
|
|
52
|
+
query_columns = [''.join(query_array[:, i]) for i in range(query_array.shape[1])]
|
|
53
|
+
|
|
54
|
+
return ref_columns, query_columns
|
|
55
|
+
|
|
56
|
+
def calculate_matches_between_ref_and_query_columns(
|
|
57
|
+
self,
|
|
58
|
+
ref_columns: List[str],
|
|
59
|
+
query_columns: List[str],
|
|
60
|
+
) -> Tuple[int, int]:
|
|
61
|
+
set1 = set(ref_columns)
|
|
62
|
+
set2 = set(query_columns)
|
|
63
|
+
|
|
64
|
+
matches = set1.intersection(set2)
|
|
65
|
+
|
|
66
|
+
return len(matches), len(query_columns)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple, Union
|
|
2
|
+
from collections import Counter
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from scipy.stats import chisquare, false_discovery_control
|
|
6
|
+
from scipy.stats._stats_py import Power_divergenceResult
|
|
7
|
+
from Bio.Align import MultipleSeqAlignment
|
|
8
|
+
|
|
9
|
+
from .base import Alignment
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CompositionalBiasPerSite(Alignment):
|
|
13
|
+
def __init__(self, args) -> None:
|
|
14
|
+
super().__init__(**self.process_args(args))
|
|
15
|
+
|
|
16
|
+
def run(self) -> None:
|
|
17
|
+
alignment, _, is_protein = self.get_alignment_and_format()
|
|
18
|
+
|
|
19
|
+
stat_res, p_vals_corrected = \
|
|
20
|
+
self.calculate_compositional_bias_per_site(alignment)
|
|
21
|
+
|
|
22
|
+
for idx, (stat_info, pval_cor) in enumerate(
|
|
23
|
+
zip(stat_res, p_vals_corrected), start=1
|
|
24
|
+
):
|
|
25
|
+
pval_cor_str = "nan" if isinstance(pval_cor, str) else round(pval_cor, 4)
|
|
26
|
+
print(f"{idx}\t{round(stat_info.statistic, 4)}\t{pval_cor_str}\t{round(stat_info.pvalue, 4)}")
|
|
27
|
+
|
|
28
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
29
|
+
return dict(alignment_file_path=args.alignment)
|
|
30
|
+
|
|
31
|
+
def get_number_of_occurrences_per_character(
|
|
32
|
+
self,
|
|
33
|
+
alignment: MultipleSeqAlignment,
|
|
34
|
+
idx: int,
|
|
35
|
+
) -> List[int]:
|
|
36
|
+
gap_chars = self.get_gap_chars()
|
|
37
|
+
seq_at_position = alignment[:, idx].upper()
|
|
38
|
+
filtered_seq = "".join([char for char in seq_at_position if char not in gap_chars])
|
|
39
|
+
|
|
40
|
+
return list(Counter(filtered_seq).values())
|
|
41
|
+
|
|
42
|
+
def calculate_compositional_bias_per_site(
|
|
43
|
+
self,
|
|
44
|
+
alignment: MultipleSeqAlignment,
|
|
45
|
+
) -> Tuple[
|
|
46
|
+
List[Power_divergenceResult],
|
|
47
|
+
List[Union[float, str]],
|
|
48
|
+
]:
|
|
49
|
+
aln_len = alignment.get_alignment_length()
|
|
50
|
+
gap_chars = set(self.get_gap_chars())
|
|
51
|
+
|
|
52
|
+
# Convert alignment to numpy array for faster operations
|
|
53
|
+
alignment_array = np.array([
|
|
54
|
+
[c.upper() for c in str(record.seq)]
|
|
55
|
+
for record in alignment
|
|
56
|
+
], dtype='U1')
|
|
57
|
+
|
|
58
|
+
stat_res = []
|
|
59
|
+
p_vals = []
|
|
60
|
+
nan_idx = []
|
|
61
|
+
|
|
62
|
+
# Process each column
|
|
63
|
+
for col_idx in range(aln_len):
|
|
64
|
+
column = alignment_array[:, col_idx]
|
|
65
|
+
|
|
66
|
+
# Filter out gaps
|
|
67
|
+
non_gap_mask = ~np.isin(column, list(gap_chars))
|
|
68
|
+
filtered_column = column[non_gap_mask]
|
|
69
|
+
|
|
70
|
+
if len(filtered_column) > 0:
|
|
71
|
+
# Count occurrences using numpy
|
|
72
|
+
unique_chars, counts = np.unique(filtered_column, return_counts=True)
|
|
73
|
+
|
|
74
|
+
# Perform chi-square test
|
|
75
|
+
chisquare_res = chisquare(counts)
|
|
76
|
+
stat_res.append(chisquare_res)
|
|
77
|
+
|
|
78
|
+
if not np.isnan(chisquare_res.pvalue):
|
|
79
|
+
p_vals.append(chisquare_res.pvalue)
|
|
80
|
+
else:
|
|
81
|
+
nan_idx.append(col_idx)
|
|
82
|
+
else:
|
|
83
|
+
# Handle empty column
|
|
84
|
+
dummy_res = chisquare([1]) # Create dummy result
|
|
85
|
+
stat_res.append(dummy_res)
|
|
86
|
+
nan_idx.append(col_idx)
|
|
87
|
+
|
|
88
|
+
# Apply FDR correction
|
|
89
|
+
if p_vals:
|
|
90
|
+
p_vals_corrected = list(false_discovery_control(p_vals))
|
|
91
|
+
else:
|
|
92
|
+
p_vals_corrected = []
|
|
93
|
+
|
|
94
|
+
# Insert NaNs at appropriate positions
|
|
95
|
+
for idx in reversed(nan_idx):
|
|
96
|
+
p_vals_corrected.insert(idx, "nan")
|
|
97
|
+
|
|
98
|
+
return stat_res, p_vals_corrected
|