phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
File without changes
@@ -0,0 +1,17 @@
1
+ from .alignment_length import AlignmentLength
2
+ from .alignment_length_no_gaps import AlignmentLengthNoGaps
3
+ from .alignment_recoding import AlignmentRecoding
4
+ from .column_score import ColumnScore
5
+ from .compositional_bias_per_site import CompositionalBiasPerSite
6
+ from .create_concatenation_matrix import CreateConcatenationMatrix
7
+ from .dna_threader import DNAThreader
8
+ from .evolutionary_rate_per_site import EvolutionaryRatePerSite
9
+ from .faidx import Faidx
10
+ from .gc_content import GCContent
11
+ from .pairwise_identity import PairwiseIdentity
12
+ from .parsimony_informative_sites import ParsimonyInformative
13
+ from .rcv import RelativeCompositionVariability
14
+ from .rcvt import RelativeCompositionVariabilityTaxon
15
+ from .rename_fasta_entries import RenameFastaEntries
16
+ from .sum_of_pairs_score import SumOfPairsScore
17
+ from .variable_sites import VariableSites
@@ -0,0 +1,16 @@
1
+ from .base import Alignment
2
+
3
+ from typing import Dict
4
+
5
+
6
+ class AlignmentLength(Alignment):
7
+ def __init__(self, args) -> None:
8
+ super().__init__(**self.process_args(args))
9
+
10
+ def run(self) -> None:
11
+ alignment, _, _ = self.get_alignment_and_format()
12
+ aln_len = alignment.get_alignment_length()
13
+ print(aln_len)
14
+
15
+ def process_args(self, args) -> Dict[str, str]:
16
+ return dict(alignment_file_path=args.alignment)
@@ -0,0 +1,69 @@
1
+ from argparse import Namespace
2
+ from typing import Dict, Tuple
3
+ import numpy as np
4
+
5
+ from Bio.Align import MultipleSeqAlignment
6
+
7
+ from .base import Alignment
8
+
9
+
10
+ class AlignmentLengthNoGaps(Alignment):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self) -> None:
15
+ alignment, _, is_protein = self.get_alignment_and_format()
16
+ (
17
+ aln_len_no_gaps,
18
+ aln_len,
19
+ aln_len_no_gaps_per,
20
+ ) = self.calculate_alignment_length_no_gaps(alignment, is_protein)
21
+ print(f"{aln_len_no_gaps}\t{aln_len}\t{round(aln_len_no_gaps_per, 4)}")
22
+
23
+ def process_args(
24
+ self,
25
+ args: Namespace,
26
+ ) -> Dict[str, str]:
27
+ return dict(alignment_file_path=args.alignment)
28
+
29
+ def calculate_alignment_length_no_gaps(
30
+ self,
31
+ alignment: MultipleSeqAlignment,
32
+ is_protein: bool,
33
+ ) -> Tuple[int, int, float]:
34
+ aln_len = alignment.get_alignment_length()
35
+ aln_len_no_gaps = self.get_sites_no_gaps_count(
36
+ alignment,
37
+ aln_len,
38
+ is_protein
39
+ )
40
+
41
+ aln_len_no_gaps_per = (aln_len_no_gaps / aln_len) * 100
42
+
43
+ return aln_len_no_gaps, aln_len, aln_len_no_gaps_per
44
+
45
+ def get_sites_no_gaps_count(
46
+ self,
47
+ alignment: MultipleSeqAlignment,
48
+ aln_len: int,
49
+ is_protein: bool,
50
+ ) -> int:
51
+ """
52
+ Count sites in the alignment with no gaps
53
+ """
54
+ gap_chars = set(self.get_gap_chars())
55
+
56
+ # Convert alignment to numpy array
57
+ alignment_array = np.array([
58
+ list(str(record.seq)) for record in alignment
59
+ ], dtype='U1')
60
+
61
+ # Count columns with no gaps
62
+ aln_len_no_gaps = 0
63
+ for col_idx in range(aln_len):
64
+ column = alignment_array[:, col_idx]
65
+ # Check if column has any gap characters
66
+ if not np.any(np.isin(column, list(gap_chars))):
67
+ aln_len_no_gaps += 1
68
+
69
+ return aln_len_no_gaps
@@ -0,0 +1,89 @@
1
+ from os import path
2
+ import sys
3
+ from typing import Dict, List
4
+
5
+ from Bio.Align import MultipleSeqAlignment
6
+
7
+ from .base import Alignment
8
+
9
+ here = path.dirname(__file__)
10
+
11
+
12
+ class AlignmentRecoding(Alignment):
13
+ def __init__(self, args) -> None:
14
+ super().__init__(**self.process_args(args))
15
+
16
+ def run(self) -> None:
17
+ alignment, _, is_protein = self.get_alignment_and_format()
18
+
19
+ recoding_table = self.read_recoding_table(self.code[0])
20
+
21
+ recoded_alignment = self.recode_alignment(
22
+ alignment, recoding_table, is_protein
23
+ )
24
+
25
+ for k, v in recoded_alignment.items():
26
+ print(f">{k}\n{''.join(v)}")
27
+
28
+ def recode_alignment(
29
+ self,
30
+ alignment: MultipleSeqAlignment,
31
+ recoding_table: Dict[str, str],
32
+ is_protein: bool,
33
+ ) -> Dict[str, List[str]]:
34
+
35
+ gap_chars = self.get_gap_chars()
36
+ recoded_alignment = dict()
37
+
38
+ for record in alignment:
39
+ recoded_sequence = [
40
+ recoding_table.get(base.upper(), base)
41
+ if base not in gap_chars else base
42
+ for base in record.seq
43
+ ]
44
+ recoded_alignment[record.id] = recoded_sequence
45
+
46
+ return recoded_alignment
47
+
48
+ def read_recoding_table(
49
+ self,
50
+ recoding: str
51
+ ) -> Dict[str, str]:
52
+ """
53
+ return translation table with codons as keys and amino acids as values
54
+ """
55
+
56
+ recoding_table = dict()
57
+
58
+ if recoding is None:
59
+ print("Please specify a recoding table")
60
+ sys.exit(2)
61
+
62
+ recoding_paths = {
63
+ "RY-nucleotide": "../../recoding_tables/RY-nucleotide.txt",
64
+ "SandR-6": "../../recoding_tables/S_and_R-6.txt",
65
+ "KGB-6": "../../recoding_tables/KGB-6.txt",
66
+ "Dayhoff-6": "../../recoding_tables/Dayhoff-6.txt",
67
+ "Dayhoff-9": "../../recoding_tables/Dayhoff-9.txt",
68
+ "Dayhoff-12": "../../recoding_tables/Dayhoff-12.txt",
69
+ "Dayhoff-15": "../../recoding_tables/Dayhoff-15.txt",
70
+ "Dayhoff-18": "../../recoding_tables/Dayhoff-18.txt",
71
+ }
72
+ pathing = recoding_paths.get(recoding, str(recoding))
73
+
74
+ try:
75
+ with open(path.join(here, pathing)) as code:
76
+ for line in code:
77
+ parts = line.split()
78
+ recoding_table[parts[1].upper()] = parts[0].upper()
79
+ except FileNotFoundError:
80
+ print(f"Recoding table file '{pathing}' not found.")
81
+ sys.exit(2)
82
+
83
+ return recoding_table
84
+
85
+ def process_args(self, args):
86
+ return dict(
87
+ alignment_file_path=args.alignment,
88
+ code=args.code
89
+ )
@@ -0,0 +1,103 @@
1
+ from collections import Counter
2
+ import sys
3
+ import numpy as np
4
+
5
+ from typing import List
6
+
7
+ from ..base import BaseService
8
+ from ...helpers.files import (
9
+ get_alignment_and_format as get_alignment_and_format_helper
10
+ )
11
+
12
+
13
+ class Alignment(BaseService):
14
+ def __init__(
15
+ self,
16
+ *args,
17
+ alignment_file_path=None,
18
+ code=None,
19
+ fasta=None,
20
+ output_file_path=None,
21
+ protein_file_path=None,
22
+ nucleotide_file_path=None,
23
+ alignment_list_path=None,
24
+ prefix=None,
25
+ idmap=None,
26
+ reference=None,
27
+ verbose=None,
28
+ entry=None,
29
+ exclude_gaps=None,
30
+ ):
31
+ self.alignment_file_path = alignment_file_path
32
+ self.code = code,
33
+ self.output_file_path = output_file_path
34
+ self.protein_file_path = (protein_file_path,)
35
+ self.nucleotide_file_path = nucleotide_file_path
36
+ self.alignment_list_path = alignment_list_path
37
+ self.prefix = prefix
38
+ self.fasta = fasta
39
+ self.idmap = idmap
40
+ self.reference = reference
41
+ self.verbose = verbose
42
+ self.entry = entry
43
+ self.exclude_gaps = exclude_gaps
44
+
45
+ def get_alignment_and_format(self):
46
+ """
47
+ automatic file type determination
48
+ """
49
+ try:
50
+ return get_alignment_and_format_helper(self.alignment_file_path)
51
+ except FileNotFoundError:
52
+ print("Input corresponds to no such file or directory.")
53
+ print("Please double check pathing and filenames")
54
+ sys.exit(2)
55
+
56
+ def calculate_rcv(self) -> float:
57
+ alignment, _, _ = self.get_alignment_and_format()
58
+ aln_len = alignment.get_alignment_length()
59
+ num_records = len(alignment)
60
+
61
+ # Convert alignment to numpy array for faster operations
62
+ alignment_array = np.array([
63
+ list(str(record.seq)) for record in alignment
64
+ ], dtype='U1')
65
+
66
+ # Get all unique characters in the alignment
67
+ unique_chars = np.unique(alignment_array)
68
+
69
+ # Vectorized approach: create a count matrix for all sequences and characters at once
70
+ # Shape: (num_records, num_unique_chars)
71
+ count_matrix = np.zeros((num_records, len(unique_chars)), dtype=np.int32)
72
+
73
+ # Build character index mapping for fast lookup
74
+ char_to_idx = {char: idx for idx, char in enumerate(unique_chars)}
75
+
76
+ # Count characters for each sequence using vectorized operations
77
+ for seq_idx in range(num_records):
78
+ seq = alignment_array[seq_idx]
79
+ for char_idx, char in enumerate(unique_chars):
80
+ count_matrix[seq_idx, char_idx] = np.sum(seq == char)
81
+
82
+ # Calculate total counts and averages using matrix operations
83
+ total_counts = np.sum(count_matrix, axis=0)
84
+ average_counts = total_counts / num_records
85
+
86
+ # Calculate RCV values using vectorized operations
87
+ # Compute absolute differences from average for all sequences at once
88
+ abs_diffs = np.abs(count_matrix - average_counts)
89
+
90
+ # Sum across characters for each sequence
91
+ seq_rcv_sums = np.sum(abs_diffs, axis=1)
92
+
93
+ # Normalize and sum
94
+ indiv_rcv_values = seq_rcv_sums / (num_records * aln_len)
95
+ relative_composition_variability = np.sum(indiv_rcv_values)
96
+
97
+ return float(relative_composition_variability)
98
+
99
+ def get_gap_chars(is_protein: bool) -> List[str]:
100
+ if is_protein:
101
+ return ["-", "?", "*", "X", "x"]
102
+ else:
103
+ return ["-", "?", "*", "X", "x", "N", "n"]
@@ -0,0 +1,66 @@
1
+ from typing import Dict, List, Tuple
2
+ import numpy as np
3
+
4
+ from Bio import AlignIO
5
+ from Bio.Align import MultipleSeqAlignment
6
+
7
+ from .base import Alignment
8
+
9
+
10
+ class ColumnScore(Alignment):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self) -> None:
15
+ query_records = AlignIO.read(self.fasta, "fasta")
16
+ reference_records = AlignIO.read(self.reference, "fasta")
17
+
18
+ # create lists with strings of every columns
19
+ ref_columns, query_columns = self.get_columns_from_alignments(
20
+ reference_records, query_records
21
+ )
22
+
23
+ # count the number of matches and total pairs
24
+ number_of_matches, number_of_total_columns = \
25
+ self.calculate_matches_between_ref_and_query_columns(
26
+ ref_columns, query_columns
27
+ )
28
+
29
+ print(round(number_of_matches / number_of_total_columns, 4))
30
+
31
+ def process_args(self, args) -> Dict[str, str]:
32
+ return dict(fasta=args.fasta, reference=args.reference)
33
+
34
+ def get_columns_from_alignments(
35
+ self,
36
+ reference_records: MultipleSeqAlignment,
37
+ query_records: MultipleSeqAlignment,
38
+ ) -> Tuple[List[str], List[str]]:
39
+ # Convert alignments to numpy arrays for faster column extraction
40
+ ref_array = np.array([
41
+ [c.upper() for c in str(record.seq)]
42
+ for record in reference_records
43
+ ], dtype='U1')
44
+
45
+ query_array = np.array([
46
+ [c.upper() for c in str(record.seq)]
47
+ for record in query_records
48
+ ], dtype='U1')
49
+
50
+ # Extract columns as strings
51
+ ref_columns = [''.join(ref_array[:, i]) for i in range(ref_array.shape[1])]
52
+ query_columns = [''.join(query_array[:, i]) for i in range(query_array.shape[1])]
53
+
54
+ return ref_columns, query_columns
55
+
56
+ def calculate_matches_between_ref_and_query_columns(
57
+ self,
58
+ ref_columns: List[str],
59
+ query_columns: List[str],
60
+ ) -> Tuple[int, int]:
61
+ set1 = set(ref_columns)
62
+ set2 = set(query_columns)
63
+
64
+ matches = set1.intersection(set2)
65
+
66
+ return len(matches), len(query_columns)
@@ -0,0 +1,98 @@
1
+ from typing import Dict, List, Tuple, Union
2
+ from collections import Counter
3
+ import numpy as np
4
+
5
+ from scipy.stats import chisquare, false_discovery_control
6
+ from scipy.stats._stats_py import Power_divergenceResult
7
+ from Bio.Align import MultipleSeqAlignment
8
+
9
+ from .base import Alignment
10
+
11
+
12
+ class CompositionalBiasPerSite(Alignment):
13
+ def __init__(self, args) -> None:
14
+ super().__init__(**self.process_args(args))
15
+
16
+ def run(self) -> None:
17
+ alignment, _, is_protein = self.get_alignment_and_format()
18
+
19
+ stat_res, p_vals_corrected = \
20
+ self.calculate_compositional_bias_per_site(alignment)
21
+
22
+ for idx, (stat_info, pval_cor) in enumerate(
23
+ zip(stat_res, p_vals_corrected), start=1
24
+ ):
25
+ pval_cor_str = "nan" if isinstance(pval_cor, str) else round(pval_cor, 4)
26
+ print(f"{idx}\t{round(stat_info.statistic, 4)}\t{pval_cor_str}\t{round(stat_info.pvalue, 4)}")
27
+
28
+ def process_args(self, args) -> Dict[str, str]:
29
+ return dict(alignment_file_path=args.alignment)
30
+
31
+ def get_number_of_occurrences_per_character(
32
+ self,
33
+ alignment: MultipleSeqAlignment,
34
+ idx: int,
35
+ ) -> List[int]:
36
+ gap_chars = self.get_gap_chars()
37
+ seq_at_position = alignment[:, idx].upper()
38
+ filtered_seq = "".join([char for char in seq_at_position if char not in gap_chars])
39
+
40
+ return list(Counter(filtered_seq).values())
41
+
42
+ def calculate_compositional_bias_per_site(
43
+ self,
44
+ alignment: MultipleSeqAlignment,
45
+ ) -> Tuple[
46
+ List[Power_divergenceResult],
47
+ List[Union[float, str]],
48
+ ]:
49
+ aln_len = alignment.get_alignment_length()
50
+ gap_chars = set(self.get_gap_chars())
51
+
52
+ # Convert alignment to numpy array for faster operations
53
+ alignment_array = np.array([
54
+ [c.upper() for c in str(record.seq)]
55
+ for record in alignment
56
+ ], dtype='U1')
57
+
58
+ stat_res = []
59
+ p_vals = []
60
+ nan_idx = []
61
+
62
+ # Process each column
63
+ for col_idx in range(aln_len):
64
+ column = alignment_array[:, col_idx]
65
+
66
+ # Filter out gaps
67
+ non_gap_mask = ~np.isin(column, list(gap_chars))
68
+ filtered_column = column[non_gap_mask]
69
+
70
+ if len(filtered_column) > 0:
71
+ # Count occurrences using numpy
72
+ unique_chars, counts = np.unique(filtered_column, return_counts=True)
73
+
74
+ # Perform chi-square test
75
+ chisquare_res = chisquare(counts)
76
+ stat_res.append(chisquare_res)
77
+
78
+ if not np.isnan(chisquare_res.pvalue):
79
+ p_vals.append(chisquare_res.pvalue)
80
+ else:
81
+ nan_idx.append(col_idx)
82
+ else:
83
+ # Handle empty column
84
+ dummy_res = chisquare([1]) # Create dummy result
85
+ stat_res.append(dummy_res)
86
+ nan_idx.append(col_idx)
87
+
88
+ # Apply FDR correction
89
+ if p_vals:
90
+ p_vals_corrected = list(false_discovery_control(p_vals))
91
+ else:
92
+ p_vals_corrected = []
93
+
94
+ # Insert NaNs at appropriate positions
95
+ for idx in reversed(nan_idx):
96
+ p_vals_corrected.insert(idx, "nan")
97
+
98
+ return stat_res, p_vals_corrected