phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from typing import Dict, List, Set, Tuple
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
4
|
+
import pickle
|
|
5
|
+
|
|
6
|
+
from Bio.Phylo import Newick
|
|
7
|
+
|
|
8
|
+
from .base import Tree
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RobinsonFouldsDistance(Tree):
|
|
12
|
+
def __init__(self, args) -> None:
|
|
13
|
+
super().__init__(**self.process_args(args))
|
|
14
|
+
|
|
15
|
+
def run(self):
|
|
16
|
+
tree_zero = self.read_tree_file()
|
|
17
|
+
tree_one = self.read_tree1_file()
|
|
18
|
+
|
|
19
|
+
# get shared tree tip names - use sets for efficiency
|
|
20
|
+
tree_zero_tips = set(self.get_tip_names_from_tree(tree_zero))
|
|
21
|
+
tree_one_tips = set(self.get_tip_names_from_tree(tree_one))
|
|
22
|
+
shared_tree_tips = tree_zero_tips & tree_one_tips
|
|
23
|
+
|
|
24
|
+
# prune to common set - already have sets
|
|
25
|
+
tree_zero_tips_to_prune = list(tree_zero_tips - shared_tree_tips)
|
|
26
|
+
tree_one_tips_to_prune = list(tree_one_tips - shared_tree_tips)
|
|
27
|
+
|
|
28
|
+
if tree_zero_tips_to_prune:
|
|
29
|
+
tree_zero = self.prune_tree_using_taxa_list(tree_zero, tree_zero_tips_to_prune)
|
|
30
|
+
if tree_one_tips_to_prune:
|
|
31
|
+
tree_one = self.prune_tree_using_taxa_list(tree_one, tree_one_tips_to_prune)
|
|
32
|
+
|
|
33
|
+
# Get first terminal for rooting
|
|
34
|
+
tip_for_rooting = tree_zero.get_terminals()[0].name
|
|
35
|
+
tree_zero.root_with_outgroup(tip_for_rooting)
|
|
36
|
+
tree_one.root_with_outgroup(tip_for_rooting)
|
|
37
|
+
|
|
38
|
+
plain_rf, normalized_rf = self.calculate_robinson_foulds_distance(
|
|
39
|
+
tree_zero, tree_one
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
print(f"{plain_rf}\t{round(normalized_rf, 4)}")
|
|
43
|
+
|
|
44
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
45
|
+
return dict(
|
|
46
|
+
tree_file_path=args.tree_zero,
|
|
47
|
+
tree1_file_path=args.tree_one,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def calculate_robinson_foulds_distance(self, tree_zero, tree_one):
|
|
51
|
+
plain_rf = 0
|
|
52
|
+
plain_rf = self.compare_trees_optimized(plain_rf, tree_zero, tree_one)
|
|
53
|
+
plain_rf = self.compare_trees_optimized(plain_rf, tree_one, tree_zero)
|
|
54
|
+
|
|
55
|
+
tip_count = tree_zero.count_terminals()
|
|
56
|
+
normalized_rf = plain_rf / (2 * (tip_count - 3))
|
|
57
|
+
|
|
58
|
+
return plain_rf, normalized_rf
|
|
59
|
+
|
|
60
|
+
def compare_trees_optimized(
|
|
61
|
+
self,
|
|
62
|
+
plain_rf: int,
|
|
63
|
+
tree_zero: Newick.Tree,
|
|
64
|
+
tree_one: Newick.Tree
|
|
65
|
+
) -> int:
|
|
66
|
+
# Cache tip names for clades to avoid recomputation
|
|
67
|
+
tip_names_cache = {}
|
|
68
|
+
|
|
69
|
+
def get_cached_tips(clade):
|
|
70
|
+
clade_id = id(clade)
|
|
71
|
+
if clade_id not in tip_names_cache:
|
|
72
|
+
tip_names_cache[clade_id] = frozenset(self.get_tip_names_from_tree(clade))
|
|
73
|
+
return tip_names_cache[clade_id]
|
|
74
|
+
|
|
75
|
+
# loop through tree_zero and find similar clade in tree_one
|
|
76
|
+
for clade_zero in tree_zero.get_nonterminals()[1:]:
|
|
77
|
+
# Get tip names from tree_zero clade
|
|
78
|
+
tip_names_zero = get_cached_tips(clade_zero)
|
|
79
|
+
# get common ancestor of tree_zero tip names in tree_one
|
|
80
|
+
clade_one = tree_one.common_ancestor(list(tip_names_zero))
|
|
81
|
+
# Get tip names from tree_one clade
|
|
82
|
+
tip_names_one = get_cached_tips(clade_one)
|
|
83
|
+
# compare the list of tip names
|
|
84
|
+
if tip_names_zero != tip_names_one:
|
|
85
|
+
plain_rf += 1
|
|
86
|
+
|
|
87
|
+
return plain_rf
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _calculate_rf_batch(tree_pairs_pickle):
|
|
91
|
+
"""Calculate RF distance for a batch of tree pairs in parallel."""
|
|
92
|
+
tree_pairs = pickle.loads(tree_pairs_pickle)
|
|
93
|
+
results = []
|
|
94
|
+
|
|
95
|
+
for tree_zero, tree_one in tree_pairs:
|
|
96
|
+
rf_calc = RobinsonFouldsDistance.__new__(RobinsonFouldsDistance)
|
|
97
|
+
rf_calc.__dict__.update({'tree_format': 'newick'})
|
|
98
|
+
|
|
99
|
+
# Calculate bipartitions
|
|
100
|
+
bipartitions_zero = rf_calc.get_all_bipartitions(tree_zero)
|
|
101
|
+
bipartitions_one = rf_calc.get_all_bipartitions(tree_one)
|
|
102
|
+
|
|
103
|
+
# Calculate RF distance
|
|
104
|
+
plain_rf = len(bipartitions_zero ^ bipartitions_one) # Symmetric difference
|
|
105
|
+
tip_count = tree_zero.count_terminals()
|
|
106
|
+
normalized_rf = plain_rf / (2 * (tip_count - 3))
|
|
107
|
+
|
|
108
|
+
results.append((plain_rf, normalized_rf))
|
|
109
|
+
|
|
110
|
+
return results
|
|
111
|
+
|
|
112
|
+
def calculate_multiple_rf_distances(self, tree_pairs: List[Tuple]) -> List[Tuple[int, float]]:
|
|
113
|
+
"""Calculate RF distances for multiple tree pairs in parallel."""
|
|
114
|
+
if len(tree_pairs) < 5:
|
|
115
|
+
# Sequential for small datasets
|
|
116
|
+
results = []
|
|
117
|
+
for tree_zero, tree_one in tree_pairs:
|
|
118
|
+
plain_rf, normalized_rf = self.calculate_robinson_foulds_distance(tree_zero, tree_one)
|
|
119
|
+
results.append((plain_rf, normalized_rf))
|
|
120
|
+
return results
|
|
121
|
+
|
|
122
|
+
# Parallel processing for larger datasets
|
|
123
|
+
batch_size = max(2, len(tree_pairs) // 4)
|
|
124
|
+
batches = [tree_pairs[i:i + batch_size] for i in range(0, len(tree_pairs), batch_size)]
|
|
125
|
+
|
|
126
|
+
with ProcessPoolExecutor(max_workers=min(4, len(batches))) as executor:
|
|
127
|
+
futures = []
|
|
128
|
+
for batch in batches:
|
|
129
|
+
batch_pickle = pickle.dumps(batch)
|
|
130
|
+
futures.append(executor.submit(self._calculate_rf_batch, batch_pickle))
|
|
131
|
+
|
|
132
|
+
all_results = []
|
|
133
|
+
for future in futures:
|
|
134
|
+
all_results.extend(future.result())
|
|
135
|
+
|
|
136
|
+
return all_results
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from Bio import Phylo
|
|
3
|
+
|
|
4
|
+
from .base import Tree
|
|
5
|
+
|
|
6
|
+
from ...helpers.files import read_single_column_file_to_list
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RootTree(Tree):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self):
|
|
14
|
+
tree = self.read_tree_file()
|
|
15
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
16
|
+
tree_copy = copy.deepcopy(tree)
|
|
17
|
+
|
|
18
|
+
outgroup = \
|
|
19
|
+
read_single_column_file_to_list(self.outgroup_taxa_file_path)
|
|
20
|
+
|
|
21
|
+
Phylo.BaseTree.Tree.root_with_outgroup(tree_copy, outgroup)
|
|
22
|
+
|
|
23
|
+
self.write_tree_file(tree_copy, self.output_file_path)
|
|
24
|
+
|
|
25
|
+
def process_args(self, args):
|
|
26
|
+
tree_file_path = args.tree
|
|
27
|
+
|
|
28
|
+
output_file_path = \
|
|
29
|
+
args.output if args.output else f"{tree_file_path}.rooted"
|
|
30
|
+
|
|
31
|
+
return dict(
|
|
32
|
+
tree_file_path=tree_file_path,
|
|
33
|
+
outgroup_taxa_file_path=args.root,
|
|
34
|
+
output_file_path=output_file_path,
|
|
35
|
+
)
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import itertools
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
from functools import partial
|
|
7
|
+
|
|
8
|
+
from Bio import Align
|
|
9
|
+
from Bio.Phylo import Newick
|
|
10
|
+
import numpy as np
|
|
11
|
+
from sklearn.linear_model import LinearRegression
|
|
12
|
+
|
|
13
|
+
from .base import Tree
|
|
14
|
+
from ...helpers.files import (
|
|
15
|
+
get_alignment_and_format as get_alignment_and_format_helper
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FileFormat(Enum):
|
|
20
|
+
fasta = "fasta"
|
|
21
|
+
clustal = "clustal"
|
|
22
|
+
maf = "maf"
|
|
23
|
+
mauve = "mauve"
|
|
24
|
+
phylip = "phylip"
|
|
25
|
+
phylip_seq = "phylip-sequential"
|
|
26
|
+
phylip_rel = "phylip-relaxed"
|
|
27
|
+
stockholm = "stockholm"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Saturation(Tree):
|
|
31
|
+
def __init__(self, args) -> None:
|
|
32
|
+
super().__init__(**self.process_args(args))
|
|
33
|
+
|
|
34
|
+
def run(self) -> None:
|
|
35
|
+
alignment, _, is_protein = get_alignment_and_format_helper(
|
|
36
|
+
self.alignment_file_path
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
tree = self.read_tree_file()
|
|
40
|
+
|
|
41
|
+
tips = self.get_tip_names_from_tree(tree)
|
|
42
|
+
combos = list(itertools.combinations(tips, 2))
|
|
43
|
+
|
|
44
|
+
(
|
|
45
|
+
patristic_distances,
|
|
46
|
+
uncorrected_distances,
|
|
47
|
+
) = self.loop_through_combos_and_calculate_pds_and_pis(
|
|
48
|
+
combos, alignment, tree, self.exclude_gaps
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# calculate slope and fit the y-intercept to zero
|
|
52
|
+
# Fitting the y-intercept to zero follows Jeffroy et al.
|
|
53
|
+
# See fig 2 https://www.cell.com/trends/genetics/fulltext/S0168-9525(06)00051-5
|
|
54
|
+
model = LinearRegression(fit_intercept=False)
|
|
55
|
+
model.fit(
|
|
56
|
+
np.array(patristic_distances).reshape(-1, 1),
|
|
57
|
+
np.array(uncorrected_distances)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
self.print_res(
|
|
61
|
+
self.verbose, combos, uncorrected_distances, patristic_distances, model.coef_[0]
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
65
|
+
return dict(
|
|
66
|
+
tree_file_path=args.tree,
|
|
67
|
+
alignment_file_path=args.alignment,
|
|
68
|
+
exclude_gaps=args.exclude_gaps,
|
|
69
|
+
verbose=args.verbose,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _process_combo_batch(self, tree, seq_arrays, gap_mask, exclude_gaps, combo_batch):
|
|
73
|
+
"""Process a batch of combinations in parallel."""
|
|
74
|
+
results = []
|
|
75
|
+
for combo in combo_batch:
|
|
76
|
+
# Calculate patristic distance
|
|
77
|
+
pd = tree.distance(combo[0], combo[1])
|
|
78
|
+
|
|
79
|
+
# Calculate uncorrected distance using numpy operations
|
|
80
|
+
seq1_arr = seq_arrays[combo[0]]
|
|
81
|
+
seq2_arr = seq_arrays[combo[1]]
|
|
82
|
+
|
|
83
|
+
if exclude_gaps:
|
|
84
|
+
# Use pre-computed gap masks
|
|
85
|
+
gap_mask1 = gap_mask[combo[0]]
|
|
86
|
+
gap_mask2 = gap_mask[combo[1]]
|
|
87
|
+
valid_positions = ~(gap_mask1 | gap_mask2)
|
|
88
|
+
|
|
89
|
+
if np.any(valid_positions):
|
|
90
|
+
matches = seq1_arr[valid_positions] == seq2_arr[valid_positions]
|
|
91
|
+
identities = np.sum(matches)
|
|
92
|
+
adjusted_len = np.sum(valid_positions)
|
|
93
|
+
ud = 1 - (identities / adjusted_len)
|
|
94
|
+
else:
|
|
95
|
+
ud = float('nan')
|
|
96
|
+
else:
|
|
97
|
+
matches = seq1_arr == seq2_arr
|
|
98
|
+
identities = np.sum(matches)
|
|
99
|
+
ud = 1 - (identities / len(seq1_arr))
|
|
100
|
+
|
|
101
|
+
results.append((pd, ud))
|
|
102
|
+
return results
|
|
103
|
+
|
|
104
|
+
def loop_through_combos_and_calculate_pds_and_pis(
|
|
105
|
+
self,
|
|
106
|
+
combos: List[Tuple[str, str]],
|
|
107
|
+
alignment: Align.MultipleSeqAlignment,
|
|
108
|
+
tree: Newick.Tree,
|
|
109
|
+
exclude_gaps: bool,
|
|
110
|
+
) -> Tuple[
|
|
111
|
+
List[float],
|
|
112
|
+
List[float]
|
|
113
|
+
]:
|
|
114
|
+
"""
|
|
115
|
+
loop through all taxon combinations and determine
|
|
116
|
+
their patristic distance and pairwise identity
|
|
117
|
+
"""
|
|
118
|
+
gap_chars = self.get_gap_chars()
|
|
119
|
+
|
|
120
|
+
# Convert sequences to numpy arrays for vectorized operations
|
|
121
|
+
seq_arrays = {}
|
|
122
|
+
gap_mask = {}
|
|
123
|
+
for record in alignment:
|
|
124
|
+
seq_arr = np.array([c.upper() for c in str(record.seq)], dtype='U1')
|
|
125
|
+
seq_arrays[record.name] = seq_arr
|
|
126
|
+
if exclude_gaps:
|
|
127
|
+
gap_mask[record.name] = np.isin(seq_arr, list(gap_chars))
|
|
128
|
+
|
|
129
|
+
# For small datasets, process sequentially
|
|
130
|
+
if len(combos) < 50:
|
|
131
|
+
patristic_distances = []
|
|
132
|
+
uncorrected_distances = []
|
|
133
|
+
for combo in combos:
|
|
134
|
+
pd = tree.distance(combo[0], combo[1])
|
|
135
|
+
patristic_distances.append(pd)
|
|
136
|
+
|
|
137
|
+
seq1_arr = seq_arrays[combo[0]]
|
|
138
|
+
seq2_arr = seq_arrays[combo[1]]
|
|
139
|
+
|
|
140
|
+
if exclude_gaps:
|
|
141
|
+
gap_mask1 = gap_mask[combo[0]]
|
|
142
|
+
gap_mask2 = gap_mask[combo[1]]
|
|
143
|
+
valid_positions = ~(gap_mask1 | gap_mask2)
|
|
144
|
+
|
|
145
|
+
if np.any(valid_positions):
|
|
146
|
+
matches = seq1_arr[valid_positions] == seq2_arr[valid_positions]
|
|
147
|
+
identities = np.sum(matches)
|
|
148
|
+
adjusted_len = np.sum(valid_positions)
|
|
149
|
+
ud = 1 - (identities / adjusted_len)
|
|
150
|
+
else:
|
|
151
|
+
ud = float('nan')
|
|
152
|
+
else:
|
|
153
|
+
matches = seq1_arr == seq2_arr
|
|
154
|
+
identities = np.sum(matches)
|
|
155
|
+
ud = 1 - (identities / len(seq1_arr))
|
|
156
|
+
|
|
157
|
+
uncorrected_distances.append(ud)
|
|
158
|
+
else:
|
|
159
|
+
# Use multiprocessing for larger datasets
|
|
160
|
+
num_workers = min(mp.cpu_count(), 8)
|
|
161
|
+
chunk_size = max(1, len(combos) // (num_workers * 4))
|
|
162
|
+
combo_chunks = [combos[i:i + chunk_size] for i in range(0, len(combos), chunk_size)]
|
|
163
|
+
|
|
164
|
+
# Create partial function
|
|
165
|
+
process_func = partial(
|
|
166
|
+
self._process_combo_batch,
|
|
167
|
+
tree,
|
|
168
|
+
seq_arrays,
|
|
169
|
+
gap_mask,
|
|
170
|
+
exclude_gaps
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Process in parallel
|
|
174
|
+
with mp.Pool(processes=num_workers) as pool:
|
|
175
|
+
chunk_results = pool.map(process_func, combo_chunks)
|
|
176
|
+
|
|
177
|
+
# Flatten results
|
|
178
|
+
patristic_distances = []
|
|
179
|
+
uncorrected_distances = []
|
|
180
|
+
for chunk_result in chunk_results:
|
|
181
|
+
for pd, ud in chunk_result:
|
|
182
|
+
patristic_distances.append(pd)
|
|
183
|
+
uncorrected_distances.append(ud)
|
|
184
|
+
|
|
185
|
+
return patristic_distances, uncorrected_distances
|
|
186
|
+
|
|
187
|
+
def print_res(
|
|
188
|
+
self,
|
|
189
|
+
verbose: bool,
|
|
190
|
+
combos: List[str],
|
|
191
|
+
uncorrected_distances: List[float],
|
|
192
|
+
patristic_distances: List[float],
|
|
193
|
+
slope: float,
|
|
194
|
+
) -> None:
|
|
195
|
+
"""
|
|
196
|
+
print results to stdout
|
|
197
|
+
"""
|
|
198
|
+
try:
|
|
199
|
+
if verbose:
|
|
200
|
+
for cbo, dist, pd in zip(
|
|
201
|
+
combos, uncorrected_distances, patristic_distances
|
|
202
|
+
):
|
|
203
|
+
print(
|
|
204
|
+
f"{cbo[0]}\t{cbo[1]}\t{round(dist,4)}\t{round(pd, 4)}"
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
print(f"{round(slope, 4)}\t{abs(round(1-slope, 4))}")
|
|
208
|
+
except BrokenPipeError:
|
|
209
|
+
pass
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import statistics as stat
|
|
2
|
+
from typing import Dict, List, Tuple
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo import Newick
|
|
5
|
+
|
|
6
|
+
from .base import Tree
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SpuriousSequence(Tree):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self) -> None:
|
|
14
|
+
tree = self.read_tree_file()
|
|
15
|
+
name_and_branch_len, threshold, median = \
|
|
16
|
+
self.identify_spurious_sequence(
|
|
17
|
+
tree, self.factor
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
counter = 0
|
|
21
|
+
for name, length in name_and_branch_len.items():
|
|
22
|
+
if length >= threshold:
|
|
23
|
+
try:
|
|
24
|
+
print(
|
|
25
|
+
f"{name}\t{round(length, 4)}\t{round(threshold, 4)}\t{round(median, 4)}"
|
|
26
|
+
)
|
|
27
|
+
except BrokenPipeError:
|
|
28
|
+
pass
|
|
29
|
+
counter += 1
|
|
30
|
+
|
|
31
|
+
if counter == 0:
|
|
32
|
+
print("None")
|
|
33
|
+
|
|
34
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
35
|
+
return dict(
|
|
36
|
+
tree_file_path=args.tree,
|
|
37
|
+
factor=args.factor or 20
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def identify_spurious_sequence(
|
|
41
|
+
self,
|
|
42
|
+
tree: Newick.Tree,
|
|
43
|
+
factor: float,
|
|
44
|
+
) -> Tuple[
|
|
45
|
+
Dict[str, float],
|
|
46
|
+
float,
|
|
47
|
+
float
|
|
48
|
+
]:
|
|
49
|
+
branch_lengths, name_and_branch_len = \
|
|
50
|
+
self.get_branch_lengths_and_their_names(tree)
|
|
51
|
+
|
|
52
|
+
median = stat.median(branch_lengths)
|
|
53
|
+
|
|
54
|
+
threshold = median * factor
|
|
55
|
+
|
|
56
|
+
return name_and_branch_len, threshold, median
|
|
57
|
+
|
|
58
|
+
def get_branch_lengths_and_their_names(
|
|
59
|
+
self,
|
|
60
|
+
tree: Newick.Tree,
|
|
61
|
+
) -> Tuple[
|
|
62
|
+
List[float],
|
|
63
|
+
Dict[str, float],
|
|
64
|
+
]:
|
|
65
|
+
branch_lengths = []
|
|
66
|
+
name_and_branch_len = {}
|
|
67
|
+
|
|
68
|
+
# collect terminal branch lengths only for spurious sequence detection
|
|
69
|
+
# (internal branches are not considered for spurious sequence detection)
|
|
70
|
+
for terminal in tree.get_terminals():
|
|
71
|
+
if terminal.branch_length is not None:
|
|
72
|
+
branch_lengths.append(terminal.branch_length)
|
|
73
|
+
name_and_branch_len[terminal.name] = terminal.branch_length
|
|
74
|
+
|
|
75
|
+
return branch_lengths, name_and_branch_len
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict, List, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo import Newick
|
|
5
|
+
|
|
6
|
+
from .base import Tree
|
|
7
|
+
|
|
8
|
+
from ...helpers.stats_summary import (
|
|
9
|
+
calculate_summary_statistics_from_arr,
|
|
10
|
+
print_summary_statistics,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TerminalBranchStats(Tree):
|
|
15
|
+
def __init__(self, args) -> None:
|
|
16
|
+
super().__init__(**self.process_args(args))
|
|
17
|
+
|
|
18
|
+
def run(self):
|
|
19
|
+
tree = self.read_tree_file()
|
|
20
|
+
_, stats, lengths_and_names = \
|
|
21
|
+
self.calculate_terminal_branch_stats(tree)
|
|
22
|
+
|
|
23
|
+
if self.verbose:
|
|
24
|
+
try:
|
|
25
|
+
for len_and_name in lengths_and_names:
|
|
26
|
+
print(round(len_and_name[0], 4), len_and_name[1])
|
|
27
|
+
except BrokenPipeError:
|
|
28
|
+
pass
|
|
29
|
+
else:
|
|
30
|
+
print_summary_statistics(stats)
|
|
31
|
+
|
|
32
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
33
|
+
return dict(tree_file_path=args.tree, verbose=args.verbose)
|
|
34
|
+
|
|
35
|
+
def get_terminal_branch_lengths(
|
|
36
|
+
self,
|
|
37
|
+
tree: Newick.Tree,
|
|
38
|
+
) -> List[
|
|
39
|
+
Union[
|
|
40
|
+
float,
|
|
41
|
+
List[List[Union[float, str]]],
|
|
42
|
+
]
|
|
43
|
+
]:
|
|
44
|
+
"""
|
|
45
|
+
loop through tree and get all terminal branch lengths
|
|
46
|
+
"""
|
|
47
|
+
terminal_branch_lengths = []
|
|
48
|
+
lengths_and_names = []
|
|
49
|
+
for terminal_branch in tree.get_terminals():
|
|
50
|
+
if terminal_branch.branch_length is not None:
|
|
51
|
+
temp = []
|
|
52
|
+
temp.append(terminal_branch.branch_length)
|
|
53
|
+
terminal_branch_lengths.append(terminal_branch.branch_length)
|
|
54
|
+
temp.append(terminal_branch.name)
|
|
55
|
+
lengths_and_names.append(temp)
|
|
56
|
+
|
|
57
|
+
return terminal_branch_lengths, lengths_and_names
|
|
58
|
+
|
|
59
|
+
def check_tree_has_branch_lengths(
|
|
60
|
+
self,
|
|
61
|
+
terminal_branch_lengths: List[float],
|
|
62
|
+
) -> None:
|
|
63
|
+
"""
|
|
64
|
+
if tree has no branch lengths, exit
|
|
65
|
+
"""
|
|
66
|
+
if len(terminal_branch_lengths) == 0:
|
|
67
|
+
print(
|
|
68
|
+
"Calculating terminal branch statistics requires a phylogeny with branch lengths."
|
|
69
|
+
)
|
|
70
|
+
sys.exit(2)
|
|
71
|
+
|
|
72
|
+
def calculate_terminal_branch_stats(
|
|
73
|
+
self,
|
|
74
|
+
tree: Newick.Tree,
|
|
75
|
+
) -> Tuple[
|
|
76
|
+
List[float],
|
|
77
|
+
Dict[str, float],
|
|
78
|
+
List[List[Union[float, str]]],
|
|
79
|
+
]:
|
|
80
|
+
terminal_branch_lengths, lengths_and_names = \
|
|
81
|
+
self.get_terminal_branch_lengths(tree)
|
|
82
|
+
|
|
83
|
+
self.check_tree_has_branch_lengths(terminal_branch_lengths)
|
|
84
|
+
|
|
85
|
+
stats = calculate_summary_statistics_from_arr(terminal_branch_lengths)
|
|
86
|
+
|
|
87
|
+
return terminal_branch_lengths, stats, lengths_and_names
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from .base import Tree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TipLabels(Tree):
|
|
7
|
+
def __init__(self, args) -> None:
|
|
8
|
+
super().__init__(**self.process_args(args))
|
|
9
|
+
|
|
10
|
+
def run(self) -> None:
|
|
11
|
+
tree = self.read_tree_file()
|
|
12
|
+
try:
|
|
13
|
+
print("\n".join([tip.name for tip in tree.get_terminals()]))
|
|
14
|
+
except BrokenPipeError:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
18
|
+
return dict(tree_file_path=args.tree)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo.BaseTree import TreeMixin
|
|
5
|
+
from Bio.Phylo import Newick
|
|
6
|
+
|
|
7
|
+
from .base import Tree
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TipToTipDistance(Tree):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
tree_zero = self.read_tree_file()
|
|
16
|
+
|
|
17
|
+
self.check_leaves(tree_zero, self.tip_1, self.tip_2)
|
|
18
|
+
|
|
19
|
+
print(round(TreeMixin.distance(tree_zero, self.tip_1, self.tip_2), 4))
|
|
20
|
+
|
|
21
|
+
def check_leaves(
|
|
22
|
+
self,
|
|
23
|
+
tree_zero: Newick.Tree,
|
|
24
|
+
tip_1: str,
|
|
25
|
+
tip_2: str,
|
|
26
|
+
) -> None:
|
|
27
|
+
leaf1 = TreeMixin.find_any(tree_zero, tip_1)
|
|
28
|
+
if not bool(leaf1):
|
|
29
|
+
print(tip_1, "not on tree\nExiting...")
|
|
30
|
+
sys.exit(2)
|
|
31
|
+
leaf2 = TreeMixin.find_any(tree_zero, tip_2)
|
|
32
|
+
if not bool(leaf2):
|
|
33
|
+
print(tip_2, "not on tree\nExiting...")
|
|
34
|
+
sys.exit(2)
|
|
35
|
+
|
|
36
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
37
|
+
return dict(
|
|
38
|
+
tree_file_path=args.tree_zero,
|
|
39
|
+
tip_1=args.tip_1,
|
|
40
|
+
tip_2=args.tip_2,
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo.BaseTree import TreeMixin
|
|
5
|
+
from Bio.Phylo import Newick
|
|
6
|
+
|
|
7
|
+
from .base import Tree
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TipToTipNodeDistance(Tree):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self) -> None:
|
|
15
|
+
tree_zero = self.read_tree_file()
|
|
16
|
+
|
|
17
|
+
self.check_leaves(tree_zero, self.tip_1, self.tip_2)
|
|
18
|
+
|
|
19
|
+
print(len(TreeMixin.trace(tree_zero, self.tip_1, self.tip_2)))
|
|
20
|
+
|
|
21
|
+
def check_leaves(
|
|
22
|
+
self,
|
|
23
|
+
tree_zero: Newick.Tree,
|
|
24
|
+
tip_1: str,
|
|
25
|
+
tip_2: str,
|
|
26
|
+
) -> None:
|
|
27
|
+
leaf1 = TreeMixin.find_any(tree_zero, tip_1)
|
|
28
|
+
if not bool(leaf1):
|
|
29
|
+
print(tip_1, "not on tree\nExiting...")
|
|
30
|
+
sys.exit(2)
|
|
31
|
+
leaf2 = TreeMixin.find_any(tree_zero, tip_2)
|
|
32
|
+
if not bool(leaf2):
|
|
33
|
+
print(tip_2, "not on tree\nExiting...")
|
|
34
|
+
sys.exit(2)
|
|
35
|
+
|
|
36
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
37
|
+
return dict(
|
|
38
|
+
tree_file_path=args.tree_zero,
|
|
39
|
+
tip_1=args.tip_1,
|
|
40
|
+
tip_2=args.tip_2,
|
|
41
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Dict, Union
|
|
2
|
+
|
|
3
|
+
from Bio.Phylo import Newick
|
|
4
|
+
from .base import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TotalTreeLength(Tree):
|
|
8
|
+
def __init__(self, args) -> None:
|
|
9
|
+
super().__init__(**self.process_args(args))
|
|
10
|
+
|
|
11
|
+
def run(self) -> None:
|
|
12
|
+
tree = self.read_tree_file()
|
|
13
|
+
print(round(self.calculate_total_tree_length(tree), 4))
|
|
14
|
+
|
|
15
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
16
|
+
return dict(tree_file_path=args.tree)
|
|
17
|
+
|
|
18
|
+
def calculate_total_tree_length(
|
|
19
|
+
self,
|
|
20
|
+
tree: Newick.Tree
|
|
21
|
+
) -> Union[int, float]:
|
|
22
|
+
total_len = tree.total_branch_length()
|
|
23
|
+
|
|
24
|
+
if isinstance(total_len, (int, float)):
|
|
25
|
+
return total_len
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from .base import Tree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Treeness(Tree):
|
|
7
|
+
def __init__(self, args) -> None:
|
|
8
|
+
super().__init__(**self.process_args(args))
|
|
9
|
+
|
|
10
|
+
def run(self) -> None:
|
|
11
|
+
tree = self.read_tree_file()
|
|
12
|
+
treeness = self.calculate_treeness(tree)
|
|
13
|
+
print(round(treeness, 4))
|
|
14
|
+
|
|
15
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
16
|
+
return dict(tree_file_path=args.tree)
|