phykit 2.1.40__tar.gz → 2.1.41__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phykit-2.1.40 → phykit-2.1.41}/PKG-INFO +1 -1
- {phykit-2.1.40 → phykit-2.1.41}/phykit/cli_registry.py +2 -0
- phykit-2.1.41/phykit/helpers/discrete_models.py +299 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/phykit.py +66 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/service_factories.py +1 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/__init__.py +1 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ancestral_reconstruction.py +16 -121
- phykit-2.1.41/phykit/services/tree/fit_discrete.py +177 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/stochastic_character_map.py +16 -214
- phykit-2.1.41/phykit/version.py +1 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/PKG-INFO +1 -1
- {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/SOURCES.txt +2 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/entry_points.txt +3 -0
- {phykit-2.1.40 → phykit-2.1.41}/setup.py +3 -0
- phykit-2.1.40/phykit/version.py +0 -1
- {phykit-2.1.40 → phykit-2.1.41}/LICENSE.md +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/README.md +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/__init__.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/__main__.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/errors.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/__init__.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/boolean_argument_parsing.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/caching.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/files.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/json_output.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/parallel.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/plot_config.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/stats_summary.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/streaming.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/__init__.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/__init__.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_entropy.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_length.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_length_no_gaps.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_outlier_taxa.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_recoding.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/base.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/column_score.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/composition_per_taxon.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/compositional_bias_per_site.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/create_concatenation_matrix.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/dna_threader.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/evolutionary_rate_per_site.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/faidx.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/gc_content.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/mask_alignment.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/occupancy_per_taxon.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/pairwise_identity.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/parsimony_informative_sites.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/plot_alignment_qc.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/rcv.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/rcvt.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/rename_fasta_entries.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/sum_of_pairs_score.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/variable_sites.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/base.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/base.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/bipartition_support_stats.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/branch_length_multiplier.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/collapse_branches.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/concordance_asr.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/consensus_network.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/consensus_tree.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/cont_map.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/cophylo.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/covarying_evolutionary_rates.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/density_map.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/discordance_asymmetry.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/dvmc.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/evo_tempo_map.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/evolutionary_rate.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/fit_continuous.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/hidden_paralogy_check.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/internal_branch_stats.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/internode_labeler.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/kf_distance.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/last_common_ancestor_subtree.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/lb_score.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ltt.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/monophyly_check.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/nearest_neighbor_interchange.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/network_signal.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ou_shift_detection.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ouwie.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/patristic_distances.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phenogram.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_glm.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_ordination.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_regression.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_signal.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylomorphospace.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/polytomy_test.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/print_tree.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/prune_tree.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/quartet_network.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/rate_heterogeneity.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/relative_rate_test.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/rename_tree_tips.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/rf_distance.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/root_tree.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/saturation.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/spectral_discordance.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/spurious_sequence.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/terminal_branch_stats.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/threshold_model.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/tip_labels.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/tip_to_tip_distance.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/tip_to_tip_node_distance.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/total_tree_length.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/treeness.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/treeness_over_rcv.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/vcv_utils.py +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/dependency_links.txt +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/requires.txt +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/top_level.txt +0 -0
- {phykit-2.1.40 → phykit-2.1.41}/setup.cfg +0 -0
|
@@ -104,6 +104,8 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
|
|
|
104
104
|
"rh": "rate_heterogeneity",
|
|
105
105
|
"fitcontinuous": "fit_continuous",
|
|
106
106
|
"fc": "fit_continuous",
|
|
107
|
+
"fitdiscrete": "fit_discrete",
|
|
108
|
+
"fd": "fit_discrete",
|
|
107
109
|
"ouwie": "ouwie",
|
|
108
110
|
"fit_ouwie": "ouwie",
|
|
109
111
|
"multi_regime_ou": "ouwie",
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared utilities for discrete trait evolution models.
|
|
3
|
+
|
|
4
|
+
Provides Q-matrix construction, Felsenstein pruning (likelihood computation),
|
|
5
|
+
maximum-likelihood Q-matrix fitting, and discrete trait data parsing. Used by
|
|
6
|
+
stochastic_character_map, ancestral_reconstruction, and fit_discrete.
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
from typing import Dict, List, Tuple
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from scipy.linalg import expm
|
|
13
|
+
from scipy.optimize import minimize
|
|
14
|
+
|
|
15
|
+
from ..errors import PhykitUserError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
VALID_DISCRETE_MODELS = frozenset(["ER", "SYM", "ARD"])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def count_params(k: int, model: str) -> int:
|
|
22
|
+
"""Return the number of free rate parameters for a discrete model."""
|
|
23
|
+
if model == "ER":
|
|
24
|
+
return 1
|
|
25
|
+
elif model == "SYM":
|
|
26
|
+
return k * (k - 1) // 2
|
|
27
|
+
elif model == "ARD":
|
|
28
|
+
return k * (k - 1)
|
|
29
|
+
else:
|
|
30
|
+
raise PhykitUserError(
|
|
31
|
+
[f"Unknown model '{model}'. Use ER, SYM, or ARD."], code=2,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def build_q_matrix(params: np.ndarray, k: int, model: str) -> np.ndarray:
|
|
36
|
+
"""Build a Q-matrix from parameters for ER, SYM, or ARD models.
|
|
37
|
+
|
|
38
|
+
Rows sum to zero (standard continuous-time Markov chain convention).
|
|
39
|
+
"""
|
|
40
|
+
Q = np.zeros((k, k))
|
|
41
|
+
if model == "ER":
|
|
42
|
+
rate = params[0]
|
|
43
|
+
Q[:] = rate
|
|
44
|
+
np.fill_diagonal(Q, 0.0)
|
|
45
|
+
elif model == "SYM":
|
|
46
|
+
idx = 0
|
|
47
|
+
for i in range(k):
|
|
48
|
+
for j in range(i + 1, k):
|
|
49
|
+
Q[i, j] = params[idx]
|
|
50
|
+
Q[j, i] = params[idx]
|
|
51
|
+
idx += 1
|
|
52
|
+
elif model == "ARD":
|
|
53
|
+
idx = 0
|
|
54
|
+
for i in range(k):
|
|
55
|
+
for j in range(k):
|
|
56
|
+
if i != j:
|
|
57
|
+
Q[i, j] = params[idx]
|
|
58
|
+
idx += 1
|
|
59
|
+
for i in range(k):
|
|
60
|
+
Q[i, i] = -np.sum(Q[i, :])
|
|
61
|
+
return Q
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def matrix_exp(Q: np.ndarray, t: float) -> np.ndarray:
|
|
65
|
+
"""Compute the matrix exponential P = exp(Q * t)."""
|
|
66
|
+
return expm(Q * t)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def felsenstein_pruning(
|
|
70
|
+
tree, tip_states: Dict[str, str], Q: np.ndarray,
|
|
71
|
+
pi: np.ndarray, states: List[str]
|
|
72
|
+
) -> Tuple[Dict, float]:
|
|
73
|
+
"""Postorder traversal computing conditional likelihoods and log-likelihood.
|
|
74
|
+
|
|
75
|
+
Returns (cond_liks, loglik) where cond_liks maps clade id to
|
|
76
|
+
a k-length likelihood vector.
|
|
77
|
+
"""
|
|
78
|
+
k = len(states)
|
|
79
|
+
state_idx = {s: i for i, s in enumerate(states)}
|
|
80
|
+
cond_liks = {}
|
|
81
|
+
|
|
82
|
+
for clade in tree.find_clades(order="postorder"):
|
|
83
|
+
if clade.is_terminal():
|
|
84
|
+
lik = np.zeros(k)
|
|
85
|
+
if clade.name in tip_states:
|
|
86
|
+
lik[state_idx[tip_states[clade.name]]] = 1.0
|
|
87
|
+
cond_liks[id(clade)] = lik
|
|
88
|
+
else:
|
|
89
|
+
lik = np.ones(k)
|
|
90
|
+
for child in clade.clades:
|
|
91
|
+
t = child.branch_length if child.branch_length else 1e-8
|
|
92
|
+
P = matrix_exp(Q, t)
|
|
93
|
+
child_lik = cond_liks[id(child)]
|
|
94
|
+
lik *= P @ child_lik
|
|
95
|
+
cond_liks[id(clade)] = lik
|
|
96
|
+
|
|
97
|
+
root_lik = cond_liks[id(tree.root)]
|
|
98
|
+
total_lik = np.sum(pi * root_lik)
|
|
99
|
+
if total_lik <= 0:
|
|
100
|
+
loglik = -1e20
|
|
101
|
+
else:
|
|
102
|
+
loglik = np.log(total_lik)
|
|
103
|
+
|
|
104
|
+
return cond_liks, loglik
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def fit_q_matrix(
|
|
108
|
+
tree, tip_states: Dict[str, str],
|
|
109
|
+
states: List[str], model: str
|
|
110
|
+
) -> Tuple[np.ndarray, float]:
|
|
111
|
+
"""Fit Q-matrix parameters via maximum likelihood.
|
|
112
|
+
|
|
113
|
+
Uses multi-start optimization with L-BFGS-B and Nelder-Mead,
|
|
114
|
+
followed by a refinement step.
|
|
115
|
+
|
|
116
|
+
Returns (Q_matrix, log_likelihood).
|
|
117
|
+
"""
|
|
118
|
+
k = len(states)
|
|
119
|
+
n_params = count_params(k, model)
|
|
120
|
+
pi = np.ones(k) / k
|
|
121
|
+
|
|
122
|
+
def neg_loglik(params):
|
|
123
|
+
Q = build_q_matrix(np.abs(params), k, model)
|
|
124
|
+
_, ll = felsenstein_pruning(tree, tip_states, Q, pi, states)
|
|
125
|
+
return -ll
|
|
126
|
+
|
|
127
|
+
bounds = [(1e-8, 100.0)] * n_params
|
|
128
|
+
|
|
129
|
+
starting_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
|
|
130
|
+
best_negll = np.inf
|
|
131
|
+
best_params = np.ones(n_params) * 0.1
|
|
132
|
+
|
|
133
|
+
for sv in starting_values:
|
|
134
|
+
x0 = np.ones(n_params) * sv
|
|
135
|
+
for method in ["L-BFGS-B", "Nelder-Mead"]:
|
|
136
|
+
try:
|
|
137
|
+
kwargs = {"method": method}
|
|
138
|
+
if method == "L-BFGS-B":
|
|
139
|
+
kwargs["bounds"] = bounds
|
|
140
|
+
result = minimize(neg_loglik, x0, **kwargs)
|
|
141
|
+
if result.fun < best_negll:
|
|
142
|
+
best_negll = result.fun
|
|
143
|
+
best_params = np.abs(result.x)
|
|
144
|
+
except (ValueError, np.linalg.LinAlgError):
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Refine best result
|
|
148
|
+
try:
|
|
149
|
+
result = minimize(
|
|
150
|
+
neg_loglik, best_params, method="Nelder-Mead",
|
|
151
|
+
options={"maxiter": 10000, "xatol": 1e-10, "fatol": 1e-10},
|
|
152
|
+
)
|
|
153
|
+
if result.fun < best_negll:
|
|
154
|
+
best_params = np.abs(result.x)
|
|
155
|
+
except (ValueError, np.linalg.LinAlgError):
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
Q = build_q_matrix(best_params, k, model)
|
|
159
|
+
_, loglik = felsenstein_pruning(tree, tip_states, Q, pi, states)
|
|
160
|
+
|
|
161
|
+
return Q, loglik
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def parse_discrete_traits(
|
|
165
|
+
path: str, tree_tips: List[str], trait_column: str = None
|
|
166
|
+
) -> Dict[str, str]:
|
|
167
|
+
"""Parse discrete trait data from a TSV file.
|
|
168
|
+
|
|
169
|
+
If trait_column is None: expects 2-column format (taxon<tab>state),
|
|
170
|
+
no header row.
|
|
171
|
+
If trait_column is given: expects multi-column with header row,
|
|
172
|
+
extracts the named column.
|
|
173
|
+
|
|
174
|
+
Returns {taxon: state} dict for the intersection of file taxa and
|
|
175
|
+
tree_tips. Requires at least 3 shared taxa.
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
with open(path) as f:
|
|
179
|
+
lines = f.readlines()
|
|
180
|
+
except FileNotFoundError:
|
|
181
|
+
raise PhykitUserError(
|
|
182
|
+
[
|
|
183
|
+
f"{path} corresponds to no such file or directory.",
|
|
184
|
+
"Please check filename and pathing",
|
|
185
|
+
],
|
|
186
|
+
code=2,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
data_lines = []
|
|
190
|
+
for line in lines:
|
|
191
|
+
stripped = line.strip()
|
|
192
|
+
if not stripped or stripped.startswith("#"):
|
|
193
|
+
continue
|
|
194
|
+
data_lines.append(stripped)
|
|
195
|
+
|
|
196
|
+
if trait_column is not None:
|
|
197
|
+
return _parse_multi_column(data_lines, path, tree_tips, trait_column)
|
|
198
|
+
else:
|
|
199
|
+
return _parse_two_column(data_lines, path, tree_tips)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _parse_multi_column(
|
|
203
|
+
data_lines: List[str], path: str, tree_tips: List[str], trait_column: str
|
|
204
|
+
) -> Dict[str, str]:
|
|
205
|
+
if len(data_lines) < 2:
|
|
206
|
+
raise PhykitUserError(
|
|
207
|
+
["Trait file must have a header row and at least one data row."],
|
|
208
|
+
code=2,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
header_parts = data_lines[0].split("\t")
|
|
212
|
+
if len(header_parts) < 2:
|
|
213
|
+
raise PhykitUserError(
|
|
214
|
+
["Header must have at least 2 columns (taxon + at least 1 trait)."],
|
|
215
|
+
code=2,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
trait_names = header_parts[1:]
|
|
219
|
+
if trait_column not in trait_names:
|
|
220
|
+
raise PhykitUserError(
|
|
221
|
+
[
|
|
222
|
+
f"Column '{trait_column}' not found in trait file.",
|
|
223
|
+
f"Available columns: {', '.join(trait_names)}",
|
|
224
|
+
],
|
|
225
|
+
code=2,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
col_idx = trait_names.index(trait_column)
|
|
229
|
+
n_cols = len(header_parts)
|
|
230
|
+
|
|
231
|
+
traits = {}
|
|
232
|
+
for line_idx, line in enumerate(data_lines[1:], 2):
|
|
233
|
+
parts = line.split("\t")
|
|
234
|
+
if len(parts) != n_cols:
|
|
235
|
+
raise PhykitUserError(
|
|
236
|
+
[f"Line {line_idx} has {len(parts)} columns; expected {n_cols}."],
|
|
237
|
+
code=2,
|
|
238
|
+
)
|
|
239
|
+
taxon = parts[0]
|
|
240
|
+
value = parts[1 + col_idx].strip()
|
|
241
|
+
if not value:
|
|
242
|
+
raise PhykitUserError(
|
|
243
|
+
[f"Missing trait value for taxon '{taxon}' on line {line_idx}."],
|
|
244
|
+
code=2,
|
|
245
|
+
)
|
|
246
|
+
traits[taxon] = value
|
|
247
|
+
|
|
248
|
+
return _validate_shared_taxa(traits, tree_tips)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _parse_two_column(
|
|
252
|
+
data_lines: List[str], path: str, tree_tips: List[str]
|
|
253
|
+
) -> Dict[str, str]:
|
|
254
|
+
traits = {}
|
|
255
|
+
for line_idx, line in enumerate(data_lines, 1):
|
|
256
|
+
parts = line.split("\t")
|
|
257
|
+
if len(parts) != 2:
|
|
258
|
+
raise PhykitUserError(
|
|
259
|
+
[f"Line {line_idx} has {len(parts)} columns; expected 2 (taxon, state)."],
|
|
260
|
+
code=2,
|
|
261
|
+
)
|
|
262
|
+
traits[parts[0]] = parts[1].strip()
|
|
263
|
+
|
|
264
|
+
return _validate_shared_taxa(traits, tree_tips)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _validate_shared_taxa(
|
|
268
|
+
traits: Dict[str, str], tree_tips: List[str]
|
|
269
|
+
) -> Dict[str, str]:
|
|
270
|
+
tree_tip_set = set(tree_tips)
|
|
271
|
+
trait_taxa_set = set(traits.keys())
|
|
272
|
+
shared = tree_tip_set & trait_taxa_set
|
|
273
|
+
|
|
274
|
+
tree_only = tree_tip_set - trait_taxa_set
|
|
275
|
+
trait_only = trait_taxa_set - tree_tip_set
|
|
276
|
+
|
|
277
|
+
if tree_only:
|
|
278
|
+
print(
|
|
279
|
+
f"Warning: {len(tree_only)} taxa in tree but not in trait file: "
|
|
280
|
+
f"{', '.join(sorted(tree_only))}",
|
|
281
|
+
file=sys.stderr,
|
|
282
|
+
)
|
|
283
|
+
if trait_only:
|
|
284
|
+
print(
|
|
285
|
+
f"Warning: {len(trait_only)} taxa in trait file but not in tree: "
|
|
286
|
+
f"{', '.join(sorted(trait_only))}",
|
|
287
|
+
file=sys.stderr,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
if len(shared) < 3:
|
|
291
|
+
raise PhykitUserError(
|
|
292
|
+
[
|
|
293
|
+
f"Only {len(shared)} shared taxa between tree and trait file.",
|
|
294
|
+
"At least 3 shared taxa are required.",
|
|
295
|
+
],
|
|
296
|
+
code=2,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return {taxon: traits[taxon] for taxon in shared}
|
|
@@ -230,6 +230,9 @@ class Phykit:
|
|
|
230
230
|
fit_continuous (alias: fitcontinuous; fc)
|
|
231
231
|
- compare continuous trait evolution models
|
|
232
232
|
(BM, OU, EB, Lambda, Delta, Kappa, White)
|
|
233
|
+
fit_discrete (alias: fitdiscrete; fd)
|
|
234
|
+
- compare discrete trait evolution models
|
|
235
|
+
(ER, SYM, ARD)
|
|
233
236
|
ouwie (alias: fit_ouwie; multi_regime_ou)
|
|
234
237
|
- fit multi-regime OU models (BM1, BMS, OU1,
|
|
235
238
|
OUM, OUMV, OUMA, OUMVA; Beaulieu et al. 2012)
|
|
@@ -3987,6 +3990,65 @@ class Phykit:
|
|
|
3987
3990
|
_add_json_argument(parser)
|
|
3988
3991
|
_run_service(parser, argv, RateHeterogeneity)
|
|
3989
3992
|
|
|
3993
|
+
@staticmethod
|
|
3994
|
+
def fit_discrete(argv):
|
|
3995
|
+
parser = _new_parser(
|
|
3996
|
+
description=textwrap.dedent(
|
|
3997
|
+
f"""\
|
|
3998
|
+
{help_header}
|
|
3999
|
+
|
|
4000
|
+
Compare models of discrete trait evolution on a phylogeny.
|
|
4001
|
+
|
|
4002
|
+
Fits ER (Equal Rates), SYM (Symmetric), and ARD (All Rates
|
|
4003
|
+
Different) Mk models of discrete character evolution via
|
|
4004
|
+
maximum likelihood. Compares models using AIC and BIC.
|
|
4005
|
+
|
|
4006
|
+
Analogous to R's geiger::fitDiscrete().
|
|
4007
|
+
|
|
4008
|
+
Aliases:
|
|
4009
|
+
fit_discrete, fitdiscrete, fd
|
|
4010
|
+
Command line interfaces:
|
|
4011
|
+
pk_fit_discrete, pk_fitdiscrete, pk_fd
|
|
4012
|
+
|
|
4013
|
+
Usage:
|
|
4014
|
+
phykit fit_discrete -t <tree> -d <trait_data> -c <trait>
|
|
4015
|
+
[--models ER,SYM,ARD] [--json]
|
|
4016
|
+
|
|
4017
|
+
Options
|
|
4018
|
+
=====================================================
|
|
4019
|
+
-t/--tree tree file (required)
|
|
4020
|
+
|
|
4021
|
+
-d/--trait_data trait data file in TSV format
|
|
4022
|
+
(required)
|
|
4023
|
+
|
|
4024
|
+
-c/--trait column name for the discrete
|
|
4025
|
+
trait in the data file
|
|
4026
|
+
(required)
|
|
4027
|
+
|
|
4028
|
+
--models comma-separated list of models
|
|
4029
|
+
to fit (default: ER,SYM,ARD)
|
|
4030
|
+
|
|
4031
|
+
--json optional argument to output
|
|
4032
|
+
results as JSON
|
|
4033
|
+
"""
|
|
4034
|
+
),
|
|
4035
|
+
)
|
|
4036
|
+
parser.add_argument(
|
|
4037
|
+
"-t", "--tree", type=str, required=True, help=SUPPRESS, metavar=""
|
|
4038
|
+
)
|
|
4039
|
+
parser.add_argument(
|
|
4040
|
+
"-d", "--trait_data", type=str, required=True, help=SUPPRESS, metavar=""
|
|
4041
|
+
)
|
|
4042
|
+
parser.add_argument(
|
|
4043
|
+
"-c", "--trait", type=str, required=True, help=SUPPRESS, metavar=""
|
|
4044
|
+
)
|
|
4045
|
+
parser.add_argument(
|
|
4046
|
+
"--models", type=str, required=False, default=None,
|
|
4047
|
+
help=SUPPRESS, metavar=""
|
|
4048
|
+
)
|
|
4049
|
+
_add_json_argument(parser)
|
|
4050
|
+
_run_service(parser, argv, FitDiscrete)
|
|
4051
|
+
|
|
3990
4052
|
@staticmethod
|
|
3991
4053
|
def fit_continuous(argv):
|
|
3992
4054
|
parser = _new_parser(
|
|
@@ -6530,6 +6592,10 @@ def fit_continuous(argv=None):
|
|
|
6530
6592
|
Phykit.fit_continuous(sys.argv[1:])
|
|
6531
6593
|
|
|
6532
6594
|
|
|
6595
|
+
def fit_discrete(argv=None):
|
|
6596
|
+
Phykit.fit_discrete(sys.argv[1:])
|
|
6597
|
+
|
|
6598
|
+
|
|
6533
6599
|
def ouwie(argv=None):
|
|
6534
6600
|
Phykit.ouwie(sys.argv[1:])
|
|
6535
6601
|
|
|
@@ -85,6 +85,7 @@ PolytomyTest = _LazyServiceFactory("phykit.services.tree.polytomy_test", "Polyto
|
|
|
85
85
|
PrintTree = _LazyServiceFactory("phykit.services.tree.print_tree", "PrintTree")
|
|
86
86
|
PruneTree = _LazyServiceFactory("phykit.services.tree.prune_tree", "PruneTree")
|
|
87
87
|
RenameTreeTips = _LazyServiceFactory("phykit.services.tree.rename_tree_tips", "RenameTreeTips")
|
|
88
|
+
FitDiscrete = _LazyServiceFactory("phykit.services.tree.fit_discrete", "FitDiscrete")
|
|
88
89
|
KuhnerFelsensteinDistance = _LazyServiceFactory("phykit.services.tree.kf_distance", "KuhnerFelsensteinDistance")
|
|
89
90
|
RobinsonFouldsDistance = _LazyServiceFactory("phykit.services.tree.rf_distance", "RobinsonFouldsDistance")
|
|
90
91
|
RootTree = _LazyServiceFactory("phykit.services.tree.root_tree", "RootTree")
|
|
@@ -12,6 +12,7 @@ _EXPORTS = {
|
|
|
12
12
|
"DiscordanceAsymmetry": "discordance_asymmetry",
|
|
13
13
|
"EvolutionaryRate": "evolutionary_rate",
|
|
14
14
|
"EvoTempoMap": "evo_tempo_map",
|
|
15
|
+
"FitDiscrete": "fit_discrete",
|
|
15
16
|
"HiddenParalogyCheck": "hidden_paralogy_check",
|
|
16
17
|
"InternalBranchStats": "internal_branch_stats",
|
|
17
18
|
"InternodeLabeler": "internode_labeler",
|
|
@@ -10,6 +10,13 @@ from scipy.optimize import minimize
|
|
|
10
10
|
from .base import Tree
|
|
11
11
|
from ...helpers.json_output import print_json
|
|
12
12
|
from ...helpers.plot_config import PlotConfig
|
|
13
|
+
from ...helpers.discrete_models import (
|
|
14
|
+
build_q_matrix,
|
|
15
|
+
matrix_exp,
|
|
16
|
+
felsenstein_pruning,
|
|
17
|
+
fit_q_matrix,
|
|
18
|
+
parse_discrete_traits,
|
|
19
|
+
)
|
|
13
20
|
from ...errors import PhykitUserError
|
|
14
21
|
|
|
15
22
|
|
|
@@ -1087,132 +1094,20 @@ class AncestralReconstruction(Tree):
|
|
|
1087
1094
|
return {taxon: traits[taxon] for taxon in shared}
|
|
1088
1095
|
|
|
1089
1096
|
# ------------------------------------------------------------------
|
|
1090
|
-
# Mk model primitives (
|
|
1097
|
+
# Mk model primitives (delegated to phykit.helpers.discrete_models)
|
|
1091
1098
|
# ------------------------------------------------------------------
|
|
1092
1099
|
|
|
1093
|
-
def _build_q_matrix(
|
|
1094
|
-
|
|
1095
|
-
) -> np.ndarray:
|
|
1096
|
-
Q = np.zeros((k, k))
|
|
1097
|
-
if model == "ER":
|
|
1098
|
-
rate = params[0]
|
|
1099
|
-
Q[:] = rate
|
|
1100
|
-
np.fill_diagonal(Q, 0.0)
|
|
1101
|
-
elif model == "SYM":
|
|
1102
|
-
idx = 0
|
|
1103
|
-
for i in range(k):
|
|
1104
|
-
for j in range(i + 1, k):
|
|
1105
|
-
Q[i, j] = params[idx]
|
|
1106
|
-
Q[j, i] = params[idx]
|
|
1107
|
-
idx += 1
|
|
1108
|
-
elif model == "ARD":
|
|
1109
|
-
idx = 0
|
|
1110
|
-
for i in range(k):
|
|
1111
|
-
for j in range(k):
|
|
1112
|
-
if i != j:
|
|
1113
|
-
Q[i, j] = params[idx]
|
|
1114
|
-
idx += 1
|
|
1115
|
-
# Set diagonal
|
|
1116
|
-
for i in range(k):
|
|
1117
|
-
Q[i, i] = -np.sum(Q[i, :])
|
|
1118
|
-
return Q
|
|
1119
|
-
|
|
1120
|
-
def _matrix_exp(self, Q: np.ndarray, t: float) -> np.ndarray:
|
|
1121
|
-
return expm(Q * t)
|
|
1122
|
-
|
|
1123
|
-
def _felsenstein_pruning(
|
|
1124
|
-
self, tree, tip_states: Dict[str, str], Q: np.ndarray,
|
|
1125
|
-
pi: np.ndarray, states: List[str]
|
|
1126
|
-
) -> Tuple[Dict, float]:
|
|
1127
|
-
k = len(states)
|
|
1128
|
-
state_idx = {s: i for i, s in enumerate(states)}
|
|
1129
|
-
cond_liks: Dict[int, np.ndarray] = {}
|
|
1130
|
-
|
|
1131
|
-
for clade in tree.find_clades(order="postorder"):
|
|
1132
|
-
if clade.is_terminal():
|
|
1133
|
-
lik = np.zeros(k)
|
|
1134
|
-
if clade.name in tip_states:
|
|
1135
|
-
lik[state_idx[tip_states[clade.name]]] = 1.0
|
|
1136
|
-
cond_liks[id(clade)] = lik
|
|
1137
|
-
else:
|
|
1138
|
-
lik = np.ones(k)
|
|
1139
|
-
for child in clade.clades:
|
|
1140
|
-
t = child.branch_length if child.branch_length else 1e-8
|
|
1141
|
-
P = self._matrix_exp(Q, t)
|
|
1142
|
-
child_lik = cond_liks[id(child)]
|
|
1143
|
-
lik *= P @ child_lik
|
|
1144
|
-
cond_liks[id(clade)] = lik
|
|
1145
|
-
|
|
1146
|
-
root_lik = cond_liks[id(tree.root)]
|
|
1147
|
-
total_lik = np.sum(pi * root_lik)
|
|
1148
|
-
if total_lik <= 0:
|
|
1149
|
-
loglik = -1e20
|
|
1150
|
-
else:
|
|
1151
|
-
loglik = np.log(total_lik)
|
|
1152
|
-
|
|
1153
|
-
return cond_liks, loglik
|
|
1100
|
+
def _build_q_matrix(self, params, k, model):
|
|
1101
|
+
return build_q_matrix(params, k, model)
|
|
1154
1102
|
|
|
1155
|
-
def
|
|
1156
|
-
|
|
1157
|
-
states: List[str], model: str
|
|
1158
|
-
) -> Tuple[np.ndarray, float]:
|
|
1159
|
-
k = len(states)
|
|
1160
|
-
|
|
1161
|
-
if model == "ER":
|
|
1162
|
-
n_params = 1
|
|
1163
|
-
elif model == "SYM":
|
|
1164
|
-
n_params = k * (k - 1) // 2
|
|
1165
|
-
elif model == "ARD":
|
|
1166
|
-
n_params = k * (k - 1)
|
|
1167
|
-
else:
|
|
1168
|
-
raise PhykitUserError(
|
|
1169
|
-
[f"Unknown model '{model}'. Use ER, SYM, or ARD."],
|
|
1170
|
-
code=2,
|
|
1171
|
-
)
|
|
1172
|
-
|
|
1173
|
-
pi = np.ones(k) / k
|
|
1174
|
-
|
|
1175
|
-
def neg_loglik(params):
|
|
1176
|
-
Q = self._build_q_matrix(np.abs(params), k, model)
|
|
1177
|
-
_, ll = self._felsenstein_pruning(tree, tip_states, Q, pi, states)
|
|
1178
|
-
return -ll
|
|
1179
|
-
|
|
1180
|
-
bounds = [(1e-8, 100.0)] * n_params
|
|
1181
|
-
|
|
1182
|
-
# Multi-start optimization for robustness
|
|
1183
|
-
starting_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
|
|
1184
|
-
best_negll = np.inf
|
|
1185
|
-
best_params = np.ones(n_params) * 0.1
|
|
1186
|
-
|
|
1187
|
-
for sv in starting_values:
|
|
1188
|
-
x0 = np.ones(n_params) * sv
|
|
1189
|
-
for opt_method in ["L-BFGS-B", "Nelder-Mead"]:
|
|
1190
|
-
try:
|
|
1191
|
-
kwargs = {"method": opt_method}
|
|
1192
|
-
if opt_method == "L-BFGS-B":
|
|
1193
|
-
kwargs["bounds"] = bounds
|
|
1194
|
-
result = minimize(neg_loglik, x0, **kwargs)
|
|
1195
|
-
if result.fun < best_negll:
|
|
1196
|
-
best_negll = result.fun
|
|
1197
|
-
best_params = np.abs(result.x)
|
|
1198
|
-
except (ValueError, np.linalg.LinAlgError):
|
|
1199
|
-
continue
|
|
1200
|
-
|
|
1201
|
-
# Refine best result with Nelder-Mead
|
|
1202
|
-
try:
|
|
1203
|
-
result = minimize(
|
|
1204
|
-
neg_loglik, best_params, method="Nelder-Mead",
|
|
1205
|
-
options={"maxiter": 10000, "xatol": 1e-10, "fatol": 1e-10},
|
|
1206
|
-
)
|
|
1207
|
-
if result.fun < best_negll:
|
|
1208
|
-
best_params = np.abs(result.x)
|
|
1209
|
-
except (ValueError, np.linalg.LinAlgError):
|
|
1210
|
-
pass
|
|
1103
|
+
def _matrix_exp(self, Q, t):
|
|
1104
|
+
return matrix_exp(Q, t)
|
|
1211
1105
|
|
|
1212
|
-
|
|
1213
|
-
|
|
1106
|
+
def _felsenstein_pruning(self, tree, tip_states, Q, pi, states):
|
|
1107
|
+
return felsenstein_pruning(tree, tip_states, Q, pi, states)
|
|
1214
1108
|
|
|
1215
|
-
|
|
1109
|
+
def _fit_q_matrix(self, tree, tip_states, states, model):
|
|
1110
|
+
return fit_q_matrix(tree, tip_states, states, model)
|
|
1216
1111
|
|
|
1217
1112
|
# ------------------------------------------------------------------
|
|
1218
1113
|
# Discrete marginal posteriors (upward-downward belief propagation)
|