phykit 2.1.39__tar.gz → 2.1.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {phykit-2.1.39 → phykit-2.1.41}/PKG-INFO +1 -1
  2. {phykit-2.1.39 → phykit-2.1.41}/phykit/cli_registry.py +5 -0
  3. phykit-2.1.41/phykit/helpers/discrete_models.py +299 -0
  4. {phykit-2.1.39 → phykit-2.1.41}/phykit/phykit.py +127 -0
  5. {phykit-2.1.39 → phykit-2.1.41}/phykit/service_factories.py +2 -0
  6. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/__init__.py +2 -0
  7. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/ancestral_reconstruction.py +16 -121
  8. phykit-2.1.41/phykit/services/tree/fit_discrete.py +177 -0
  9. phykit-2.1.41/phykit/services/tree/kf_distance.py +101 -0
  10. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/stochastic_character_map.py +16 -214
  11. phykit-2.1.41/phykit/version.py +1 -0
  12. {phykit-2.1.39 → phykit-2.1.41}/phykit.egg-info/PKG-INFO +1 -1
  13. {phykit-2.1.39 → phykit-2.1.41}/phykit.egg-info/SOURCES.txt +3 -0
  14. {phykit-2.1.39 → phykit-2.1.41}/phykit.egg-info/entry_points.txt +7 -0
  15. {phykit-2.1.39 → phykit-2.1.41}/setup.py +7 -0
  16. phykit-2.1.39/phykit/version.py +0 -1
  17. {phykit-2.1.39 → phykit-2.1.41}/LICENSE.md +0 -0
  18. {phykit-2.1.39 → phykit-2.1.41}/README.md +0 -0
  19. {phykit-2.1.39 → phykit-2.1.41}/phykit/__init__.py +0 -0
  20. {phykit-2.1.39 → phykit-2.1.41}/phykit/__main__.py +0 -0
  21. {phykit-2.1.39 → phykit-2.1.41}/phykit/errors.py +0 -0
  22. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/__init__.py +0 -0
  23. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/boolean_argument_parsing.py +0 -0
  24. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/caching.py +0 -0
  25. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/files.py +0 -0
  26. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/json_output.py +0 -0
  27. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/parallel.py +0 -0
  28. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/plot_config.py +0 -0
  29. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/stats_summary.py +0 -0
  30. {phykit-2.1.39 → phykit-2.1.41}/phykit/helpers/streaming.py +0 -0
  31. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/__init__.py +0 -0
  32. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/__init__.py +0 -0
  33. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/alignment_entropy.py +0 -0
  34. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/alignment_length.py +0 -0
  35. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/alignment_length_no_gaps.py +0 -0
  36. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/alignment_outlier_taxa.py +0 -0
  37. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/alignment_recoding.py +0 -0
  38. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/base.py +0 -0
  39. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/column_score.py +0 -0
  40. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/composition_per_taxon.py +0 -0
  41. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/compositional_bias_per_site.py +0 -0
  42. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/create_concatenation_matrix.py +0 -0
  43. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/dna_threader.py +0 -0
  44. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/evolutionary_rate_per_site.py +0 -0
  45. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/faidx.py +0 -0
  46. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/gc_content.py +0 -0
  47. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/mask_alignment.py +0 -0
  48. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/occupancy_per_taxon.py +0 -0
  49. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/pairwise_identity.py +0 -0
  50. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/parsimony_informative_sites.py +0 -0
  51. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/plot_alignment_qc.py +0 -0
  52. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/rcv.py +0 -0
  53. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/rcvt.py +0 -0
  54. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/rename_fasta_entries.py +0 -0
  55. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/sum_of_pairs_score.py +0 -0
  56. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/alignment/variable_sites.py +0 -0
  57. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/base.py +0 -0
  58. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/base.py +0 -0
  59. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/bipartition_support_stats.py +0 -0
  60. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/branch_length_multiplier.py +0 -0
  61. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/collapse_branches.py +0 -0
  62. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/concordance_asr.py +0 -0
  63. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/consensus_network.py +0 -0
  64. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/consensus_tree.py +0 -0
  65. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/cont_map.py +0 -0
  66. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/cophylo.py +0 -0
  67. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/covarying_evolutionary_rates.py +0 -0
  68. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/density_map.py +0 -0
  69. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/discordance_asymmetry.py +0 -0
  70. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/dvmc.py +0 -0
  71. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/evo_tempo_map.py +0 -0
  72. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/evolutionary_rate.py +0 -0
  73. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/fit_continuous.py +0 -0
  74. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/hidden_paralogy_check.py +0 -0
  75. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/internal_branch_stats.py +0 -0
  76. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/internode_labeler.py +0 -0
  77. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/last_common_ancestor_subtree.py +0 -0
  78. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/lb_score.py +0 -0
  79. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/ltt.py +0 -0
  80. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/monophyly_check.py +0 -0
  81. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/nearest_neighbor_interchange.py +0 -0
  82. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/network_signal.py +0 -0
  83. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/ou_shift_detection.py +0 -0
  84. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/ouwie.py +0 -0
  85. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/patristic_distances.py +0 -0
  86. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/phenogram.py +0 -0
  87. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/phylogenetic_glm.py +0 -0
  88. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/phylogenetic_ordination.py +0 -0
  89. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/phylogenetic_regression.py +0 -0
  90. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/phylogenetic_signal.py +0 -0
  91. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/phylomorphospace.py +0 -0
  92. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/polytomy_test.py +0 -0
  93. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/print_tree.py +0 -0
  94. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/prune_tree.py +0 -0
  95. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/quartet_network.py +0 -0
  96. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/rate_heterogeneity.py +0 -0
  97. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/relative_rate_test.py +0 -0
  98. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/rename_tree_tips.py +0 -0
  99. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/rf_distance.py +0 -0
  100. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/root_tree.py +0 -0
  101. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/saturation.py +0 -0
  102. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/spectral_discordance.py +0 -0
  103. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/spurious_sequence.py +0 -0
  104. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/terminal_branch_stats.py +0 -0
  105. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/threshold_model.py +0 -0
  106. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/tip_labels.py +0 -0
  107. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/tip_to_tip_distance.py +0 -0
  108. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/tip_to_tip_node_distance.py +0 -0
  109. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/total_tree_length.py +0 -0
  110. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/treeness.py +0 -0
  111. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/treeness_over_rcv.py +0 -0
  112. {phykit-2.1.39 → phykit-2.1.41}/phykit/services/tree/vcv_utils.py +0 -0
  113. {phykit-2.1.39 → phykit-2.1.41}/phykit.egg-info/dependency_links.txt +0 -0
  114. {phykit-2.1.39 → phykit-2.1.41}/phykit.egg-info/requires.txt +0 -0
  115. {phykit-2.1.39 → phykit-2.1.41}/phykit.egg-info/top_level.txt +0 -0
  116. {phykit-2.1.39 → phykit-2.1.41}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phykit
3
- Version: 2.1.39
3
+ Version: 2.1.41
4
4
  Home-page: https://github.com/jlsteenwyk/phykit
5
5
  Author: Jacob L. Steenwyk
6
6
  Author-email: jlsteenwyk@gmail.com
@@ -104,6 +104,8 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
104
104
  "rh": "rate_heterogeneity",
105
105
  "fitcontinuous": "fit_continuous",
106
106
  "fc": "fit_continuous",
107
+ "fitdiscrete": "fit_discrete",
108
+ "fd": "fit_discrete",
107
109
  "ouwie": "ouwie",
108
110
  "fit_ouwie": "ouwie",
109
111
  "multi_regime_ou": "ouwie",
@@ -129,6 +131,9 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
129
131
  "prune": "prune_tree",
130
132
  "rename_tree": "rename_tree_tips",
131
133
  "rename_tips": "rename_tree_tips",
134
+ "kuhner_felsenstein_distance": "kf_distance",
135
+ "kf_dist": "kf_distance",
136
+ "kf": "kf_distance",
132
137
  "robinson_foulds_distance": "rf_distance",
133
138
  "rf_dist": "rf_distance",
134
139
  "rf": "rf_distance",
@@ -0,0 +1,299 @@
1
+ """
2
+ Shared utilities for discrete trait evolution models.
3
+
4
+ Provides Q-matrix construction, Felsenstein pruning (likelihood computation),
5
+ maximum-likelihood Q-matrix fitting, and discrete trait data parsing. Used by
6
+ stochastic_character_map, ancestral_reconstruction, and fit_discrete.
7
+ """
8
+ import sys
9
+ from typing import Dict, List, Tuple
10
+
11
+ import numpy as np
12
+ from scipy.linalg import expm
13
+ from scipy.optimize import minimize
14
+
15
+ from ..errors import PhykitUserError
16
+
17
+
18
+ VALID_DISCRETE_MODELS = frozenset(["ER", "SYM", "ARD"])
19
+
20
+
21
+ def count_params(k: int, model: str) -> int:
22
+ """Return the number of free rate parameters for a discrete model."""
23
+ if model == "ER":
24
+ return 1
25
+ elif model == "SYM":
26
+ return k * (k - 1) // 2
27
+ elif model == "ARD":
28
+ return k * (k - 1)
29
+ else:
30
+ raise PhykitUserError(
31
+ [f"Unknown model '{model}'. Use ER, SYM, or ARD."], code=2,
32
+ )
33
+
34
+
35
+ def build_q_matrix(params: np.ndarray, k: int, model: str) -> np.ndarray:
36
+ """Build a Q-matrix from parameters for ER, SYM, or ARD models.
37
+
38
+ Rows sum to zero (standard continuous-time Markov chain convention).
39
+ """
40
+ Q = np.zeros((k, k))
41
+ if model == "ER":
42
+ rate = params[0]
43
+ Q[:] = rate
44
+ np.fill_diagonal(Q, 0.0)
45
+ elif model == "SYM":
46
+ idx = 0
47
+ for i in range(k):
48
+ for j in range(i + 1, k):
49
+ Q[i, j] = params[idx]
50
+ Q[j, i] = params[idx]
51
+ idx += 1
52
+ elif model == "ARD":
53
+ idx = 0
54
+ for i in range(k):
55
+ for j in range(k):
56
+ if i != j:
57
+ Q[i, j] = params[idx]
58
+ idx += 1
59
+ for i in range(k):
60
+ Q[i, i] = -np.sum(Q[i, :])
61
+ return Q
62
+
63
+
64
+ def matrix_exp(Q: np.ndarray, t: float) -> np.ndarray:
65
+ """Compute the matrix exponential P = exp(Q * t)."""
66
+ return expm(Q * t)
67
+
68
+
69
+ def felsenstein_pruning(
70
+ tree, tip_states: Dict[str, str], Q: np.ndarray,
71
+ pi: np.ndarray, states: List[str]
72
+ ) -> Tuple[Dict, float]:
73
+ """Postorder traversal computing conditional likelihoods and log-likelihood.
74
+
75
+ Returns (cond_liks, loglik) where cond_liks maps clade id to
76
+ a k-length likelihood vector.
77
+ """
78
+ k = len(states)
79
+ state_idx = {s: i for i, s in enumerate(states)}
80
+ cond_liks = {}
81
+
82
+ for clade in tree.find_clades(order="postorder"):
83
+ if clade.is_terminal():
84
+ lik = np.zeros(k)
85
+ if clade.name in tip_states:
86
+ lik[state_idx[tip_states[clade.name]]] = 1.0
87
+ cond_liks[id(clade)] = lik
88
+ else:
89
+ lik = np.ones(k)
90
+ for child in clade.clades:
91
+ t = child.branch_length if child.branch_length else 1e-8
92
+ P = matrix_exp(Q, t)
93
+ child_lik = cond_liks[id(child)]
94
+ lik *= P @ child_lik
95
+ cond_liks[id(clade)] = lik
96
+
97
+ root_lik = cond_liks[id(tree.root)]
98
+ total_lik = np.sum(pi * root_lik)
99
+ if total_lik <= 0:
100
+ loglik = -1e20
101
+ else:
102
+ loglik = np.log(total_lik)
103
+
104
+ return cond_liks, loglik
105
+
106
+
107
+ def fit_q_matrix(
108
+ tree, tip_states: Dict[str, str],
109
+ states: List[str], model: str
110
+ ) -> Tuple[np.ndarray, float]:
111
+ """Fit Q-matrix parameters via maximum likelihood.
112
+
113
+ Uses multi-start optimization with L-BFGS-B and Nelder-Mead,
114
+ followed by a refinement step.
115
+
116
+ Returns (Q_matrix, log_likelihood).
117
+ """
118
+ k = len(states)
119
+ n_params = count_params(k, model)
120
+ pi = np.ones(k) / k
121
+
122
+ def neg_loglik(params):
123
+ Q = build_q_matrix(np.abs(params), k, model)
124
+ _, ll = felsenstein_pruning(tree, tip_states, Q, pi, states)
125
+ return -ll
126
+
127
+ bounds = [(1e-8, 100.0)] * n_params
128
+
129
+ starting_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
130
+ best_negll = np.inf
131
+ best_params = np.ones(n_params) * 0.1
132
+
133
+ for sv in starting_values:
134
+ x0 = np.ones(n_params) * sv
135
+ for method in ["L-BFGS-B", "Nelder-Mead"]:
136
+ try:
137
+ kwargs = {"method": method}
138
+ if method == "L-BFGS-B":
139
+ kwargs["bounds"] = bounds
140
+ result = minimize(neg_loglik, x0, **kwargs)
141
+ if result.fun < best_negll:
142
+ best_negll = result.fun
143
+ best_params = np.abs(result.x)
144
+ except (ValueError, np.linalg.LinAlgError):
145
+ continue
146
+
147
+ # Refine best result
148
+ try:
149
+ result = minimize(
150
+ neg_loglik, best_params, method="Nelder-Mead",
151
+ options={"maxiter": 10000, "xatol": 1e-10, "fatol": 1e-10},
152
+ )
153
+ if result.fun < best_negll:
154
+ best_params = np.abs(result.x)
155
+ except (ValueError, np.linalg.LinAlgError):
156
+ pass
157
+
158
+ Q = build_q_matrix(best_params, k, model)
159
+ _, loglik = felsenstein_pruning(tree, tip_states, Q, pi, states)
160
+
161
+ return Q, loglik
162
+
163
+
164
+ def parse_discrete_traits(
165
+ path: str, tree_tips: List[str], trait_column: str = None
166
+ ) -> Dict[str, str]:
167
+ """Parse discrete trait data from a TSV file.
168
+
169
+ If trait_column is None: expects 2-column format (taxon<tab>state),
170
+ no header row.
171
+ If trait_column is given: expects multi-column with header row,
172
+ extracts the named column.
173
+
174
+ Returns {taxon: state} dict for the intersection of file taxa and
175
+ tree_tips. Requires at least 3 shared taxa.
176
+ """
177
+ try:
178
+ with open(path) as f:
179
+ lines = f.readlines()
180
+ except FileNotFoundError:
181
+ raise PhykitUserError(
182
+ [
183
+ f"{path} corresponds to no such file or directory.",
184
+ "Please check filename and pathing",
185
+ ],
186
+ code=2,
187
+ )
188
+
189
+ data_lines = []
190
+ for line in lines:
191
+ stripped = line.strip()
192
+ if not stripped or stripped.startswith("#"):
193
+ continue
194
+ data_lines.append(stripped)
195
+
196
+ if trait_column is not None:
197
+ return _parse_multi_column(data_lines, path, tree_tips, trait_column)
198
+ else:
199
+ return _parse_two_column(data_lines, path, tree_tips)
200
+
201
+
202
+ def _parse_multi_column(
203
+ data_lines: List[str], path: str, tree_tips: List[str], trait_column: str
204
+ ) -> Dict[str, str]:
205
+ if len(data_lines) < 2:
206
+ raise PhykitUserError(
207
+ ["Trait file must have a header row and at least one data row."],
208
+ code=2,
209
+ )
210
+
211
+ header_parts = data_lines[0].split("\t")
212
+ if len(header_parts) < 2:
213
+ raise PhykitUserError(
214
+ ["Header must have at least 2 columns (taxon + at least 1 trait)."],
215
+ code=2,
216
+ )
217
+
218
+ trait_names = header_parts[1:]
219
+ if trait_column not in trait_names:
220
+ raise PhykitUserError(
221
+ [
222
+ f"Column '{trait_column}' not found in trait file.",
223
+ f"Available columns: {', '.join(trait_names)}",
224
+ ],
225
+ code=2,
226
+ )
227
+
228
+ col_idx = trait_names.index(trait_column)
229
+ n_cols = len(header_parts)
230
+
231
+ traits = {}
232
+ for line_idx, line in enumerate(data_lines[1:], 2):
233
+ parts = line.split("\t")
234
+ if len(parts) != n_cols:
235
+ raise PhykitUserError(
236
+ [f"Line {line_idx} has {len(parts)} columns; expected {n_cols}."],
237
+ code=2,
238
+ )
239
+ taxon = parts[0]
240
+ value = parts[1 + col_idx].strip()
241
+ if not value:
242
+ raise PhykitUserError(
243
+ [f"Missing trait value for taxon '{taxon}' on line {line_idx}."],
244
+ code=2,
245
+ )
246
+ traits[taxon] = value
247
+
248
+ return _validate_shared_taxa(traits, tree_tips)
249
+
250
+
251
+ def _parse_two_column(
252
+ data_lines: List[str], path: str, tree_tips: List[str]
253
+ ) -> Dict[str, str]:
254
+ traits = {}
255
+ for line_idx, line in enumerate(data_lines, 1):
256
+ parts = line.split("\t")
257
+ if len(parts) != 2:
258
+ raise PhykitUserError(
259
+ [f"Line {line_idx} has {len(parts)} columns; expected 2 (taxon, state)."],
260
+ code=2,
261
+ )
262
+ traits[parts[0]] = parts[1].strip()
263
+
264
+ return _validate_shared_taxa(traits, tree_tips)
265
+
266
+
267
+ def _validate_shared_taxa(
268
+ traits: Dict[str, str], tree_tips: List[str]
269
+ ) -> Dict[str, str]:
270
+ tree_tip_set = set(tree_tips)
271
+ trait_taxa_set = set(traits.keys())
272
+ shared = tree_tip_set & trait_taxa_set
273
+
274
+ tree_only = tree_tip_set - trait_taxa_set
275
+ trait_only = trait_taxa_set - tree_tip_set
276
+
277
+ if tree_only:
278
+ print(
279
+ f"Warning: {len(tree_only)} taxa in tree but not in trait file: "
280
+ f"{', '.join(sorted(tree_only))}",
281
+ file=sys.stderr,
282
+ )
283
+ if trait_only:
284
+ print(
285
+ f"Warning: {len(trait_only)} taxa in trait file but not in tree: "
286
+ f"{', '.join(sorted(trait_only))}",
287
+ file=sys.stderr,
288
+ )
289
+
290
+ if len(shared) < 3:
291
+ raise PhykitUserError(
292
+ [
293
+ f"Only {len(shared)} shared taxa between tree and trait file.",
294
+ "At least 3 shared taxa are required.",
295
+ ],
296
+ code=2,
297
+ )
298
+
299
+ return {taxon: traits[taxon] for taxon in shared}
@@ -230,6 +230,9 @@ class Phykit:
230
230
  fit_continuous (alias: fitcontinuous; fc)
231
231
  - compare continuous trait evolution models
232
232
  (BM, OU, EB, Lambda, Delta, Kappa, White)
233
+ fit_discrete (alias: fitdiscrete; fd)
234
+ - compare discrete trait evolution models
235
+ (ER, SYM, ARD)
233
236
  ouwie (alias: fit_ouwie; multi_regime_ou)
234
237
  - fit multi-regime OU models (BM1, BMS, OU1,
235
238
  OUM, OUMV, OUMA, OUMVA; Beaulieu et al. 2012)
@@ -260,6 +263,9 @@ class Phykit:
260
263
  rename_tree_tips (alias: rename_tree; rename_tips)
261
264
  - renames tips in a phylogeny according to a file with
262
265
  the desired new tip names
266
+ kuhner_felsenstein_distance (alias: kf_distance; kf_dist; kf)
267
+ - calculates Kuhner-Felsenstein (branch score) distance
268
+ between two trees
263
269
  robinson_foulds_distance (alias: rf_distance; rf_dist; rf)
264
270
  - calculates Robinson-Foulds distance between two trees
265
271
  root_tree (alias: root; rt)
@@ -3984,6 +3990,65 @@ class Phykit:
3984
3990
  _add_json_argument(parser)
3985
3991
  _run_service(parser, argv, RateHeterogeneity)
3986
3992
 
3993
+ @staticmethod
3994
+ def fit_discrete(argv):
3995
+ parser = _new_parser(
3996
+ description=textwrap.dedent(
3997
+ f"""\
3998
+ {help_header}
3999
+
4000
+ Compare models of discrete trait evolution on a phylogeny.
4001
+
4002
+ Fits ER (Equal Rates), SYM (Symmetric), and ARD (All Rates
4003
+ Different) Mk models of discrete character evolution via
4004
+ maximum likelihood. Compares models using AIC and BIC.
4005
+
4006
+ Analogous to R's geiger::fitDiscrete().
4007
+
4008
+ Aliases:
4009
+ fit_discrete, fitdiscrete, fd
4010
+ Command line interfaces:
4011
+ pk_fit_discrete, pk_fitdiscrete, pk_fd
4012
+
4013
+ Usage:
4014
+ phykit fit_discrete -t <tree> -d <trait_data> -c <trait>
4015
+ [--models ER,SYM,ARD] [--json]
4016
+
4017
+ Options
4018
+ =====================================================
4019
+ -t/--tree tree file (required)
4020
+
4021
+ -d/--trait_data trait data file in TSV format
4022
+ (required)
4023
+
4024
+ -c/--trait column name for the discrete
4025
+ trait in the data file
4026
+ (required)
4027
+
4028
+ --models comma-separated list of models
4029
+ to fit (default: ER,SYM,ARD)
4030
+
4031
+ --json optional argument to output
4032
+ results as JSON
4033
+ """
4034
+ ),
4035
+ )
4036
+ parser.add_argument(
4037
+ "-t", "--tree", type=str, required=True, help=SUPPRESS, metavar=""
4038
+ )
4039
+ parser.add_argument(
4040
+ "-d", "--trait_data", type=str, required=True, help=SUPPRESS, metavar=""
4041
+ )
4042
+ parser.add_argument(
4043
+ "-c", "--trait", type=str, required=True, help=SUPPRESS, metavar=""
4044
+ )
4045
+ parser.add_argument(
4046
+ "--models", type=str, required=False, default=None,
4047
+ help=SUPPRESS, metavar=""
4048
+ )
4049
+ _add_json_argument(parser)
4050
+ _run_service(parser, argv, FitDiscrete)
4051
+
3987
4052
  @staticmethod
3988
4053
  def fit_continuous(argv):
3989
4054
  parser = _new_parser(
@@ -5128,6 +5193,60 @@ class Phykit:
5128
5193
  _add_json_argument(parser)
5129
5194
  _run_service(parser, argv, RenameTreeTips)
5130
5195
 
5196
+ @staticmethod
5197
+ def kf_distance(argv):
5198
+ parser = _new_parser(
5199
+ description=textwrap.dedent(
5200
+ f"""\
5201
+ {help_header}
5202
+
5203
+ Calculate Kuhner-Felsenstein (KF) branch score distance
5204
+ between two trees.
5205
+
5206
+ Unlike Robinson-Foulds distance which only considers topology,
5207
+ KF distance incorporates both topology and branch length
5208
+ differences. The KF distance is calculated as:
5209
+ KF = sqrt( sum_over_all_splits( (b1_i - b2_i)^2 ) )
5210
+ where b1_i and b2_i are branch lengths for split i in each
5211
+ tree. Splits absent from one tree use branch length 0.
5212
+
5213
+ PhyKIT will print out
5214
+ col 1: the plain KF distance and
5215
+ col 2: the normalized KF distance.
5216
+
5217
+ KF distances are calculated following Kuhner & Felsenstein,
5218
+ Journal of Computational Biology (1994),
5219
+ doi: 10.1089/cmb.1994.1.183.
5220
+
5221
+ Aliases:
5222
+ kuhner_felsenstein_distance, kf_distance, kf_dist, kf
5223
+ Command line interfaces:
5224
+ pk_kuhner_felsenstein_distance, pk_kf_distance, pk_kf_dist,
5225
+ pk_kf
5226
+
5227
+ Usage:
5228
+ phykit kf_distance <tree_file_zero> <tree_file_one> [--json]
5229
+
5230
+ Options
5231
+ =====================================================
5232
+ <tree_file_zero> first argument after
5233
+ function name should be
5234
+ a tree file
5235
+
5236
+ <tree_file_one> second argument after
5237
+ function name should be
5238
+ a tree file
5239
+
5240
+ --json optional argument to output
5241
+ results as JSON
5242
+ """
5243
+ ),
5244
+ )
5245
+ parser.add_argument("tree_zero", type=str, help=SUPPRESS)
5246
+ parser.add_argument("tree_one", type=str, help=SUPPRESS)
5247
+ _add_json_argument(parser)
5248
+ _run_service(parser, argv, KuhnerFelsensteinDistance)
5249
+
5131
5250
  @staticmethod
5132
5251
  def rf_distance(argv):
5133
5252
  parser = _new_parser(
@@ -6473,6 +6592,10 @@ def fit_continuous(argv=None):
6473
6592
  Phykit.fit_continuous(sys.argv[1:])
6474
6593
 
6475
6594
 
6595
+ def fit_discrete(argv=None):
6596
+ Phykit.fit_discrete(sys.argv[1:])
6597
+
6598
+
6476
6599
  def ouwie(argv=None):
6477
6600
  Phykit.ouwie(sys.argv[1:])
6478
6601
 
@@ -6525,6 +6648,10 @@ def rename_tree_tips(argv=None):
6525
6648
  Phykit.rename_tree_tips(sys.argv[1:])
6526
6649
 
6527
6650
 
6651
+ def kf_distance(argv=None):
6652
+ Phykit.kf_distance(sys.argv[1:])
6653
+
6654
+
6528
6655
  def rf_distance(argv=None):
6529
6656
  Phykit.rf_distance(sys.argv[1:])
6530
6657
 
@@ -85,6 +85,8 @@ PolytomyTest = _LazyServiceFactory("phykit.services.tree.polytomy_test", "Polyto
85
85
  PrintTree = _LazyServiceFactory("phykit.services.tree.print_tree", "PrintTree")
86
86
  PruneTree = _LazyServiceFactory("phykit.services.tree.prune_tree", "PruneTree")
87
87
  RenameTreeTips = _LazyServiceFactory("phykit.services.tree.rename_tree_tips", "RenameTreeTips")
88
+ FitDiscrete = _LazyServiceFactory("phykit.services.tree.fit_discrete", "FitDiscrete")
89
+ KuhnerFelsensteinDistance = _LazyServiceFactory("phykit.services.tree.kf_distance", "KuhnerFelsensteinDistance")
88
90
  RobinsonFouldsDistance = _LazyServiceFactory("phykit.services.tree.rf_distance", "RobinsonFouldsDistance")
89
91
  RootTree = _LazyServiceFactory("phykit.services.tree.root_tree", "RootTree")
90
92
  Saturation = _LazyServiceFactory("phykit.services.tree.saturation", "Saturation")
@@ -12,6 +12,7 @@ _EXPORTS = {
12
12
  "DiscordanceAsymmetry": "discordance_asymmetry",
13
13
  "EvolutionaryRate": "evolutionary_rate",
14
14
  "EvoTempoMap": "evo_tempo_map",
15
+ "FitDiscrete": "fit_discrete",
15
16
  "HiddenParalogyCheck": "hidden_paralogy_check",
16
17
  "InternalBranchStats": "internal_branch_stats",
17
18
  "InternodeLabeler": "internode_labeler",
@@ -28,6 +29,7 @@ _EXPORTS = {
28
29
  "PrintTree": "print_tree",
29
30
  "PruneTree": "prune_tree",
30
31
  "RenameTreeTips": "rename_tree_tips",
32
+ "KuhnerFelsensteinDistance": "kf_distance",
31
33
  "RobinsonFouldsDistance": "rf_distance",
32
34
  "RootTree": "root_tree",
33
35
  "Saturation": "saturation",
@@ -10,6 +10,13 @@ from scipy.optimize import minimize
10
10
  from .base import Tree
11
11
  from ...helpers.json_output import print_json
12
12
  from ...helpers.plot_config import PlotConfig
13
+ from ...helpers.discrete_models import (
14
+ build_q_matrix,
15
+ matrix_exp,
16
+ felsenstein_pruning,
17
+ fit_q_matrix,
18
+ parse_discrete_traits,
19
+ )
13
20
  from ...errors import PhykitUserError
14
21
 
15
22
 
@@ -1087,132 +1094,20 @@ class AncestralReconstruction(Tree):
1087
1094
  return {taxon: traits[taxon] for taxon in shared}
1088
1095
 
1089
1096
  # ------------------------------------------------------------------
1090
- # Mk model primitives (shared with StochasticCharacterMap)
1097
+ # Mk model primitives (delegated to phykit.helpers.discrete_models)
1091
1098
  # ------------------------------------------------------------------
1092
1099
 
1093
- def _build_q_matrix(
1094
- self, params: np.ndarray, k: int, model: str
1095
- ) -> np.ndarray:
1096
- Q = np.zeros((k, k))
1097
- if model == "ER":
1098
- rate = params[0]
1099
- Q[:] = rate
1100
- np.fill_diagonal(Q, 0.0)
1101
- elif model == "SYM":
1102
- idx = 0
1103
- for i in range(k):
1104
- for j in range(i + 1, k):
1105
- Q[i, j] = params[idx]
1106
- Q[j, i] = params[idx]
1107
- idx += 1
1108
- elif model == "ARD":
1109
- idx = 0
1110
- for i in range(k):
1111
- for j in range(k):
1112
- if i != j:
1113
- Q[i, j] = params[idx]
1114
- idx += 1
1115
- # Set diagonal
1116
- for i in range(k):
1117
- Q[i, i] = -np.sum(Q[i, :])
1118
- return Q
1119
-
1120
- def _matrix_exp(self, Q: np.ndarray, t: float) -> np.ndarray:
1121
- return expm(Q * t)
1122
-
1123
- def _felsenstein_pruning(
1124
- self, tree, tip_states: Dict[str, str], Q: np.ndarray,
1125
- pi: np.ndarray, states: List[str]
1126
- ) -> Tuple[Dict, float]:
1127
- k = len(states)
1128
- state_idx = {s: i for i, s in enumerate(states)}
1129
- cond_liks: Dict[int, np.ndarray] = {}
1130
-
1131
- for clade in tree.find_clades(order="postorder"):
1132
- if clade.is_terminal():
1133
- lik = np.zeros(k)
1134
- if clade.name in tip_states:
1135
- lik[state_idx[tip_states[clade.name]]] = 1.0
1136
- cond_liks[id(clade)] = lik
1137
- else:
1138
- lik = np.ones(k)
1139
- for child in clade.clades:
1140
- t = child.branch_length if child.branch_length else 1e-8
1141
- P = self._matrix_exp(Q, t)
1142
- child_lik = cond_liks[id(child)]
1143
- lik *= P @ child_lik
1144
- cond_liks[id(clade)] = lik
1145
-
1146
- root_lik = cond_liks[id(tree.root)]
1147
- total_lik = np.sum(pi * root_lik)
1148
- if total_lik <= 0:
1149
- loglik = -1e20
1150
- else:
1151
- loglik = np.log(total_lik)
1152
-
1153
- return cond_liks, loglik
1100
+ def _build_q_matrix(self, params, k, model):
1101
+ return build_q_matrix(params, k, model)
1154
1102
 
1155
- def _fit_q_matrix(
1156
- self, tree, tip_states: Dict[str, str],
1157
- states: List[str], model: str
1158
- ) -> Tuple[np.ndarray, float]:
1159
- k = len(states)
1160
-
1161
- if model == "ER":
1162
- n_params = 1
1163
- elif model == "SYM":
1164
- n_params = k * (k - 1) // 2
1165
- elif model == "ARD":
1166
- n_params = k * (k - 1)
1167
- else:
1168
- raise PhykitUserError(
1169
- [f"Unknown model '{model}'. Use ER, SYM, or ARD."],
1170
- code=2,
1171
- )
1172
-
1173
- pi = np.ones(k) / k
1174
-
1175
- def neg_loglik(params):
1176
- Q = self._build_q_matrix(np.abs(params), k, model)
1177
- _, ll = self._felsenstein_pruning(tree, tip_states, Q, pi, states)
1178
- return -ll
1179
-
1180
- bounds = [(1e-8, 100.0)] * n_params
1181
-
1182
- # Multi-start optimization for robustness
1183
- starting_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
1184
- best_negll = np.inf
1185
- best_params = np.ones(n_params) * 0.1
1186
-
1187
- for sv in starting_values:
1188
- x0 = np.ones(n_params) * sv
1189
- for opt_method in ["L-BFGS-B", "Nelder-Mead"]:
1190
- try:
1191
- kwargs = {"method": opt_method}
1192
- if opt_method == "L-BFGS-B":
1193
- kwargs["bounds"] = bounds
1194
- result = minimize(neg_loglik, x0, **kwargs)
1195
- if result.fun < best_negll:
1196
- best_negll = result.fun
1197
- best_params = np.abs(result.x)
1198
- except (ValueError, np.linalg.LinAlgError):
1199
- continue
1200
-
1201
- # Refine best result with Nelder-Mead
1202
- try:
1203
- result = minimize(
1204
- neg_loglik, best_params, method="Nelder-Mead",
1205
- options={"maxiter": 10000, "xatol": 1e-10, "fatol": 1e-10},
1206
- )
1207
- if result.fun < best_negll:
1208
- best_params = np.abs(result.x)
1209
- except (ValueError, np.linalg.LinAlgError):
1210
- pass
1103
+ def _matrix_exp(self, Q, t):
1104
+ return matrix_exp(Q, t)
1211
1105
 
1212
- Q = self._build_q_matrix(best_params, k, model)
1213
- _, loglik = self._felsenstein_pruning(tree, tip_states, Q, pi, states)
1106
+ def _felsenstein_pruning(self, tree, tip_states, Q, pi, states):
1107
+ return felsenstein_pruning(tree, tip_states, Q, pi, states)
1214
1108
 
1215
- return Q, loglik
1109
+ def _fit_q_matrix(self, tree, tip_states, states, model):
1110
+ return fit_q_matrix(tree, tip_states, states, model)
1216
1111
 
1217
1112
  # ------------------------------------------------------------------
1218
1113
  # Discrete marginal posteriors (upward-downward belief propagation)