phykit 2.1.40__tar.gz → 2.1.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {phykit-2.1.40 → phykit-2.1.41}/PKG-INFO +1 -1
  2. {phykit-2.1.40 → phykit-2.1.41}/phykit/cli_registry.py +2 -0
  3. phykit-2.1.41/phykit/helpers/discrete_models.py +299 -0
  4. {phykit-2.1.40 → phykit-2.1.41}/phykit/phykit.py +66 -0
  5. {phykit-2.1.40 → phykit-2.1.41}/phykit/service_factories.py +1 -0
  6. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/__init__.py +1 -0
  7. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ancestral_reconstruction.py +16 -121
  8. phykit-2.1.41/phykit/services/tree/fit_discrete.py +177 -0
  9. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/stochastic_character_map.py +16 -214
  10. phykit-2.1.41/phykit/version.py +1 -0
  11. {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/PKG-INFO +1 -1
  12. {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/SOURCES.txt +2 -0
  13. {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/entry_points.txt +3 -0
  14. {phykit-2.1.40 → phykit-2.1.41}/setup.py +3 -0
  15. phykit-2.1.40/phykit/version.py +0 -1
  16. {phykit-2.1.40 → phykit-2.1.41}/LICENSE.md +0 -0
  17. {phykit-2.1.40 → phykit-2.1.41}/README.md +0 -0
  18. {phykit-2.1.40 → phykit-2.1.41}/phykit/__init__.py +0 -0
  19. {phykit-2.1.40 → phykit-2.1.41}/phykit/__main__.py +0 -0
  20. {phykit-2.1.40 → phykit-2.1.41}/phykit/errors.py +0 -0
  21. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/__init__.py +0 -0
  22. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/boolean_argument_parsing.py +0 -0
  23. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/caching.py +0 -0
  24. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/files.py +0 -0
  25. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/json_output.py +0 -0
  26. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/parallel.py +0 -0
  27. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/plot_config.py +0 -0
  28. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/stats_summary.py +0 -0
  29. {phykit-2.1.40 → phykit-2.1.41}/phykit/helpers/streaming.py +0 -0
  30. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/__init__.py +0 -0
  31. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/__init__.py +0 -0
  32. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_entropy.py +0 -0
  33. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_length.py +0 -0
  34. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_length_no_gaps.py +0 -0
  35. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_outlier_taxa.py +0 -0
  36. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/alignment_recoding.py +0 -0
  37. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/base.py +0 -0
  38. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/column_score.py +0 -0
  39. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/composition_per_taxon.py +0 -0
  40. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/compositional_bias_per_site.py +0 -0
  41. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/create_concatenation_matrix.py +0 -0
  42. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/dna_threader.py +0 -0
  43. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/evolutionary_rate_per_site.py +0 -0
  44. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/faidx.py +0 -0
  45. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/gc_content.py +0 -0
  46. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/mask_alignment.py +0 -0
  47. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/occupancy_per_taxon.py +0 -0
  48. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/pairwise_identity.py +0 -0
  49. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/parsimony_informative_sites.py +0 -0
  50. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/plot_alignment_qc.py +0 -0
  51. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/rcv.py +0 -0
  52. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/rcvt.py +0 -0
  53. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/rename_fasta_entries.py +0 -0
  54. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/sum_of_pairs_score.py +0 -0
  55. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/alignment/variable_sites.py +0 -0
  56. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/base.py +0 -0
  57. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/base.py +0 -0
  58. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/bipartition_support_stats.py +0 -0
  59. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/branch_length_multiplier.py +0 -0
  60. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/collapse_branches.py +0 -0
  61. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/concordance_asr.py +0 -0
  62. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/consensus_network.py +0 -0
  63. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/consensus_tree.py +0 -0
  64. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/cont_map.py +0 -0
  65. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/cophylo.py +0 -0
  66. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/covarying_evolutionary_rates.py +0 -0
  67. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/density_map.py +0 -0
  68. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/discordance_asymmetry.py +0 -0
  69. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/dvmc.py +0 -0
  70. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/evo_tempo_map.py +0 -0
  71. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/evolutionary_rate.py +0 -0
  72. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/fit_continuous.py +0 -0
  73. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/hidden_paralogy_check.py +0 -0
  74. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/internal_branch_stats.py +0 -0
  75. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/internode_labeler.py +0 -0
  76. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/kf_distance.py +0 -0
  77. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/last_common_ancestor_subtree.py +0 -0
  78. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/lb_score.py +0 -0
  79. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ltt.py +0 -0
  80. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/monophyly_check.py +0 -0
  81. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/nearest_neighbor_interchange.py +0 -0
  82. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/network_signal.py +0 -0
  83. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ou_shift_detection.py +0 -0
  84. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/ouwie.py +0 -0
  85. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/patristic_distances.py +0 -0
  86. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phenogram.py +0 -0
  87. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_glm.py +0 -0
  88. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_ordination.py +0 -0
  89. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_regression.py +0 -0
  90. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylogenetic_signal.py +0 -0
  91. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/phylomorphospace.py +0 -0
  92. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/polytomy_test.py +0 -0
  93. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/print_tree.py +0 -0
  94. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/prune_tree.py +0 -0
  95. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/quartet_network.py +0 -0
  96. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/rate_heterogeneity.py +0 -0
  97. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/relative_rate_test.py +0 -0
  98. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/rename_tree_tips.py +0 -0
  99. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/rf_distance.py +0 -0
  100. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/root_tree.py +0 -0
  101. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/saturation.py +0 -0
  102. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/spectral_discordance.py +0 -0
  103. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/spurious_sequence.py +0 -0
  104. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/terminal_branch_stats.py +0 -0
  105. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/threshold_model.py +0 -0
  106. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/tip_labels.py +0 -0
  107. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/tip_to_tip_distance.py +0 -0
  108. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/tip_to_tip_node_distance.py +0 -0
  109. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/total_tree_length.py +0 -0
  110. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/treeness.py +0 -0
  111. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/treeness_over_rcv.py +0 -0
  112. {phykit-2.1.40 → phykit-2.1.41}/phykit/services/tree/vcv_utils.py +0 -0
  113. {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/dependency_links.txt +0 -0
  114. {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/requires.txt +0 -0
  115. {phykit-2.1.40 → phykit-2.1.41}/phykit.egg-info/top_level.txt +0 -0
  116. {phykit-2.1.40 → phykit-2.1.41}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phykit
3
- Version: 2.1.40
3
+ Version: 2.1.41
4
4
  Home-page: https://github.com/jlsteenwyk/phykit
5
5
  Author: Jacob L. Steenwyk
6
6
  Author-email: jlsteenwyk@gmail.com
@@ -104,6 +104,8 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
104
104
  "rh": "rate_heterogeneity",
105
105
  "fitcontinuous": "fit_continuous",
106
106
  "fc": "fit_continuous",
107
+ "fitdiscrete": "fit_discrete",
108
+ "fd": "fit_discrete",
107
109
  "ouwie": "ouwie",
108
110
  "fit_ouwie": "ouwie",
109
111
  "multi_regime_ou": "ouwie",
@@ -0,0 +1,299 @@
1
+ """
2
+ Shared utilities for discrete trait evolution models.
3
+
4
+ Provides Q-matrix construction, Felsenstein pruning (likelihood computation),
5
+ maximum-likelihood Q-matrix fitting, and discrete trait data parsing. Used by
6
+ stochastic_character_map, ancestral_reconstruction, and fit_discrete.
7
+ """
8
+ import sys
9
+ from typing import Dict, List, Tuple
10
+
11
+ import numpy as np
12
+ from scipy.linalg import expm
13
+ from scipy.optimize import minimize
14
+
15
+ from ..errors import PhykitUserError
16
+
17
+
18
+ VALID_DISCRETE_MODELS = frozenset(["ER", "SYM", "ARD"])
19
+
20
+
21
+ def count_params(k: int, model: str) -> int:
22
+ """Return the number of free rate parameters for a discrete model."""
23
+ if model == "ER":
24
+ return 1
25
+ elif model == "SYM":
26
+ return k * (k - 1) // 2
27
+ elif model == "ARD":
28
+ return k * (k - 1)
29
+ else:
30
+ raise PhykitUserError(
31
+ [f"Unknown model '{model}'. Use ER, SYM, or ARD."], code=2,
32
+ )
33
+
34
+
35
+ def build_q_matrix(params: np.ndarray, k: int, model: str) -> np.ndarray:
36
+ """Build a Q-matrix from parameters for ER, SYM, or ARD models.
37
+
38
+ Rows sum to zero (standard continuous-time Markov chain convention).
39
+ """
40
+ Q = np.zeros((k, k))
41
+ if model == "ER":
42
+ rate = params[0]
43
+ Q[:] = rate
44
+ np.fill_diagonal(Q, 0.0)
45
+ elif model == "SYM":
46
+ idx = 0
47
+ for i in range(k):
48
+ for j in range(i + 1, k):
49
+ Q[i, j] = params[idx]
50
+ Q[j, i] = params[idx]
51
+ idx += 1
52
+ elif model == "ARD":
53
+ idx = 0
54
+ for i in range(k):
55
+ for j in range(k):
56
+ if i != j:
57
+ Q[i, j] = params[idx]
58
+ idx += 1
59
+ for i in range(k):
60
+ Q[i, i] = -np.sum(Q[i, :])
61
+ return Q
62
+
63
+
64
+ def matrix_exp(Q: np.ndarray, t: float) -> np.ndarray:
65
+ """Compute the matrix exponential P = exp(Q * t)."""
66
+ return expm(Q * t)
67
+
68
+
69
+ def felsenstein_pruning(
70
+ tree, tip_states: Dict[str, str], Q: np.ndarray,
71
+ pi: np.ndarray, states: List[str]
72
+ ) -> Tuple[Dict, float]:
73
+ """Postorder traversal computing conditional likelihoods and log-likelihood.
74
+
75
+ Returns (cond_liks, loglik) where cond_liks maps clade id to
76
+ a k-length likelihood vector.
77
+ """
78
+ k = len(states)
79
+ state_idx = {s: i for i, s in enumerate(states)}
80
+ cond_liks = {}
81
+
82
+ for clade in tree.find_clades(order="postorder"):
83
+ if clade.is_terminal():
84
+ lik = np.zeros(k)
85
+ if clade.name in tip_states:
86
+ lik[state_idx[tip_states[clade.name]]] = 1.0
87
+ cond_liks[id(clade)] = lik
88
+ else:
89
+ lik = np.ones(k)
90
+ for child in clade.clades:
91
+ t = child.branch_length if child.branch_length else 1e-8
92
+ P = matrix_exp(Q, t)
93
+ child_lik = cond_liks[id(child)]
94
+ lik *= P @ child_lik
95
+ cond_liks[id(clade)] = lik
96
+
97
+ root_lik = cond_liks[id(tree.root)]
98
+ total_lik = np.sum(pi * root_lik)
99
+ if total_lik <= 0:
100
+ loglik = -1e20
101
+ else:
102
+ loglik = np.log(total_lik)
103
+
104
+ return cond_liks, loglik
105
+
106
+
107
+ def fit_q_matrix(
108
+ tree, tip_states: Dict[str, str],
109
+ states: List[str], model: str
110
+ ) -> Tuple[np.ndarray, float]:
111
+ """Fit Q-matrix parameters via maximum likelihood.
112
+
113
+ Uses multi-start optimization with L-BFGS-B and Nelder-Mead,
114
+ followed by a refinement step.
115
+
116
+ Returns (Q_matrix, log_likelihood).
117
+ """
118
+ k = len(states)
119
+ n_params = count_params(k, model)
120
+ pi = np.ones(k) / k
121
+
122
+ def neg_loglik(params):
123
+ Q = build_q_matrix(np.abs(params), k, model)
124
+ _, ll = felsenstein_pruning(tree, tip_states, Q, pi, states)
125
+ return -ll
126
+
127
+ bounds = [(1e-8, 100.0)] * n_params
128
+
129
+ starting_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
130
+ best_negll = np.inf
131
+ best_params = np.ones(n_params) * 0.1
132
+
133
+ for sv in starting_values:
134
+ x0 = np.ones(n_params) * sv
135
+ for method in ["L-BFGS-B", "Nelder-Mead"]:
136
+ try:
137
+ kwargs = {"method": method}
138
+ if method == "L-BFGS-B":
139
+ kwargs["bounds"] = bounds
140
+ result = minimize(neg_loglik, x0, **kwargs)
141
+ if result.fun < best_negll:
142
+ best_negll = result.fun
143
+ best_params = np.abs(result.x)
144
+ except (ValueError, np.linalg.LinAlgError):
145
+ continue
146
+
147
+ # Refine best result
148
+ try:
149
+ result = minimize(
150
+ neg_loglik, best_params, method="Nelder-Mead",
151
+ options={"maxiter": 10000, "xatol": 1e-10, "fatol": 1e-10},
152
+ )
153
+ if result.fun < best_negll:
154
+ best_params = np.abs(result.x)
155
+ except (ValueError, np.linalg.LinAlgError):
156
+ pass
157
+
158
+ Q = build_q_matrix(best_params, k, model)
159
+ _, loglik = felsenstein_pruning(tree, tip_states, Q, pi, states)
160
+
161
+ return Q, loglik
162
+
163
+
164
+ def parse_discrete_traits(
165
+ path: str, tree_tips: List[str], trait_column: str = None
166
+ ) -> Dict[str, str]:
167
+ """Parse discrete trait data from a TSV file.
168
+
169
+ If trait_column is None: expects 2-column format (taxon<tab>state),
170
+ no header row.
171
+ If trait_column is given: expects multi-column with header row,
172
+ extracts the named column.
173
+
174
+ Returns {taxon: state} dict for the intersection of file taxa and
175
+ tree_tips. Requires at least 3 shared taxa.
176
+ """
177
+ try:
178
+ with open(path) as f:
179
+ lines = f.readlines()
180
+ except FileNotFoundError:
181
+ raise PhykitUserError(
182
+ [
183
+ f"{path} corresponds to no such file or directory.",
184
+ "Please check filename and pathing",
185
+ ],
186
+ code=2,
187
+ )
188
+
189
+ data_lines = []
190
+ for line in lines:
191
+ stripped = line.strip()
192
+ if not stripped or stripped.startswith("#"):
193
+ continue
194
+ data_lines.append(stripped)
195
+
196
+ if trait_column is not None:
197
+ return _parse_multi_column(data_lines, path, tree_tips, trait_column)
198
+ else:
199
+ return _parse_two_column(data_lines, path, tree_tips)
200
+
201
+
202
+ def _parse_multi_column(
203
+ data_lines: List[str], path: str, tree_tips: List[str], trait_column: str
204
+ ) -> Dict[str, str]:
205
+ if len(data_lines) < 2:
206
+ raise PhykitUserError(
207
+ ["Trait file must have a header row and at least one data row."],
208
+ code=2,
209
+ )
210
+
211
+ header_parts = data_lines[0].split("\t")
212
+ if len(header_parts) < 2:
213
+ raise PhykitUserError(
214
+ ["Header must have at least 2 columns (taxon + at least 1 trait)."],
215
+ code=2,
216
+ )
217
+
218
+ trait_names = header_parts[1:]
219
+ if trait_column not in trait_names:
220
+ raise PhykitUserError(
221
+ [
222
+ f"Column '{trait_column}' not found in trait file.",
223
+ f"Available columns: {', '.join(trait_names)}",
224
+ ],
225
+ code=2,
226
+ )
227
+
228
+ col_idx = trait_names.index(trait_column)
229
+ n_cols = len(header_parts)
230
+
231
+ traits = {}
232
+ for line_idx, line in enumerate(data_lines[1:], 2):
233
+ parts = line.split("\t")
234
+ if len(parts) != n_cols:
235
+ raise PhykitUserError(
236
+ [f"Line {line_idx} has {len(parts)} columns; expected {n_cols}."],
237
+ code=2,
238
+ )
239
+ taxon = parts[0]
240
+ value = parts[1 + col_idx].strip()
241
+ if not value:
242
+ raise PhykitUserError(
243
+ [f"Missing trait value for taxon '{taxon}' on line {line_idx}."],
244
+ code=2,
245
+ )
246
+ traits[taxon] = value
247
+
248
+ return _validate_shared_taxa(traits, tree_tips)
249
+
250
+
251
+ def _parse_two_column(
252
+ data_lines: List[str], path: str, tree_tips: List[str]
253
+ ) -> Dict[str, str]:
254
+ traits = {}
255
+ for line_idx, line in enumerate(data_lines, 1):
256
+ parts = line.split("\t")
257
+ if len(parts) != 2:
258
+ raise PhykitUserError(
259
+ [f"Line {line_idx} has {len(parts)} columns; expected 2 (taxon, state)."],
260
+ code=2,
261
+ )
262
+ traits[parts[0]] = parts[1].strip()
263
+
264
+ return _validate_shared_taxa(traits, tree_tips)
265
+
266
+
267
+ def _validate_shared_taxa(
268
+ traits: Dict[str, str], tree_tips: List[str]
269
+ ) -> Dict[str, str]:
270
+ tree_tip_set = set(tree_tips)
271
+ trait_taxa_set = set(traits.keys())
272
+ shared = tree_tip_set & trait_taxa_set
273
+
274
+ tree_only = tree_tip_set - trait_taxa_set
275
+ trait_only = trait_taxa_set - tree_tip_set
276
+
277
+ if tree_only:
278
+ print(
279
+ f"Warning: {len(tree_only)} taxa in tree but not in trait file: "
280
+ f"{', '.join(sorted(tree_only))}",
281
+ file=sys.stderr,
282
+ )
283
+ if trait_only:
284
+ print(
285
+ f"Warning: {len(trait_only)} taxa in trait file but not in tree: "
286
+ f"{', '.join(sorted(trait_only))}",
287
+ file=sys.stderr,
288
+ )
289
+
290
+ if len(shared) < 3:
291
+ raise PhykitUserError(
292
+ [
293
+ f"Only {len(shared)} shared taxa between tree and trait file.",
294
+ "At least 3 shared taxa are required.",
295
+ ],
296
+ code=2,
297
+ )
298
+
299
+ return {taxon: traits[taxon] for taxon in shared}
@@ -230,6 +230,9 @@ class Phykit:
230
230
  fit_continuous (alias: fitcontinuous; fc)
231
231
  - compare continuous trait evolution models
232
232
  (BM, OU, EB, Lambda, Delta, Kappa, White)
233
+ fit_discrete (alias: fitdiscrete; fd)
234
+ - compare discrete trait evolution models
235
+ (ER, SYM, ARD)
233
236
  ouwie (alias: fit_ouwie; multi_regime_ou)
234
237
  - fit multi-regime OU models (BM1, BMS, OU1,
235
238
  OUM, OUMV, OUMA, OUMVA; Beaulieu et al. 2012)
@@ -3987,6 +3990,65 @@ class Phykit:
3987
3990
  _add_json_argument(parser)
3988
3991
  _run_service(parser, argv, RateHeterogeneity)
3989
3992
 
3993
+ @staticmethod
3994
+ def fit_discrete(argv):
3995
+ parser = _new_parser(
3996
+ description=textwrap.dedent(
3997
+ f"""\
3998
+ {help_header}
3999
+
4000
+ Compare models of discrete trait evolution on a phylogeny.
4001
+
4002
+ Fits ER (Equal Rates), SYM (Symmetric), and ARD (All Rates
4003
+ Different) Mk models of discrete character evolution via
4004
+ maximum likelihood. Compares models using AIC and BIC.
4005
+
4006
+ Analogous to R's geiger::fitDiscrete().
4007
+
4008
+ Aliases:
4009
+ fit_discrete, fitdiscrete, fd
4010
+ Command line interfaces:
4011
+ pk_fit_discrete, pk_fitdiscrete, pk_fd
4012
+
4013
+ Usage:
4014
+ phykit fit_discrete -t <tree> -d <trait_data> -c <trait>
4015
+ [--models ER,SYM,ARD] [--json]
4016
+
4017
+ Options
4018
+ =====================================================
4019
+ -t/--tree tree file (required)
4020
+
4021
+ -d/--trait_data trait data file in TSV format
4022
+ (required)
4023
+
4024
+ -c/--trait column name for the discrete
4025
+ trait in the data file
4026
+ (required)
4027
+
4028
+ --models comma-separated list of models
4029
+ to fit (default: ER,SYM,ARD)
4030
+
4031
+ --json optional argument to output
4032
+ results as JSON
4033
+ """
4034
+ ),
4035
+ )
4036
+ parser.add_argument(
4037
+ "-t", "--tree", type=str, required=True, help=SUPPRESS, metavar=""
4038
+ )
4039
+ parser.add_argument(
4040
+ "-d", "--trait_data", type=str, required=True, help=SUPPRESS, metavar=""
4041
+ )
4042
+ parser.add_argument(
4043
+ "-c", "--trait", type=str, required=True, help=SUPPRESS, metavar=""
4044
+ )
4045
+ parser.add_argument(
4046
+ "--models", type=str, required=False, default=None,
4047
+ help=SUPPRESS, metavar=""
4048
+ )
4049
+ _add_json_argument(parser)
4050
+ _run_service(parser, argv, FitDiscrete)
4051
+
3990
4052
  @staticmethod
3991
4053
  def fit_continuous(argv):
3992
4054
  parser = _new_parser(
@@ -6530,6 +6592,10 @@ def fit_continuous(argv=None):
6530
6592
  Phykit.fit_continuous(sys.argv[1:])
6531
6593
 
6532
6594
 
6595
+ def fit_discrete(argv=None):
6596
+ Phykit.fit_discrete(sys.argv[1:])
6597
+
6598
+
6533
6599
  def ouwie(argv=None):
6534
6600
  Phykit.ouwie(sys.argv[1:])
6535
6601
 
@@ -85,6 +85,7 @@ PolytomyTest = _LazyServiceFactory("phykit.services.tree.polytomy_test", "Polyto
85
85
  PrintTree = _LazyServiceFactory("phykit.services.tree.print_tree", "PrintTree")
86
86
  PruneTree = _LazyServiceFactory("phykit.services.tree.prune_tree", "PruneTree")
87
87
  RenameTreeTips = _LazyServiceFactory("phykit.services.tree.rename_tree_tips", "RenameTreeTips")
88
+ FitDiscrete = _LazyServiceFactory("phykit.services.tree.fit_discrete", "FitDiscrete")
88
89
  KuhnerFelsensteinDistance = _LazyServiceFactory("phykit.services.tree.kf_distance", "KuhnerFelsensteinDistance")
89
90
  RobinsonFouldsDistance = _LazyServiceFactory("phykit.services.tree.rf_distance", "RobinsonFouldsDistance")
90
91
  RootTree = _LazyServiceFactory("phykit.services.tree.root_tree", "RootTree")
@@ -12,6 +12,7 @@ _EXPORTS = {
12
12
  "DiscordanceAsymmetry": "discordance_asymmetry",
13
13
  "EvolutionaryRate": "evolutionary_rate",
14
14
  "EvoTempoMap": "evo_tempo_map",
15
+ "FitDiscrete": "fit_discrete",
15
16
  "HiddenParalogyCheck": "hidden_paralogy_check",
16
17
  "InternalBranchStats": "internal_branch_stats",
17
18
  "InternodeLabeler": "internode_labeler",
@@ -10,6 +10,13 @@ from scipy.optimize import minimize
10
10
  from .base import Tree
11
11
  from ...helpers.json_output import print_json
12
12
  from ...helpers.plot_config import PlotConfig
13
+ from ...helpers.discrete_models import (
14
+ build_q_matrix,
15
+ matrix_exp,
16
+ felsenstein_pruning,
17
+ fit_q_matrix,
18
+ parse_discrete_traits,
19
+ )
13
20
  from ...errors import PhykitUserError
14
21
 
15
22
 
@@ -1087,132 +1094,20 @@ class AncestralReconstruction(Tree):
1087
1094
  return {taxon: traits[taxon] for taxon in shared}
1088
1095
 
1089
1096
  # ------------------------------------------------------------------
1090
- # Mk model primitives (shared with StochasticCharacterMap)
1097
+ # Mk model primitives (delegated to phykit.helpers.discrete_models)
1091
1098
  # ------------------------------------------------------------------
1092
1099
 
1093
- def _build_q_matrix(
1094
- self, params: np.ndarray, k: int, model: str
1095
- ) -> np.ndarray:
1096
- Q = np.zeros((k, k))
1097
- if model == "ER":
1098
- rate = params[0]
1099
- Q[:] = rate
1100
- np.fill_diagonal(Q, 0.0)
1101
- elif model == "SYM":
1102
- idx = 0
1103
- for i in range(k):
1104
- for j in range(i + 1, k):
1105
- Q[i, j] = params[idx]
1106
- Q[j, i] = params[idx]
1107
- idx += 1
1108
- elif model == "ARD":
1109
- idx = 0
1110
- for i in range(k):
1111
- for j in range(k):
1112
- if i != j:
1113
- Q[i, j] = params[idx]
1114
- idx += 1
1115
- # Set diagonal
1116
- for i in range(k):
1117
- Q[i, i] = -np.sum(Q[i, :])
1118
- return Q
1119
-
1120
- def _matrix_exp(self, Q: np.ndarray, t: float) -> np.ndarray:
1121
- return expm(Q * t)
1122
-
1123
- def _felsenstein_pruning(
1124
- self, tree, tip_states: Dict[str, str], Q: np.ndarray,
1125
- pi: np.ndarray, states: List[str]
1126
- ) -> Tuple[Dict, float]:
1127
- k = len(states)
1128
- state_idx = {s: i for i, s in enumerate(states)}
1129
- cond_liks: Dict[int, np.ndarray] = {}
1130
-
1131
- for clade in tree.find_clades(order="postorder"):
1132
- if clade.is_terminal():
1133
- lik = np.zeros(k)
1134
- if clade.name in tip_states:
1135
- lik[state_idx[tip_states[clade.name]]] = 1.0
1136
- cond_liks[id(clade)] = lik
1137
- else:
1138
- lik = np.ones(k)
1139
- for child in clade.clades:
1140
- t = child.branch_length if child.branch_length else 1e-8
1141
- P = self._matrix_exp(Q, t)
1142
- child_lik = cond_liks[id(child)]
1143
- lik *= P @ child_lik
1144
- cond_liks[id(clade)] = lik
1145
-
1146
- root_lik = cond_liks[id(tree.root)]
1147
- total_lik = np.sum(pi * root_lik)
1148
- if total_lik <= 0:
1149
- loglik = -1e20
1150
- else:
1151
- loglik = np.log(total_lik)
1152
-
1153
- return cond_liks, loglik
1100
+ def _build_q_matrix(self, params, k, model):
1101
+ return build_q_matrix(params, k, model)
1154
1102
 
1155
- def _fit_q_matrix(
1156
- self, tree, tip_states: Dict[str, str],
1157
- states: List[str], model: str
1158
- ) -> Tuple[np.ndarray, float]:
1159
- k = len(states)
1160
-
1161
- if model == "ER":
1162
- n_params = 1
1163
- elif model == "SYM":
1164
- n_params = k * (k - 1) // 2
1165
- elif model == "ARD":
1166
- n_params = k * (k - 1)
1167
- else:
1168
- raise PhykitUserError(
1169
- [f"Unknown model '{model}'. Use ER, SYM, or ARD."],
1170
- code=2,
1171
- )
1172
-
1173
- pi = np.ones(k) / k
1174
-
1175
- def neg_loglik(params):
1176
- Q = self._build_q_matrix(np.abs(params), k, model)
1177
- _, ll = self._felsenstein_pruning(tree, tip_states, Q, pi, states)
1178
- return -ll
1179
-
1180
- bounds = [(1e-8, 100.0)] * n_params
1181
-
1182
- # Multi-start optimization for robustness
1183
- starting_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
1184
- best_negll = np.inf
1185
- best_params = np.ones(n_params) * 0.1
1186
-
1187
- for sv in starting_values:
1188
- x0 = np.ones(n_params) * sv
1189
- for opt_method in ["L-BFGS-B", "Nelder-Mead"]:
1190
- try:
1191
- kwargs = {"method": opt_method}
1192
- if opt_method == "L-BFGS-B":
1193
- kwargs["bounds"] = bounds
1194
- result = minimize(neg_loglik, x0, **kwargs)
1195
- if result.fun < best_negll:
1196
- best_negll = result.fun
1197
- best_params = np.abs(result.x)
1198
- except (ValueError, np.linalg.LinAlgError):
1199
- continue
1200
-
1201
- # Refine best result with Nelder-Mead
1202
- try:
1203
- result = minimize(
1204
- neg_loglik, best_params, method="Nelder-Mead",
1205
- options={"maxiter": 10000, "xatol": 1e-10, "fatol": 1e-10},
1206
- )
1207
- if result.fun < best_negll:
1208
- best_params = np.abs(result.x)
1209
- except (ValueError, np.linalg.LinAlgError):
1210
- pass
1103
+ def _matrix_exp(self, Q, t):
1104
+ return matrix_exp(Q, t)
1211
1105
 
1212
- Q = self._build_q_matrix(best_params, k, model)
1213
- _, loglik = self._felsenstein_pruning(tree, tip_states, Q, pi, states)
1106
+ def _felsenstein_pruning(self, tree, tip_states, Q, pi, states):
1107
+ return felsenstein_pruning(tree, tip_states, Q, pi, states)
1214
1108
 
1215
- return Q, loglik
1109
+ def _fit_q_matrix(self, tree, tip_states, states, model):
1110
+ return fit_q_matrix(tree, tip_states, states, model)
1216
1111
 
1217
1112
  # ------------------------------------------------------------------
1218
1113
  # Discrete marginal posteriors (upward-downward belief propagation)