phykit 2.1.83__tar.gz → 2.1.84__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phykit-2.1.83 → phykit-2.1.84}/PKG-INFO +1 -1
- {phykit-2.1.83 → phykit-2.1.84}/phykit/cli_registry.py +3 -0
- phykit-2.1.84/phykit/helpers/pgls_utils.py +181 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/plot_config.py +116 -0
- phykit-2.1.84/phykit/helpers/trait_parsing.py +133 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/phykit.py +69 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/service_factories.py +1 -0
- phykit-2.1.84/phykit/services/alignment/occupancy_filter.py +299 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/ancestral_reconstruction.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/base.py +40 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/concordance_asr.py +1 -1
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/cont_map.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/density_map.py +1 -1
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/fit_continuous.py +3 -44
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/fit_discrete.py +1 -11
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/independent_contrasts.py +1 -11
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/ltt.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/ou_shift_detection.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/ouwie.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/parsimony_score.py +1 -8
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phenogram.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylo_anova.py +1 -6
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylo_heatmap.py +11 -78
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylo_impute.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylo_logistic.py +3 -117
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylo_path.py +14 -91
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylogenetic_glm.py +3 -116
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylogenetic_ordination.py +5 -143
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylogenetic_regression.py +14 -244
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylogenetic_signal.py +5 -160
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/phylomorphospace.py +3 -116
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/quartet_pie.py +17 -73
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/rate_heterogeneity.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/simmap_summary.py +10 -43
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/stochastic_character_map.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/threshold_model.py +1 -15
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/trait_correlation.py +5 -119
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/trait_rate_map.py +1 -19
- phykit-2.1.84/phykit/version.py +1 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit.egg-info/PKG-INFO +1 -1
- {phykit-2.1.83 → phykit-2.1.84}/phykit.egg-info/SOURCES.txt +3 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit.egg-info/entry_points.txt +6 -89
- phykit-2.1.84/setup.py +63 -0
- phykit-2.1.83/phykit/version.py +0 -1
- phykit-2.1.83/setup.py +0 -356
- {phykit-2.1.83 → phykit-2.1.84}/LICENSE.md +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/README.md +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/__init__.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/__main__.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/errors.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/__init__.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/boolean_argument_parsing.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/caching.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/circular_layout.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/color_annotations.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/discrete_models.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/files.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/json_output.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/parallel.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/parsimony_utils.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/quartet_utils.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/stats_summary.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/helpers/streaming.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/__init__.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/__init__.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/alignment_entropy.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/alignment_length.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/alignment_length_no_gaps.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/alignment_outlier_taxa.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/alignment_recoding.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/alignment_subsample.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/base.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/column_score.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/composition_per_taxon.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/compositional_bias_per_site.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/create_concatenation_matrix.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/dfoil.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/dna_threader.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/dstatistic.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/evolutionary_rate_per_site.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/faidx.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/gc_content.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/identity_matrix.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/mask_alignment.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/occupancy_per_taxon.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/pairwise_identity.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/parsimony_informative_sites.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/phylo_gwas.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/plot_alignment_qc.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/rcv.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/rcvt.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/rename_fasta_entries.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/sum_of_pairs_score.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/taxon_groups.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/alignment/variable_sites.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/base.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/__init__.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/bipartition_support_stats.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/branch_length_multiplier.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/character_map.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/collapse_branches.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/consensus_network.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/consensus_tree.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/cophylo.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/covarying_evolutionary_rates.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/discordance_asymmetry.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/dvmc.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/evo_tempo_map.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/evolutionary_rate.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/hidden_paralogy_check.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/hybridization.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/internal_branch_stats.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/internode_labeler.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/kf_distance.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/last_common_ancestor_subtree.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/lb_score.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/monophyly_check.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/nearest_neighbor_interchange.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/neighbor_net.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/network_signal.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/patristic_distances.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/polytomy_test.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/print_tree.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/prune_tree.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/quartet_network.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/relative_rate_test.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/rename_tree_tips.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/rf_distance.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/root_tree.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/saturation.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/spectral_discordance.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/spr.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/spurious_sequence.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/terminal_branch_stats.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/tip_labels.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/tip_to_tip_distance.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/tip_to_tip_node_distance.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/total_tree_length.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/tree_space.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/treeness.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/treeness_over_rcv.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit/services/tree/vcv_utils.py +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit.egg-info/dependency_links.txt +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit.egg-info/requires.txt +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/phykit.egg-info/top_level.txt +0 -0
- {phykit-2.1.83 → phykit-2.1.84}/setup.cfg +0 -0
|
@@ -217,6 +217,9 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
|
|
|
217
217
|
"tree_landscape": "tree_space",
|
|
218
218
|
"tgroups": "taxon_groups",
|
|
219
219
|
"shared_taxa": "taxon_groups",
|
|
220
|
+
"occupancy_filter": "occupancy_filter",
|
|
221
|
+
"occ_filter": "occupancy_filter",
|
|
222
|
+
"filter_occupancy": "occupancy_filter",
|
|
220
223
|
# Helper aliases
|
|
221
224
|
"create_concat": "create_concatenation_matrix",
|
|
222
225
|
"cc": "create_concatenation_matrix",
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared PGLS (Phylogenetic Generalized Least Squares) utilities.
|
|
3
|
+
|
|
4
|
+
Provides reusable functions for:
|
|
5
|
+
- Pagel's lambda estimation via ML
|
|
6
|
+
- Concentrated PGLS log-likelihood
|
|
7
|
+
- GLS model fitting
|
|
8
|
+
- Lambda upper bound computation
|
|
9
|
+
|
|
10
|
+
Used by phylogenetic_regression, phylo_path, phylogenetic_signal,
|
|
11
|
+
phylogenetic_ordination, fit_continuous, and other comparative methods.
|
|
12
|
+
"""
|
|
13
|
+
from typing import Tuple
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from scipy.optimize import minimize_scalar
|
|
17
|
+
|
|
18
|
+
from ..errors import PhykitUserError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def max_lambda(tree) -> float:
|
|
22
|
+
"""Compute the upper bound for Pagel's lambda.
|
|
23
|
+
|
|
24
|
+
For ultrametric trees, returns max_tip_height / max_parent_height.
|
|
25
|
+
For non-ultrametric trees, returns 1.0.
|
|
26
|
+
"""
|
|
27
|
+
tips = tree.get_terminals()
|
|
28
|
+
root = tree.root
|
|
29
|
+
tip_heights = [tree.distance(root, tip) for tip in tips]
|
|
30
|
+
max_tip_height = max(tip_heights)
|
|
31
|
+
min_tip_height = min(tip_heights)
|
|
32
|
+
|
|
33
|
+
is_ultrametric = (max_tip_height - min_tip_height) / max_tip_height < 1e-6
|
|
34
|
+
|
|
35
|
+
if not is_ultrametric:
|
|
36
|
+
return 1.0
|
|
37
|
+
|
|
38
|
+
max_parent_height = 0.0
|
|
39
|
+
for clade in tree.find_clades(order="level"):
|
|
40
|
+
if clade == root:
|
|
41
|
+
continue
|
|
42
|
+
node_height = tree.distance(root, clade)
|
|
43
|
+
parent_height = node_height - (clade.branch_length or 0.0)
|
|
44
|
+
if parent_height > max_parent_height:
|
|
45
|
+
max_parent_height = parent_height
|
|
46
|
+
|
|
47
|
+
if max_parent_height == 0.0:
|
|
48
|
+
return 1.0
|
|
49
|
+
|
|
50
|
+
return max_tip_height / max_parent_height
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def pgls_log_likelihood(
|
|
54
|
+
y: np.ndarray, X: np.ndarray, C: np.ndarray
|
|
55
|
+
) -> float:
|
|
56
|
+
"""Concentrated log-likelihood with beta and sigma^2 profiled out.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
y : response vector (n,)
|
|
61
|
+
X : design matrix (n, p)
|
|
62
|
+
C : phylogenetic VCV matrix (n, n), possibly lambda-transformed
|
|
63
|
+
"""
|
|
64
|
+
n = len(y)
|
|
65
|
+
try:
|
|
66
|
+
C_inv = np.linalg.inv(C)
|
|
67
|
+
XtCiX = X.T @ C_inv @ X
|
|
68
|
+
XtCiX_inv = np.linalg.inv(XtCiX)
|
|
69
|
+
except np.linalg.LinAlgError:
|
|
70
|
+
return -1e20
|
|
71
|
+
|
|
72
|
+
beta_hat = XtCiX_inv @ X.T @ C_inv @ y
|
|
73
|
+
e = y - X @ beta_hat
|
|
74
|
+
sigma2_ml = float(e @ C_inv @ e) / n
|
|
75
|
+
|
|
76
|
+
sign, logdet_C = np.linalg.slogdet(C)
|
|
77
|
+
if sign <= 0 or sigma2_ml <= 0:
|
|
78
|
+
return -1e20
|
|
79
|
+
|
|
80
|
+
ll = -0.5 * (
|
|
81
|
+
n * np.log(2 * np.pi) + n * np.log(sigma2_ml) + logdet_C + n
|
|
82
|
+
)
|
|
83
|
+
return float(ll)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def estimate_lambda(
|
|
87
|
+
y: np.ndarray,
|
|
88
|
+
X: np.ndarray,
|
|
89
|
+
vcv: np.ndarray,
|
|
90
|
+
max_lam: float = 1.0,
|
|
91
|
+
) -> Tuple[float, float]:
|
|
92
|
+
"""Optimize Pagel's lambda via ML using multi-interval bounded search.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
y : response vector (n,)
|
|
97
|
+
X : design matrix (n, p)
|
|
98
|
+
vcv : phylogenetic VCV matrix (n, n)
|
|
99
|
+
max_lam : upper bound for lambda (default 1.0)
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
(lambda_hat, log_likelihood_at_lambda)
|
|
104
|
+
"""
|
|
105
|
+
diag_vals = np.diag(vcv).copy()
|
|
106
|
+
niter = 10
|
|
107
|
+
|
|
108
|
+
def neg_ll(lam):
|
|
109
|
+
C_lam = vcv * lam
|
|
110
|
+
np.fill_diagonal(C_lam, diag_vals)
|
|
111
|
+
try:
|
|
112
|
+
ll = pgls_log_likelihood(y, X, C_lam)
|
|
113
|
+
return -ll
|
|
114
|
+
except (np.linalg.LinAlgError, FloatingPointError, ValueError):
|
|
115
|
+
return 1e10
|
|
116
|
+
|
|
117
|
+
bounds_lo = np.linspace(0, max_lam - max_lam / niter, niter)
|
|
118
|
+
bounds_hi = np.linspace(max_lam / niter, max_lam, niter)
|
|
119
|
+
|
|
120
|
+
best_ll = -np.inf
|
|
121
|
+
lambda_hat = 0.0
|
|
122
|
+
for lo, hi in zip(bounds_lo, bounds_hi):
|
|
123
|
+
res = minimize_scalar(neg_ll, bounds=(lo, hi), method="bounded")
|
|
124
|
+
ll_val = -res.fun
|
|
125
|
+
if ll_val > best_ll:
|
|
126
|
+
best_ll = ll_val
|
|
127
|
+
lambda_hat = res.x
|
|
128
|
+
|
|
129
|
+
# Compute log-likelihood at fitted lambda
|
|
130
|
+
C_fitted = vcv * lambda_hat
|
|
131
|
+
np.fill_diagonal(C_fitted, diag_vals)
|
|
132
|
+
ll_fitted = pgls_log_likelihood(y, X, C_fitted)
|
|
133
|
+
|
|
134
|
+
return float(lambda_hat), float(ll_fitted)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def fit_gls(
|
|
138
|
+
y: np.ndarray, X: np.ndarray, C_inv: np.ndarray
|
|
139
|
+
) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray]:
|
|
140
|
+
"""GLS estimation: beta_hat = (X' C_inv X)^{-1} X' C_inv y.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
y : response vector (n,)
|
|
145
|
+
X : design matrix (n, p)
|
|
146
|
+
C_inv : inverse of phylogenetic VCV matrix (n, n)
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
(beta_hat, residuals, sigma2_reml, var_beta)
|
|
151
|
+
"""
|
|
152
|
+
n, k_plus_1 = X.shape
|
|
153
|
+
XtCiX = X.T @ C_inv @ X
|
|
154
|
+
try:
|
|
155
|
+
XtCiX_inv = np.linalg.inv(XtCiX)
|
|
156
|
+
except np.linalg.LinAlgError:
|
|
157
|
+
raise PhykitUserError(
|
|
158
|
+
[
|
|
159
|
+
"Singular design matrix: cannot estimate coefficients.",
|
|
160
|
+
"Check that predictors are not collinear.",
|
|
161
|
+
],
|
|
162
|
+
code=2,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
beta_hat = XtCiX_inv @ X.T @ C_inv @ y
|
|
166
|
+
residuals = y - X @ beta_hat
|
|
167
|
+
|
|
168
|
+
df_resid = n - k_plus_1
|
|
169
|
+
sigma2 = float(residuals @ C_inv @ residuals) / max(df_resid, 1)
|
|
170
|
+
|
|
171
|
+
var_beta = sigma2 * XtCiX_inv
|
|
172
|
+
|
|
173
|
+
return beta_hat, residuals, sigma2, var_beta
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def apply_lambda(vcv: np.ndarray, lambda_val: float) -> np.ndarray:
|
|
177
|
+
"""Apply Pagel's lambda to a VCV matrix (scale off-diagonals, keep diagonal)."""
|
|
178
|
+
diag_vals = np.diag(vcv).copy()
|
|
179
|
+
vcv_lam = vcv * lambda_val
|
|
180
|
+
np.fill_diagonal(vcv_lam, diag_vals)
|
|
181
|
+
return vcv_lam
|
|
@@ -248,3 +248,119 @@ def compute_node_x_cladogram(tree, parent_map):
|
|
|
248
248
|
else:
|
|
249
249
|
node_x[cid] = float(node_depth.get(cid, 0)) * step_size
|
|
250
250
|
return node_x
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ---- Shared rectangular tree plotting utilities ----
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def build_parent_map(tree):
|
|
257
|
+
"""Build a dict mapping child node id -> parent node."""
|
|
258
|
+
parent_map = {}
|
|
259
|
+
for clade in tree.find_clades(order="preorder"):
|
|
260
|
+
for child in clade.clades:
|
|
261
|
+
parent_map[id(child)] = clade
|
|
262
|
+
return parent_map
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def compute_node_positions(tree, parent_map, cladogram=False):
|
|
266
|
+
"""Compute (node_x, node_y) for a rectangular tree layout.
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
----------
|
|
270
|
+
tree : Bio.Phylo tree
|
|
271
|
+
parent_map : dict from build_parent_map()
|
|
272
|
+
cladogram : if True, use equal-depth x-positions (tips aligned)
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
(node_x, node_y) : dicts mapping node id -> coordinate
|
|
277
|
+
"""
|
|
278
|
+
import numpy as np
|
|
279
|
+
|
|
280
|
+
tips = list(tree.get_terminals())
|
|
281
|
+
root = tree.root
|
|
282
|
+
|
|
283
|
+
node_y = {}
|
|
284
|
+
for i, tip in enumerate(tips):
|
|
285
|
+
node_y[id(tip)] = i
|
|
286
|
+
|
|
287
|
+
if cladogram:
|
|
288
|
+
node_x = compute_node_x_cladogram(tree, parent_map)
|
|
289
|
+
else:
|
|
290
|
+
node_x = {}
|
|
291
|
+
for clade in tree.find_clades(order="preorder"):
|
|
292
|
+
if clade == root:
|
|
293
|
+
node_x[id(clade)] = 0.0
|
|
294
|
+
elif id(clade) in parent_map:
|
|
295
|
+
parent = parent_map[id(clade)]
|
|
296
|
+
t = clade.branch_length if clade.branch_length else 0.0
|
|
297
|
+
node_x[id(clade)] = node_x.get(id(parent), 0.0) + t
|
|
298
|
+
|
|
299
|
+
for clade in tree.find_clades(order="postorder"):
|
|
300
|
+
if not clade.is_terminal() and id(clade) not in node_y:
|
|
301
|
+
child_ys = [
|
|
302
|
+
node_y[id(c)] for c in clade.clades if id(c) in node_y
|
|
303
|
+
]
|
|
304
|
+
if child_ys:
|
|
305
|
+
node_y[id(clade)] = float(np.mean(child_ys))
|
|
306
|
+
else:
|
|
307
|
+
node_y[id(clade)] = 0.0
|
|
308
|
+
|
|
309
|
+
return node_x, node_y
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def draw_tree_branches(
|
|
313
|
+
ax, tree, node_x, node_y, parent_map,
|
|
314
|
+
color="black", lw=1.5, vertical_color="black", vertical_lw=0.8,
|
|
315
|
+
):
|
|
316
|
+
"""Draw rectangular tree branches (horizontal + vertical connectors).
|
|
317
|
+
|
|
318
|
+
Override color per branch by passing a callable for `color`:
|
|
319
|
+
color=lambda clade: "red" if ... else "black"
|
|
320
|
+
"""
|
|
321
|
+
root = tree.root
|
|
322
|
+
for clade in tree.find_clades(order="preorder"):
|
|
323
|
+
if clade == root:
|
|
324
|
+
continue
|
|
325
|
+
if id(clade) not in parent_map:
|
|
326
|
+
continue
|
|
327
|
+
parent = parent_map[id(clade)]
|
|
328
|
+
if id(parent) not in node_x or id(clade) not in node_x:
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
x0 = node_x[id(parent)]
|
|
332
|
+
x1 = node_x[id(clade)]
|
|
333
|
+
y0 = node_y.get(id(parent), 0)
|
|
334
|
+
y1 = node_y.get(id(clade), 0)
|
|
335
|
+
|
|
336
|
+
branch_color = color(clade) if callable(color) else color
|
|
337
|
+
ax.plot([x0, x1], [y1, y1], color=branch_color, lw=lw)
|
|
338
|
+
ax.plot([x0, x0], [y0, y1], color=vertical_color, lw=vertical_lw)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def draw_tip_labels(
|
|
342
|
+
ax, tree, node_x, node_y, fontsize=9, offset_fraction=0.03,
|
|
343
|
+
):
|
|
344
|
+
"""Draw taxon name labels at tree tips."""
|
|
345
|
+
tips = list(tree.get_terminals())
|
|
346
|
+
max_x = max(node_x.values()) if node_x else 1.0
|
|
347
|
+
offset = max_x * offset_fraction
|
|
348
|
+
|
|
349
|
+
if fontsize <= 0:
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
for tip in tips:
|
|
353
|
+
ax.text(
|
|
354
|
+
node_x[id(tip)] + offset, node_y[id(tip)],
|
|
355
|
+
tip.name, va="center", fontsize=fontsize,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def cleanup_tree_axes(ax, show_xlabel=True):
|
|
360
|
+
"""Standard axis cleanup for rectangular tree plots."""
|
|
361
|
+
ax.set_yticks([])
|
|
362
|
+
ax.spines["top"].set_visible(False)
|
|
363
|
+
ax.spines["right"].set_visible(False)
|
|
364
|
+
ax.spines["left"].set_visible(False)
|
|
365
|
+
if show_xlabel:
|
|
366
|
+
ax.set_xlabel("Branch length")
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared trait file parsing utilities.
|
|
3
|
+
|
|
4
|
+
Provides a single implementation for parsing tab-delimited multi-trait
|
|
5
|
+
files with a header row, used across phylogenetic regression, signal,
|
|
6
|
+
ordination, path analysis, ANOVA, and other comparative methods.
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
from typing import Dict, List, Tuple
|
|
10
|
+
|
|
11
|
+
from ..errors import PhykitUserError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_multi_trait_file(
|
|
15
|
+
path: str,
|
|
16
|
+
tree_tips: List[str],
|
|
17
|
+
min_shared: int = 3,
|
|
18
|
+
min_columns: int = 2,
|
|
19
|
+
) -> Tuple[List[str], Dict[str, List[float]]]:
|
|
20
|
+
"""Parse a tab-delimited multi-trait file with a header row.
|
|
21
|
+
|
|
22
|
+
Format:
|
|
23
|
+
taxon<tab>trait1<tab>trait2<tab>...
|
|
24
|
+
species_A<tab>1.2<tab>3.4<tab>...
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
path : path to TSV file
|
|
29
|
+
tree_tips : list of tip names from the tree
|
|
30
|
+
min_shared : minimum shared taxa between tree and file (default 3)
|
|
31
|
+
min_columns : minimum columns in header (default 2: taxon + 1 trait)
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
(trait_names, traits_dict) where traits_dict maps taxon -> [float values]
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
with open(path) as f:
|
|
39
|
+
lines = f.readlines()
|
|
40
|
+
except FileNotFoundError:
|
|
41
|
+
raise PhykitUserError(
|
|
42
|
+
[
|
|
43
|
+
f"{path} corresponds to no such file or directory.",
|
|
44
|
+
"Please check filename and pathing",
|
|
45
|
+
],
|
|
46
|
+
code=2,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Filter out comments and blank lines
|
|
50
|
+
data_lines = []
|
|
51
|
+
for line in lines:
|
|
52
|
+
stripped = line.strip()
|
|
53
|
+
if not stripped or stripped.startswith("#"):
|
|
54
|
+
continue
|
|
55
|
+
data_lines.append(stripped)
|
|
56
|
+
|
|
57
|
+
if len(data_lines) < 2:
|
|
58
|
+
raise PhykitUserError(
|
|
59
|
+
["Multi-trait file must have a header row and at least one data row."],
|
|
60
|
+
code=2,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# First data line is the header
|
|
64
|
+
header_parts = data_lines[0].split("\t")
|
|
65
|
+
n_cols = len(header_parts)
|
|
66
|
+
if n_cols < min_columns:
|
|
67
|
+
raise PhykitUserError(
|
|
68
|
+
[
|
|
69
|
+
f"Header must have at least {min_columns} columns "
|
|
70
|
+
f"(taxon + at least {min_columns - 1} trait(s)).",
|
|
71
|
+
],
|
|
72
|
+
code=2,
|
|
73
|
+
)
|
|
74
|
+
trait_names = header_parts[1:]
|
|
75
|
+
|
|
76
|
+
traits = {}
|
|
77
|
+
for line_idx, line in enumerate(data_lines[1:], 2):
|
|
78
|
+
parts = line.split("\t")
|
|
79
|
+
if len(parts) != n_cols:
|
|
80
|
+
raise PhykitUserError(
|
|
81
|
+
[
|
|
82
|
+
f"Line {line_idx} has {len(parts)} columns; expected {n_cols}.",
|
|
83
|
+
f"Each line should have: taxon_name<tab>"
|
|
84
|
+
f"{'<tab>'.join(['trait'] * len(trait_names))}",
|
|
85
|
+
],
|
|
86
|
+
code=2,
|
|
87
|
+
)
|
|
88
|
+
taxon = parts[0]
|
|
89
|
+
values = []
|
|
90
|
+
for i, val_str in enumerate(parts[1:]):
|
|
91
|
+
try:
|
|
92
|
+
values.append(float(val_str))
|
|
93
|
+
except ValueError:
|
|
94
|
+
raise PhykitUserError(
|
|
95
|
+
[
|
|
96
|
+
f"Non-numeric trait value '{val_str}' for taxon '{taxon}' "
|
|
97
|
+
f"(trait '{trait_names[i]}') on line {line_idx}.",
|
|
98
|
+
],
|
|
99
|
+
code=2,
|
|
100
|
+
)
|
|
101
|
+
traits[taxon] = values
|
|
102
|
+
|
|
103
|
+
tree_tip_set = set(tree_tips)
|
|
104
|
+
trait_taxa_set = set(traits.keys())
|
|
105
|
+
shared = tree_tip_set & trait_taxa_set
|
|
106
|
+
|
|
107
|
+
tree_only = tree_tip_set - trait_taxa_set
|
|
108
|
+
trait_only = trait_taxa_set - tree_tip_set
|
|
109
|
+
|
|
110
|
+
if tree_only:
|
|
111
|
+
print(
|
|
112
|
+
f"Warning: {len(tree_only)} taxa in tree but not in trait file: "
|
|
113
|
+
f"{', '.join(sorted(tree_only))}",
|
|
114
|
+
file=sys.stderr,
|
|
115
|
+
)
|
|
116
|
+
if trait_only:
|
|
117
|
+
print(
|
|
118
|
+
f"Warning: {len(trait_only)} taxa in trait file but not in tree: "
|
|
119
|
+
f"{', '.join(sorted(trait_only))}",
|
|
120
|
+
file=sys.stderr,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if len(shared) < min_shared:
|
|
124
|
+
raise PhykitUserError(
|
|
125
|
+
[
|
|
126
|
+
f"Only {len(shared)} shared taxa between tree and trait file.",
|
|
127
|
+
f"At least {min_shared} shared taxa are required.",
|
|
128
|
+
],
|
|
129
|
+
code=2,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
filtered = {taxon: traits[taxon] for taxon in shared}
|
|
133
|
+
return trait_names, filtered
|
|
@@ -175,6 +175,8 @@ class Phykit:
|
|
|
175
175
|
in an alignment
|
|
176
176
|
taxon_groups (alias: tgroups; shared_taxa)
|
|
177
177
|
- group tree or FASTA files by their taxon set
|
|
178
|
+
occupancy_filter (alias: occ_filter; filter_occupancy)
|
|
179
|
+
- filter alignments/trees by cross-file taxon occupancy
|
|
178
180
|
|
|
179
181
|
Tree-based commands
|
|
180
182
|
===================
|
|
@@ -8606,6 +8608,69 @@ class Phykit:
|
|
|
8606
8608
|
_add_json_argument(parser)
|
|
8607
8609
|
_run_service(parser, argv, TaxonGroups)
|
|
8608
8610
|
|
|
8611
|
+
@staticmethod
|
|
8612
|
+
def occupancy_filter(argv):
|
|
8613
|
+
parser = _new_parser(
|
|
8614
|
+
description=textwrap.dedent(
|
|
8615
|
+
f"""\
|
|
8616
|
+
{help_header}
|
|
8617
|
+
|
|
8618
|
+
Filter alignments and/or trees by cross-file taxon
|
|
8619
|
+
occupancy. Counts how many files each taxon appears in
|
|
8620
|
+
and retains only taxa meeting a minimum threshold.
|
|
8621
|
+
Outputs filtered copies of each input file.
|
|
8622
|
+
|
|
8623
|
+
For FASTA files, removes sequences of filtered taxa.
|
|
8624
|
+
For tree files, prunes tips of filtered taxa.
|
|
8625
|
+
|
|
8626
|
+
Aliases:
|
|
8627
|
+
occupancy_filter, occ_filter, filter_occupancy
|
|
8628
|
+
Command line interfaces:
|
|
8629
|
+
pk_occupancy_filter, pk_occ_filter, pk_filter_occupancy
|
|
8630
|
+
|
|
8631
|
+
Usage:
|
|
8632
|
+
phykit occupancy_filter -l <file_list>
|
|
8633
|
+
[-f/--format fasta|trees] [-t/--threshold <int>]
|
|
8634
|
+
[-o/--output-dir <dir>] [--suffix <str>] [--json]
|
|
8635
|
+
|
|
8636
|
+
Options
|
|
8637
|
+
=====================================================
|
|
8638
|
+
-l/--list file listing paths to
|
|
8639
|
+
alignment or tree files,
|
|
8640
|
+
one per line (required)
|
|
8641
|
+
|
|
8642
|
+
-f/--format input file format: fasta
|
|
8643
|
+
or trees (default: fasta)
|
|
8644
|
+
|
|
8645
|
+
-t/--threshold minimum occupancy to retain
|
|
8646
|
+
a taxon. Values between 0
|
|
8647
|
+
and 1 are treated as a
|
|
8648
|
+
fraction (e.g., 0.5 = 50%
|
|
8649
|
+
of files). Values >= 1 are
|
|
8650
|
+
treated as an absolute
|
|
8651
|
+
count. (default: 0.5)
|
|
8652
|
+
|
|
8653
|
+
-o/--output-dir directory for filtered
|
|
8654
|
+
output files (default:
|
|
8655
|
+
same directory as input)
|
|
8656
|
+
|
|
8657
|
+
--suffix suffix added to output
|
|
8658
|
+
filenames before the
|
|
8659
|
+
extension (default:
|
|
8660
|
+
".filtered")
|
|
8661
|
+
|
|
8662
|
+
--json output results as JSON
|
|
8663
|
+
"""
|
|
8664
|
+
),
|
|
8665
|
+
)
|
|
8666
|
+
parser.add_argument("-l", "--list", type=str, required=True, help=SUPPRESS, metavar="")
|
|
8667
|
+
parser.add_argument("-f", "--format", type=str, default="fasta", choices=["fasta", "trees"], help=SUPPRESS, metavar="")
|
|
8668
|
+
parser.add_argument("-t", "--threshold", type=float, default=0.5, help=SUPPRESS, metavar="")
|
|
8669
|
+
parser.add_argument("-o", "--output-dir", type=str, default=None, help=SUPPRESS, metavar="")
|
|
8670
|
+
parser.add_argument("--suffix", type=str, default=".filtered", help=SUPPRESS, metavar="")
|
|
8671
|
+
_add_json_argument(parser)
|
|
8672
|
+
_run_service(parser, argv, OccupancyFilter)
|
|
8673
|
+
|
|
8609
8674
|
### Helper commands
|
|
8610
8675
|
@staticmethod
|
|
8611
8676
|
def create_concatenation_matrix(argv):
|
|
@@ -9225,3 +9290,7 @@ def trait_rate_map(argv=None):
|
|
|
9225
9290
|
|
|
9226
9291
|
def taxon_groups(argv=None):
|
|
9227
9292
|
Phykit.taxon_groups(sys.argv[1:])
|
|
9293
|
+
|
|
9294
|
+
|
|
9295
|
+
def occupancy_filter(argv=None):
|
|
9296
|
+
Phykit.occupancy_filter(sys.argv[1:])
|
|
@@ -47,6 +47,7 @@ SumOfPairsScore = _LazyServiceFactory("phykit.services.alignment.sum_of_pairs_sc
|
|
|
47
47
|
PhyloAnova = _LazyServiceFactory("phykit.services.tree.phylo_anova", "PhyloAnova")
|
|
48
48
|
PhyloPath = _LazyServiceFactory("phykit.services.tree.phylo_path", "PhyloPath")
|
|
49
49
|
PhyloGwas = _LazyServiceFactory("phykit.services.alignment.phylo_gwas", "PhyloGwas")
|
|
50
|
+
OccupancyFilter = _LazyServiceFactory("phykit.services.alignment.occupancy_filter", "OccupancyFilter")
|
|
50
51
|
TaxonGroups = _LazyServiceFactory("phykit.services.alignment.taxon_groups", "TaxonGroups")
|
|
51
52
|
VariableSites = _LazyServiceFactory("phykit.services.alignment.variable_sites", "VariableSites")
|
|
52
53
|
|