phykit 2.1.71__tar.gz → 2.1.72__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. {phykit-2.1.71 → phykit-2.1.72}/PKG-INFO +1 -1
  2. {phykit-2.1.71 → phykit-2.1.72}/phykit/cli_registry.py +2 -0
  3. {phykit-2.1.71 → phykit-2.1.72}/phykit/phykit.py +74 -0
  4. {phykit-2.1.71 → phykit-2.1.72}/phykit/service_factories.py +1 -0
  5. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/__init__.py +1 -0
  6. phykit-2.1.72/phykit/services/tree/phylo_impute.py +438 -0
  7. phykit-2.1.72/phykit/version.py +1 -0
  8. {phykit-2.1.71 → phykit-2.1.72}/phykit.egg-info/PKG-INFO +1 -1
  9. {phykit-2.1.71 → phykit-2.1.72}/phykit.egg-info/SOURCES.txt +1 -0
  10. {phykit-2.1.71 → phykit-2.1.72}/phykit.egg-info/entry_points.txt +3 -0
  11. {phykit-2.1.71 → phykit-2.1.72}/setup.py +3 -0
  12. phykit-2.1.71/phykit/version.py +0 -1
  13. {phykit-2.1.71 → phykit-2.1.72}/LICENSE.md +0 -0
  14. {phykit-2.1.71 → phykit-2.1.72}/README.md +0 -0
  15. {phykit-2.1.71 → phykit-2.1.72}/phykit/__init__.py +0 -0
  16. {phykit-2.1.71 → phykit-2.1.72}/phykit/__main__.py +0 -0
  17. {phykit-2.1.71 → phykit-2.1.72}/phykit/errors.py +0 -0
  18. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/__init__.py +0 -0
  19. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/boolean_argument_parsing.py +0 -0
  20. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/caching.py +0 -0
  21. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/circular_layout.py +0 -0
  22. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/color_annotations.py +0 -0
  23. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/discrete_models.py +0 -0
  24. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/files.py +0 -0
  25. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/json_output.py +0 -0
  26. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/parallel.py +0 -0
  27. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/parsimony_utils.py +0 -0
  28. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/plot_config.py +0 -0
  29. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/quartet_utils.py +0 -0
  30. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/stats_summary.py +0 -0
  31. {phykit-2.1.71 → phykit-2.1.72}/phykit/helpers/streaming.py +0 -0
  32. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/__init__.py +0 -0
  33. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/__init__.py +0 -0
  34. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/alignment_entropy.py +0 -0
  35. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/alignment_length.py +0 -0
  36. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/alignment_length_no_gaps.py +0 -0
  37. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/alignment_outlier_taxa.py +0 -0
  38. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/alignment_recoding.py +0 -0
  39. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/alignment_subsample.py +0 -0
  40. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/base.py +0 -0
  41. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/column_score.py +0 -0
  42. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/composition_per_taxon.py +0 -0
  43. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/compositional_bias_per_site.py +0 -0
  44. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/create_concatenation_matrix.py +0 -0
  45. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/dfoil.py +0 -0
  46. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/dna_threader.py +0 -0
  47. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/dstatistic.py +0 -0
  48. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/evolutionary_rate_per_site.py +0 -0
  49. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/faidx.py +0 -0
  50. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/gc_content.py +0 -0
  51. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/identity_matrix.py +0 -0
  52. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/mask_alignment.py +0 -0
  53. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/occupancy_per_taxon.py +0 -0
  54. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/pairwise_identity.py +0 -0
  55. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/parsimony_informative_sites.py +0 -0
  56. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/phylo_gwas.py +0 -0
  57. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/plot_alignment_qc.py +0 -0
  58. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/rcv.py +0 -0
  59. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/rcvt.py +0 -0
  60. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/rename_fasta_entries.py +0 -0
  61. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/sum_of_pairs_score.py +0 -0
  62. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/alignment/variable_sites.py +0 -0
  63. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/base.py +0 -0
  64. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/ancestral_reconstruction.py +0 -0
  65. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/base.py +0 -0
  66. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/bipartition_support_stats.py +0 -0
  67. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/branch_length_multiplier.py +0 -0
  68. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/character_map.py +0 -0
  69. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/collapse_branches.py +0 -0
  70. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/concordance_asr.py +0 -0
  71. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/consensus_network.py +0 -0
  72. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/consensus_tree.py +0 -0
  73. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/cont_map.py +0 -0
  74. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/cophylo.py +0 -0
  75. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/covarying_evolutionary_rates.py +0 -0
  76. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/density_map.py +0 -0
  77. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/discordance_asymmetry.py +0 -0
  78. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/dvmc.py +0 -0
  79. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/evo_tempo_map.py +0 -0
  80. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/evolutionary_rate.py +0 -0
  81. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/fit_continuous.py +0 -0
  82. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/fit_discrete.py +0 -0
  83. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/hidden_paralogy_check.py +0 -0
  84. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/independent_contrasts.py +0 -0
  85. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/internal_branch_stats.py +0 -0
  86. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/internode_labeler.py +0 -0
  87. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/kf_distance.py +0 -0
  88. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/last_common_ancestor_subtree.py +0 -0
  89. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/lb_score.py +0 -0
  90. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/ltt.py +0 -0
  91. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/monophyly_check.py +0 -0
  92. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/nearest_neighbor_interchange.py +0 -0
  93. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/network_signal.py +0 -0
  94. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/ou_shift_detection.py +0 -0
  95. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/ouwie.py +0 -0
  96. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/parsimony_score.py +0 -0
  97. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/patristic_distances.py +0 -0
  98. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phenogram.py +0 -0
  99. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylo_heatmap.py +0 -0
  100. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylo_logistic.py +0 -0
  101. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylogenetic_glm.py +0 -0
  102. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylogenetic_ordination.py +0 -0
  103. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylogenetic_regression.py +0 -0
  104. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylogenetic_signal.py +0 -0
  105. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/phylomorphospace.py +0 -0
  106. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/polytomy_test.py +0 -0
  107. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/print_tree.py +0 -0
  108. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/prune_tree.py +0 -0
  109. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/quartet_network.py +0 -0
  110. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/quartet_pie.py +0 -0
  111. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/rate_heterogeneity.py +0 -0
  112. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/relative_rate_test.py +0 -0
  113. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/rename_tree_tips.py +0 -0
  114. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/rf_distance.py +0 -0
  115. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/root_tree.py +0 -0
  116. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/saturation.py +0 -0
  117. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/spectral_discordance.py +0 -0
  118. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/spurious_sequence.py +0 -0
  119. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/stochastic_character_map.py +0 -0
  120. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/terminal_branch_stats.py +0 -0
  121. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/threshold_model.py +0 -0
  122. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/tip_labels.py +0 -0
  123. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/tip_to_tip_distance.py +0 -0
  124. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/tip_to_tip_node_distance.py +0 -0
  125. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/total_tree_length.py +0 -0
  126. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/trait_correlation.py +0 -0
  127. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/trait_rate_map.py +0 -0
  128. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/tree_space.py +0 -0
  129. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/treeness.py +0 -0
  130. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/treeness_over_rcv.py +0 -0
  131. {phykit-2.1.71 → phykit-2.1.72}/phykit/services/tree/vcv_utils.py +0 -0
  132. {phykit-2.1.71 → phykit-2.1.72}/phykit.egg-info/dependency_links.txt +0 -0
  133. {phykit-2.1.71 → phykit-2.1.72}/phykit.egg-info/requires.txt +0 -0
  134. {phykit-2.1.71 → phykit-2.1.72}/phykit.egg-info/top_level.txt +0 -0
  135. {phykit-2.1.71 → phykit-2.1.72}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: phykit
3
- Version: 2.1.71
3
+ Version: 2.1.72
4
4
  Home-page: https://github.com/jlsteenwyk/phykit
5
5
  Author: Jacob L. Steenwyk
6
6
  Author-email: jlsteenwyk@gmail.com
@@ -188,6 +188,8 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
188
188
  "da": "discordance_asymmetry",
189
189
  "spec_disc": "spectral_discordance",
190
190
  "sd": "spectral_discordance",
191
+ "impute": "phylo_impute",
192
+ "phylo_imp": "phylo_impute",
191
193
  "trait_correlation": "trait_correlation",
192
194
  "trait_corr": "trait_correlation",
193
195
  "phylo_corr": "trait_correlation",
@@ -227,6 +227,9 @@ class Phykit:
227
227
  trait_correlation (alias: trait_corr; phylo_corr)
228
228
  - compute phylogenetic correlations between all pairs
229
229
  of traits and display as a heatmap
230
+ phylo_impute (alias: impute; phylo_imp)
231
+ - impute missing trait values using phylogenetic
232
+ relationships and between-trait correlations
230
233
  phylogenetic_ordination (alias: phylo_ordination; ordination; ord;
231
234
  phylo_pca; phyl_pca; ppca; phylo_dimreduce; dimreduce; pdr)
232
235
  - phylogenetic ordination (PCA, t-SNE, or UMAP) on
@@ -3681,6 +3684,73 @@ class Phykit:
3681
3684
  _add_json_argument(parser)
3682
3685
  _run_service(parser, argv, TraitCorrelation)
3683
3686
 
3687
+ @staticmethod
3688
+ def phylo_impute(argv):
3689
+ parser = _new_parser(
3690
+ description=textwrap.dedent(
3691
+ f"""\
3692
+ {help_header}
3693
+
3694
+ Phylogenetic imputation of missing trait values using
3695
+ conditional multivariate normal distributions.
3696
+
3697
+ Captures both phylogenetic relationships (via the
3698
+ tree's variance-covariance matrix) and between-trait
3699
+ correlations to predict missing values. Reports
3700
+ imputed values with standard errors and 95% CIs.
3701
+
3702
+ Missing values in the input trait file may be marked
3703
+ as NA, na, ?, or left empty.
3704
+
3705
+ Input is a phylogenetic tree and a tab-delimited
3706
+ multi-trait file with a header row:
3707
+ taxon<tab>trait1<tab>trait2<tab>...
3708
+
3709
+ Aliases:
3710
+ phylo_impute, impute, phylo_imp
3711
+ Command line interfaces:
3712
+ pk_phylo_impute, pk_impute, pk_phylo_imp
3713
+
3714
+ Usage:
3715
+ phykit phylo_impute -t <tree> -d <trait_data> -o <output>
3716
+ [-g <gene_trees>] [--json]
3717
+
3718
+ Options
3719
+ =====================================================
3720
+ -t/--tree tree file (required)
3721
+
3722
+ -d/--trait-data multi-trait TSV with header
3723
+ row; missing values marked
3724
+ as NA, ?, or empty
3725
+ (required)
3726
+
3727
+ -o/--output output TSV file with
3728
+ imputed values (required)
3729
+
3730
+ -g/--gene-trees optional multi-Newick file
3731
+ of gene trees for
3732
+ discordance-aware VCV
3733
+
3734
+ --json optional argument to output
3735
+ results as JSON
3736
+ """
3737
+ ),
3738
+ )
3739
+ parser.add_argument(
3740
+ "-t", "--tree", type=str, required=True, help=SUPPRESS, metavar=""
3741
+ )
3742
+ parser.add_argument(
3743
+ "-d", "--trait-data", type=str, required=True, help=SUPPRESS, metavar=""
3744
+ )
3745
+ parser.add_argument(
3746
+ "-o", "--output", type=str, required=True, help=SUPPRESS, metavar=""
3747
+ )
3748
+ parser.add_argument(
3749
+ "-g", "--gene-trees", type=str, default=None, help=SUPPRESS, metavar=""
3750
+ )
3751
+ _add_json_argument(parser)
3752
+ _run_service(parser, argv, PhyloImpute)
3753
+
3684
3754
  @staticmethod
3685
3755
  def phylogenetic_ordination(argv):
3686
3756
  parser = _new_parser(
@@ -8562,5 +8632,9 @@ def tree_space(argv=None):
8562
8632
  Phykit.tree_space(sys.argv[1:])
8563
8633
 
8564
8634
 
8635
+ def phylo_impute(argv=None):
8636
+ Phykit.phylo_impute(sys.argv[1:])
8637
+
8638
+
8565
8639
  def trait_rate_map(argv=None):
8566
8640
  Phykit.trait_rate_map(sys.argv[1:])
@@ -112,6 +112,7 @@ TreenessOverRCV = _LazyServiceFactory("phykit.services.tree.treeness_over_rcv",
112
112
  EvoTempoMap = _LazyServiceFactory("phykit.services.tree.evo_tempo_map", "EvoTempoMap")
113
113
  DiscordanceAsymmetry = _LazyServiceFactory("phykit.services.tree.discordance_asymmetry", "DiscordanceAsymmetry")
114
114
  SpectralDiscordance = _LazyServiceFactory("phykit.services.tree.spectral_discordance", "SpectralDiscordance")
115
+ PhyloImpute = _LazyServiceFactory("phykit.services.tree.phylo_impute", "PhyloImpute")
115
116
  TraitCorrelation = _LazyServiceFactory("phykit.services.tree.trait_correlation", "TraitCorrelation")
116
117
  TraitRateMap = _LazyServiceFactory("phykit.services.tree.trait_rate_map", "TraitRateMap")
117
118
  TreeSpace = _LazyServiceFactory("phykit.services.tree.tree_space", "TreeSpace")
@@ -49,6 +49,7 @@ _EXPORTS = {
49
49
  "TreenessOverRCV": "treeness_over_rcv",
50
50
  "ConcordanceAsr": "concordance_asr",
51
51
  "PhyloLogistic": "phylo_logistic",
52
+ "PhyloImpute": "phylo_impute",
52
53
  "TraitCorrelation": "trait_correlation",
53
54
  "TraitRateMap": "trait_rate_map",
54
55
  "TreeSpace": "tree_space",
@@ -0,0 +1,438 @@
1
+ """
2
+ Phylogenetic imputation of missing trait values using conditional
3
+ multivariate normal distributions that capture both phylogenetic
4
+ relationships and between-trait correlations.
5
+ """
6
+ import sys
7
+ from typing import Dict, List, Tuple
8
+
9
+ import numpy as np
10
+
11
+ from .base import Tree
12
+ from ...helpers.json_output import print_json
13
+ from ...errors import PhykitUserError
14
+
15
+
16
+ class PhyloImpute(Tree):
17
+ def __init__(self, args) -> None:
18
+ parsed = self.process_args(args)
19
+ super().__init__(tree_file_path=parsed["tree_file_path"])
20
+ self.trait_data_path = parsed["trait_data_path"]
21
+ self.output_path = parsed["output_path"]
22
+ self.gene_trees_path = parsed["gene_trees_path"]
23
+ self.json_output = parsed["json_output"]
24
+
25
+ def run(self) -> None:
26
+ from .vcv_utils import build_vcv_matrix, build_discordance_vcv, parse_gene_trees
27
+
28
+ tree = self.read_tree_file()
29
+ self._validate_tree(tree)
30
+
31
+ tree_tips = self.get_tip_names_from_tree(tree)
32
+ trait_names, trait_data, missing_info = self._parse_trait_file_with_na(
33
+ self.trait_data_path, tree_tips
34
+ )
35
+
36
+ ordered_names = sorted(trait_data.keys())
37
+ n = len(ordered_names)
38
+ p = len(trait_names)
39
+
40
+ # Build data matrix (n x p) with NaN for missing
41
+ Y = np.full((n, p), np.nan)
42
+ for i, name in enumerate(ordered_names):
43
+ for j in range(p):
44
+ Y[i, j] = trait_data[name][j]
45
+
46
+ # Build VCV
47
+ if self.gene_trees_path:
48
+ gene_trees = parse_gene_trees(self.gene_trees_path)
49
+ vcv, vcv_meta = build_discordance_vcv(tree, gene_trees, ordered_names)
50
+ shared = vcv_meta["shared_taxa"]
51
+ if set(shared) != set(ordered_names):
52
+ # Subset to shared taxa
53
+ keep_idx = [i for i, name in enumerate(ordered_names) if name in set(shared)]
54
+ ordered_names = [ordered_names[i] for i in keep_idx]
55
+ Y = Y[keep_idx, :]
56
+ n = len(ordered_names)
57
+ vcv_type = "discordance"
58
+ else:
59
+ vcv = build_vcv_matrix(tree, ordered_names)
60
+ vcv_type = "BM"
61
+
62
+ # Find complete cases (taxa with no missing values)
63
+ complete_mask = ~np.any(np.isnan(Y), axis=1)
64
+ n_complete = int(np.sum(complete_mask))
65
+
66
+ if n_complete < 2:
67
+ raise PhykitUserError(
68
+ [
69
+ f"Only {n_complete} taxa have complete data for all traits.",
70
+ "At least 2 taxa with complete data are required for imputation.",
71
+ ],
72
+ code=2,
73
+ )
74
+
75
+ # Step 1: Estimate phylogenetic mean and trait covariance from complete cases
76
+ complete_idx = np.where(complete_mask)[0]
77
+ Y_complete = Y[complete_idx, :]
78
+ C_complete = vcv[np.ix_(complete_idx, complete_idx)]
79
+
80
+ try:
81
+ C_complete_inv = np.linalg.inv(C_complete)
82
+ except np.linalg.LinAlgError:
83
+ C_complete_inv = np.linalg.pinv(C_complete)
84
+
85
+ ones = np.ones(n_complete)
86
+ denom = ones @ C_complete_inv @ ones
87
+
88
+ # GLS phylogenetic mean for each trait
89
+ a_hat = np.array(
90
+ [(ones @ C_complete_inv @ Y_complete[:, j]) / denom for j in range(p)]
91
+ )
92
+
93
+ # GLS residuals
94
+ E = Y_complete - a_hat
95
+
96
+ # Trait covariance from phylogenetic residuals
97
+ if n_complete > 1:
98
+ Sigma_trait = (E.T @ C_complete_inv @ E) / (n_complete - 1)
99
+ else:
100
+ Sigma_trait = np.eye(p)
101
+
102
+ # Step 2: Impute each missing value
103
+ imputed_results = []
104
+ Y_imputed = Y.copy()
105
+
106
+ for i in range(n):
107
+ missing_traits = np.where(np.isnan(Y[i, :]))[0]
108
+ if len(missing_traits) == 0:
109
+ continue
110
+
111
+ observed_traits = np.where(~np.isnan(Y[i, :]))[0]
112
+
113
+ imp_values, imp_ses = self._impute_taxon(
114
+ Y_imputed, vcv, i, list(observed_traits), list(missing_traits),
115
+ a_hat, Sigma_trait
116
+ )
117
+
118
+ for j in missing_traits:
119
+ val = imp_values[j]
120
+ se = imp_ses[j]
121
+ ci_lower = val - 1.96 * se
122
+ ci_upper = val + 1.96 * se
123
+
124
+ Y_imputed[i, j] = val
125
+
126
+ imputed_results.append({
127
+ "taxon": ordered_names[i],
128
+ "trait": trait_names[j],
129
+ "value": round(val, 6),
130
+ "se": round(se, 6),
131
+ "ci_lower": round(ci_lower, 6),
132
+ "ci_upper": round(ci_upper, 6),
133
+ })
134
+
135
+ # Write output TSV
136
+ self._write_output_tsv(
137
+ self.output_path, trait_names, ordered_names, Y_imputed
138
+ )
139
+
140
+ # Print summary
141
+ if self.json_output:
142
+ self._print_json(
143
+ n, p, trait_names, vcv_type, imputed_results, self.output_path
144
+ )
145
+ else:
146
+ self._print_text(
147
+ n, p, trait_names, vcv_type, imputed_results,
148
+ self.trait_data_path, self.output_path,
149
+ )
150
+
151
+ def process_args(self, args) -> Dict:
152
+ return dict(
153
+ tree_file_path=args.tree,
154
+ trait_data_path=args.trait_data,
155
+ output_path=args.output,
156
+ gene_trees_path=getattr(args, "gene_trees", None),
157
+ json_output=getattr(args, "json", False),
158
+ )
159
+
160
+ def _validate_tree(self, tree) -> None:
161
+ tips = list(tree.get_terminals())
162
+ if len(tips) < 3:
163
+ raise PhykitUserError(
164
+ ["Tree must have at least 3 tips for phylogenetic imputation."],
165
+ code=2,
166
+ )
167
+ for clade in tree.find_clades():
168
+ if clade.branch_length is None and clade != tree.root:
169
+ raise PhykitUserError(
170
+ ["All branches in the tree must have lengths."],
171
+ code=2,
172
+ )
173
+
174
+ def _parse_trait_file_with_na(
175
+ self, path: str, tree_tips: List[str]
176
+ ) -> Tuple[List[str], Dict[str, List[float]], List[dict]]:
177
+ """Parse multi-trait TSV allowing NA/? for missing values."""
178
+ missing_markers = {"NA", "na", "Na", "?", ""}
179
+
180
+ try:
181
+ with open(path) as f:
182
+ lines = f.readlines()
183
+ except FileNotFoundError:
184
+ raise PhykitUserError(
185
+ [
186
+ f"{path} corresponds to no such file or directory.",
187
+ "Please check filename and pathing",
188
+ ],
189
+ code=2,
190
+ )
191
+
192
+ data_lines = []
193
+ for line in lines:
194
+ stripped = line.strip()
195
+ if not stripped or stripped.startswith("#"):
196
+ continue
197
+ data_lines.append(stripped)
198
+
199
+ if len(data_lines) < 2:
200
+ raise PhykitUserError(
201
+ [
202
+ "Multi-trait file must have a header row and at least one data row.",
203
+ ],
204
+ code=2,
205
+ )
206
+
207
+ header_parts = data_lines[0].split("\t")
208
+ n_cols = len(header_parts)
209
+ if n_cols < 2:
210
+ raise PhykitUserError(
211
+ [
212
+ "Header must have at least 2 columns (taxon + at least 1 trait).",
213
+ ],
214
+ code=2,
215
+ )
216
+ trait_names = header_parts[1:]
217
+
218
+ traits = {}
219
+ missing_info = []
220
+ for line_idx, line in enumerate(data_lines[1:], 2):
221
+ parts = line.split("\t")
222
+ if len(parts) != n_cols:
223
+ raise PhykitUserError(
224
+ [
225
+ f"Line {line_idx} has {len(parts)} columns; expected {n_cols}.",
226
+ ],
227
+ code=2,
228
+ )
229
+ taxon = parts[0]
230
+ values = []
231
+ for i, val_str in enumerate(parts[1:]):
232
+ if val_str.strip() in missing_markers:
233
+ values.append(float("nan"))
234
+ missing_info.append({
235
+ "taxon": taxon,
236
+ "trait": trait_names[i],
237
+ "line": line_idx,
238
+ })
239
+ else:
240
+ try:
241
+ values.append(float(val_str))
242
+ except ValueError:
243
+ raise PhykitUserError(
244
+ [
245
+ f"Non-numeric trait value '{val_str}' for taxon '{taxon}' "
246
+ f"(trait '{trait_names[i]}') on line {line_idx}.",
247
+ ],
248
+ code=2,
249
+ )
250
+ traits[taxon] = values
251
+
252
+ tree_tip_set = set(tree_tips)
253
+ trait_taxa_set = set(traits.keys())
254
+ shared = tree_tip_set & trait_taxa_set
255
+
256
+ tree_only = tree_tip_set - trait_taxa_set
257
+ trait_only = trait_taxa_set - tree_tip_set
258
+
259
+ if tree_only:
260
+ print(
261
+ f"Warning: {len(tree_only)} taxa in tree but not in trait file: "
262
+ f"{', '.join(sorted(tree_only))}",
263
+ file=sys.stderr,
264
+ )
265
+ if trait_only:
266
+ print(
267
+ f"Warning: {len(trait_only)} taxa in trait file but not in tree: "
268
+ f"{', '.join(sorted(trait_only))}",
269
+ file=sys.stderr,
270
+ )
271
+
272
+ if len(shared) < 3:
273
+ raise PhykitUserError(
274
+ [
275
+ f"Only {len(shared)} shared taxa between tree and trait file.",
276
+ "At least 3 shared taxa are required.",
277
+ ],
278
+ code=2,
279
+ )
280
+
281
+ filtered = {taxon: traits[taxon] for taxon in shared}
282
+ filtered_missing = [m for m in missing_info if m["taxon"] in shared]
283
+ return trait_names, filtered, filtered_missing
284
+
285
+ @staticmethod
286
+ def _impute_taxon(
287
+ Y: np.ndarray,
288
+ C_phylo: np.ndarray,
289
+ taxon_idx: int,
290
+ observed_trait_indices: List[int],
291
+ missing_trait_indices: List[int],
292
+ a_hat: np.ndarray,
293
+ Sigma_trait: np.ndarray,
294
+ ) -> Tuple[Dict[int, float], Dict[int, float]]:
295
+ """Impute missing traits for one taxon using phylogeny + trait correlations."""
296
+ n = C_phylo.shape[0]
297
+
298
+ other_idx = [k for k in range(n) if k != taxon_idx]
299
+
300
+ C_io = C_phylo[taxon_idx, other_idx]
301
+ C_oo = C_phylo[np.ix_(other_idx, other_idx)]
302
+
303
+ imputed = {}
304
+ imputed_se = {}
305
+
306
+ for j in missing_trait_indices:
307
+ # Get observed values of trait j from other taxa
308
+ obs_other = [
309
+ (k_idx, Y[other_idx[k_idx], j])
310
+ for k_idx in range(len(other_idx))
311
+ if not np.isnan(Y[other_idx[k_idx], j])
312
+ ]
313
+
314
+ if not obs_other:
315
+ # No other taxa have this trait -- use phylogenetic mean
316
+ imputed[j] = float(a_hat[j])
317
+ imputed_se[j] = float(np.sqrt(max(Sigma_trait[j, j], 0.0)))
318
+ continue
319
+
320
+ obs_k_indices = [x[0] for x in obs_other]
321
+ obs_values = np.array([x[1] for x in obs_other])
322
+
323
+ # Phylogenetic prediction
324
+ c_io_sub = C_io[obs_k_indices]
325
+ C_oo_sub = C_oo[np.ix_(obs_k_indices, obs_k_indices)]
326
+ try:
327
+ C_oo_sub_inv = np.linalg.inv(C_oo_sub)
328
+ except np.linalg.LinAlgError:
329
+ C_oo_sub_inv = np.linalg.pinv(C_oo_sub)
330
+
331
+ weights = c_io_sub @ C_oo_sub_inv
332
+ y_hat = float(a_hat[j] + weights @ (obs_values - a_hat[j]))
333
+
334
+ # Conditional variance
335
+ c_ii = C_phylo[taxon_idx, taxon_idx]
336
+ cond_var = (c_ii - c_io_sub @ C_oo_sub_inv @ c_io_sub) * Sigma_trait[j, j]
337
+ se = float(np.sqrt(max(cond_var, 0.0)))
338
+
339
+ # If taxon has observed traits that correlate with trait j,
340
+ # refine the prediction using trait correlations
341
+ if len(observed_trait_indices) > 0:
342
+ obs_traits_values = np.array(
343
+ [Y[taxon_idx, k] for k in observed_trait_indices]
344
+ )
345
+ obs_means = a_hat[observed_trait_indices]
346
+
347
+ Sigma_jo = Sigma_trait[j, observed_trait_indices]
348
+ Sigma_oo_traits = Sigma_trait[
349
+ np.ix_(observed_trait_indices, observed_trait_indices)
350
+ ]
351
+ try:
352
+ Sigma_oo_inv = np.linalg.inv(Sigma_oo_traits)
353
+ # Conditional expectation adjustment
354
+ trait_adjustment = float(
355
+ Sigma_jo @ Sigma_oo_inv @ (obs_traits_values - obs_means)
356
+ )
357
+ y_hat += trait_adjustment
358
+
359
+ # Refined variance
360
+ cond_var_trait = Sigma_trait[j, j] - Sigma_jo @ Sigma_oo_inv @ Sigma_jo
361
+ se = float(
362
+ np.sqrt(max(cond_var * cond_var_trait / Sigma_trait[j, j], 0.0))
363
+ )
364
+ except np.linalg.LinAlgError:
365
+ pass # keep phylogeny-only prediction
366
+
367
+ imputed[j] = float(y_hat)
368
+ imputed_se[j] = float(se)
369
+
370
+ return imputed, imputed_se
371
+
372
+ @staticmethod
373
+ def _write_output_tsv(
374
+ path: str,
375
+ trait_names: List[str],
376
+ ordered_names: List[str],
377
+ Y: np.ndarray,
378
+ ) -> None:
379
+ """Write imputed data as TSV (drop-in replacement for the input)."""
380
+ with open(path, "w") as f:
381
+ header = "taxon\t" + "\t".join(trait_names) + "\n"
382
+ f.write(header)
383
+ for i, name in enumerate(ordered_names):
384
+ vals = "\t".join(f"{Y[i, j]:.6f}" for j in range(Y.shape[1]))
385
+ f.write(f"{name}\t{vals}\n")
386
+
387
+ def _print_text(
388
+ self, n, p, trait_names, vcv_type, imputed_results, data_path, output_path
389
+ ) -> None:
390
+ print("Phylogenetic Imputation")
391
+ print(f"Data: {data_path}")
392
+ print(f"Taxa: {n}")
393
+ print(f"Traits: {p} ({', '.join(trait_names)})")
394
+ print(f"Missing values: {len(imputed_results)}")
395
+ vcv_label = "standard" if vcv_type == "BM" else "discordance-aware"
396
+ print(f"VCV: {vcv_type} ({vcv_label})")
397
+
398
+ if imputed_results:
399
+ # Determine column widths
400
+ max_taxon = max(len(r["taxon"]) for r in imputed_results)
401
+ max_trait = max(len(r["trait"]) for r in imputed_results)
402
+ max_taxon = max(max_taxon, len("Taxon"))
403
+ max_trait = max(max_trait, len("Trait"))
404
+
405
+ print("\nImputed values:")
406
+ header = (
407
+ f" {'Taxon':<{max_taxon}} "
408
+ f"{'Trait':<{max_trait}} "
409
+ f"{'Imputed':>10} "
410
+ f"{'SE':>10} "
411
+ f"{'95% CI':>20}"
412
+ )
413
+ print(header)
414
+
415
+ for r in imputed_results:
416
+ ci_str = f"[{r['ci_lower']:.4f}, {r['ci_upper']:.4f}]"
417
+ row = (
418
+ f" {r['taxon']:<{max_taxon}} "
419
+ f" {r['trait']:<{max_trait}} "
420
+ f"{r['value']:>10.4f} "
421
+ f"{r['se']:>10.4f} "
422
+ f"{ci_str:>20}"
423
+ )
424
+ print(row)
425
+
426
+ print(f"\nOutput: {output_path}")
427
+
428
+ def _print_json(self, n, p, trait_names, vcv_type, imputed_results, output_path):
429
+ payload = {
430
+ "n_taxa": n,
431
+ "n_traits": p,
432
+ "trait_names": trait_names,
433
+ "n_missing": len(imputed_results),
434
+ "vcv_type": vcv_type,
435
+ "imputed": imputed_results,
436
+ "output_file": output_path,
437
+ }
438
+ print_json(payload)
@@ -0,0 +1 @@
1
+ __version__ = "2.1.72"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: phykit
3
- Version: 2.1.71
3
+ Version: 2.1.72
4
4
  Home-page: https://github.com/jlsteenwyk/phykit
5
5
  Author: Jacob L. Steenwyk
6
6
  Author-email: jlsteenwyk@gmail.com
@@ -97,6 +97,7 @@ phykit/services/tree/parsimony_score.py
97
97
  phykit/services/tree/patristic_distances.py
98
98
  phykit/services/tree/phenogram.py
99
99
  phykit/services/tree/phylo_heatmap.py
100
+ phykit/services/tree/phylo_impute.py
100
101
  phykit/services/tree/phylo_logistic.py
101
102
  phykit/services/tree/phylogenetic_glm.py
102
103
  phykit/services/tree/phylogenetic_ordination.py
@@ -96,6 +96,7 @@ pk_ibs = phykit.phykit:internal_branch_stats
96
96
  pk_id_matrix = phykit.phykit:identity_matrix
97
97
  pk_identity_matrix = phykit.phykit:identity_matrix
98
98
  pk_il = phykit.phykit:internode_labeler
99
+ pk_impute = phykit.phykit:phylo_impute
99
100
  pk_independent_contrasts = phykit.phykit:independent_contrasts
100
101
  pk_internal_branch_stats = phykit.phykit:internal_branch_stats
101
102
  pk_internode_labeler = phykit.phykit:internode_labeler
@@ -158,6 +159,8 @@ pk_phylo_dimreduce = phykit.phykit:phylogenetic_ordination
158
159
  pk_phylo_glm = phykit.phykit:phylogenetic_glm
159
160
  pk_phylo_gwas = phykit.phykit:phylo_gwas
160
161
  pk_phylo_heatmap = phykit.phykit:phylo_heatmap
162
+ pk_phylo_imp = phykit.phykit:phylo_impute
163
+ pk_phylo_impute = phykit.phykit:phylo_impute
161
164
  pk_phylo_logistic = phykit.phykit:phylo_logistic
162
165
  pk_phylo_logreg = phykit.phykit:phylo_logistic
163
166
  pk_phylo_ordination = phykit.phykit:phylogenetic_ordination
@@ -303,6 +303,9 @@ setup(
303
303
  "pk_spectral_discordance = phykit.phykit:spectral_discordance",
304
304
  "pk_spec_disc = phykit.phykit:spectral_discordance",
305
305
  "pk_sd = phykit.phykit:spectral_discordance",
306
+ "pk_phylo_impute = phykit.phykit:phylo_impute",
307
+ "pk_impute = phykit.phykit:phylo_impute",
308
+ "pk_phylo_imp = phykit.phykit:phylo_impute",
306
309
  "pk_trait_correlation = phykit.phykit:trait_correlation",
307
310
  "pk_trait_corr = phykit.phykit:trait_correlation",
308
311
  "pk_phylo_corr = phykit.phykit:trait_correlation",
@@ -1 +0,0 @@
1
- __version__ = "2.1.71"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes