phykit 2.1.35__tar.gz → 2.1.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phykit-2.1.35 → phykit-2.1.37}/PKG-INFO +1 -1
- {phykit-2.1.35 → phykit-2.1.37}/phykit/cli_registry.py +2 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/phykit.py +105 -1
- {phykit-2.1.35 → phykit-2.1.37}/phykit/service_factories.py +1 -0
- phykit-2.1.37/phykit/services/tree/spectral_discordance.py +643 -0
- phykit-2.1.37/phykit/version.py +1 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit.egg-info/PKG-INFO +1 -1
- {phykit-2.1.35 → phykit-2.1.37}/phykit.egg-info/SOURCES.txt +1 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit.egg-info/entry_points.txt +8 -0
- {phykit-2.1.35 → phykit-2.1.37}/setup.py +8 -0
- phykit-2.1.35/phykit/version.py +0 -1
- {phykit-2.1.35 → phykit-2.1.37}/LICENSE.md +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/README.md +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/__init__.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/__main__.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/errors.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/__init__.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/boolean_argument_parsing.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/caching.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/files.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/json_output.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/parallel.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/stats_summary.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/helpers/streaming.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/__init__.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/__init__.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/alignment_entropy.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/alignment_length.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/alignment_length_no_gaps.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/alignment_outlier_taxa.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/alignment_recoding.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/base.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/column_score.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/composition_per_taxon.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/compositional_bias_per_site.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/create_concatenation_matrix.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/dna_threader.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/evolutionary_rate_per_site.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/faidx.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/gc_content.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/mask_alignment.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/occupancy_per_taxon.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/pairwise_identity.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/parsimony_informative_sites.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/plot_alignment_qc.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/rcv.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/rcvt.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/rename_fasta_entries.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/sum_of_pairs_score.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/alignment/variable_sites.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/base.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/__init__.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/ancestral_reconstruction.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/base.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/bipartition_support_stats.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/branch_length_multiplier.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/collapse_branches.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/concordance_asr.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/consensus_network.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/consensus_tree.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/cont_map.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/cophylo.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/covarying_evolutionary_rates.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/density_map.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/discordance_asymmetry.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/dvmc.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/evo_tempo_map.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/evolutionary_rate.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/fit_continuous.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/hidden_paralogy_check.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/internal_branch_stats.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/internode_labeler.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/last_common_ancestor_subtree.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/lb_score.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/ltt.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/monophyly_check.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/nearest_neighbor_interchange.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/network_signal.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/ou_shift_detection.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/ouwie.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/patristic_distances.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/phenogram.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/phylogenetic_glm.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/phylogenetic_ordination.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/phylogenetic_regression.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/phylogenetic_signal.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/phylomorphospace.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/polytomy_test.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/print_tree.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/prune_tree.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/quartet_network.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/rate_heterogeneity.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/relative_rate_test.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/rename_tree_tips.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/rf_distance.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/root_tree.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/saturation.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/spurious_sequence.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/stochastic_character_map.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/terminal_branch_stats.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/threshold_model.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/tip_labels.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/tip_to_tip_distance.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/tip_to_tip_node_distance.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/total_tree_length.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/treeness.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/treeness_over_rcv.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit/services/tree/vcv_utils.py +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit.egg-info/dependency_links.txt +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit.egg-info/requires.txt +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/phykit.egg-info/top_level.txt +0 -0
- {phykit-2.1.35 → phykit-2.1.37}/setup.cfg +0 -0
|
@@ -157,6 +157,8 @@ ALIAS_TO_HANDLER: Dict[str, str] = {
|
|
|
157
157
|
"etm": "evo_tempo_map",
|
|
158
158
|
"disc_asym": "discordance_asymmetry",
|
|
159
159
|
"da": "discordance_asymmetry",
|
|
160
|
+
"spec_disc": "spectral_discordance",
|
|
161
|
+
"sd": "spectral_discordance",
|
|
160
162
|
# Helper aliases
|
|
161
163
|
"create_concat": "create_concatenation_matrix",
|
|
162
164
|
"cc": "create_concatenation_matrix",
|
|
@@ -277,7 +277,10 @@ class Phykit:
|
|
|
277
277
|
treeness (alias: tness)
|
|
278
278
|
- reports treeness or stemminess, a measure of signal-to-
|
|
279
279
|
noise ratio in a phylogeny
|
|
280
|
-
|
|
280
|
+
spectral_discordance (alias: spec_disc; sd)
|
|
281
|
+
- PCA + spectral clustering of gene tree space via
|
|
282
|
+
bipartition decomposition
|
|
283
|
+
|
|
281
284
|
Alignment- and tree-based commands
|
|
282
285
|
==================================
|
|
283
286
|
saturation (alias: sat)
|
|
@@ -5000,6 +5003,95 @@ class Phykit:
|
|
|
5000
5003
|
_add_json_argument(parser)
|
|
5001
5004
|
_run_service(parser, argv, DiscordanceAsymmetry)
|
|
5002
5005
|
|
|
5006
|
+
@staticmethod
|
|
5007
|
+
def spectral_discordance(argv):
|
|
5008
|
+
parser = _new_parser(
|
|
5009
|
+
description=textwrap.dedent(
|
|
5010
|
+
f"""\
|
|
5011
|
+
{help_header}
|
|
5012
|
+
|
|
5013
|
+
Spectral discordance decomposition — decompose gene tree
|
|
5014
|
+
space via PCA on a bipartition presence/absence (or
|
|
5015
|
+
branch-length) matrix, with spectral clustering and
|
|
5016
|
+
automatic cluster detection via the eigengap heuristic.
|
|
5017
|
+
|
|
5018
|
+
Each gene tree is encoded as a vector over the union of
|
|
5019
|
+
all bipartitions observed across gene trees. PCA reveals
|
|
5020
|
+
the axes of topological variation, with loading vectors
|
|
5021
|
+
identifying which bipartitions drive each PC. Spectral
|
|
5022
|
+
clustering groups genes sharing similar topologies.
|
|
5023
|
+
|
|
5024
|
+
Two metrics are available:
|
|
5025
|
+
- nrf (default): binary presence/absence (normalized RF)
|
|
5026
|
+
- wrf: branch-length weighted
|
|
5027
|
+
|
|
5028
|
+
Aliases:
|
|
5029
|
+
spectral_discordance, spec_disc, sd
|
|
5030
|
+
Command line interfaces:
|
|
5031
|
+
pk_spectral_discordance, pk_spec_disc, pk_sd
|
|
5032
|
+
|
|
5033
|
+
Usage:
|
|
5034
|
+
phykit spectral_discordance -g <gene_trees> [-t <tree>] [--metric nrf|wrf] [--clusters K] [--n-pcs N] [--top-loadings N] [--plot <prefix>] [--json]
|
|
5035
|
+
|
|
5036
|
+
Options
|
|
5037
|
+
=====================================================
|
|
5038
|
+
-g/--gene-trees file of gene trees (one
|
|
5039
|
+
Newick per line, or file
|
|
5040
|
+
of filenames)
|
|
5041
|
+
|
|
5042
|
+
-t/--tree species tree (optional; flags
|
|
5043
|
+
species-tree bipartitions in
|
|
5044
|
+
loading output)
|
|
5045
|
+
|
|
5046
|
+
--metric distance metric: nrf or wrf
|
|
5047
|
+
(default: nrf)
|
|
5048
|
+
|
|
5049
|
+
--clusters override auto-detected K
|
|
5050
|
+
|
|
5051
|
+
--n-pcs number of PCs to report
|
|
5052
|
+
(default: min(10, G-1))
|
|
5053
|
+
|
|
5054
|
+
--top-loadings top bipartitions per PC
|
|
5055
|
+
(default: 5)
|
|
5056
|
+
|
|
5057
|
+
--plot output prefix for plots
|
|
5058
|
+
(generates _scatter.png and
|
|
5059
|
+
_eigengap.png)
|
|
5060
|
+
|
|
5061
|
+
--json output results as JSON
|
|
5062
|
+
"""
|
|
5063
|
+
),
|
|
5064
|
+
)
|
|
5065
|
+
parser.add_argument(
|
|
5066
|
+
"-g", "--gene-trees", type=str, required=True, help=SUPPRESS, metavar=""
|
|
5067
|
+
)
|
|
5068
|
+
parser.add_argument(
|
|
5069
|
+
"-t", "--tree", type=str, required=False, default=None,
|
|
5070
|
+
help=SUPPRESS, metavar=""
|
|
5071
|
+
)
|
|
5072
|
+
parser.add_argument(
|
|
5073
|
+
"--metric", type=str, required=False, default="nrf",
|
|
5074
|
+
choices=["nrf", "wrf"], help=SUPPRESS, metavar=""
|
|
5075
|
+
)
|
|
5076
|
+
parser.add_argument(
|
|
5077
|
+
"--clusters", type=int, required=False, default=None,
|
|
5078
|
+
help=SUPPRESS, metavar=""
|
|
5079
|
+
)
|
|
5080
|
+
parser.add_argument(
|
|
5081
|
+
"--n-pcs", type=int, required=False, default=None,
|
|
5082
|
+
help=SUPPRESS, metavar=""
|
|
5083
|
+
)
|
|
5084
|
+
parser.add_argument(
|
|
5085
|
+
"--top-loadings", type=int, required=False, default=5,
|
|
5086
|
+
help=SUPPRESS, metavar=""
|
|
5087
|
+
)
|
|
5088
|
+
parser.add_argument(
|
|
5089
|
+
"--plot", type=str, required=False, default=None,
|
|
5090
|
+
help=SUPPRESS, metavar=""
|
|
5091
|
+
)
|
|
5092
|
+
_add_json_argument(parser)
|
|
5093
|
+
_run_service(parser, argv, SpectralDiscordance)
|
|
5094
|
+
|
|
5003
5095
|
### Helper commands
|
|
5004
5096
|
@staticmethod
|
|
5005
5097
|
def create_concatenation_matrix(argv):
|
|
@@ -5453,3 +5545,15 @@ def create_concatenation_matrix(argv=None):
|
|
|
5453
5545
|
|
|
5454
5546
|
def thread_dna(argv=None):
|
|
5455
5547
|
Phykit.thread_dna(sys.argv[1:])
|
|
5548
|
+
|
|
5549
|
+
|
|
5550
|
+
def evo_tempo_map(argv=None):
|
|
5551
|
+
Phykit.evo_tempo_map(sys.argv[1:])
|
|
5552
|
+
|
|
5553
|
+
|
|
5554
|
+
def discordance_asymmetry(argv=None):
|
|
5555
|
+
Phykit.discordance_asymmetry(sys.argv[1:])
|
|
5556
|
+
|
|
5557
|
+
|
|
5558
|
+
def spectral_discordance(argv=None):
|
|
5559
|
+
Phykit.spectral_discordance(sys.argv[1:])
|
|
@@ -98,6 +98,7 @@ Treeness = _LazyServiceFactory("phykit.services.tree.treeness", "Treeness")
|
|
|
98
98
|
TreenessOverRCV = _LazyServiceFactory("phykit.services.tree.treeness_over_rcv", "TreenessOverRCV")
|
|
99
99
|
EvoTempoMap = _LazyServiceFactory("phykit.services.tree.evo_tempo_map", "EvoTempoMap")
|
|
100
100
|
DiscordanceAsymmetry = _LazyServiceFactory("phykit.services.tree.discordance_asymmetry", "DiscordanceAsymmetry")
|
|
101
|
+
SpectralDiscordance = _LazyServiceFactory("phykit.services.tree.spectral_discordance", "SpectralDiscordance")
|
|
101
102
|
|
|
102
103
|
SERVICE_FACTORIES: Dict[str, _LazyServiceFactory] = {
|
|
103
104
|
name: value
|
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import sys
|
|
3
|
+
from io import StringIO
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from Bio import Phylo
|
|
9
|
+
|
|
10
|
+
from .base import Tree
|
|
11
|
+
from ...helpers.json_output import print_json
|
|
12
|
+
from ...errors import PhykitUserError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SpectralDiscordance(Tree):
|
|
16
|
+
def __init__(self, args) -> None:
|
|
17
|
+
parsed = self.process_args(args)
|
|
18
|
+
tree_file = parsed["tree_file_path"]
|
|
19
|
+
if tree_file is not None:
|
|
20
|
+
super().__init__(tree_file_path=tree_file)
|
|
21
|
+
else:
|
|
22
|
+
self.tree_file_path = None
|
|
23
|
+
self.gene_trees_path = parsed["gene_trees_path"]
|
|
24
|
+
self.metric = parsed["metric"]
|
|
25
|
+
self.n_clusters = parsed["n_clusters"]
|
|
26
|
+
self.n_pcs = parsed["n_pcs"]
|
|
27
|
+
self.top_loadings = parsed["top_loadings"]
|
|
28
|
+
self.plot_output = parsed["plot_output"]
|
|
29
|
+
self.json_output = parsed["json_output"]
|
|
30
|
+
|
|
31
|
+
def process_args(self, args) -> Dict:
|
|
32
|
+
return dict(
|
|
33
|
+
tree_file_path=getattr(args, "tree", None),
|
|
34
|
+
gene_trees_path=args.gene_trees,
|
|
35
|
+
metric=getattr(args, "metric", "nrf"),
|
|
36
|
+
n_clusters=getattr(args, "clusters", None),
|
|
37
|
+
n_pcs=getattr(args, "n_pcs", None),
|
|
38
|
+
top_loadings=getattr(args, "top_loadings", 5),
|
|
39
|
+
plot_output=getattr(args, "plot", None),
|
|
40
|
+
json_output=getattr(args, "json", False),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def run(self) -> None:
|
|
44
|
+
gene_trees = self._parse_gene_trees(self.gene_trees_path)
|
|
45
|
+
|
|
46
|
+
species_tree = None
|
|
47
|
+
if self.tree_file_path is not None:
|
|
48
|
+
species_tree = self.read_tree_file()
|
|
49
|
+
|
|
50
|
+
shared = self._get_shared_taxa(gene_trees, species_tree)
|
|
51
|
+
|
|
52
|
+
X, bip_index, sp_flags = self._build_bipartition_matrix(
|
|
53
|
+
gene_trees, shared, metric=self.metric, species_tree=species_tree
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
G = X.shape[0]
|
|
57
|
+
if G < 5:
|
|
58
|
+
print(
|
|
59
|
+
f"Warning: only {G} gene trees; clustering may be unreliable.",
|
|
60
|
+
file=sys.stderr,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
scores, var_explained, loadings = self._run_pca(X)
|
|
64
|
+
n_pcs = self.n_pcs if self.n_pcs else min(10, scores.shape[1])
|
|
65
|
+
n_pcs = min(n_pcs, scores.shape[1])
|
|
66
|
+
|
|
67
|
+
X_centered = X - X.mean(axis=0)
|
|
68
|
+
labels, K, eigengaps = self._spectral_cluster(
|
|
69
|
+
X_centered, n_clusters=self.n_clusters
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
top_loadings = self._get_top_loadings(
|
|
73
|
+
loadings, bip_index, sp_flags, n_pcs, self.top_loadings
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if self.plot_output:
|
|
77
|
+
self._plot_scatter(scores, labels, K, self.plot_output + "_scatter.png")
|
|
78
|
+
self._plot_eigengap(eigengaps, K, self.plot_output + "_eigengap.png")
|
|
79
|
+
|
|
80
|
+
if self.json_output:
|
|
81
|
+
result = self._format_json(
|
|
82
|
+
scores, var_explained, top_loadings, labels, K,
|
|
83
|
+
eigengaps, n_pcs, bip_index, sp_flags
|
|
84
|
+
)
|
|
85
|
+
print_json(result)
|
|
86
|
+
else:
|
|
87
|
+
self._print_text(
|
|
88
|
+
scores, var_explained, top_loadings, labels, K,
|
|
89
|
+
eigengaps, n_pcs, G, len(shared), len(bip_index)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def _bipartition_to_str(self, bip: frozenset) -> str:
|
|
93
|
+
return "{" + ",".join(sorted(bip)) + "}"
|
|
94
|
+
|
|
95
|
+
def _get_top_loadings(
|
|
96
|
+
self, loadings, bip_index, sp_flags, n_pcs, top_n,
|
|
97
|
+
) -> Dict[str, list]:
|
|
98
|
+
result = {}
|
|
99
|
+
for pc in range(min(n_pcs, loadings.shape[0])):
|
|
100
|
+
pc_loadings = loadings[pc]
|
|
101
|
+
top_idx = np.argsort(np.abs(pc_loadings))[::-1][:top_n]
|
|
102
|
+
entries = []
|
|
103
|
+
for idx in top_idx:
|
|
104
|
+
bip = bip_index[idx]
|
|
105
|
+
entries.append({
|
|
106
|
+
"bipartition": self._bipartition_to_str(bip),
|
|
107
|
+
"loading": float(pc_loadings[idx]),
|
|
108
|
+
"in_species_tree": sp_flags.get(bip, False),
|
|
109
|
+
})
|
|
110
|
+
result[f"PC{pc + 1}"] = entries
|
|
111
|
+
return result
|
|
112
|
+
|
|
113
|
+
def _format_json(
|
|
114
|
+
self, scores, var_explained, top_loadings, labels, K,
|
|
115
|
+
eigengaps, n_pcs, bip_index, sp_flags,
|
|
116
|
+
) -> Dict:
|
|
117
|
+
G = scores.shape[0]
|
|
118
|
+
score_dict = {}
|
|
119
|
+
for g in range(G):
|
|
120
|
+
row = {}
|
|
121
|
+
for pc in range(min(n_pcs, scores.shape[1])):
|
|
122
|
+
row[f"PC{pc + 1}"] = float(scores[g, pc])
|
|
123
|
+
row["cluster"] = int(labels[g])
|
|
124
|
+
score_dict[f"gene_tree_{g + 1}"] = row
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"metric": self.metric,
|
|
128
|
+
"n_gene_trees": G,
|
|
129
|
+
"n_bipartitions": len(bip_index),
|
|
130
|
+
"n_clusters": int(K),
|
|
131
|
+
"scores": score_dict,
|
|
132
|
+
"variance_explained": {
|
|
133
|
+
f"PC{i + 1}": float(var_explained[i])
|
|
134
|
+
for i in range(min(n_pcs, len(var_explained)))
|
|
135
|
+
},
|
|
136
|
+
"top_loadings": top_loadings,
|
|
137
|
+
"cluster_assignments": [int(l) for l in labels],
|
|
138
|
+
"eigengap_values": [float(g) for g in eigengaps],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
def _print_text(
|
|
142
|
+
self, scores, var_explained, top_loadings, labels, K,
|
|
143
|
+
eigengaps, n_pcs, n_trees, n_taxa, n_bips,
|
|
144
|
+
) -> None:
|
|
145
|
+
print("Spectral Discordance Decomposition")
|
|
146
|
+
print(f"\nMetric: {self.metric}")
|
|
147
|
+
print(f"Gene trees: {n_trees}")
|
|
148
|
+
print(f"Shared taxa: {n_taxa}")
|
|
149
|
+
print(f"Unique bipartitions: {n_bips}")
|
|
150
|
+
print(f"Clusters detected: {K}")
|
|
151
|
+
|
|
152
|
+
print("\nVariance explained:")
|
|
153
|
+
for i in range(min(n_pcs, len(var_explained))):
|
|
154
|
+
bar = "#" * int(var_explained[i] * 50)
|
|
155
|
+
print(f" PC{i + 1:<3d} {var_explained[i]:>8.4f} {bar}")
|
|
156
|
+
|
|
157
|
+
for pc_name, entries in top_loadings.items():
|
|
158
|
+
print(f"\nTop loadings for {pc_name}:")
|
|
159
|
+
for e in entries:
|
|
160
|
+
flag = " *" if e["in_species_tree"] else ""
|
|
161
|
+
print(f" {e['loading']:>8.4f} {e['bipartition']}{flag}")
|
|
162
|
+
|
|
163
|
+
show_pcs = min(n_pcs, scores.shape[1], 5)
|
|
164
|
+
header = f" {'Gene tree':<14s}"
|
|
165
|
+
for pc in range(show_pcs):
|
|
166
|
+
header += f"{'PC' + str(pc + 1):>10s}"
|
|
167
|
+
header += f"{'Cluster':>10s}"
|
|
168
|
+
print(f"\nPC scores:")
|
|
169
|
+
print(header)
|
|
170
|
+
for g in range(scores.shape[0]):
|
|
171
|
+
row = f" {'gene_tree_' + str(g + 1):<14s}"
|
|
172
|
+
for pc in range(show_pcs):
|
|
173
|
+
row += f"{scores[g, pc]:>10.4f}"
|
|
174
|
+
row += f"{labels[g]:>10d}"
|
|
175
|
+
print(row)
|
|
176
|
+
|
|
177
|
+
def _plot_scatter(self, scores, labels, K, output_path) -> None:
|
|
178
|
+
try:
|
|
179
|
+
import matplotlib
|
|
180
|
+
matplotlib.use("Agg")
|
|
181
|
+
import matplotlib.pyplot as plt
|
|
182
|
+
except ImportError:
|
|
183
|
+
print("matplotlib is required for plotting. Install matplotlib and retry.")
|
|
184
|
+
raise SystemExit(2)
|
|
185
|
+
|
|
186
|
+
fig, ax = plt.subplots(figsize=(8, 6))
|
|
187
|
+
cmap = plt.get_cmap("tab10")
|
|
188
|
+
for k in range(K):
|
|
189
|
+
mask = labels == k
|
|
190
|
+
ax.scatter(
|
|
191
|
+
scores[mask, 0], scores[mask, 1],
|
|
192
|
+
c=[cmap(k)], label=f"Cluster {k + 1}",
|
|
193
|
+
s=60, edgecolors="black", linewidths=0.5, alpha=0.8,
|
|
194
|
+
)
|
|
195
|
+
idxs = np.where(mask)[0]
|
|
196
|
+
for idx in idxs:
|
|
197
|
+
ax.annotate(
|
|
198
|
+
str(idx + 1), (scores[idx, 0], scores[idx, 1]),
|
|
199
|
+
fontsize=7, ha="center", va="bottom",
|
|
200
|
+
xytext=(0, 4), textcoords="offset points",
|
|
201
|
+
)
|
|
202
|
+
ax.set_xlabel("PC1")
|
|
203
|
+
ax.set_ylabel("PC2")
|
|
204
|
+
ax.set_title("Gene Tree Space — Spectral Discordance")
|
|
205
|
+
ax.legend()
|
|
206
|
+
fig.tight_layout()
|
|
207
|
+
fig.savefig(output_path, dpi=300, bbox_inches="tight")
|
|
208
|
+
plt.close(fig)
|
|
209
|
+
print(f"Saved scatter plot: {output_path}")
|
|
210
|
+
|
|
211
|
+
def _plot_eigengap(self, eigengaps, K, output_path) -> None:
|
|
212
|
+
try:
|
|
213
|
+
import matplotlib
|
|
214
|
+
matplotlib.use("Agg")
|
|
215
|
+
import matplotlib.pyplot as plt
|
|
216
|
+
except ImportError:
|
|
217
|
+
print("matplotlib is required for plotting. Install matplotlib and retry.")
|
|
218
|
+
raise SystemExit(2)
|
|
219
|
+
|
|
220
|
+
fig, ax = plt.subplots(figsize=(8, 4))
|
|
221
|
+
n = len(eigengaps)
|
|
222
|
+
x = np.arange(1, n + 1)
|
|
223
|
+
colors = ["tab:red" if i == K - 1 else "tab:blue" for i in range(n)]
|
|
224
|
+
ax.bar(x, eigengaps, color=colors, edgecolor="black", linewidth=0.5)
|
|
225
|
+
ax.set_xlabel("Eigenvalue index (i)")
|
|
226
|
+
ax.set_ylabel(r"Eigengap ($\lambda_{i+1} - \lambda_i$)")
|
|
227
|
+
ax.set_title(f"Eigengap Heuristic — K = {K} clusters")
|
|
228
|
+
ax.axvline(K, color="tab:red", linestyle="--", alpha=0.7, label=f"K = {K}")
|
|
229
|
+
ax.legend()
|
|
230
|
+
fig.tight_layout()
|
|
231
|
+
fig.savefig(output_path, dpi=300, bbox_inches="tight")
|
|
232
|
+
plt.close(fig)
|
|
233
|
+
print(f"Saved eigengap plot: {output_path}")
|
|
234
|
+
|
|
235
|
+
# ------------------------------------------------------------------
|
|
236
|
+
# Task 3: Gene-tree loading and bipartition extraction
|
|
237
|
+
# ------------------------------------------------------------------
|
|
238
|
+
|
|
239
|
+
def _parse_gene_trees(self, path: str) -> list:
|
|
240
|
+
"""Read a file of gene trees (one Newick string per line, or
|
|
241
|
+
file-of-filenames where each line is a path to a single-tree file).
|
|
242
|
+
|
|
243
|
+
Returns a list of Bio.Phylo tree objects.
|
|
244
|
+
Raises PhykitUserError if the file is not found or contains < 2 trees.
|
|
245
|
+
"""
|
|
246
|
+
p = Path(path)
|
|
247
|
+
if not p.exists():
|
|
248
|
+
raise PhykitUserError(
|
|
249
|
+
[
|
|
250
|
+
f"{path} corresponds to no such file or directory.",
|
|
251
|
+
"Please check filename and pathing",
|
|
252
|
+
],
|
|
253
|
+
code=2,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
trees: list = []
|
|
257
|
+
with open(p) as fh:
|
|
258
|
+
for line in fh:
|
|
259
|
+
line = line.strip()
|
|
260
|
+
if not line:
|
|
261
|
+
continue
|
|
262
|
+
# Heuristic: if the line starts with '(' it is inline Newick
|
|
263
|
+
if line.startswith("("):
|
|
264
|
+
tree = Phylo.read(StringIO(line), "newick")
|
|
265
|
+
trees.append(tree)
|
|
266
|
+
else:
|
|
267
|
+
# Treat as a file path (file-of-filenames)
|
|
268
|
+
tree_path = Path(line)
|
|
269
|
+
if not tree_path.exists():
|
|
270
|
+
raise PhykitUserError(
|
|
271
|
+
[
|
|
272
|
+
f"{line} corresponds to no such file or directory.",
|
|
273
|
+
"Please check filename and pathing",
|
|
274
|
+
],
|
|
275
|
+
code=2,
|
|
276
|
+
)
|
|
277
|
+
tree = Phylo.read(str(tree_path), "newick")
|
|
278
|
+
trees.append(tree)
|
|
279
|
+
|
|
280
|
+
if len(trees) < 2:
|
|
281
|
+
raise PhykitUserError(
|
|
282
|
+
[
|
|
283
|
+
"At least 2 gene trees are required for spectral discordance analysis.",
|
|
284
|
+
f"Only {len(trees)} tree(s) found in {path}.",
|
|
285
|
+
],
|
|
286
|
+
code=2,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
return trees
|
|
290
|
+
|
|
291
|
+
def _get_shared_taxa(self, gene_trees, species_tree=None) -> set:
|
|
292
|
+
"""Compute the intersection of tip names across all gene trees
|
|
293
|
+
(and the species tree, if provided).
|
|
294
|
+
|
|
295
|
+
Raises PhykitUserError if fewer than 4 taxa are shared.
|
|
296
|
+
"""
|
|
297
|
+
if not gene_trees:
|
|
298
|
+
raise PhykitUserError(
|
|
299
|
+
["No gene trees provided."],
|
|
300
|
+
code=2,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
shared = set(tip.name for tip in gene_trees[0].get_terminals())
|
|
304
|
+
for gt in gene_trees[1:]:
|
|
305
|
+
shared &= set(tip.name for tip in gt.get_terminals())
|
|
306
|
+
|
|
307
|
+
if species_tree is not None:
|
|
308
|
+
sp_taxa = set(tip.name for tip in species_tree.get_terminals())
|
|
309
|
+
shared &= sp_taxa
|
|
310
|
+
|
|
311
|
+
if len(shared) < 4:
|
|
312
|
+
raise PhykitUserError(
|
|
313
|
+
[
|
|
314
|
+
"Fewer than 4 shared taxa across gene trees.",
|
|
315
|
+
f"Only {len(shared)} shared taxa found.",
|
|
316
|
+
"At least 4 shared taxa are required for meaningful "
|
|
317
|
+
"bipartition analysis.",
|
|
318
|
+
],
|
|
319
|
+
code=2,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
return shared
|
|
323
|
+
|
|
324
|
+
@staticmethod
|
|
325
|
+
def _canonical_split(taxa_side, all_taxa) -> frozenset:
|
|
326
|
+
"""Return the canonical representation of a bipartition split.
|
|
327
|
+
|
|
328
|
+
The canonical form is the smaller side of the bipartition.
|
|
329
|
+
Ties (equal-sized sides) are broken lexicographically: the side
|
|
330
|
+
whose sorted tuple is lexicographically smaller is chosen.
|
|
331
|
+
"""
|
|
332
|
+
complement = all_taxa - taxa_side
|
|
333
|
+
if len(taxa_side) < len(complement):
|
|
334
|
+
return frozenset(taxa_side)
|
|
335
|
+
elif len(taxa_side) > len(complement):
|
|
336
|
+
return frozenset(complement)
|
|
337
|
+
else:
|
|
338
|
+
# Tie-breaking: pick the lexicographically smaller side
|
|
339
|
+
if sorted(taxa_side) <= sorted(complement):
|
|
340
|
+
return frozenset(taxa_side)
|
|
341
|
+
else:
|
|
342
|
+
return frozenset(complement)
|
|
343
|
+
|
|
344
|
+
def _extract_splits(self, tree, all_taxa_fs) -> set:
|
|
345
|
+
"""Extract canonical bipartitions from a tree via postorder traversal.
|
|
346
|
+
|
|
347
|
+
Skips trivial splits (single-tip on both sides) and the full-taxa split.
|
|
348
|
+
Returns a set of frozensets.
|
|
349
|
+
"""
|
|
350
|
+
splits = set()
|
|
351
|
+
# Map terminal clades to their single-taxon sets
|
|
352
|
+
clade_taxa: dict = {}
|
|
353
|
+
|
|
354
|
+
for clade in tree.find_clades(order="postorder"):
|
|
355
|
+
if clade.is_terminal():
|
|
356
|
+
if clade.name in all_taxa_fs:
|
|
357
|
+
clade_taxa[id(clade)] = frozenset({clade.name})
|
|
358
|
+
else:
|
|
359
|
+
clade_taxa[id(clade)] = frozenset()
|
|
360
|
+
else:
|
|
361
|
+
taxa = frozenset()
|
|
362
|
+
for child in clade.clades:
|
|
363
|
+
taxa = taxa | clade_taxa.get(id(child), frozenset())
|
|
364
|
+
clade_taxa[id(clade)] = taxa
|
|
365
|
+
|
|
366
|
+
# Skip trivial and full-taxa splits
|
|
367
|
+
if len(taxa) <= 1:
|
|
368
|
+
continue
|
|
369
|
+
complement_size = len(all_taxa_fs) - len(taxa)
|
|
370
|
+
if complement_size <= 0:
|
|
371
|
+
continue
|
|
372
|
+
# A split where one side has 1 taxon is trivial only if the
|
|
373
|
+
# complement also has 1 taxon (i.e. 2-taxon tree). For larger
|
|
374
|
+
# trees, a split of {A} vs rest is still informative only if
|
|
375
|
+
# rest > 1, but we skip single-tip sides as they are trivial.
|
|
376
|
+
# Actually, the spec says skip trivial (single-tip) splits:
|
|
377
|
+
# that means skip if len(taxa)==1 (already handled above) OR
|
|
378
|
+
# if complement is 1 taxon.
|
|
379
|
+
if complement_size < 1:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
canonical = self._canonical_split(taxa, all_taxa_fs)
|
|
383
|
+
# Skip if canonical is single-tip (trivial)
|
|
384
|
+
if len(canonical) <= 1:
|
|
385
|
+
continue
|
|
386
|
+
# Skip full-taxa split
|
|
387
|
+
if canonical == all_taxa_fs:
|
|
388
|
+
continue
|
|
389
|
+
splits.add(canonical)
|
|
390
|
+
|
|
391
|
+
return splits
|
|
392
|
+
|
|
393
|
+
def _extract_splits_with_lengths(self, tree, all_taxa_fs) -> Dict[frozenset, float]:
|
|
394
|
+
"""Extract canonical bipartitions with their branch lengths.
|
|
395
|
+
|
|
396
|
+
Same traversal as _extract_splits but returns a dict mapping
|
|
397
|
+
canonical split -> branch length (for the wrf metric).
|
|
398
|
+
"""
|
|
399
|
+
split_lengths: Dict[frozenset, float] = {}
|
|
400
|
+
clade_taxa: dict = {}
|
|
401
|
+
|
|
402
|
+
for clade in tree.find_clades(order="postorder"):
|
|
403
|
+
if clade.is_terminal():
|
|
404
|
+
if clade.name in all_taxa_fs:
|
|
405
|
+
clade_taxa[id(clade)] = frozenset({clade.name})
|
|
406
|
+
else:
|
|
407
|
+
clade_taxa[id(clade)] = frozenset()
|
|
408
|
+
else:
|
|
409
|
+
taxa = frozenset()
|
|
410
|
+
for child in clade.clades:
|
|
411
|
+
taxa = taxa | clade_taxa.get(id(child), frozenset())
|
|
412
|
+
clade_taxa[id(clade)] = taxa
|
|
413
|
+
|
|
414
|
+
if len(taxa) <= 1:
|
|
415
|
+
continue
|
|
416
|
+
complement_size = len(all_taxa_fs) - len(taxa)
|
|
417
|
+
if complement_size <= 0:
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
canonical = self._canonical_split(taxa, all_taxa_fs)
|
|
421
|
+
if len(canonical) <= 1:
|
|
422
|
+
continue
|
|
423
|
+
if canonical == all_taxa_fs:
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
bl = clade.branch_length if clade.branch_length is not None else 0.0
|
|
427
|
+
split_lengths[canonical] = bl
|
|
428
|
+
|
|
429
|
+
return split_lengths
|
|
430
|
+
|
|
431
|
+
def _build_bipartition_matrix(
|
|
432
|
+
self,
|
|
433
|
+
gene_trees,
|
|
434
|
+
shared_taxa,
|
|
435
|
+
metric="nrf",
|
|
436
|
+
species_tree=None,
|
|
437
|
+
) -> Tuple[np.ndarray, List[frozenset], Dict[frozenset, bool]]:
|
|
438
|
+
"""Build the gene-tree x bipartition matrix.
|
|
439
|
+
|
|
440
|
+
Parameters
|
|
441
|
+
----------
|
|
442
|
+
gene_trees : list
|
|
443
|
+
List of Bio.Phylo tree objects.
|
|
444
|
+
shared_taxa : set
|
|
445
|
+
Set of taxon names shared across all trees.
|
|
446
|
+
metric : str
|
|
447
|
+
"nrf" for binary (presence/absence) or "wrf" for branch-length
|
|
448
|
+
weighted matrix.
|
|
449
|
+
species_tree : Bio.Phylo tree or None
|
|
450
|
+
If provided, bipartitions are flagged as concordant/discordant
|
|
451
|
+
with the species tree.
|
|
452
|
+
|
|
453
|
+
Returns
|
|
454
|
+
-------
|
|
455
|
+
X : np.ndarray of shape (G, B)
|
|
456
|
+
The bipartition matrix.
|
|
457
|
+
bip_index : list of frozenset
|
|
458
|
+
Ordered list of bipartitions (columns of X).
|
|
459
|
+
sp_flags : dict
|
|
460
|
+
Maps each bipartition to True if it appears in the species tree,
|
|
461
|
+
False otherwise. Empty dict if no species tree provided.
|
|
462
|
+
"""
|
|
463
|
+
all_taxa_fs = frozenset(shared_taxa)
|
|
464
|
+
|
|
465
|
+
# Prune gene trees to shared taxa and extract splits
|
|
466
|
+
per_tree_data: list = []
|
|
467
|
+
all_bipartitions: set = set()
|
|
468
|
+
|
|
469
|
+
for gt in gene_trees:
|
|
470
|
+
pruned = copy.deepcopy(gt)
|
|
471
|
+
tips_to_remove = [
|
|
472
|
+
tip.name
|
|
473
|
+
for tip in pruned.get_terminals()
|
|
474
|
+
if tip.name not in shared_taxa
|
|
475
|
+
]
|
|
476
|
+
for tip_name in tips_to_remove:
|
|
477
|
+
pruned.prune(tip_name)
|
|
478
|
+
|
|
479
|
+
if metric == "wrf":
|
|
480
|
+
split_data = self._extract_splits_with_lengths(pruned, all_taxa_fs)
|
|
481
|
+
else:
|
|
482
|
+
split_data = self._extract_splits(pruned, all_taxa_fs)
|
|
483
|
+
|
|
484
|
+
per_tree_data.append(split_data)
|
|
485
|
+
|
|
486
|
+
if isinstance(split_data, dict):
|
|
487
|
+
all_bipartitions.update(split_data.keys())
|
|
488
|
+
else:
|
|
489
|
+
all_bipartitions.update(split_data)
|
|
490
|
+
|
|
491
|
+
# Stable sort: by (length of split, sorted taxon names)
|
|
492
|
+
bip_index = sorted(
|
|
493
|
+
all_bipartitions,
|
|
494
|
+
key=lambda s: (len(s), sorted(s)),
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Build matrix
|
|
498
|
+
n_genes = len(gene_trees)
|
|
499
|
+
n_bips = len(bip_index)
|
|
500
|
+
X = np.zeros((n_genes, n_bips), dtype=float)
|
|
501
|
+
|
|
502
|
+
for i, tree_data in enumerate(per_tree_data):
|
|
503
|
+
for j, bip in enumerate(bip_index):
|
|
504
|
+
if metric == "wrf":
|
|
505
|
+
# tree_data is a dict: split -> branch_length
|
|
506
|
+
if bip in tree_data:
|
|
507
|
+
X[i, j] = tree_data[bip]
|
|
508
|
+
else:
|
|
509
|
+
# tree_data is a set of splits
|
|
510
|
+
if bip in tree_data:
|
|
511
|
+
X[i, j] = 1.0
|
|
512
|
+
|
|
513
|
+
# Species tree flags
|
|
514
|
+
sp_flags: Dict[frozenset, bool] = {}
|
|
515
|
+
if species_tree is not None:
|
|
516
|
+
sp_pruned = copy.deepcopy(species_tree)
|
|
517
|
+
sp_tips_to_remove = [
|
|
518
|
+
tip.name
|
|
519
|
+
for tip in sp_pruned.get_terminals()
|
|
520
|
+
if tip.name not in shared_taxa
|
|
521
|
+
]
|
|
522
|
+
for tip_name in sp_tips_to_remove:
|
|
523
|
+
sp_pruned.prune(tip_name)
|
|
524
|
+
|
|
525
|
+
sp_splits = self._extract_splits(sp_pruned, all_taxa_fs)
|
|
526
|
+
for bip in bip_index:
|
|
527
|
+
sp_flags[bip] = bip in sp_splits
|
|
528
|
+
|
|
529
|
+
return X, bip_index, sp_flags
|
|
530
|
+
|
|
531
|
+
# ------------------------------------------------------------------
|
|
532
|
+
# Task 4: PCA via SVD
|
|
533
|
+
# ------------------------------------------------------------------
|
|
534
|
+
|
|
535
|
+
def _run_pca(
|
|
536
|
+
self, X: np.ndarray
|
|
537
|
+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
538
|
+
"""PCA via SVD on centered bipartition matrix.
|
|
539
|
+
|
|
540
|
+
Returns (scores, variance_explained, loadings).
|
|
541
|
+
scores: G x n_components
|
|
542
|
+
variance_explained: n_components (sums to 1.0)
|
|
543
|
+
loadings: n_components x B
|
|
544
|
+
"""
|
|
545
|
+
X_centered = X - X.mean(axis=0)
|
|
546
|
+
|
|
547
|
+
if np.allclose(X_centered, 0):
|
|
548
|
+
n_comp = min(X.shape)
|
|
549
|
+
return (
|
|
550
|
+
np.zeros((X.shape[0], n_comp)),
|
|
551
|
+
np.zeros(n_comp),
|
|
552
|
+
np.zeros((n_comp, X.shape[1])),
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
|
|
556
|
+
scores = U * S
|
|
557
|
+
total_var = np.sum(S ** 2)
|
|
558
|
+
var_explained = S ** 2 / total_var if total_var > 0 else np.zeros_like(S)
|
|
559
|
+
loadings = Vt
|
|
560
|
+
|
|
561
|
+
return scores, var_explained, loadings
|
|
562
|
+
|
|
563
|
+
# ------------------------------------------------------------------
|
|
564
|
+
# Task 5: Spectral clustering with eigengap
|
|
565
|
+
# ------------------------------------------------------------------
|
|
566
|
+
|
|
567
|
+
def _spectral_cluster(
|
|
568
|
+
self, X_centered: np.ndarray, n_clusters: int = None,
|
|
569
|
+
) -> Tuple[np.ndarray, int, np.ndarray]:
|
|
570
|
+
"""Spectral clustering with eigengap auto-K detection."""
|
|
571
|
+
from scipy.spatial.distance import pdist, squareform
|
|
572
|
+
|
|
573
|
+
G = X_centered.shape[0]
|
|
574
|
+
|
|
575
|
+
dists = squareform(pdist(X_centered, metric="euclidean"))
|
|
576
|
+
|
|
577
|
+
upper_tri = dists[np.triu_indices(G, k=1)]
|
|
578
|
+
sigma = float(np.median(upper_tri))
|
|
579
|
+
if sigma == 0:
|
|
580
|
+
nonzero = upper_tri[upper_tri > 0]
|
|
581
|
+
sigma = float(np.mean(nonzero)) if len(nonzero) > 0 else 1.0
|
|
582
|
+
|
|
583
|
+
W = np.exp(-dists ** 2 / (2 * sigma ** 2))
|
|
584
|
+
np.fill_diagonal(W, W.diagonal() + 1e-10)
|
|
585
|
+
|
|
586
|
+
d = np.sum(W, axis=1)
|
|
587
|
+
d_inv_sqrt = 1.0 / np.sqrt(d)
|
|
588
|
+
D_inv_sqrt = np.diag(d_inv_sqrt)
|
|
589
|
+
L_norm = np.eye(G) - D_inv_sqrt @ W @ D_inv_sqrt
|
|
590
|
+
|
|
591
|
+
eigenvalues, eigenvectors = np.linalg.eigh(L_norm)
|
|
592
|
+
|
|
593
|
+
max_k = min(20, G // 2)
|
|
594
|
+
if max_k < 2:
|
|
595
|
+
max_k = 2
|
|
596
|
+
eigengaps = np.diff(eigenvalues[:max_k + 1])
|
|
597
|
+
|
|
598
|
+
if n_clusters is not None:
|
|
599
|
+
K = n_clusters
|
|
600
|
+
else:
|
|
601
|
+
K = int(np.argmax(eigengaps[1:max_k]) + 2)
|
|
602
|
+
K = max(K, 2)
|
|
603
|
+
|
|
604
|
+
vecs = eigenvectors[:, :K]
|
|
605
|
+
row_norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
|
606
|
+
row_norms[row_norms == 0] = 1.0
|
|
607
|
+
vecs = vecs / row_norms
|
|
608
|
+
|
|
609
|
+
labels = self._kmeans(vecs, K)
|
|
610
|
+
|
|
611
|
+
return labels, K, eigengaps
|
|
612
|
+
|
|
613
|
+
@staticmethod
|
|
614
|
+
def _kmeans(X: np.ndarray, K: int, max_iter: int = 300) -> np.ndarray:
|
|
615
|
+
"""Simple K-means clustering (avoids sklearn dependency)."""
|
|
616
|
+
G = X.shape[0]
|
|
617
|
+
rng = np.random.RandomState(42)
|
|
618
|
+
|
|
619
|
+
centers = [X[rng.randint(G)]]
|
|
620
|
+
for _ in range(1, K):
|
|
621
|
+
dists = np.array([
|
|
622
|
+
min(np.sum((x - c) ** 2) for c in centers)
|
|
623
|
+
for x in X
|
|
624
|
+
])
|
|
625
|
+
probs = dists / dists.sum() if dists.sum() > 0 else np.ones(G) / G
|
|
626
|
+
centers.append(X[rng.choice(G, p=probs)])
|
|
627
|
+
centers = np.array(centers)
|
|
628
|
+
|
|
629
|
+
for _ in range(max_iter):
|
|
630
|
+
dists_to_centers = np.array([
|
|
631
|
+
np.sum((X - c) ** 2, axis=1) for c in centers
|
|
632
|
+
]).T
|
|
633
|
+
labels = np.argmin(dists_to_centers, axis=1)
|
|
634
|
+
|
|
635
|
+
new_centers = np.array([
|
|
636
|
+
X[labels == k].mean(axis=0) if np.any(labels == k) else centers[k]
|
|
637
|
+
for k in range(K)
|
|
638
|
+
])
|
|
639
|
+
if np.allclose(new_centers, centers):
|
|
640
|
+
break
|
|
641
|
+
centers = new_centers
|
|
642
|
+
|
|
643
|
+
return labels
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.1.37"
|
|
@@ -95,6 +95,7 @@ phykit/services/tree/rename_tree_tips.py
|
|
|
95
95
|
phykit/services/tree/rf_distance.py
|
|
96
96
|
phykit/services/tree/root_tree.py
|
|
97
97
|
phykit/services/tree/saturation.py
|
|
98
|
+
phykit/services/tree/spectral_discordance.py
|
|
98
99
|
phykit/services/tree/spurious_sequence.py
|
|
99
100
|
phykit/services/tree/stochastic_character_map.py
|
|
100
101
|
phykit/services/tree/terminal_branch_stats.py
|
|
@@ -49,17 +49,22 @@ pk_create_concat = phykit.phykit:create_concatenation_matrix
|
|
|
49
49
|
pk_create_concatenation_matrix = phykit.phykit:create_concatenation_matrix
|
|
50
50
|
pk_cs = phykit.phykit:column_score
|
|
51
51
|
pk_ctree = phykit.phykit:consensus_tree
|
|
52
|
+
pk_da = phykit.phykit:discordance_asymmetry
|
|
52
53
|
pk_degree_of_violation_of_a_molecular_clock = phykit.phykit:dvmc
|
|
53
54
|
pk_density_map = phykit.phykit:density_map
|
|
54
55
|
pk_densitymap = phykit.phykit:density_map
|
|
55
56
|
pk_detect_shifts = phykit.phykit:ou_shift_detection
|
|
56
57
|
pk_dimreduce = phykit.phykit:phylogenetic_ordination
|
|
58
|
+
pk_disc_asym = phykit.phykit:discordance_asymmetry
|
|
59
|
+
pk_discordance_asymmetry = phykit.phykit:discordance_asymmetry
|
|
57
60
|
pk_dmap = phykit.phykit:density_map
|
|
58
61
|
pk_dvmc = phykit.phykit:dvmc
|
|
59
62
|
pk_entropy = phykit.phykit:alignment_entropy
|
|
60
63
|
pk_erps = phykit.phykit:evolutionary_rate_per_site
|
|
64
|
+
pk_etm = phykit.phykit:evo_tempo_map
|
|
61
65
|
pk_evo_rate = phykit.phykit:evolutionary_rate
|
|
62
66
|
pk_evo_rate_per_site = phykit.phykit:evolutionary_rate_per_site
|
|
67
|
+
pk_evo_tempo_map = phykit.phykit:evo_tempo_map
|
|
63
68
|
pk_evolutionary_rate = phykit.phykit:evolutionary_rate
|
|
64
69
|
pk_evolutionary_rate_per_site = phykit.phykit:evolutionary_rate_per_site
|
|
65
70
|
pk_faidx = phykit.phykit:faidx
|
|
@@ -179,9 +184,12 @@ pk_rt = phykit.phykit:root_tree
|
|
|
179
184
|
pk_sat = phykit.phykit:saturation
|
|
180
185
|
pk_saturation = phykit.phykit:saturation
|
|
181
186
|
pk_scm = phykit.phykit:stochastic_character_map
|
|
187
|
+
pk_sd = phykit.phykit:spectral_discordance
|
|
182
188
|
pk_simmap = phykit.phykit:stochastic_character_map
|
|
183
189
|
pk_sop = phykit.phykit:sum_of_pairs_score
|
|
184
190
|
pk_sops = phykit.phykit:sum_of_pairs_score
|
|
191
|
+
pk_spec_disc = phykit.phykit:spectral_discordance
|
|
192
|
+
pk_spectral_discordance = phykit.phykit:spectral_discordance
|
|
185
193
|
pk_splitnet = phykit.phykit:consensus_network
|
|
186
194
|
pk_splits_network = phykit.phykit:consensus_network
|
|
187
195
|
pk_spurious_seq = phykit.phykit:spurious_sequence
|
|
@@ -256,6 +256,14 @@ setup(
|
|
|
256
256
|
"pk_treeness_over_rcv = phykit.phykit:treeness_over_rcv",
|
|
257
257
|
"pk_toverr = phykit.phykit:treeness_over_rcv",
|
|
258
258
|
"pk_tor = phykit.phykit:treeness_over_rcv",
|
|
259
|
+
"pk_evo_tempo_map = phykit.phykit:evo_tempo_map",
|
|
260
|
+
"pk_etm = phykit.phykit:evo_tempo_map",
|
|
261
|
+
"pk_discordance_asymmetry = phykit.phykit:discordance_asymmetry",
|
|
262
|
+
"pk_disc_asym = phykit.phykit:discordance_asymmetry",
|
|
263
|
+
"pk_da = phykit.phykit:discordance_asymmetry",
|
|
264
|
+
"pk_spectral_discordance = phykit.phykit:spectral_discordance",
|
|
265
|
+
"pk_spec_disc = phykit.phykit:spectral_discordance",
|
|
266
|
+
"pk_sd = phykit.phykit:spectral_discordance",
|
|
259
267
|
"pk_create_concatenation_matrix = phykit.phykit:create_concatenation_matrix", # Helper functions
|
|
260
268
|
"pk_create_concat = phykit.phykit:create_concatenation_matrix",
|
|
261
269
|
"pk_cc = phykit.phykit:create_concatenation_matrix",
|
phykit-2.1.35/phykit/version.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.1.35"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|