treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/PMSF.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from distutils.spawn import find_executable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
PMSF_usage = '''
|
|
7
|
+
==================== PMSF example commands ====================
|
|
8
|
+
|
|
9
|
+
# Dependency: iqtree2
|
|
10
|
+
|
|
11
|
+
TreeSAK PMSF -i in.aln -o get_PMSF_tree_wd -t 12
|
|
12
|
+
TreeSAK PMSF -i in.aln -o get_PMSF_tree_wd -t 12 -topo topo.tree
|
|
13
|
+
|
|
14
|
+
# This is a wrapper for:
|
|
15
|
+
iqtree2 -T 12 -B 1000 --alrt 1000 --quiet --seqtype AA -s in.aln --prefix guide_tree -m LG+F+G
|
|
16
|
+
iqtree2 -T 12 -B 1000 --alrt 1000 --quiet --seqtype AA -s in.aln --prefix PMSF -m LG+C60+F+G -ft guide_tree.treefile
|
|
17
|
+
|
|
18
|
+
# more information: http://www.iqtree.org/doc/Complex-Models
|
|
19
|
+
|
|
20
|
+
# Reference: The evolutionary origin of host association in the Rickettsiales
|
|
21
|
+
Maximum likelihood phylogenetic reconstructions were done under the PMSF approximation
|
|
22
|
+
(with 100 non-parametric bootstraps; guidetree under LG+G+F) of the LG+C60+F+Γ4 model
|
|
23
|
+
(selected by ModelFinder) for both supermatrix alignments with IQTREE v1.6.5.
|
|
24
|
+
|
|
25
|
+
===============================================================
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def PMSF(args):
|
|
30
|
+
|
|
31
|
+
msa_in = args['i']
|
|
32
|
+
iqtree_model_guide_tree = args['gm']
|
|
33
|
+
iqtree_model = args['m']
|
|
34
|
+
op_dir = args['o']
|
|
35
|
+
tree_prefix = args['p']
|
|
36
|
+
force_overwrite = args['f']
|
|
37
|
+
num_of_threads = args['t']
|
|
38
|
+
topo_constraint_txt = args['topo']
|
|
39
|
+
|
|
40
|
+
guide_tree_wd = '%s/guide_tree' % op_dir
|
|
41
|
+
pwd_guide_tree = '%s/guide_tree.treefile' % guide_tree_wd
|
|
42
|
+
pwd_cmd_txt = '%s/cmds.txt' % op_dir
|
|
43
|
+
|
|
44
|
+
iqtree_exe = ''
|
|
45
|
+
if find_executable('iqtree2'):
|
|
46
|
+
iqtree_exe = 'iqtree2'
|
|
47
|
+
elif find_executable('iqtree'):
|
|
48
|
+
iqtree_exe = 'iqtree'
|
|
49
|
+
else:
|
|
50
|
+
print('iqtree not detected, program exited!')
|
|
51
|
+
exit()
|
|
52
|
+
|
|
53
|
+
# check input file
|
|
54
|
+
if os.path.isfile(msa_in) is False:
|
|
55
|
+
print('MSA file not found, program exited!')
|
|
56
|
+
exit()
|
|
57
|
+
|
|
58
|
+
# create output dir
|
|
59
|
+
if os.path.isdir(op_dir) is True:
|
|
60
|
+
if force_overwrite is True:
|
|
61
|
+
os.system('rm -r %s' % op_dir)
|
|
62
|
+
else:
|
|
63
|
+
print('output folder already exist, program exited!')
|
|
64
|
+
exit()
|
|
65
|
+
os.system('mkdir %s' % op_dir)
|
|
66
|
+
os.system('mkdir %s' % guide_tree_wd)
|
|
67
|
+
|
|
68
|
+
guidetree_cmd = '%s -s %s --prefix %s/guide_tree --seqtype AA -m %s -T %s -B 1000 --alrt 1000 --quiet' % (iqtree_exe, msa_in, guide_tree_wd, iqtree_model_guide_tree, num_of_threads)
|
|
69
|
+
iqtree_cmd = '%s -s %s --prefix %s/%s --seqtype AA -m %s -T %s -B 1000 --alrt 1000 --quiet -ft %s' % (iqtree_exe, msa_in, op_dir, tree_prefix, iqtree_model, num_of_threads, pwd_guide_tree)
|
|
70
|
+
if topo_constraint_txt is not None:
|
|
71
|
+
guidetree_cmd = '%s -s %s --prefix %s/guide_tree --seqtype AA -m %s -T %s -B 1000 --alrt 1000 --quiet -g %s' % (iqtree_exe, msa_in, guide_tree_wd, iqtree_model_guide_tree, num_of_threads, topo_constraint_txt)
|
|
72
|
+
iqtree_cmd = '%s -s %s --prefix %s/%s --seqtype AA -m %s -T %s -B 1000 --alrt 1000 --quiet -ft %s -g %s' % (iqtree_exe, msa_in, op_dir, tree_prefix, iqtree_model, num_of_threads, pwd_guide_tree, topo_constraint_txt)
|
|
73
|
+
|
|
74
|
+
# write out commands
|
|
75
|
+
pwd_cmd_txt_handle = open(pwd_cmd_txt, 'w')
|
|
76
|
+
pwd_cmd_txt_handle.write(guidetree_cmd + '\n')
|
|
77
|
+
pwd_cmd_txt_handle.write(iqtree_cmd + '\n')
|
|
78
|
+
pwd_cmd_txt_handle.close()
|
|
79
|
+
|
|
80
|
+
# get guide tree
|
|
81
|
+
print('Building guide tree')
|
|
82
|
+
print(guidetree_cmd)
|
|
83
|
+
os.system(guidetree_cmd)
|
|
84
|
+
|
|
85
|
+
# get PMSF tree
|
|
86
|
+
print('Building PMSF tree with model %s' % iqtree_model)
|
|
87
|
+
print(iqtree_cmd)
|
|
88
|
+
os.system(iqtree_cmd)
|
|
89
|
+
|
|
90
|
+
print('Done!')
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == '__main__':
|
|
94
|
+
|
|
95
|
+
# initialize the options parser
|
|
96
|
+
PMSF_parser = argparse.ArgumentParser()
|
|
97
|
+
PMSF_parser.add_argument('-i', required=True, help='input MSA file')
|
|
98
|
+
PMSF_parser.add_argument('-gm', required=False, default='LG+F+G', help='iqtree model for guide tree, default: LG+F+G')
|
|
99
|
+
PMSF_parser.add_argument('-m', required=False, default='LG+C60+F+G', help='iqtree model, default: LG+C60+F+G')
|
|
100
|
+
PMSF_parser.add_argument('-o', required=True, help='output plot')
|
|
101
|
+
PMSF_parser.add_argument('-p', required=False, default='PMSF', help='tree prefix, default: PMSF')
|
|
102
|
+
PMSF_parser.add_argument('-topo', required=False, default=None, help='topological constraint tree, pass to -g, default is None')
|
|
103
|
+
PMSF_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads')
|
|
104
|
+
PMSF_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
105
|
+
args = vars(PMSF_parser.parse_args())
|
|
106
|
+
PMSF(args)
|
TreeSAK/PhyloBiAssoc.R
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
suppressMessages(suppressWarnings(library("ape")))
|
|
2
|
+
suppressMessages(suppressWarnings(library("phytools")))
|
|
3
|
+
suppressMessages(suppressWarnings(library("optparse")))
|
|
4
|
+
|
|
5
|
+
################################################################################
|
|
6
|
+
|
|
7
|
+
option_list = list(
|
|
8
|
+
make_option(c("-t", "--treefile"), type="character", default=NULL, help="tree file"),
|
|
9
|
+
make_option(c("-d", "--datafile"), type="character", default=NULL, help="data file"));
|
|
10
|
+
opt_parser = OptionParser(option_list=option_list);
|
|
11
|
+
opt = parse_args(opt_parser);
|
|
12
|
+
|
|
13
|
+
tree_file = opt$treefile
|
|
14
|
+
data_file = opt$datafile
|
|
15
|
+
|
|
16
|
+
# Rscript PhyloBiAssoc.R -t demo.tre -d demo.txt
|
|
17
|
+
# phylosig 7.973475e-26 binaryPGLMM 0.03255813
|
|
18
|
+
# The header of the first two columns has to be "ID" and "cate".
|
|
19
|
+
|
|
20
|
+
################################################################################
|
|
21
|
+
|
|
22
|
+
geotree <- read.tree(tree_file)
|
|
23
|
+
geodata <- read.table(data_file, header = TRUE, sep = "\t")
|
|
24
|
+
|
|
25
|
+
# sort rows in df according to the order of tips in the tree
|
|
26
|
+
row.names(geodata) <- geodata[,1]
|
|
27
|
+
row.names(geodata) <- geodata$ID
|
|
28
|
+
geodata <- geodata[geotree$tip.label,]
|
|
29
|
+
colnames_all = colnames(geodata[, ,])
|
|
30
|
+
colnames_to_process = tail(colnames_all, -2)
|
|
31
|
+
|
|
32
|
+
cat('ID', "phylosig", "binaryPGLMM", "chisq.test", "coefficient", "significance", '\n', fill=FALSE, sep = "\t")
|
|
33
|
+
|
|
34
|
+
for (i in colnames_to_process){
|
|
35
|
+
|
|
36
|
+
# perform phylosig test
|
|
37
|
+
phylosig_test <- phylosig(tree = geotree, x = setNames(geodata[, i], geodata$ID), method = "lambda", test = TRUE)
|
|
38
|
+
phylosig_test_pvalue = phylosig_test$P
|
|
39
|
+
|
|
40
|
+
# perform binaryPGLMM test if phylosig P-value <= 0.05 (indicating significant phylogenetic signal)
|
|
41
|
+
# perform chi-squared test if phylosig P-value > 0.05 (indicating no phylogenetic signal)
|
|
42
|
+
# do nothing if phylosig returns NaN
|
|
43
|
+
|
|
44
|
+
association_test = ''
|
|
45
|
+
association_p_value = NA
|
|
46
|
+
do_nothing = FALSE
|
|
47
|
+
association_coefficient = 'na'
|
|
48
|
+
significant = 'n'
|
|
49
|
+
if (phylosig_test_pvalue == 'NaN') {
|
|
50
|
+
do_nothing = TRUE
|
|
51
|
+
significant = 'na'
|
|
52
|
+
} else if (phylosig_test_pvalue <= 0.05) {
|
|
53
|
+
binaryPGLMM_result <- binaryPGLMM(setNames(geodata[, i], geodata$ID) ~ geodata$cate, phy = geotree)
|
|
54
|
+
association_test = 'binaryPGLMM'
|
|
55
|
+
association_coefficient = binaryPGLMM_result$B[1]
|
|
56
|
+
association_p_value = binaryPGLMM_result$B.pvalue[2]
|
|
57
|
+
if (association_p_value <= 0.05) {
|
|
58
|
+
significant = 'y'
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
} else {
|
|
62
|
+
chisq_test <- chisq.test(table(geodata$cate, setNames(geodata[, i], geodata$ID)))
|
|
63
|
+
association_test = 'chisq.test'
|
|
64
|
+
association_p_value = chisq_test$p.value
|
|
65
|
+
|
|
66
|
+
#cor_test <- cor.test(geodata$cate, geodata[, i])
|
|
67
|
+
#association_coefficient = cor_test$estimate
|
|
68
|
+
|
|
69
|
+
if (association_p_value <= 0.05) {
|
|
70
|
+
significant = 'y'
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# print to screen
|
|
75
|
+
if (do_nothing == FALSE) {
|
|
76
|
+
|
|
77
|
+
if (association_test == 'binaryPGLMM'){
|
|
78
|
+
cat(i, phylosig_test_pvalue, association_p_value, 'na', association_coefficient, significant, '\n', fill=FALSE, sep = "\t")
|
|
79
|
+
}
|
|
80
|
+
if (association_test == 'chisq.test'){
|
|
81
|
+
cat(i, phylosig_test_pvalue, 'na', association_p_value, association_coefficient, significant, '\n', fill=FALSE, sep = "\t")
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
TreeSAK/PhyloBiAssoc.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
from statsmodels.stats.multitest import multipletests
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
PhyloBiAssoc_usage = '''
|
|
9
|
+
============================= PhyloBiAssoc example commands =============================
|
|
10
|
+
|
|
11
|
+
BioSAK PhyloBiAssoc -i demo.tre -d demo.txt -o op_dir -t 10 -f
|
|
12
|
+
|
|
13
|
+
# Note, header for the first two columns in -d has to be "ID" and "cate"!!!
|
|
14
|
+
|
|
15
|
+
# It will perform:
|
|
16
|
+
# 1) binaryPGLMM test if phylosig p-value <= 0.05 (significant phylogenetic signal)
|
|
17
|
+
# 2) chi-squared test if phylosig p-value > 0.05 (no phylogenetic signal)
|
|
18
|
+
# 3) do nothing if phylosig returns NaN (might due to the same value across all genomes)
|
|
19
|
+
|
|
20
|
+
# https://www.rdocumentation.org/packages/ape/versions/5.7-1/topics/binaryPGLMM
|
|
21
|
+
|
|
22
|
+
=========================================================================================
|
|
23
|
+
'''
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def sep_path_basename_ext(file_in):
|
|
27
|
+
|
|
28
|
+
f_path, file_name = os.path.split(file_in)
|
|
29
|
+
if f_path == '':
|
|
30
|
+
f_path = '.'
|
|
31
|
+
|
|
32
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
33
|
+
return f_path, f_base, f_ext
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def subset_df(file_in, rows_to_keep, cols_to_keep, sep_symbol, row_name_pos, column_name_pos, file_out):
|
|
37
|
+
|
|
38
|
+
df = pd.read_csv(file_in, sep=sep_symbol, header=column_name_pos, index_col=row_name_pos)
|
|
39
|
+
|
|
40
|
+
if len(rows_to_keep) == 0:
|
|
41
|
+
if len(cols_to_keep) == 0:
|
|
42
|
+
subset_df = df.loc[:, :]
|
|
43
|
+
else:
|
|
44
|
+
subset_df = df.loc[:, cols_to_keep]
|
|
45
|
+
else:
|
|
46
|
+
if len(cols_to_keep) == 0:
|
|
47
|
+
subset_df = df.loc[rows_to_keep, :]
|
|
48
|
+
else:
|
|
49
|
+
subset_df = df.loc[rows_to_keep, cols_to_keep]
|
|
50
|
+
|
|
51
|
+
subset_df.to_csv(file_out, sep=sep_symbol)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def PhyloBiAssoc(args):
|
|
55
|
+
|
|
56
|
+
tree_file = args['i']
|
|
57
|
+
data_file = args['d']
|
|
58
|
+
op_dir = args['o']
|
|
59
|
+
num_threads = args['t']
|
|
60
|
+
force_create_op_dir = args['f']
|
|
61
|
+
|
|
62
|
+
pwd_current_script = os.path.realpath(__file__)
|
|
63
|
+
current_script_path = '/'.join(pwd_current_script.split('/')[:-1])
|
|
64
|
+
PhyloBiAssoc_R = '%s/PhyloBiAssoc.R' % current_script_path
|
|
65
|
+
|
|
66
|
+
cmd_txt = '%s/cmds.txt' % op_dir
|
|
67
|
+
df_subset_dir = '%s/df_subset' % op_dir
|
|
68
|
+
stats_op_dir = '%s/stats_results' % op_dir
|
|
69
|
+
combined_stats_txt = '%s/stats_results_all.txt' % op_dir
|
|
70
|
+
combined_stats_txt_sig = '%s/stats_results_0.05.txt' % op_dir
|
|
71
|
+
combined_stats_txt_adjusted = '%s/stats_results_0.05_adjusted.txt' % op_dir
|
|
72
|
+
|
|
73
|
+
# create op_dir
|
|
74
|
+
if os.path.isdir(op_dir) is True:
|
|
75
|
+
if force_create_op_dir is True:
|
|
76
|
+
os.system('rm -r %s' % op_dir)
|
|
77
|
+
else:
|
|
78
|
+
print('output directory exist, program exited!')
|
|
79
|
+
exit()
|
|
80
|
+
os.system('mkdir %s' % op_dir)
|
|
81
|
+
os.system('mkdir %s' % df_subset_dir)
|
|
82
|
+
os.system('mkdir %s' % stats_op_dir)
|
|
83
|
+
|
|
84
|
+
# read in dataframe
|
|
85
|
+
df = pd.read_csv(data_file, sep='\t', header=0, index_col=0)
|
|
86
|
+
col_header_list = list(df.columns.values)
|
|
87
|
+
|
|
88
|
+
subset_dict = dict()
|
|
89
|
+
for each_col in col_header_list[1:]:
|
|
90
|
+
subset_dict[each_col] = ['cate', each_col]
|
|
91
|
+
|
|
92
|
+
# subset dataframe
|
|
93
|
+
cmd_txt_handle = open(cmd_txt, 'w')
|
|
94
|
+
stats_cmd_list = []
|
|
95
|
+
op_stats_txt_set = set()
|
|
96
|
+
for each_subset in subset_dict:
|
|
97
|
+
cols_to_keep = subset_dict[each_subset]
|
|
98
|
+
df_subset_file = '%s/%s.tab' % (df_subset_dir, each_subset)
|
|
99
|
+
stats_out_txt = '%s/%s.txt' % (stats_op_dir, each_subset)
|
|
100
|
+
subset_df(data_file, set(), cols_to_keep, '\t', 0, 0, df_subset_file)
|
|
101
|
+
stats_cmd = 'Rscript %s -t %s -d %s > %s' % (PhyloBiAssoc_R, tree_file, df_subset_file, stats_out_txt)
|
|
102
|
+
cmd_txt_handle.write(stats_cmd + '\n')
|
|
103
|
+
stats_cmd_list.append(stats_cmd)
|
|
104
|
+
op_stats_txt_set.add(stats_out_txt)
|
|
105
|
+
cmd_txt_handle.close()
|
|
106
|
+
|
|
107
|
+
print('Processing %s objects with %s cores' % (len(stats_cmd_list), num_threads))
|
|
108
|
+
pool = mp.Pool(processes=num_threads)
|
|
109
|
+
pool.map(os.system, stats_cmd_list)
|
|
110
|
+
pool.close()
|
|
111
|
+
pool.join()
|
|
112
|
+
#os.system('cp /Users/songweizhi/Documents/Research/Sponge/12_PhyloBiAssoc_wd/PhyloBiAssoc_wd_backup/stats_results/*.txt %s/' % stats_op_dir)
|
|
113
|
+
|
|
114
|
+
# combine stats results
|
|
115
|
+
sig_list_id = []
|
|
116
|
+
sig_list_value = []
|
|
117
|
+
|
|
118
|
+
combined_stats_txt_handle = open(combined_stats_txt, 'w')
|
|
119
|
+
combined_stats_txt_handle.write('ID phylosig binaryPGLMM chisq.test coefficient significance\n')
|
|
120
|
+
combined_stats_txt_sig_handle = open(combined_stats_txt_sig, 'w')
|
|
121
|
+
for each_file in sorted(list(op_stats_txt_set)):
|
|
122
|
+
f_path, f_base, f_ext = sep_path_basename_ext(each_file)
|
|
123
|
+
for each_line in open(each_file):
|
|
124
|
+
if not each_line.startswith('ID\tphylosig\tbinaryPGLMM\tchisq.test'):
|
|
125
|
+
each_line_split = each_line.strip().split('\t')
|
|
126
|
+
significance = each_line_split[5]
|
|
127
|
+
combined_stats_txt_handle.write('%s\t%s\n' % (f_base, '\t'.join(each_line_split[1:])))
|
|
128
|
+
if significance == 'y':
|
|
129
|
+
combined_stats_txt_sig_handle.write(f_base + '\n')
|
|
130
|
+
sig_bi = each_line_split[2]
|
|
131
|
+
sig_chi = each_line_split[3]
|
|
132
|
+
current_sig = ''
|
|
133
|
+
if sig_bi == 'na':
|
|
134
|
+
current_sig = sig_chi
|
|
135
|
+
elif sig_chi == 'na':
|
|
136
|
+
current_sig = sig_bi
|
|
137
|
+
sig_list_id.append(f_base)
|
|
138
|
+
sig_list_value.append(float(current_sig))
|
|
139
|
+
combined_stats_txt_handle.close()
|
|
140
|
+
combined_stats_txt_sig_handle.close()
|
|
141
|
+
|
|
142
|
+
# perform Bonferroni correction
|
|
143
|
+
sig_list_value_adjusted = list(multipletests(sig_list_value, alpha=0.1, method='bonferroni')[1])
|
|
144
|
+
|
|
145
|
+
# write out adjusted p values
|
|
146
|
+
combined_stats_txt_adjusted_handle = open(combined_stats_txt_adjusted, 'w')
|
|
147
|
+
combined_stats_txt_adjusted_handle.write('ID\tadjusted_p_value\n')
|
|
148
|
+
for (id, adjusted_p) in zip(sig_list_id, sig_list_value_adjusted):
|
|
149
|
+
if adjusted_p <= 0.05:
|
|
150
|
+
combined_stats_txt_adjusted_handle.write('%s\t%s\n' % (id, adjusted_p))
|
|
151
|
+
combined_stats_txt_adjusted_handle.close()
|
|
152
|
+
|
|
153
|
+
# Final report
|
|
154
|
+
print('Results exported to: \n%s\n%s' % (combined_stats_txt, combined_stats_txt_adjusted))
|
|
155
|
+
print('Done')
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == "__main__":
|
|
159
|
+
|
|
160
|
+
PhyloBiAssoc_parser = argparse.ArgumentParser(usage=PhyloBiAssoc_usage)
|
|
161
|
+
PhyloBiAssoc_parser.add_argument('-i', required=True, help='tree file')
|
|
162
|
+
PhyloBiAssoc_parser.add_argument('-d', required=True, help='data file')
|
|
163
|
+
PhyloBiAssoc_parser.add_argument('-o', required=True, help='output directory')
|
|
164
|
+
PhyloBiAssoc_parser.add_argument('-t', required=False, type=int, default=1, help='number of threads, default: 1')
|
|
165
|
+
PhyloBiAssoc_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
166
|
+
args = vars(PhyloBiAssoc_parser.parse_args())
|
|
167
|
+
PhyloBiAssoc(args)
|
TreeSAK/PlotMCMC.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import matplotlib as mpl
|
|
2
|
+
mpl.use('Agg')
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def PlotMCMC(num_list_x, num_list_y, err_range_x, err_range_y, pwd_figure):
|
|
7
|
+
|
|
8
|
+
x_err_l = []
|
|
9
|
+
x_err_r = []
|
|
10
|
+
y_err_l = []
|
|
11
|
+
y_err_u = []
|
|
12
|
+
n = 0
|
|
13
|
+
while n < len(num_list_x):
|
|
14
|
+
x_value = num_list_x[n]
|
|
15
|
+
y_value = num_list_y[n]
|
|
16
|
+
x_range = err_range_x[n]
|
|
17
|
+
y_range = err_range_y[n]
|
|
18
|
+
x_l_dist = abs(x_value - x_range[0])
|
|
19
|
+
x_r_dist = abs(x_range[1] - x_value)
|
|
20
|
+
y_l_dist = abs(y_value - y_range[0])
|
|
21
|
+
y_u_dist = abs(y_range[1] - y_value)
|
|
22
|
+
x_err_l.append(x_l_dist)
|
|
23
|
+
x_err_r.append(x_r_dist)
|
|
24
|
+
y_err_l.append(y_l_dist)
|
|
25
|
+
y_err_u.append(y_u_dist)
|
|
26
|
+
n += 1
|
|
27
|
+
|
|
28
|
+
plt.scatter(num_list_x, num_list_y, s=0)
|
|
29
|
+
plt.errorbar(num_list_x, num_list_y, xerr=[x_err_l, x_err_r], yerr=[y_err_l, y_err_u], ls='none', ecolor='black', elinewidth=1)
|
|
30
|
+
plt.tight_layout()
|
|
31
|
+
plt.savefig(pwd_figure)
|
|
32
|
+
plt.close()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
num_list_x = [1, 2, 3, 4, 5]
|
|
36
|
+
num_list_y = [1, 2, 3, 4, 5]
|
|
37
|
+
err_range_x = [[0.8, 1.1], [1.5, 2.1], [2.9, 3.7], [3.2, 4.1], [4.5, 5.5]]
|
|
38
|
+
err_range_y = [[0.7, 1.3], [1.9, 2.5], [2.9, 3.7], [3.1, 4.4], [4.5, 5.5]]
|
|
39
|
+
pwd_figure = '/Users/songweizhi/Desktop/aaa.png'
|
|
40
|
+
|
|
41
|
+
PlotMCMC(num_list_x, num_list_y, err_range_x, err_range_y, pwd_figure)
|
TreeSAK/PlotMcmcNode.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import plotly.express as px
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
PlotMcmcNode_usage = '''
|
|
9
|
+
========================== PlotMcmcNode example commands ==========================
|
|
10
|
+
|
|
11
|
+
TreeSAK PlotMcmcNode -i McmcTree_op_files -n topo123_nodes.txt -o topo123_age.pdf
|
|
12
|
+
|
|
13
|
+
# txt file format (-i, tab separated, no header)
|
|
14
|
+
# column 1: mcmc file
|
|
15
|
+
# column 2: node id
|
|
16
|
+
# column 3: mcmc file description
|
|
17
|
+
# column 4: node description
|
|
18
|
+
|
|
19
|
+
path/to/topo1_clock3_mcmc.txt t_n171 Topo1 Symbiosis_event_1
|
|
20
|
+
path/to/topo1_clock3_mcmc.txt t_n151 Topo1 Symbiosis_event_2
|
|
21
|
+
path/to/topo1_clock3_mcmc.txt t_n131 Topo1 Symbiosis_event_3
|
|
22
|
+
path/to/topo2_clock3_mcmc.txt t_n171 Topo2 Symbiosis_event_1
|
|
23
|
+
path/to/topo2_clock3_mcm.txtc t_n171 Topo3 Symbiosis_event_1
|
|
24
|
+
|
|
25
|
+
===================================================================================
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def sep_path_basename_ext(file_in):
|
|
30
|
+
|
|
31
|
+
# separate path and file name
|
|
32
|
+
f_path, file_name = os.path.split(file_in)
|
|
33
|
+
if f_path == '':
|
|
34
|
+
f_path = '.'
|
|
35
|
+
|
|
36
|
+
# separate file basename and extension
|
|
37
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
38
|
+
|
|
39
|
+
return f_path, f_base, f_ext
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def plot_distribution(df_txt, output_plot):
|
|
43
|
+
|
|
44
|
+
df = pd.read_table(df_txt, sep=',')
|
|
45
|
+
run_id_list = df['Setting'].unique()
|
|
46
|
+
node_id_list = df['Node'].unique()
|
|
47
|
+
|
|
48
|
+
# sort dataframe by run id
|
|
49
|
+
df = df.sort_values(by='Setting', ascending=False)
|
|
50
|
+
|
|
51
|
+
plot_width = 900
|
|
52
|
+
plot_height = len(run_id_list)*100
|
|
53
|
+
if plot_height < 360:
|
|
54
|
+
plot_height = 360
|
|
55
|
+
|
|
56
|
+
fig = px.violin(df, x="Value", y="Setting", color="Node", points=False, orientation="h", width=plot_width, height=plot_height).update_layout(xaxis_title="Age", yaxis_title="Settings")
|
|
57
|
+
if len(node_id_list) == 1:
|
|
58
|
+
fig.update_traces(side="positive", fillcolor='lightblue', width=1.6, opacity=0.75)
|
|
59
|
+
else:
|
|
60
|
+
fig.update_traces(side="positive", fillcolor='rgba(0,0,0,0)', width=1.6)
|
|
61
|
+
|
|
62
|
+
fig.update_traces(showlegend=True)
|
|
63
|
+
fig.layout.template = "simple_white"
|
|
64
|
+
# fig.layout.width = 700
|
|
65
|
+
# fig.layout.height = 750
|
|
66
|
+
# fig.update_xaxes(range=[40, 0])
|
|
67
|
+
# fig.update_layout(margin_t=10, title_text='Demo', title_x=0.5)
|
|
68
|
+
fig.write_image(output_plot)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def PlotMcmcNode(args):
|
|
72
|
+
|
|
73
|
+
node_txt = args['i']
|
|
74
|
+
output_plot = args['o']
|
|
75
|
+
|
|
76
|
+
if output_plot[-4:] not in ['.PDF', '.pdf']:
|
|
77
|
+
output_plot = output_plot + '.pdf'
|
|
78
|
+
op_df_tmp = output_plot + '.txt'
|
|
79
|
+
|
|
80
|
+
# read in txt file
|
|
81
|
+
file_to_node_dict = dict()
|
|
82
|
+
file_rename_dict = dict()
|
|
83
|
+
node_rename_dict = dict()
|
|
84
|
+
not_found_file_set = set()
|
|
85
|
+
for each_node in open(node_txt):
|
|
86
|
+
line_split = each_node.strip().split()
|
|
87
|
+
mcmc_file = line_split[0]
|
|
88
|
+
node_id = line_split[1]
|
|
89
|
+
|
|
90
|
+
if os.path.isfile(mcmc_file) is False:
|
|
91
|
+
not_found_file_set.add(mcmc_file)
|
|
92
|
+
|
|
93
|
+
_, mcmc_f_base, _ = sep_path_basename_ext(mcmc_file)
|
|
94
|
+
node_id_with_file_name = '%s_____%s' % (mcmc_file, node_id)
|
|
95
|
+
|
|
96
|
+
# get file_to_node_dict
|
|
97
|
+
if mcmc_file not in file_to_node_dict:
|
|
98
|
+
file_to_node_dict[mcmc_file] = set()
|
|
99
|
+
file_to_node_dict[mcmc_file].add(node_id)
|
|
100
|
+
|
|
101
|
+
# get file_rename_dict
|
|
102
|
+
if len(line_split) >= 3:
|
|
103
|
+
file_rename_on_plot = line_split[2]
|
|
104
|
+
file_rename_dict[mcmc_file] = file_rename_on_plot
|
|
105
|
+
|
|
106
|
+
# get node_rename_dict
|
|
107
|
+
if len(line_split) == 4:
|
|
108
|
+
node_rename_on_plot = line_split[3]
|
|
109
|
+
node_rename_dict[node_id_with_file_name] = node_rename_on_plot
|
|
110
|
+
|
|
111
|
+
if len(not_found_file_set) > 0:
|
|
112
|
+
print('The following files not found program exited!')
|
|
113
|
+
print('\n'.join(sorted(list(not_found_file_set))))
|
|
114
|
+
exit()
|
|
115
|
+
|
|
116
|
+
found_matched_node = False
|
|
117
|
+
op_df_tmp_handle = open(op_df_tmp, 'w')
|
|
118
|
+
op_df_tmp_handle.write('Value,Node,Setting\n')
|
|
119
|
+
for mcmc_file in file_to_node_dict:
|
|
120
|
+
current_node_set = file_to_node_dict.get(mcmc_file, set())
|
|
121
|
+
file_name_to_plot = file_rename_dict.get(mcmc_file, mcmc_file)
|
|
122
|
+
mcmc_df = pd.read_table(mcmc_file, index_col=0)
|
|
123
|
+
for each_col in mcmc_df:
|
|
124
|
+
if each_col in current_node_set:
|
|
125
|
+
node_desc_to_plot = node_rename_dict.get(('%s_____%s' % (mcmc_file, each_col)), each_col)
|
|
126
|
+
found_matched_node = True
|
|
127
|
+
value_list = mcmc_df[each_col].values
|
|
128
|
+
for each_value in value_list:
|
|
129
|
+
op_df_tmp_handle.write('%s,%s,%s\n' % (each_value, node_desc_to_plot, file_name_to_plot))
|
|
130
|
+
op_df_tmp_handle.close()
|
|
131
|
+
|
|
132
|
+
if found_matched_node is False:
|
|
133
|
+
print('Provided node(s) not found, program exited!')
|
|
134
|
+
exit()
|
|
135
|
+
|
|
136
|
+
# plot distribution
|
|
137
|
+
plot_distribution(op_df_tmp, output_plot)
|
|
138
|
+
|
|
139
|
+
# remove tmp files
|
|
140
|
+
os.system('rm %s' % op_df_tmp)
|
|
141
|
+
|
|
142
|
+
# final report
|
|
143
|
+
print('Plot exported to %s, done!' % output_plot)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == '__main__':
|
|
147
|
+
|
|
148
|
+
PlotMcmcNode_parser = argparse.ArgumentParser()
|
|
149
|
+
PlotMcmcNode_parser.add_argument('-i', required=True, help='input txt file')
|
|
150
|
+
PlotMcmcNode_parser.add_argument('-o', required=True, help='output plot')
|
|
151
|
+
args = vars(PlotMcmcNode_parser.parse_args())
|
|
152
|
+
PlotMcmcNode(args)
|