treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/TaxaCountStats.R
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
|
|
2
|
+
################################################################################
|
|
3
|
+
|
|
4
|
+
# R script for ranking marker proteins for generating concatenated species trees (127 taxa set)
|
|
5
|
+
# Dombrowski et al., 2020
|
|
6
|
+
# Finalized: February 2020
|
|
7
|
+
|
|
8
|
+
# modified by Weizhi
|
|
9
|
+
# Rscript /Users/songweizhi/PycharmProjects/Sponge_Hologenome/Scripts/TaxaCountStats.R -t treefile_v2.tre -l List_of_trees_2.txt -g mapping_3.txt -x MarkerList.txt -s TaxaCounts_op.txt -r Genes_to_remove.txt -o a.txt
|
|
10
|
+
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
suppressMessages(library(optparse))
|
|
14
|
+
suppressMessages(library(plyr))
|
|
15
|
+
suppressMessages(library(dbplyr))
|
|
16
|
+
suppressMessages(library(dplyr))
|
|
17
|
+
suppressMessages(library(tidyr))
|
|
18
|
+
suppressMessages(library(ggplot2))
|
|
19
|
+
suppressMessages(library(data.table))
|
|
20
|
+
suppressMessages(library(RColorBrewer))
|
|
21
|
+
suppressMessages(library(gplots))
|
|
22
|
+
suppressMessages(library(ape))
|
|
23
|
+
|
|
24
|
+
####################################### argument parser ######################################
|
|
25
|
+
|
|
26
|
+
option_list = list(
|
|
27
|
+
make_option(c("-t", "--tree"), type="character", help="combined_contree_file"),
|
|
28
|
+
make_option(c("-l", "--treelist"), type="character", help="list_of_trees_txt"),
|
|
29
|
+
make_option(c("-g", "--mapping"), type="character", help="mapping_txt"),
|
|
30
|
+
make_option(c("-x", "--markerlist"), type="character", help="marker_list_txt"),
|
|
31
|
+
make_option(c("-s", "--cstop"), type="character", help="combined_count_sister_taxa_output"),
|
|
32
|
+
make_option(c("-r", "--removegene"), type="character", help="genes_to_remove_txt"),
|
|
33
|
+
make_option(c("-o", "--output"), type="character", help="output table"));
|
|
34
|
+
|
|
35
|
+
opt_parser = OptionParser(option_list=option_list);
|
|
36
|
+
opt = parse_args(opt_parser);
|
|
37
|
+
|
|
38
|
+
combined_contree_file = opt$tree
|
|
39
|
+
list_of_trees_txt = opt$treelist
|
|
40
|
+
mapping_txt = opt$mapping
|
|
41
|
+
marker_list_txt = opt$markerlist
|
|
42
|
+
combined_count_sister_taxa_op = opt$cstop
|
|
43
|
+
genes_to_remove_txt = opt$removegene
|
|
44
|
+
output_table = opt$output
|
|
45
|
+
|
|
46
|
+
################################################################################
|
|
47
|
+
|
|
48
|
+
#rm(list=ls())
|
|
49
|
+
sessionInfo()
|
|
50
|
+
|
|
51
|
+
################################################################################
|
|
52
|
+
#0.1 setting working directory (!adjust wdir accordingly!)
|
|
53
|
+
################################################################################
|
|
54
|
+
|
|
55
|
+
# setting working directory (!adjust wdir accordingly!)
|
|
56
|
+
#wdir <- "/Users/songweizhi/Desktop/Anja_paper/Nina/4_151Marker_analyses/127_taxa"
|
|
57
|
+
#wdir <- "/Users/songweizhi/Desktop/Input_folder_to_R"
|
|
58
|
+
#setwd(wdir)
|
|
59
|
+
|
|
60
|
+
# Weizhi
|
|
61
|
+
# List_of_trees_2.txt id of marker genes (HOGs) (152, mind order)
|
|
62
|
+
# treefile_v2.tre tree corresponding to each marker gene (HOG)
|
|
63
|
+
# mapping_3.txt taxonomy/cluster/group/color for each genome (Domain is higher than cluster)
|
|
64
|
+
# Genes_to_remove.txt basically a list of marker gene ids
|
|
65
|
+
# MarkerList.txt basically a list of marker gene ids (152, mind order)
|
|
66
|
+
# TaxaCounts_151MarkerGenes_ArcRefv5UAP2_129taxa_v5.txt concatenated output from count_sister_taxa.py
|
|
67
|
+
|
|
68
|
+
################################################################################
|
|
69
|
+
#1. read in the treefiles (all concatenated in one large document)
|
|
70
|
+
################################################################################
|
|
71
|
+
#read in concatenated tree and list of trees
|
|
72
|
+
tree_order <- read.table(list_of_trees_txt, sep="\t", header=F, fill=TRUE, quote = "")
|
|
73
|
+
|
|
74
|
+
trees<-read.tree(combined_contree_file)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
#make a table from taxa labels and count how many taxa are in each tree
|
|
78
|
+
y <- c()
|
|
79
|
+
for(i in 1:length(trees)){
|
|
80
|
+
x <- length(trees[[i]]$tip.label)
|
|
81
|
+
y <- rbind (y,x)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
Species_in_tree <- as.data.frame(y)
|
|
85
|
+
rownames(Species_in_tree) <- as.character(tree_order$V1)
|
|
86
|
+
colnames(Species_in_tree) <- "NrSpecies"
|
|
87
|
+
#head(Species_in_tree)
|
|
88
|
+
|
|
89
|
+
################################################################################
|
|
90
|
+
#2. read in taxa mapping files and stats from tom's script
|
|
91
|
+
################################################################################
|
|
92
|
+
|
|
93
|
+
#taxa - taxonomy mapping file
|
|
94
|
+
mapping <- read.table(mapping_txt, sep="\t", header=T, fill=TRUE, quote = "", comment.char = "", check.names = FALSE)
|
|
95
|
+
#head(mapping)
|
|
96
|
+
|
|
97
|
+
#cleanup and shorten mapping file
|
|
98
|
+
mapping_clean <-unique(mapping[,c("Cluster", "Domain")])
|
|
99
|
+
#head(mapping_clean)
|
|
100
|
+
#dim(mapping_clean)
|
|
101
|
+
|
|
102
|
+
#list of genes to remove because in these trees archaea were not monphyletic
|
|
103
|
+
genes_to_remove <- read.table(genes_to_remove_txt, sep="\t", header=T, fill=TRUE, quote = "")
|
|
104
|
+
|
|
105
|
+
#list of total markers used in these analyses
|
|
106
|
+
MarkerList <- read.table(marker_list_txt, sep="\t", header=T, fill=TRUE, quote = "")
|
|
107
|
+
|
|
108
|
+
#read in statistics file from count_sister_taxa.py
|
|
109
|
+
SisterCounts <- read.table(combined_count_sister_taxa_op, sep="\t", header=T, fill=TRUE, quote = "")
|
|
110
|
+
#colnames(SisterCounts)
|
|
111
|
+
|
|
112
|
+
################################################################################
|
|
113
|
+
#3. transform data
|
|
114
|
+
################################################################################
|
|
115
|
+
#reduce table on and remove hits with low support (0.1 in this case but can be changed)
|
|
116
|
+
SisterCounts_temp0 <- subset(SisterCounts, Normalized2_sum_of_occurances >= 0.1)
|
|
117
|
+
|
|
118
|
+
#control, whether something is missing in the mapping file, the first setdiff is the relevant one!
|
|
119
|
+
List_taxa <- as.character(unique(SisterCounts_temp0$Group_of_interest))
|
|
120
|
+
#setdiff(List_taxa, mapping$Cluster)
|
|
121
|
+
|
|
122
|
+
#add in Group Info (i.e. DPANN, Eury, TACK) for Group of interest (needed to define HGT events)
|
|
123
|
+
SisterCounts_temp1 <- merge(SisterCounts_temp0, mapping_clean, by.x = "Group_of_interest", by.y = "Cluster", all.x = T)
|
|
124
|
+
colnames(SisterCounts_temp1) <- c( "Group_of_interest", "MarkerID", "Sister_taxa", "Normalized_sum_of_occurances","splits","Normalized2_sum_of_occurances","Clusters", "Group_of_interest_Group")
|
|
125
|
+
#head(SisterCounts_temp1)
|
|
126
|
+
|
|
127
|
+
#add in Group Info (i.e. DPANN, Eury, TACK) for Sister_taxa (needed to define HGT events)
|
|
128
|
+
SisterCounts_temp2 <- merge(SisterCounts_temp1, mapping_clean, by.x = "Sister_taxa", by.y = "Cluster", all.x = T)
|
|
129
|
+
colnames(SisterCounts_temp2) <- c( "Sister_taxa", "Group_of_interest", "MarkerID","Normalized_sum_of_occurances", "splits","Normalized2_sum_of_occurances","Clusters","Group_of_interest_Group", "Sister_taxon_Group")
|
|
130
|
+
#head(SisterCounts_temp2)
|
|
131
|
+
|
|
132
|
+
#resort dataframe for aesthetics
|
|
133
|
+
SisterCounts_temp3 <- SisterCounts_temp2[,c("MarkerID","Group_of_interest", "Group_of_interest_Group", "Sister_taxa", "Sister_taxon_Group", "Normalized_sum_of_occurances","splits", "Normalized2_sum_of_occurances","Clusters")]
|
|
134
|
+
#head(SisterCounts_temp3)
|
|
135
|
+
|
|
136
|
+
#count nr of total splits
|
|
137
|
+
SisterCounts_temp4 <- cbind(SisterCounts_temp3, count.fields(textConnection(as.character(SisterCounts_temp3$Clusters)), sep = ","))
|
|
138
|
+
#head(SisterCounts_temp4)
|
|
139
|
+
|
|
140
|
+
#make new column and make a remark whether clusters of interest are split or not
|
|
141
|
+
#Notice: The column "Clusters" lists if there is a split (i.e. if UAP2 12 then all 12 MAGs are together, if UAP2 has 8,4 then UAP2 is split once with one cluster with 8 and the other with 4 taxa)
|
|
142
|
+
SisterCounts_temp4$SplitGroups<- ifelse(grepl(",",SisterCounts_temp3$Clusters), "split", "no")
|
|
143
|
+
#head(SisterCounts_temp4)
|
|
144
|
+
|
|
145
|
+
#rename a column for better readability
|
|
146
|
+
names(SisterCounts_temp4)[names(SisterCounts_temp4) == "count.fields(textConnection(as.character(SisterCounts_temp3$Clusters)), "] <- 'NrSplits'
|
|
147
|
+
#head(SisterCounts_temp4)
|
|
148
|
+
|
|
149
|
+
#remove dublicates and keep the Group_of_interest with the best Normalized2_sum_of_occurances value
|
|
150
|
+
#this is done to only have one hits per arcog and group of interest to better normalize the data by the total nr of arcogs and do have a consistent link to the total number of phylogenetic clusters
|
|
151
|
+
SisterCounts_best <-
|
|
152
|
+
SisterCounts_temp4 %>%
|
|
153
|
+
group_by(MarkerID,Group_of_interest) %>%
|
|
154
|
+
filter(Normalized2_sum_of_occurances == max(Normalized2_sum_of_occurances))
|
|
155
|
+
|
|
156
|
+
#count nr of taxonomic groups (i.e. clusters) in each tree
|
|
157
|
+
Nr_clusters <- Number_of_taxa <- ddply(SisterCounts_best, .(MarkerID), summarize, NrClusters = length(Group_of_interest))
|
|
158
|
+
#head(Nr_clusters)
|
|
159
|
+
|
|
160
|
+
#merge nr of clusters and nr of species with datatable
|
|
161
|
+
SisterCounts_best_temp1 <- merge(SisterCounts_best, Nr_clusters, by = "MarkerID")
|
|
162
|
+
SisterCounts_best_temp2 <- merge(SisterCounts_best_temp1,Species_in_tree, by.x = "MarkerID", by.y = "row.names" )
|
|
163
|
+
#head(SisterCounts_best_temp2)
|
|
164
|
+
|
|
165
|
+
#print table
|
|
166
|
+
# write.table(SisterCounts_best_temp1, "2_Output/Taxa_Summary_1.txt", sep = "\t", row.names = F, quote =F)
|
|
167
|
+
|
|
168
|
+
################################################################################
|
|
169
|
+
#4. summarize split events to be able to rank marker genes
|
|
170
|
+
################################################################################
|
|
171
|
+
#summarize splits/cluster
|
|
172
|
+
Split_counts <- ddply(SisterCounts_best_temp1, .(MarkerID,SplitGroups, NrClusters), summarise, quantity = length(NrSplits))
|
|
173
|
+
Split_counts_wide <- spread(Split_counts, SplitGroups, quantity)
|
|
174
|
+
|
|
175
|
+
#make new column to calulate the percentage of split clusters
|
|
176
|
+
Split_counts_wide$SplitsPerCluster <- round((Split_counts_wide$split/Split_counts_wide$NrClusters)*100, digits = 1)
|
|
177
|
+
#head(Split_counts_wide)
|
|
178
|
+
|
|
179
|
+
#summarize and coun the total number of splits
|
|
180
|
+
Split_Total <- ddply(SisterCounts_best_temp1, .(MarkerID, NrClusters), summarise, TotalSplits = sum(NrSplits))
|
|
181
|
+
#head(Split_Total)
|
|
182
|
+
|
|
183
|
+
#combine the two dataframes generated above
|
|
184
|
+
#Summary_temp1 <- merge(Split_counts_wide[,c("MarkerID","NrClusters","SplitsPerCluster")],HGT_Counts, by = "MarkerID" )
|
|
185
|
+
Summary_temp2 <- merge(Split_counts_wide[,c("MarkerID","NrClusters","SplitsPerCluster")], Split_Total, by = "MarkerID")
|
|
186
|
+
Summary_temp3 <- merge(Summary_temp2, Species_in_tree, by.x = "MarkerID", by.y = "row.names")
|
|
187
|
+
Summary_temp3$TotalSplits_to_Species <- round((Summary_temp3$TotalSplits/Summary_temp3$NrSpecies)*100, digits = 1)
|
|
188
|
+
#head(Summary_temp3)
|
|
189
|
+
|
|
190
|
+
#subset to only print relevant info
|
|
191
|
+
Summary_temp4 <- Summary_temp3[,c("MarkerID","NrSpecies", "NrClusters.x", "SplitsPerCluster","TotalSplits", "TotalSplits_to_Species" )]
|
|
192
|
+
#head(Summary_temp4)
|
|
193
|
+
|
|
194
|
+
#if genes were lost already during the tree building step, add that info in
|
|
195
|
+
Summary_temp5 <- merge(MarkerList, Summary_temp4, by = "MarkerID", all.x = T)
|
|
196
|
+
#head(Summary_temp5)
|
|
197
|
+
|
|
198
|
+
################################################################################
|
|
199
|
+
#5. find highest/lowest 25/50% ranking markers
|
|
200
|
+
################################################################################
|
|
201
|
+
#make vector of genes that are not good marker genes based on literature and that are not monophyletic
|
|
202
|
+
genes_to_remove_vector <- as.character(genes_to_remove$MarkerID)
|
|
203
|
+
#genes_to_remove_vector
|
|
204
|
+
|
|
205
|
+
#remove genes from dataframe
|
|
206
|
+
Stats_temp1A <- Summary_temp4[ ! Summary_temp4$MarkerID %in% genes_to_remove_vector, ]
|
|
207
|
+
#dim(Summary_temp4)
|
|
208
|
+
#dim(Stats_temp1A)
|
|
209
|
+
|
|
210
|
+
#define a cutoff to remove gene trees that have less than 50% of the species as we do not want to use these genes for concatenations
|
|
211
|
+
cutoff <- mean(Stats_temp1A$NrSpecies)/2
|
|
212
|
+
#cutoff
|
|
213
|
+
|
|
214
|
+
#remove genes that have few species
|
|
215
|
+
Stats_temp1B <- subset(Stats_temp1A, Stats_temp1A[ , "NrSpecies"] > cutoff)
|
|
216
|
+
#dim(Stats_temp1A)
|
|
217
|
+
#dim(Stats_temp1B)
|
|
218
|
+
|
|
219
|
+
#rank according to the split clusters in percentage from 1 to xx (lowest/best value = lowest nr) = RankA
|
|
220
|
+
Stats_temp2 <- Stats_temp1B %>% mutate(RankA = rank(SplitsPerCluster, ties.method = 'first'))
|
|
221
|
+
|
|
222
|
+
#rank according to the total splits normalized by the total number of species from 1 to xx (lowest value = lowest nr)
|
|
223
|
+
Stats_temp3 <- Stats_temp2 %>% mutate(RankB = rank(TotalSplits_to_Species, ties.method = 'first'))
|
|
224
|
+
|
|
225
|
+
#combine RankA and RankB to get the best for each method
|
|
226
|
+
Stats_temp3$RankA_B <- Stats_temp3$RankA+Stats_temp3$RankB
|
|
227
|
+
#dim(Stats_temp3)
|
|
228
|
+
|
|
229
|
+
#define the concatenated marker sets and create vectors
|
|
230
|
+
nr_genes <- length(Stats_temp3$MarkerID)
|
|
231
|
+
cutoff_25perc <- round(nr_genes/4, digits = 0)
|
|
232
|
+
cutoff_50perc<- round(nr_genes/2, digits = 0)
|
|
233
|
+
#cutoff_25perc
|
|
234
|
+
#cutoff_50perc
|
|
235
|
+
|
|
236
|
+
#subset the tables for the different cutoffs
|
|
237
|
+
best_50perc <- as.data.frame(Stats_temp3 %>% top_n(-cutoff_50perc, RankA_B))
|
|
238
|
+
Stats_temp3$best_50perc <- best_50perc$MarkerID[match(Stats_temp3$MarkerID, best_50perc$MarkerID)]
|
|
239
|
+
|
|
240
|
+
best_25perc <- Stats_temp3 %>% top_n(-cutoff_25perc, RankA_B)
|
|
241
|
+
Stats_temp3$best_25perc <- best_25perc$MarkerID[match(Stats_temp3$MarkerID, best_25perc$MarkerID)]
|
|
242
|
+
|
|
243
|
+
worst_50perc <- Stats_temp3 %>% top_n(cutoff_50perc, RankA_B)
|
|
244
|
+
Stats_temp3$worst_50perc <- worst_50perc$MarkerID[match(Stats_temp3$MarkerID, worst_50perc$MarkerID)]
|
|
245
|
+
|
|
246
|
+
worst_25perc <- Stats_temp3 %>% top_n(cutoff_25perc, RankA_B)
|
|
247
|
+
Stats_temp3$worst_25perc <- worst_25perc$MarkerID[match(Stats_temp3$MarkerID, worst_25perc$MarkerID)]
|
|
248
|
+
|
|
249
|
+
Stats_temp3$FullSet <- Stats_temp3$MarkerID
|
|
250
|
+
|
|
251
|
+
#merge with original table (to keep the statistics)
|
|
252
|
+
Stats_temp4 <- merge(Summary_temp5, Stats_temp3[,c("MarkerID", "RankA", "RankB", "RankA_B", "FullSet", "best_50perc", "best_25perc", "worst_50perc","worst_25perc")], by = "MarkerID", all.x = T)
|
|
253
|
+
|
|
254
|
+
#print
|
|
255
|
+
write.table(Stats_temp4, output_table, sep = "\t", row.names = F, quote =F, na = "")
|
|
256
|
+
|
TreeSAK/TaxonTree.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from ete3 import Tree
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
TaxonTree_usage = '''
|
|
6
|
+
================================ TaxonTree example commands ================================
|
|
7
|
+
|
|
8
|
+
TreeSAK TaxonTree -i ar53_r220.tree -tax o__Nitrososphaerales -o o__Nitrososphaerales.tree
|
|
9
|
+
|
|
10
|
+
============================================================================================
|
|
11
|
+
'''
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def TaxonTree(args):
|
|
15
|
+
|
|
16
|
+
tree_file_in = args['i']
|
|
17
|
+
interested_taxon = args['tax']
|
|
18
|
+
tree_file_out = args['o']
|
|
19
|
+
|
|
20
|
+
input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
|
|
21
|
+
|
|
22
|
+
matched_node_list = []
|
|
23
|
+
for node in input_tree.traverse():
|
|
24
|
+
if (node.name == interested_taxon) or (interested_taxon in node.name):
|
|
25
|
+
matched_node_list.append(node.name)
|
|
26
|
+
|
|
27
|
+
if len(matched_node_list) == 1:
|
|
28
|
+
for node in input_tree.traverse():
|
|
29
|
+
if node.name in matched_node_list:
|
|
30
|
+
node.write(outfile=tree_file_out)
|
|
31
|
+
else:
|
|
32
|
+
print('There are multiple matched nodes. program exited!')
|
|
33
|
+
print('Matched nodes: %s' % ','.join(matched_node_list))
|
|
34
|
+
exit()
|
|
35
|
+
|
|
36
|
+
print('Subset tree exported to: %s' % tree_file_out)
|
|
37
|
+
print('Done!')
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if __name__ == '__main__':
|
|
41
|
+
|
|
42
|
+
TaxonTree_parser = argparse.ArgumentParser()
|
|
43
|
+
TaxonTree_parser.add_argument('-i', required=True, help='input tree file')
|
|
44
|
+
TaxonTree_parser.add_argument('-tax', required=True, help='interested taxon')
|
|
45
|
+
TaxonTree_parser.add_argument('-o', required=True, help='output tree file')
|
|
46
|
+
args = vars(TaxonTree_parser.parse_args())
|
|
47
|
+
TaxonTree(args)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
# extract path to the config file
|
|
4
|
+
pwd_config_file = os.path.realpath(__file__)
|
|
5
|
+
config_file_path = '/'.join(pwd_config_file.split('/')[:-1])
|
|
6
|
+
|
|
7
|
+
# specify full path to corresponding executables at the right side of colon
|
|
8
|
+
config_dict = {'config_file_path' : config_file_path,
|
|
9
|
+
'prodigal' : 'prodigal',
|
|
10
|
+
'hmmsearch' : 'hmmsearch',
|
|
11
|
+
'hmmfetch' : 'hmmfetch',
|
|
12
|
+
'hmmalign' : 'hmmalign',
|
|
13
|
+
'hmmstat' : 'hmmstat',
|
|
14
|
+
'mafft' : 'mafft',
|
|
15
|
+
'bowtie2' : 'bowtie2',
|
|
16
|
+
'bowtie2_build' : 'bowtie2-build',
|
|
17
|
+
'blastp' : 'blastp',
|
|
18
|
+
'blastn' : 'blastn',
|
|
19
|
+
'makeblastdb' : 'makeblastdb',
|
|
20
|
+
'fasttree' : 'FastTree',
|
|
21
|
+
'ranger_mac' : '%s/Ranger-DTL-Dated.mac' % config_file_path,
|
|
22
|
+
'ranger_linux' : '%s/Ranger-DTL-Dated.linux' % config_file_path,
|
|
23
|
+
'path_to_hmm' : '%s/MetaCHIP_phylo.hmm' % config_file_path,
|
|
24
|
+
'circos_HGT_R' : '%s/MetaCHIP_circos_HGT.R' % config_file_path,
|
|
25
|
+
'VisHPD95_R' : '%s/VisHPD95.R' % config_file_path,
|
|
26
|
+
'label_tree_R' : '%s/label_tree.R' % config_file_path,
|
|
27
|
+
'cdd2cog_perl' : '%s/cdd2cog.pl' % config_file_path,
|
|
28
|
+
'get_sankey_plot_R' : '%s/get_sankey_plot.R' % config_file_path,
|
|
29
|
+
'compare_trees_R' : '%s/compare_trees.R' % config_file_path,
|
|
30
|
+
'ko00001_keg' : '%s/ko00001.keg' % config_file_path,
|
|
31
|
+
'MetaCyc_rxns_with_ec' : '%s/MetaCyc_reactions_with_ec.txt' % config_file_path
|
|
32
|
+
}
|
TreeSAK/VERSION
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
1.51.2
|
|
2
|
+
- fixed bugs
|
|
3
|
+
|
|
4
|
+
1.51.0
|
|
5
|
+
- new module added: iTOL_msa_stats
|
|
6
|
+
|
|
7
|
+
1.50.0
|
|
8
|
+
- new module added: RootTreeGTDB226
|
|
9
|
+
|
|
10
|
+
1.49.0
|
|
11
|
+
- new module added: mcmctree_vs_reltime
|
|
12
|
+
|
|
13
|
+
1.48.0
|
|
14
|
+
- new module added: parse_reltime
|
|
15
|
+
|
|
16
|
+
1.47.0
|
|
17
|
+
- new module added: GeneTree
|
|
18
|
+
|
|
19
|
+
1.46.0
|
|
20
|
+
- new module added: cogTree
|
|
21
|
+
|
|
22
|
+
1.45.0
|
|
23
|
+
- new module added: iTOL_gene_tree
|
|
24
|
+
|
|
25
|
+
1.44.0
|
|
26
|
+
- new module added: koTree
|
|
27
|
+
|
|
28
|
+
1.43.0
|
|
29
|
+
- new module added: ALE7
|
|
30
|
+
|
|
31
|
+
1.42.0
|
|
32
|
+
- new module added: mcmc2tree
|
|
33
|
+
|
|
34
|
+
1.41.0
|
|
35
|
+
- new module added: mcmcTC
|
|
36
|
+
|
|
37
|
+
1.40.0
|
|
38
|
+
- new module added: TaxonTree
|
|
39
|
+
|
|
40
|
+
1.39.0
|
|
41
|
+
- new module added: supertree
|
|
42
|
+
|
|
43
|
+
1.38.0
|
|
44
|
+
- new module added: pruneMSA
|
|
45
|
+
|
|
46
|
+
1.37.0
|
|
47
|
+
- new module added: recode
|
|
48
|
+
|
|
49
|
+
1.36.0
|
|
50
|
+
- new module added: gap_stats
|
|
51
|
+
|
|
52
|
+
1.35.0
|
|
53
|
+
- new module added: AssessPB
|
|
54
|
+
|
|
55
|
+
1.34.0
|
|
56
|
+
- new module added: PB
|
|
57
|
+
|
|
58
|
+
1.33.0
|
|
59
|
+
- new module added: RootTreeGTDB214 and RootTreeGTDB220
|
|
60
|
+
|
|
61
|
+
1.32.0
|
|
62
|
+
- new module added: replace_clade
|
|
63
|
+
|
|
64
|
+
1.31.0
|
|
65
|
+
- new module added: PhyloBiAssoc
|
|
66
|
+
|
|
67
|
+
1.30.0
|
|
68
|
+
- new module added: ALE6
|
|
69
|
+
|
|
70
|
+
1.29.0
|
|
71
|
+
- new module added: LcaToLeaves
|
|
72
|
+
|
|
73
|
+
1.28.0
|
|
74
|
+
- new module added: SingleAleHGT
|
|
75
|
+
|
|
76
|
+
1.27.0
|
|
77
|
+
- new module added: ConcateMSA
|
|
78
|
+
|
|
79
|
+
1.26.0
|
|
80
|
+
- new module added: pRTC
|
|
81
|
+
|
|
82
|
+
1.25.0
|
|
83
|
+
- new module added: BMGE
|
|
84
|
+
|
|
85
|
+
1.24.0
|
|
86
|
+
- new module added: AlignmentPruner
|
|
87
|
+
|
|
88
|
+
1.23.0
|
|
89
|
+
- new module added: RootTree
|
|
90
|
+
|
|
91
|
+
1.22.0
|
|
92
|
+
- new module added: OMA2
|
|
93
|
+
|
|
94
|
+
1.21.0
|
|
95
|
+
- new module added: print_leaves
|
|
96
|
+
|
|
97
|
+
1.20.0
|
|
98
|
+
- new module added: ALE1, ALE2, ALE3, ALE4
|
|
99
|
+
|
|
100
|
+
1.19.0
|
|
101
|
+
- new module added: OMA
|
|
102
|
+
|
|
103
|
+
1.18.0
|
|
104
|
+
- new module added: ExtractMarkerSeq
|
|
105
|
+
|
|
106
|
+
1.17.0
|
|
107
|
+
- new module added: MarkerSeq2Tree
|
|
108
|
+
|
|
109
|
+
1.16.0
|
|
110
|
+
- new module added: SplitScoreStep1 and SplitScoreStep2
|
|
111
|
+
|
|
112
|
+
1.15.0
|
|
113
|
+
- new module added: SingleLinePhy
|
|
114
|
+
|
|
115
|
+
1.14.0
|
|
116
|
+
- new module added: PMSF
|
|
117
|
+
|
|
118
|
+
1.13.0
|
|
119
|
+
- new module added: VisHPD95
|
|
120
|
+
|
|
121
|
+
1.12.0
|
|
122
|
+
- new module added: PlotMcmcNode
|
|
123
|
+
|
|
124
|
+
1.11.0
|
|
125
|
+
- new module added: CompareMCMC
|
|
126
|
+
|
|
127
|
+
1.10.0
|
|
128
|
+
- new module added: fa2phy
|
|
129
|
+
|
|
130
|
+
1.9.0
|
|
131
|
+
- new module added: Dating
|
|
132
|
+
|
|
133
|
+
1.8.0
|
|
134
|
+
- new module added: AssessMarkerDeltaLL
|
|
135
|
+
|
|
136
|
+
1.7.0
|
|
137
|
+
- new module added: AssessMarkerPA
|
|
138
|
+
|
|
139
|
+
1.6.0
|
|
140
|
+
- new module added: Marker2Tree
|
|
141
|
+
|
|
142
|
+
1.5.0
|
|
143
|
+
- new module added: SliceMSA
|
|
144
|
+
|
|
145
|
+
1.4.0
|
|
146
|
+
- new module added: ConvertMSA
|
|
147
|
+
|
|
148
|
+
1.3.0
|
|
149
|
+
- new module added: get_arCOG_seq
|
|
150
|
+
|
|
151
|
+
1.2.0
|
|
152
|
+
- new module added: parse_deltall_stdout
|
|
153
|
+
|
|
154
|
+
1.1.0
|
|
155
|
+
- new module added: AssessCVG
|
|
156
|
+
|
|
157
|
+
1.0.0
|
|
158
|
+
- initial release
|
TreeSAK/VisHPD95.R
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
library(ggplot2)
|
|
2
|
+
library(optparse)
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
plot_grouped_HPD95 <- function(data_file, plot_width, plot_height, plot_file){
|
|
6
|
+
|
|
7
|
+
dat <- read.table(data_file, header = T)
|
|
8
|
+
|
|
9
|
+
ggplot(dat, aes(x = Var, y = Mean, ymin = Low, ymax = High)) +
|
|
10
|
+
geom_pointrange(aes(col = factor(Test), shape=factor(Shape)),
|
|
11
|
+
position=position_dodge(width=0.6), # controls distance between groups
|
|
12
|
+
linewidth = 0.9, # line width
|
|
13
|
+
size=0.75) + # size of shape
|
|
14
|
+
theme_bw() + # remove background
|
|
15
|
+
theme(panel.grid.major=element_blank(), # remove grid
|
|
16
|
+
panel.grid.minor=element_blank()) + # remove grid
|
|
17
|
+
xlab("") + # x-axis label text
|
|
18
|
+
ylab("95% HPD CI") + # y-axis label text
|
|
19
|
+
theme(axis.text.x=element_text(size=12, color='black', angle=30, hjust=1), # x-axis label, rotate at an angle of 45
|
|
20
|
+
axis.text.y=element_text(size=12, color='black'), # y-axis label
|
|
21
|
+
legend.text=element_text(size=10)) + # legend label
|
|
22
|
+
scale_color_discrete(name="Color") + # customize color legend, title
|
|
23
|
+
guides(color=guide_legend(override.aes=list(linetype=0))) + # customize color legend
|
|
24
|
+
scale_shape_discrete(name="Shape") + # customize color legend, title
|
|
25
|
+
guides(shape=guide_legend(override.aes=list(linetype=0, color='grey'))) # customize color legend,
|
|
26
|
+
|
|
27
|
+
# write to file
|
|
28
|
+
ggsave(plot_file, width=plot_width, height=plot_height, dpi=300)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
option_list = list(
|
|
33
|
+
make_option(c("-i", "--datain"), type="character", default=NULL, help="input data matrix"),
|
|
34
|
+
make_option(c("-x", "--width"), type="double", default=8, help="plot width"),
|
|
35
|
+
make_option(c("-y", "--height"), type="double", default=5, help="plot height"),
|
|
36
|
+
make_option(c("-o", "--plotout"), type="character", default=NULL, help="output plot"));
|
|
37
|
+
|
|
38
|
+
opt_parser = OptionParser(option_list=option_list);
|
|
39
|
+
opt = parse_args(opt_parser);
|
|
40
|
+
data_matrix_txt = opt$datain
|
|
41
|
+
plot_width = opt$width
|
|
42
|
+
plot_height = opt$height
|
|
43
|
+
output_plot = opt$plotout
|
|
44
|
+
|
|
45
|
+
plot_grouped_HPD95(data_matrix_txt, plot_width, plot_height, output_plot)
|