treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/deltall.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
deltall_usage = '''
|
|
5
|
+
========================= deltall example commands =========================
|
|
6
|
+
|
|
7
|
+
TreeSAK deltall -i nohup.out -o DeltaLL_op_summary.txt
|
|
8
|
+
|
|
9
|
+
# This script was wrote to parse the stdout of deltaLL.rb from Sishuo Wang
|
|
10
|
+
|
|
11
|
+
============================================================================
|
|
12
|
+
'''
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def deltall(args):
|
|
16
|
+
|
|
17
|
+
deltall_stdout_txt = args['i']
|
|
18
|
+
summary_txt = args['o']
|
|
19
|
+
|
|
20
|
+
deltall_op_dict = dict()
|
|
21
|
+
for each_line in open(deltall_stdout_txt):
|
|
22
|
+
if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
|
|
23
|
+
each_line_split = each_line.strip().split('\t')
|
|
24
|
+
marker_id = each_line_split[0]
|
|
25
|
+
value = float(each_line_split[1])
|
|
26
|
+
if marker_id not in deltall_op_dict:
|
|
27
|
+
deltall_op_dict[marker_id] = [value]
|
|
28
|
+
else:
|
|
29
|
+
deltall_op_dict[marker_id].append(value)
|
|
30
|
+
|
|
31
|
+
metric_1_dict = dict()
|
|
32
|
+
metric_2_dict = dict()
|
|
33
|
+
for each_marker in deltall_op_dict:
|
|
34
|
+
metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
|
|
35
|
+
metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
|
|
36
|
+
metric_1_dict[each_marker] = metric_1_value
|
|
37
|
+
metric_2_dict[each_marker] = metric_2_value
|
|
38
|
+
|
|
39
|
+
metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
|
|
40
|
+
metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
|
|
41
|
+
|
|
42
|
+
metric_1_score_dict = dict()
|
|
43
|
+
metric_1_score = 1
|
|
44
|
+
for each_marker_1 in metric_1_dict_sorted:
|
|
45
|
+
metric_1_score_dict[each_marker_1] = metric_1_score
|
|
46
|
+
metric_1_score += 1
|
|
47
|
+
|
|
48
|
+
metric_2_score_dict = dict()
|
|
49
|
+
metric_2_score = 1
|
|
50
|
+
for each_marker_2 in metric_2_dict_sorted:
|
|
51
|
+
metric_2_score_dict[each_marker_2] = metric_2_score
|
|
52
|
+
metric_2_score += 1
|
|
53
|
+
|
|
54
|
+
overall_score_dict = dict()
|
|
55
|
+
for each_marker in deltall_op_dict:
|
|
56
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
57
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
58
|
+
metric_score_overall = metric_score_1 + metric_score_2
|
|
59
|
+
overall_score_dict[each_marker] = metric_score_overall
|
|
60
|
+
|
|
61
|
+
overall_score_dict_sorted = {k: v for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])}
|
|
62
|
+
|
|
63
|
+
summary_txt_handle = open(summary_txt, 'w')
|
|
64
|
+
summary_txt_handle.write('Marker\tmetric1\tmetric1_score\tmetric2\tmetric2_score\toverall_score\n')
|
|
65
|
+
for each_marker in overall_score_dict_sorted:
|
|
66
|
+
metric_value_1 = metric_1_dict[each_marker]
|
|
67
|
+
metric_value_2 = metric_2_dict[each_marker]
|
|
68
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
69
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
70
|
+
metric_score_overall = overall_score_dict_sorted[each_marker]
|
|
71
|
+
summary_txt_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (each_marker, metric_value_1, metric_score_1, metric_value_2, metric_score_2, metric_score_overall))
|
|
72
|
+
summary_txt_handle.close()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
if __name__ == '__main__':
|
|
76
|
+
|
|
77
|
+
# initialize the options parser
|
|
78
|
+
parser = argparse.ArgumentParser()
|
|
79
|
+
parser.add_argument('-i', required=True, help='input file (e.g., nohup.out)')
|
|
80
|
+
parser.add_argument('-o', required=True, help='output summary')
|
|
81
|
+
args = vars(parser.parse_args())
|
|
82
|
+
deltall(args)
|
TreeSAK/do_rrtc.rb
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
#! /bin/env ruby
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
################################################################################
|
|
5
|
+
|
|
6
|
+
# updated on 2023-01-11
|
|
7
|
+
# auto-detect input type (marginal or joint)
|
|
8
|
+
# updated on 2023-01-09
|
|
9
|
+
# --rtc a folder as an input allowed
|
|
10
|
+
# updated on 2022-11-18
|
|
11
|
+
# bugs fixed
|
|
12
|
+
# updated on 2022-11-18 for non-independent ASR based on scm
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
require 'find'
|
|
18
|
+
require 'getoptlong'
|
|
19
|
+
require 'csv'
|
|
20
|
+
require 'parallel'
|
|
21
|
+
require 'bio-nwk'
|
|
22
|
+
require 'colorize'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
################################################################################
|
|
26
|
+
class Dir
|
|
27
|
+
def self.mkdirs(path)
|
|
28
|
+
if(!File.directory?(path))
|
|
29
|
+
if(!mkdirs(File.dirname(path)))
|
|
30
|
+
return false;
|
|
31
|
+
end
|
|
32
|
+
mkdir(path)
|
|
33
|
+
end
|
|
34
|
+
return true
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
################################################################################
|
|
40
|
+
def mkdir_with_force(outdir, is_force=false, is_tolerate=false)
|
|
41
|
+
if outdir.class != String
|
|
42
|
+
raise "outdir wrong? Exiting ......"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
if ! Dir.exists?(outdir)
|
|
46
|
+
`mkdir -p #{outdir}`
|
|
47
|
+
else
|
|
48
|
+
if is_tolerate
|
|
49
|
+
;
|
|
50
|
+
elsif is_force
|
|
51
|
+
`rm -rf #{outdir}`
|
|
52
|
+
`mkdir -p #{outdir}`
|
|
53
|
+
else
|
|
54
|
+
raise "The outdir #{outdir} has already existed!"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_infiles(indir, suffix='', is_all_subfolder=false)
|
|
61
|
+
infiles = Array.new
|
|
62
|
+
if ! is_all_subfolder
|
|
63
|
+
Dir.foreach(indir) do |b|
|
|
64
|
+
next if b =~ /^\./
|
|
65
|
+
if suffix.is_a?(String)
|
|
66
|
+
if suffix != ''
|
|
67
|
+
next if b !~ /#{suffix}$/
|
|
68
|
+
end
|
|
69
|
+
elsif suffix.is_a?(Array)
|
|
70
|
+
next unless suffix.any?{|i| b =~ /#{i}$/ }
|
|
71
|
+
end
|
|
72
|
+
infiles << File.join(indir, b)
|
|
73
|
+
end
|
|
74
|
+
else
|
|
75
|
+
Find.find(indir) do |path|
|
|
76
|
+
next if File.directory?(path)
|
|
77
|
+
next if File.basename(path) =~ /^\./
|
|
78
|
+
infiles << path if suffix.is_a?(String) ? path =~ /\.#{suffix}$/ : suffix.any?{|i| path =~ /#{i}$/ }
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
return(infiles)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def getFilesBySuffices(indir, suffices)
|
|
86
|
+
files = Array.new
|
|
87
|
+
infiles = read_infiles(indir)
|
|
88
|
+
infiles.each do |infile|
|
|
89
|
+
if suffices.include?(File.extname(infile))
|
|
90
|
+
files << infile
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
return(files)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_file_path(file)
|
|
98
|
+
path = File.symlink?(file) ? File.readlink(file) : file
|
|
99
|
+
return(path)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
################################################################################
|
|
103
|
+
|
|
104
|
+
# all credits to Wolfgang Teuber (https://gitlab.com/knugie)
|
|
105
|
+
|
|
106
|
+
def weighted_rand(weights = {})
|
|
107
|
+
raise 'Probabilities must sum up to 1' unless weights.values.inject(&:+) == 1.0
|
|
108
|
+
raise 'Probabilities must not be negative' unless weights.values.all? { |p| p >= 0 }
|
|
109
|
+
# Do more sanity checks depending on the amount of trust in the software component using this method,
|
|
110
|
+
# e.g. don't allow duplicates, don't allow non-numeric values, etc.
|
|
111
|
+
|
|
112
|
+
# Ignore elements with probability 0
|
|
113
|
+
weights = weights.reject { |k, v| v == 0.0 } # e.g. => {"a"=>0.4, "b"=>0.4, "c"=>0.2}
|
|
114
|
+
|
|
115
|
+
# Accumulate probabilities and map them to a value
|
|
116
|
+
u = 0.0
|
|
117
|
+
ranges = weights.map { |v, p| [u += p, v] } # e.g. => [[0.4, "a"], [0.8, "b"], [1.0, "c"]]
|
|
118
|
+
|
|
119
|
+
# Generate a (pseudo-)random floating point number between 0.0(included) and 1.0(excluded)
|
|
120
|
+
u = rand # e.g. => 0.4651073966724186
|
|
121
|
+
|
|
122
|
+
# Find the first value that has an accumulated probability greater than the random number u
|
|
123
|
+
ranges.find { |p, v| p > u }.last # e.g. => "b"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
################################################################################
|
|
127
|
+
|
|
128
|
+
class Bio::Tree
|
|
129
|
+
def de_no_for_tips
|
|
130
|
+
allTips.each do |tip|
|
|
131
|
+
tip.name = tip.name.split(' ')[1,100].join('_')
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class Subclade
|
|
138
|
+
attr_accessor :name, :num
|
|
139
|
+
def initialize(arr)
|
|
140
|
+
#@name = arr
|
|
141
|
+
@name = arr[0].is_a?(Array) ? arr.map{|i|i.sort} : arr.sort
|
|
142
|
+
#@name = arr[0] =~ /,/ ? arr.map{|i|i.split(',').sort} : arr.sort
|
|
143
|
+
@num = nil
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class Symbiont < Subclade
|
|
149
|
+
attr_accessor :hosts
|
|
150
|
+
def initialize(arr)
|
|
151
|
+
@name = arr
|
|
152
|
+
@hosts = Array.new
|
|
153
|
+
end
|
|
154
|
+
def is_co_evolve?(data:, is_strict:)
|
|
155
|
+
@host2prob = Hash.new
|
|
156
|
+
@hosts.map{|i| @host2prob[i]=i.prob }
|
|
157
|
+
selected_host = is_strict ? @hosts.sort_by{|i| i.prob }.reverse[0] : weighted_rand(@host2prob)
|
|
158
|
+
if data[selected_host.num-1].to_f >= data[@num-1].to_f
|
|
159
|
+
return(true)
|
|
160
|
+
else
|
|
161
|
+
return(false)
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
def is_co_evolve2?(data:, is_strict:)
|
|
165
|
+
@host2prob = Hash.new
|
|
166
|
+
@hosts.map{|i| @host2prob[i]=i.prob }
|
|
167
|
+
selected_host = is_strict ? @hosts.sort_by{|i| i.prob }.reverse[0] : weighted_rand(@host2prob)
|
|
168
|
+
if selected_host.num.zip(@num).all?{|a, b| data[a-1].to_f >= data[b-1].to_f }
|
|
169
|
+
return(true)
|
|
170
|
+
else
|
|
171
|
+
return(false)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class Host < Subclade
|
|
178
|
+
attr_accessor :prob
|
|
179
|
+
def prob
|
|
180
|
+
@prob
|
|
181
|
+
end
|
|
182
|
+
def prob=(prob)
|
|
183
|
+
@prob = prob.to_f
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
##############################################
|
|
189
|
+
def get_lca_bootstrap(names, name2node, tree)
|
|
190
|
+
nodes = names.map{|i| name2node[i] }
|
|
191
|
+
lca = tree.lowest_common_ancestor(nodes[0], nodes[1])
|
|
192
|
+
begin
|
|
193
|
+
return(lca.bootstrap)
|
|
194
|
+
rescue
|
|
195
|
+
raise nodes.join("\t")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def read_mcmctree_out(file)
|
|
201
|
+
is_start = false
|
|
202
|
+
tree = nil
|
|
203
|
+
in_fh = File.open(file, 'r')
|
|
204
|
+
#(((1_t5, 2_t9) 33 , ((3_t21, (4_t22, 5_t18) 36 ) 35 , (6_t3, 7_t6) 37 ) 34 ) 32 , (((((8_t8, 9_t12) 42 , (10_t23, 11_t14) 43 ) 41 , (12_t2, 13_t4) 44 ) 40 , 14_t24) 39 , ((((15_t30, 16_t20) 48 , 17_t13) 47 , (((18_t11, 19_t10) 51 , ((20_t1, 21_t26) 53 , (22_t7, (23_t19, 24_t27) 55 ) 54 ) 52 ) 50 , ((((25_t25, 26_t17) 59 , 27_t15) 58 , 28_t16) 57 , 29_t29) 56 ) 49 ) 46 , 30_t28) 45 ) 38 ) 31 ;
|
|
205
|
+
in_fh.each_line do |line|
|
|
206
|
+
line.chomp!
|
|
207
|
+
if is_start
|
|
208
|
+
tree = getTreeObjFromNwkString(line)
|
|
209
|
+
break
|
|
210
|
+
end
|
|
211
|
+
is_start = true if line =~ /^Species tree for FigTree/
|
|
212
|
+
end
|
|
213
|
+
in_fh.close
|
|
214
|
+
|
|
215
|
+
tree.de_no_for_tips
|
|
216
|
+
|
|
217
|
+
#get_internal_node_index(tree)
|
|
218
|
+
return(tree)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def get_rtc(file, root_two_children_names)
|
|
223
|
+
rtc_info = Hash.new
|
|
224
|
+
in_fh = File.open(file, 'r')
|
|
225
|
+
in_fh.each_line do |line|
|
|
226
|
+
line.chomp!
|
|
227
|
+
next if line =~ /^#|^$/
|
|
228
|
+
line_arr = line.split("\t")
|
|
229
|
+
|
|
230
|
+
symbiont = line_arr[0].gsub('_', '_').split(',')
|
|
231
|
+
rtc = Symbiont.new(symbiont)
|
|
232
|
+
|
|
233
|
+
(1..line_arr.size-1).each do |index|
|
|
234
|
+
ele = line_arr[index]
|
|
235
|
+
host = ele.split(':')[0].gsub('_', '_').split(',')
|
|
236
|
+
prob = ele.split(':')[1]
|
|
237
|
+
|
|
238
|
+
host_obj = Host.new(host)
|
|
239
|
+
host_obj.prob = prob
|
|
240
|
+
rtc.hosts << host_obj
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
root_obj = Host.new(root_two_children_names)
|
|
244
|
+
|
|
245
|
+
curr_total_prob = rtc.hosts.map{|host|host.prob}.reduce(&:+)
|
|
246
|
+
if curr_total_prob < 1
|
|
247
|
+
root_obj.prob = 1 - curr_total_prob
|
|
248
|
+
rtc.hosts << root_obj
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
rtc_info[symbiont] = rtc
|
|
252
|
+
end
|
|
253
|
+
in_fh.close
|
|
254
|
+
return(rtc_info)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_scm(file, root_two_children_names)
|
|
259
|
+
scm_info = Hash.new
|
|
260
|
+
symbionts = nil
|
|
261
|
+
in_fh = File.open(file, 'r')
|
|
262
|
+
rtc = nil
|
|
263
|
+
|
|
264
|
+
in_fh.each_line do |line|
|
|
265
|
+
line.chomp!
|
|
266
|
+
line_arr = line.split("\t")
|
|
267
|
+
next if line =~ /^#|^$/
|
|
268
|
+
if $. == 1
|
|
269
|
+
symbionts = line_arr.map{|i| i.gsub(' ', '_').split(',') }
|
|
270
|
+
rtc = Symbiont.new(symbionts)
|
|
271
|
+
next
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
prob = line_arr[-1].to_f
|
|
275
|
+
hosts = line_arr[0, line_arr.size-1].map{|i|i.split(',')}
|
|
276
|
+
hosts.map!{|a| a[0] == 'root' ? root_two_children_names : a } #2023-01, in case of >=1 root (fl; Z)
|
|
277
|
+
host_obj = Host.new(hosts)
|
|
278
|
+
host_obj.prob = prob
|
|
279
|
+
rtc.hosts << host_obj
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
root_obj = Host.new([root_two_children_names] * symbionts.size)
|
|
283
|
+
curr_total_prob = rtc.hosts.map{|host|host.prob}.reduce(&:+)
|
|
284
|
+
if curr_total_prob < 1
|
|
285
|
+
root_obj.prob = 1 - curr_total_prob
|
|
286
|
+
rtc.hosts << root_obj
|
|
287
|
+
end
|
|
288
|
+
scm_info[symbionts] = rtc
|
|
289
|
+
|
|
290
|
+
in_fh.close
|
|
291
|
+
return(scm_info)
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
##############################################
|
|
296
|
+
def auto_detect_mj(rtc_files) # identify whether marginal or joint
|
|
297
|
+
first_file = rtc_files[0]
|
|
298
|
+
in_fh = File.open(first_file, 'r')
|
|
299
|
+
first_line = in_fh.readline.chomp
|
|
300
|
+
if first_line =~ /:[10] (\b | \.\d+)/x
|
|
301
|
+
type = 'marginal'
|
|
302
|
+
else
|
|
303
|
+
type = 'joint'
|
|
304
|
+
end
|
|
305
|
+
in_fh.close
|
|
306
|
+
STDERR.puts "type auto-detected\t" + type.colorize(:yellow)
|
|
307
|
+
return(type)
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
##############################################
|
|
312
|
+
if __FILE__ == $0
|
|
313
|
+
indir = nil
|
|
314
|
+
infile = nil
|
|
315
|
+
type = nil
|
|
316
|
+
rtc_files = Array.new
|
|
317
|
+
scm_file = nil
|
|
318
|
+
scm_files = Array.new
|
|
319
|
+
mcmctxt_file = nil
|
|
320
|
+
is_rtc = true
|
|
321
|
+
is_renum = true
|
|
322
|
+
is_strict = false
|
|
323
|
+
|
|
324
|
+
rtc_info = Hash.new
|
|
325
|
+
scm_info = Hash.new
|
|
326
|
+
|
|
327
|
+
##############################################
|
|
328
|
+
opts = GetoptLong.new(
|
|
329
|
+
['--indir', GetoptLong::REQUIRED_ARGUMENT],
|
|
330
|
+
['-i', GetoptLong::REQUIRED_ARGUMENT],
|
|
331
|
+
['--marginal', GetoptLong::REQUIRED_ARGUMENT],
|
|
332
|
+
['--joint', GetoptLong::REQUIRED_ARGUMENT],
|
|
333
|
+
['--rtc', '--rrtc', GetoptLong::REQUIRED_ARGUMENT],
|
|
334
|
+
['--mcmctxt', GetoptLong::REQUIRED_ARGUMENT],
|
|
335
|
+
['--is_rtc', '--is_rrtc', '--rrtc_file', '--rtc_file', GetoptLong::REQUIRED_ARGUMENT],
|
|
336
|
+
['--strict', '--is_strict', GetoptLong::NO_ARGUMENT],
|
|
337
|
+
['--no_renum', '--no_re_num', GetoptLong::NO_ARGUMENT],
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
opts.each do |opt, value|
|
|
342
|
+
case opt
|
|
343
|
+
when '--indir'
|
|
344
|
+
indir = value
|
|
345
|
+
when '-i'
|
|
346
|
+
infile = value
|
|
347
|
+
when '--marginal'
|
|
348
|
+
rtc_files = File.directory?(value) ? read_infiles(value) : [value]
|
|
349
|
+
type = 'marginal'
|
|
350
|
+
when '--joint'
|
|
351
|
+
rtc_files = File.directory?(value) ? read_infiles(value) : [value]
|
|
352
|
+
type = 'joint'
|
|
353
|
+
when /^--r?rtc(file)?$/
|
|
354
|
+
rtc_files = File.directory?(value) ? read_infiles(value) : [value]
|
|
355
|
+
when '--mcmctxt'
|
|
356
|
+
mcmctxt_file = value
|
|
357
|
+
when '--is_rtc', '--is_rrtc'
|
|
358
|
+
is_rtc = value =~ /^true|T$/i ? true : false
|
|
359
|
+
when '--is_strict', '--strict'
|
|
360
|
+
is_strict = true
|
|
361
|
+
STDERR.puts "is_strict:\t" + "true".colorize(:yellow)
|
|
362
|
+
when '--no_renum', '--no_re_num'
|
|
363
|
+
is_renum = false
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
##############################################
|
|
369
|
+
unless indir.nil?
|
|
370
|
+
infile = File.join(indir, 'out') # file 'out' from the mcmctree output
|
|
371
|
+
mcmctxt_file = File.join(indir, 'mcmc.txt')
|
|
372
|
+
if is_output
|
|
373
|
+
outdir = File.join(File.dirname(mcmctxt_file), '')
|
|
374
|
+
out_fh = File.open(outfile, 'w')
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
type = auto_detect_mj(rtc_files) if type.nil? # identify whether marginal or joint
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
##############################################
|
|
382
|
+
tree = read_mcmctree_out(infile) # "out", NOT "mcmc.txt"
|
|
383
|
+
|
|
384
|
+
root_two_children_names = tree.children(tree.root).map{|i|tree.tips(i)[0].name}
|
|
385
|
+
|
|
386
|
+
case type
|
|
387
|
+
when 'marginal'
|
|
388
|
+
#rtc_info = get_rtc(rtc_file, root_two_children_names)
|
|
389
|
+
rtc_files.each do |rtc_file|
|
|
390
|
+
rtc_info.merge! get_rtc(rtc_file, root_two_children_names)
|
|
391
|
+
end
|
|
392
|
+
when 'joint'
|
|
393
|
+
rtc_files.each do |scm_file|
|
|
394
|
+
# note scm_info
|
|
395
|
+
scm_info.merge! get_scm(scm_file, root_two_children_names)
|
|
396
|
+
end
|
|
397
|
+
else
|
|
398
|
+
raise "rtc_file or scm_file has to be provided! Exiting ......"
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
name2node, node2name = tree.getNameNodeRela
|
|
402
|
+
rtc_info.delete_if{|names, rtc| not names.all?{|name|name2node.include?(name)}} unless rtc_info.nil?
|
|
403
|
+
scm_info.delete_if{|names, rtc| not names.flatten.all?{|name|name2node.include?(name)}} unless rtc_info.nil?
|
|
404
|
+
|
|
405
|
+
minus = tree.allTips.size - 1
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
##############################################
|
|
409
|
+
rtc_info.each_pair do |names, rtc|
|
|
410
|
+
begin
|
|
411
|
+
rtc.num = get_lca_bootstrap(names, name2node, tree) - minus
|
|
412
|
+
rtc.hosts.map{|obj|obj.num = get_lca_bootstrap(obj.name, name2node, tree) - minus}
|
|
413
|
+
rescue
|
|
414
|
+
raise "species #{names} or #{rtc.hosts} not found"
|
|
415
|
+
end
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
scm_info.each_pair do |names, rtc|
|
|
419
|
+
begin
|
|
420
|
+
rtc.num = names.map{|names2|get_lca_bootstrap(names2, name2node, tree) - minus}
|
|
421
|
+
scm_info[names].num = rtc.num
|
|
422
|
+
rtc.hosts.each do |obj|
|
|
423
|
+
obj_nums = Array.new
|
|
424
|
+
names = obj.name
|
|
425
|
+
names.each do |names|
|
|
426
|
+
obj_nums << get_lca_bootstrap(names, name2node, tree) - minus
|
|
427
|
+
end
|
|
428
|
+
obj.num = obj_nums
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
##############################################
|
|
435
|
+
headers = CSV.open(mcmctxt_file, &:readline)
|
|
436
|
+
|
|
437
|
+
col_data = Array.new
|
|
438
|
+
CSV.foreach(mcmctxt_file) do |row|
|
|
439
|
+
data = row[0].split("\t")
|
|
440
|
+
if is_rtc
|
|
441
|
+
if not rtc_info.empty?
|
|
442
|
+
#col_data << row if rtc_info.all?{|rtc_name, rtcs| rtcs.all?{|rtc| rtc.is_co_evolve?(data)} }
|
|
443
|
+
col_data << row if rtc_info.all?{|rtc_name, rtc| rtc.is_co_evolve?(data:data, is_strict:is_strict) }
|
|
444
|
+
elsif not scm_info.empty?
|
|
445
|
+
col_data << row if scm_info.all?{|rtc_name, rtc| rtc.is_co_evolve2?(data:data, is_strict:is_strict) }
|
|
446
|
+
end
|
|
447
|
+
else
|
|
448
|
+
col_data << row
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
STDERR.puts "# of samples after filtering\t" + col_data.size.to_s.colorize(:red)
|
|
453
|
+
|
|
454
|
+
col_data.each_with_index do |row_arr, index| # row_arr: single-element array
|
|
455
|
+
if is_renum and index > 0
|
|
456
|
+
posteriors = row_arr[0].split("\t")
|
|
457
|
+
posteriors[0] = index
|
|
458
|
+
row_arr = [posteriors.join("\t")]
|
|
459
|
+
end
|
|
460
|
+
puts row_arr
|
|
461
|
+
end
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
|
TreeSAK/fa2phy.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from Bio import AlignIO
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
fa2phy_usage = '''
|
|
6
|
+
======= fa2phy example commands =======
|
|
7
|
+
|
|
8
|
+
TreeSAK fa2phy -i msa.fa -o msa.phy
|
|
9
|
+
|
|
10
|
+
=======================================
|
|
11
|
+
'''
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def fa2phy(args):
|
|
15
|
+
|
|
16
|
+
fasta_in = args['i']
|
|
17
|
+
phy_out = args['o']
|
|
18
|
+
|
|
19
|
+
alignment = AlignIO.read(fasta_in, 'fasta')
|
|
20
|
+
|
|
21
|
+
max_seq_id_len = 0
|
|
22
|
+
for each_seq in alignment:
|
|
23
|
+
seq_id_len = len(each_seq.id)
|
|
24
|
+
if seq_id_len > max_seq_id_len:
|
|
25
|
+
max_seq_id_len = seq_id_len
|
|
26
|
+
|
|
27
|
+
with open(phy_out, 'w') as msa_out_handle:
|
|
28
|
+
msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
|
|
29
|
+
for each_seq in alignment:
|
|
30
|
+
seq_id = each_seq.id
|
|
31
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
32
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == '__main__':
|
|
36
|
+
|
|
37
|
+
# initialize the options parser
|
|
38
|
+
parser = argparse.ArgumentParser()
|
|
39
|
+
parser.add_argument('-i', required=True, help='input MSA in fasta format')
|
|
40
|
+
parser.add_argument('-o', required=True, help='output MSA in phylip format')
|
|
41
|
+
args = vars(parser.parse_args())
|
|
42
|
+
fa2phy(args)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from ete3 import Tree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
format_leaf_name_usage = '''
|
|
7
|
+
========= FLN (Format Leaf Name) example commands =========
|
|
8
|
+
|
|
9
|
+
BioSAK FLN -i input.tree -o output.tree -s2u -nsqm -ndqm
|
|
10
|
+
BioSAK FLN -i input.tree -o output.tree -ns -nsqm
|
|
11
|
+
|
|
12
|
+
===========================================================
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def format_leaf_name(args):
|
|
17
|
+
|
|
18
|
+
tree_file_in = args['i']
|
|
19
|
+
tree_format = args['fmt']
|
|
20
|
+
tree_file_out = args['o']
|
|
21
|
+
no_space = args['ns']
|
|
22
|
+
space_to_underscore = args['s2u']
|
|
23
|
+
no_single_quotation_mark = args['nsqm']
|
|
24
|
+
no_double_quotation_mark = args['ndqm']
|
|
25
|
+
|
|
26
|
+
if os.path.isfile(tree_file_in) is False:
|
|
27
|
+
print('Tree file not found, program exited!')
|
|
28
|
+
exit()
|
|
29
|
+
|
|
30
|
+
if (no_space is True) and (space_to_underscore is True):
|
|
31
|
+
print('Two actions (-ns and -s2u) specified to spaces in tree leaves, program exited!')
|
|
32
|
+
exit()
|
|
33
|
+
|
|
34
|
+
t = Tree(tree_file_in, format=tree_format)
|
|
35
|
+
|
|
36
|
+
# get rename dict
|
|
37
|
+
mag_rename_dict = dict()
|
|
38
|
+
for leaf in t:
|
|
39
|
+
leaf_name = leaf.name
|
|
40
|
+
if space_to_underscore is True:
|
|
41
|
+
leaf_name = leaf_name.replace(' ', '_')
|
|
42
|
+
if no_space is True:
|
|
43
|
+
leaf_name = leaf_name.replace(' ', '')
|
|
44
|
+
if no_single_quotation_mark is True:
|
|
45
|
+
leaf_name = leaf_name.replace("'", '')
|
|
46
|
+
if no_double_quotation_mark is True:
|
|
47
|
+
leaf_name = leaf_name.replace('"', '')
|
|
48
|
+
mag_rename_dict[leaf.name] = leaf_name
|
|
49
|
+
|
|
50
|
+
for leaf in t:
|
|
51
|
+
leaf_name = leaf.name
|
|
52
|
+
leaf_name_new = mag_rename_dict[leaf_name]
|
|
53
|
+
leaf.name = leaf_name_new
|
|
54
|
+
t.write(format=tree_format, outfile=tree_file_out)
|
|
55
|
+
|
|
56
|
+
print('Done!')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
|
|
61
|
+
format_leaf_name_parser = argparse.ArgumentParser()
|
|
62
|
+
format_leaf_name_parser.add_argument('-i', required=True, help='input tree')
|
|
63
|
+
format_leaf_name_parser.add_argument('-fmt', required=False, default=1, help='tree format, default: 1')
|
|
64
|
+
format_leaf_name_parser.add_argument('-o', required=True, help='output tree')
|
|
65
|
+
format_leaf_name_parser.add_argument('-s2u', required=False, action="store_true", help='change space in tree leaves to underscore')
|
|
66
|
+
format_leaf_name_parser.add_argument('-ns', required=False, action="store_true", help='remove space from leaf names')
|
|
67
|
+
format_leaf_name_parser.add_argument('-nsqm', required=False, action="store_true", help='remove single quotation marks from leaf names')
|
|
68
|
+
format_leaf_name_parser.add_argument('-ndqm', required=False, action="store_true", help='remove double quotation marks from leaf names')
|
|
69
|
+
args = vars(format_leaf_name_parser.parse_args())
|
|
70
|
+
format_leaf_name(args)
|
TreeSAK/gap_stats.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from Bio import SeqIO
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
gap_stats_usage = '''
|
|
6
|
+
======= gap_stats example commands =======
|
|
7
|
+
|
|
8
|
+
TreeSAK gap_stats -i msa.fasta
|
|
9
|
+
|
|
10
|
+
==========================================
|
|
11
|
+
'''
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def gap_stats(args):
|
|
15
|
+
|
|
16
|
+
msa_in_fa = args['i']
|
|
17
|
+
|
|
18
|
+
gap_pct_dict = dict()
|
|
19
|
+
for each_seq in SeqIO.parse(msa_in_fa, 'fasta'):
|
|
20
|
+
seq_id = each_seq.id
|
|
21
|
+
seq_str = str(each_seq.seq)
|
|
22
|
+
gap_pct = seq_str.count('-')*100/len(seq_str)
|
|
23
|
+
gap_pct = float("{0:.2f}".format(gap_pct))
|
|
24
|
+
gap_pct_dict[seq_id] = gap_pct
|
|
25
|
+
|
|
26
|
+
gap_pct_sorted = sorted(gap_pct_dict.items(), key=lambda x: x[1])
|
|
27
|
+
|
|
28
|
+
print('Sequence\tGap')
|
|
29
|
+
for each_seq in gap_pct_sorted:
|
|
30
|
+
print('%s\t%s' % (each_seq[0], each_seq[1]))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
if __name__ == '__main__':
|
|
34
|
+
|
|
35
|
+
gap_stats_parser = argparse.ArgumentParser()
|
|
36
|
+
gap_stats_parser.add_argument('-i', required=True, help='MSA in fasta format')
|
|
37
|
+
args = vars(gap_stats_parser.parse_args())
|
|
38
|
+
gap_stats(args)
|