treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/do_rrtc.rb
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
#! /bin/env ruby
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
################################################################################
|
|
5
|
+
|
|
6
|
+
# updated on 2023-01-11
|
|
7
|
+
# auto-detect input type (marginal or joint)
|
|
8
|
+
# updated on 2023-01-09
|
|
9
|
+
# --rtc a folder as an input allowed
|
|
10
|
+
# updated on 2022-11-18
|
|
11
|
+
# bugs fixed
|
|
12
|
+
# updated on 2022-11-18 for non-independent ASR based on scm
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
require 'find'
|
|
18
|
+
require 'getoptlong'
|
|
19
|
+
require 'csv'
|
|
20
|
+
require 'parallel'
|
|
21
|
+
require 'bio-nwk'
|
|
22
|
+
require 'colorize'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
################################################################################
|
|
26
|
+
class Dir
|
|
27
|
+
def self.mkdirs(path)
|
|
28
|
+
if(!File.directory?(path))
|
|
29
|
+
if(!mkdirs(File.dirname(path)))
|
|
30
|
+
return false;
|
|
31
|
+
end
|
|
32
|
+
mkdir(path)
|
|
33
|
+
end
|
|
34
|
+
return true
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
################################################################################
|
|
40
|
+
def mkdir_with_force(outdir, is_force=false, is_tolerate=false)
|
|
41
|
+
if outdir.class != String
|
|
42
|
+
raise "outdir wrong? Exiting ......"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
if ! Dir.exists?(outdir)
|
|
46
|
+
`mkdir -p #{outdir}`
|
|
47
|
+
else
|
|
48
|
+
if is_tolerate
|
|
49
|
+
;
|
|
50
|
+
elsif is_force
|
|
51
|
+
`rm -rf #{outdir}`
|
|
52
|
+
`mkdir -p #{outdir}`
|
|
53
|
+
else
|
|
54
|
+
raise "The outdir #{outdir} has already existed!"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_infiles(indir, suffix='', is_all_subfolder=false)
|
|
61
|
+
infiles = Array.new
|
|
62
|
+
if ! is_all_subfolder
|
|
63
|
+
Dir.foreach(indir) do |b|
|
|
64
|
+
next if b =~ /^\./
|
|
65
|
+
if suffix.is_a?(String)
|
|
66
|
+
if suffix != ''
|
|
67
|
+
next if b !~ /#{suffix}$/
|
|
68
|
+
end
|
|
69
|
+
elsif suffix.is_a?(Array)
|
|
70
|
+
next unless suffix.any?{|i| b =~ /#{i}$/ }
|
|
71
|
+
end
|
|
72
|
+
infiles << File.join(indir, b)
|
|
73
|
+
end
|
|
74
|
+
else
|
|
75
|
+
Find.find(indir) do |path|
|
|
76
|
+
next if File.directory?(path)
|
|
77
|
+
next if File.basename(path) =~ /^\./
|
|
78
|
+
infiles << path if suffix.is_a?(String) ? path =~ /\.#{suffix}$/ : suffix.any?{|i| path =~ /#{i}$/ }
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
return(infiles)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def getFilesBySuffices(indir, suffices)
|
|
86
|
+
files = Array.new
|
|
87
|
+
infiles = read_infiles(indir)
|
|
88
|
+
infiles.each do |infile|
|
|
89
|
+
if suffices.include?(File.extname(infile))
|
|
90
|
+
files << infile
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
return(files)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_file_path(file)
|
|
98
|
+
path = File.symlink?(file) ? File.readlink(file) : file
|
|
99
|
+
return(path)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
################################################################################
|
|
103
|
+
|
|
104
|
+
# all credits to Wolfgang Teuber (https://gitlab.com/knugie)
|
|
105
|
+
|
|
106
|
+
def weighted_rand(weights = {})
|
|
107
|
+
raise 'Probabilities must sum up to 1' unless weights.values.inject(&:+) == 1.0
|
|
108
|
+
raise 'Probabilities must not be negative' unless weights.values.all? { |p| p >= 0 }
|
|
109
|
+
# Do more sanity checks depending on the amount of trust in the software component using this method,
|
|
110
|
+
# e.g. don't allow duplicates, don't allow non-numeric values, etc.
|
|
111
|
+
|
|
112
|
+
# Ignore elements with probability 0
|
|
113
|
+
weights = weights.reject { |k, v| v == 0.0 } # e.g. => {"a"=>0.4, "b"=>0.4, "c"=>0.2}
|
|
114
|
+
|
|
115
|
+
# Accumulate probabilities and map them to a value
|
|
116
|
+
u = 0.0
|
|
117
|
+
ranges = weights.map { |v, p| [u += p, v] } # e.g. => [[0.4, "a"], [0.8, "b"], [1.0, "c"]]
|
|
118
|
+
|
|
119
|
+
# Generate a (pseudo-)random floating point number between 0.0(included) and 1.0(excluded)
|
|
120
|
+
u = rand # e.g. => 0.4651073966724186
|
|
121
|
+
|
|
122
|
+
# Find the first value that has an accumulated probability greater than the random number u
|
|
123
|
+
ranges.find { |p, v| p > u }.last # e.g. => "b"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
################################################################################
|
|
127
|
+
|
|
128
|
+
class Bio::Tree
|
|
129
|
+
def de_no_for_tips
|
|
130
|
+
allTips.each do |tip|
|
|
131
|
+
tip.name = tip.name.split(' ')[1,100].join('_')
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class Subclade
|
|
138
|
+
attr_accessor :name, :num
|
|
139
|
+
def initialize(arr)
|
|
140
|
+
#@name = arr
|
|
141
|
+
@name = arr[0].is_a?(Array) ? arr.map{|i|i.sort} : arr.sort
|
|
142
|
+
#@name = arr[0] =~ /,/ ? arr.map{|i|i.split(',').sort} : arr.sort
|
|
143
|
+
@num = nil
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class Symbiont < Subclade
|
|
149
|
+
attr_accessor :hosts
|
|
150
|
+
def initialize(arr)
|
|
151
|
+
@name = arr
|
|
152
|
+
@hosts = Array.new
|
|
153
|
+
end
|
|
154
|
+
def is_co_evolve?(data:, is_strict:)
|
|
155
|
+
@host2prob = Hash.new
|
|
156
|
+
@hosts.map{|i| @host2prob[i]=i.prob }
|
|
157
|
+
selected_host = is_strict ? @hosts.sort_by{|i| i.prob }.reverse[0] : weighted_rand(@host2prob)
|
|
158
|
+
if data[selected_host.num-1].to_f >= data[@num-1].to_f
|
|
159
|
+
return(true)
|
|
160
|
+
else
|
|
161
|
+
return(false)
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
def is_co_evolve2?(data:, is_strict:)
|
|
165
|
+
@host2prob = Hash.new
|
|
166
|
+
@hosts.map{|i| @host2prob[i]=i.prob }
|
|
167
|
+
selected_host = is_strict ? @hosts.sort_by{|i| i.prob }.reverse[0] : weighted_rand(@host2prob)
|
|
168
|
+
if selected_host.num.zip(@num).all?{|a, b| data[a-1].to_f >= data[b-1].to_f }
|
|
169
|
+
return(true)
|
|
170
|
+
else
|
|
171
|
+
return(false)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class Host < Subclade
|
|
178
|
+
attr_accessor :prob
|
|
179
|
+
def prob
|
|
180
|
+
@prob
|
|
181
|
+
end
|
|
182
|
+
def prob=(prob)
|
|
183
|
+
@prob = prob.to_f
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
##############################################
|
|
189
|
+
def get_lca_bootstrap(names, name2node, tree)
|
|
190
|
+
nodes = names.map{|i| name2node[i] }
|
|
191
|
+
lca = tree.lowest_common_ancestor(nodes[0], nodes[1])
|
|
192
|
+
begin
|
|
193
|
+
return(lca.bootstrap)
|
|
194
|
+
rescue
|
|
195
|
+
raise nodes.join("\t")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def read_mcmctree_out(file)
|
|
201
|
+
is_start = false
|
|
202
|
+
tree = nil
|
|
203
|
+
in_fh = File.open(file, 'r')
|
|
204
|
+
#(((1_t5, 2_t9) 33 , ((3_t21, (4_t22, 5_t18) 36 ) 35 , (6_t3, 7_t6) 37 ) 34 ) 32 , (((((8_t8, 9_t12) 42 , (10_t23, 11_t14) 43 ) 41 , (12_t2, 13_t4) 44 ) 40 , 14_t24) 39 , ((((15_t30, 16_t20) 48 , 17_t13) 47 , (((18_t11, 19_t10) 51 , ((20_t1, 21_t26) 53 , (22_t7, (23_t19, 24_t27) 55 ) 54 ) 52 ) 50 , ((((25_t25, 26_t17) 59 , 27_t15) 58 , 28_t16) 57 , 29_t29) 56 ) 49 ) 46 , 30_t28) 45 ) 38 ) 31 ;
|
|
205
|
+
in_fh.each_line do |line|
|
|
206
|
+
line.chomp!
|
|
207
|
+
if is_start
|
|
208
|
+
tree = getTreeObjFromNwkString(line)
|
|
209
|
+
break
|
|
210
|
+
end
|
|
211
|
+
is_start = true if line =~ /^Species tree for FigTree/
|
|
212
|
+
end
|
|
213
|
+
in_fh.close
|
|
214
|
+
|
|
215
|
+
tree.de_no_for_tips
|
|
216
|
+
|
|
217
|
+
#get_internal_node_index(tree)
|
|
218
|
+
return(tree)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def get_rtc(file, root_two_children_names)
|
|
223
|
+
rtc_info = Hash.new
|
|
224
|
+
in_fh = File.open(file, 'r')
|
|
225
|
+
in_fh.each_line do |line|
|
|
226
|
+
line.chomp!
|
|
227
|
+
next if line =~ /^#|^$/
|
|
228
|
+
line_arr = line.split("\t")
|
|
229
|
+
|
|
230
|
+
symbiont = line_arr[0].gsub('_', '_').split(',')
|
|
231
|
+
rtc = Symbiont.new(symbiont)
|
|
232
|
+
|
|
233
|
+
(1..line_arr.size-1).each do |index|
|
|
234
|
+
ele = line_arr[index]
|
|
235
|
+
host = ele.split(':')[0].gsub('_', '_').split(',')
|
|
236
|
+
prob = ele.split(':')[1]
|
|
237
|
+
|
|
238
|
+
host_obj = Host.new(host)
|
|
239
|
+
host_obj.prob = prob
|
|
240
|
+
rtc.hosts << host_obj
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
root_obj = Host.new(root_two_children_names)
|
|
244
|
+
|
|
245
|
+
curr_total_prob = rtc.hosts.map{|host|host.prob}.reduce(&:+)
|
|
246
|
+
if curr_total_prob < 1
|
|
247
|
+
root_obj.prob = 1 - curr_total_prob
|
|
248
|
+
rtc.hosts << root_obj
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
rtc_info[symbiont] = rtc
|
|
252
|
+
end
|
|
253
|
+
in_fh.close
|
|
254
|
+
return(rtc_info)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_scm(file, root_two_children_names)
|
|
259
|
+
scm_info = Hash.new
|
|
260
|
+
symbionts = nil
|
|
261
|
+
in_fh = File.open(file, 'r')
|
|
262
|
+
rtc = nil
|
|
263
|
+
|
|
264
|
+
in_fh.each_line do |line|
|
|
265
|
+
line.chomp!
|
|
266
|
+
line_arr = line.split("\t")
|
|
267
|
+
next if line =~ /^#|^$/
|
|
268
|
+
if $. == 1
|
|
269
|
+
symbionts = line_arr.map{|i| i.gsub(' ', '_').split(',') }
|
|
270
|
+
rtc = Symbiont.new(symbionts)
|
|
271
|
+
next
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
prob = line_arr[-1].to_f
|
|
275
|
+
hosts = line_arr[0, line_arr.size-1].map{|i|i.split(',')}
|
|
276
|
+
hosts.map!{|a| a[0] == 'root' ? root_two_children_names : a } #2023-01, in case of >=1 root (fl; Z)
|
|
277
|
+
host_obj = Host.new(hosts)
|
|
278
|
+
host_obj.prob = prob
|
|
279
|
+
rtc.hosts << host_obj
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
root_obj = Host.new([root_two_children_names] * symbionts.size)
|
|
283
|
+
curr_total_prob = rtc.hosts.map{|host|host.prob}.reduce(&:+)
|
|
284
|
+
if curr_total_prob < 1
|
|
285
|
+
root_obj.prob = 1 - curr_total_prob
|
|
286
|
+
rtc.hosts << root_obj
|
|
287
|
+
end
|
|
288
|
+
scm_info[symbionts] = rtc
|
|
289
|
+
|
|
290
|
+
in_fh.close
|
|
291
|
+
return(scm_info)
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
##############################################
|
|
296
|
+
def auto_detect_mj(rtc_files) # identify whether marginal or joint
|
|
297
|
+
first_file = rtc_files[0]
|
|
298
|
+
in_fh = File.open(first_file, 'r')
|
|
299
|
+
first_line = in_fh.readline.chomp
|
|
300
|
+
if first_line =~ /:[10] (\b | \.\d+)/x
|
|
301
|
+
type = 'marginal'
|
|
302
|
+
else
|
|
303
|
+
type = 'joint'
|
|
304
|
+
end
|
|
305
|
+
in_fh.close
|
|
306
|
+
STDERR.puts "type auto-detected\t" + type.colorize(:yellow)
|
|
307
|
+
return(type)
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
##############################################
|
|
312
|
+
if __FILE__ == $0
|
|
313
|
+
indir = nil
|
|
314
|
+
infile = nil
|
|
315
|
+
type = nil
|
|
316
|
+
rtc_files = Array.new
|
|
317
|
+
scm_file = nil
|
|
318
|
+
scm_files = Array.new
|
|
319
|
+
mcmctxt_file = nil
|
|
320
|
+
is_rtc = true
|
|
321
|
+
is_renum = true
|
|
322
|
+
is_strict = false
|
|
323
|
+
|
|
324
|
+
rtc_info = Hash.new
|
|
325
|
+
scm_info = Hash.new
|
|
326
|
+
|
|
327
|
+
##############################################
|
|
328
|
+
opts = GetoptLong.new(
|
|
329
|
+
['--indir', GetoptLong::REQUIRED_ARGUMENT],
|
|
330
|
+
['-i', GetoptLong::REQUIRED_ARGUMENT],
|
|
331
|
+
['--marginal', GetoptLong::REQUIRED_ARGUMENT],
|
|
332
|
+
['--joint', GetoptLong::REQUIRED_ARGUMENT],
|
|
333
|
+
['--rtc', '--rrtc', GetoptLong::REQUIRED_ARGUMENT],
|
|
334
|
+
['--mcmctxt', GetoptLong::REQUIRED_ARGUMENT],
|
|
335
|
+
['--is_rtc', '--is_rrtc', '--rrtc_file', '--rtc_file', GetoptLong::REQUIRED_ARGUMENT],
|
|
336
|
+
['--strict', '--is_strict', GetoptLong::NO_ARGUMENT],
|
|
337
|
+
['--no_renum', '--no_re_num', GetoptLong::NO_ARGUMENT],
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
opts.each do |opt, value|
|
|
342
|
+
case opt
|
|
343
|
+
when '--indir'
|
|
344
|
+
indir = value
|
|
345
|
+
when '-i'
|
|
346
|
+
infile = value
|
|
347
|
+
when '--marginal'
|
|
348
|
+
rtc_files = File.directory?(value) ? read_infiles(value) : [value]
|
|
349
|
+
type = 'marginal'
|
|
350
|
+
when '--joint'
|
|
351
|
+
rtc_files = File.directory?(value) ? read_infiles(value) : [value]
|
|
352
|
+
type = 'joint'
|
|
353
|
+
when /^--r?rtc(file)?$/
|
|
354
|
+
rtc_files = File.directory?(value) ? read_infiles(value) : [value]
|
|
355
|
+
when '--mcmctxt'
|
|
356
|
+
mcmctxt_file = value
|
|
357
|
+
when '--is_rtc', '--is_rrtc'
|
|
358
|
+
is_rtc = value =~ /^true|T$/i ? true : false
|
|
359
|
+
when '--is_strict', '--strict'
|
|
360
|
+
is_strict = true
|
|
361
|
+
STDERR.puts "is_strict:\t" + "true".colorize(:yellow)
|
|
362
|
+
when '--no_renum', '--no_re_num'
|
|
363
|
+
is_renum = false
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
##############################################
|
|
369
|
+
unless indir.nil?
|
|
370
|
+
infile = File.join(indir, 'out') # file 'out' from the mcmctree output
|
|
371
|
+
mcmctxt_file = File.join(indir, 'mcmc.txt')
|
|
372
|
+
if is_output
|
|
373
|
+
outdir = File.join(File.dirname(mcmctxt_file), '')
|
|
374
|
+
out_fh = File.open(outfile, 'w')
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
type = auto_detect_mj(rtc_files) if type.nil? # identify whether marginal or joint
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
##############################################
|
|
382
|
+
tree = read_mcmctree_out(infile) # "out", NOT "mcmc.txt"
|
|
383
|
+
|
|
384
|
+
root_two_children_names = tree.children(tree.root).map{|i|tree.tips(i)[0].name}
|
|
385
|
+
|
|
386
|
+
case type
|
|
387
|
+
when 'marginal'
|
|
388
|
+
#rtc_info = get_rtc(rtc_file, root_two_children_names)
|
|
389
|
+
rtc_files.each do |rtc_file|
|
|
390
|
+
rtc_info.merge! get_rtc(rtc_file, root_two_children_names)
|
|
391
|
+
end
|
|
392
|
+
when 'joint'
|
|
393
|
+
rtc_files.each do |scm_file|
|
|
394
|
+
# note scm_info
|
|
395
|
+
scm_info.merge! get_scm(scm_file, root_two_children_names)
|
|
396
|
+
end
|
|
397
|
+
else
|
|
398
|
+
raise "rtc_file or scm_file has to be provided! Exiting ......"
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
name2node, node2name = tree.getNameNodeRela
|
|
402
|
+
rtc_info.delete_if{|names, rtc| not names.all?{|name|name2node.include?(name)}} unless rtc_info.nil?
|
|
403
|
+
scm_info.delete_if{|names, rtc| not names.flatten.all?{|name|name2node.include?(name)}} unless rtc_info.nil?
|
|
404
|
+
|
|
405
|
+
minus = tree.allTips.size - 1
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
##############################################
|
|
409
|
+
rtc_info.each_pair do |names, rtc|
|
|
410
|
+
begin
|
|
411
|
+
rtc.num = get_lca_bootstrap(names, name2node, tree) - minus
|
|
412
|
+
rtc.hosts.map{|obj|obj.num = get_lca_bootstrap(obj.name, name2node, tree) - minus}
|
|
413
|
+
rescue
|
|
414
|
+
raise "species #{names} or #{rtc.hosts} not found"
|
|
415
|
+
end
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
scm_info.each_pair do |names, rtc|
|
|
419
|
+
begin
|
|
420
|
+
rtc.num = names.map{|names2|get_lca_bootstrap(names2, name2node, tree) - minus}
|
|
421
|
+
scm_info[names].num = rtc.num
|
|
422
|
+
rtc.hosts.each do |obj|
|
|
423
|
+
obj_nums = Array.new
|
|
424
|
+
names = obj.name
|
|
425
|
+
names.each do |names|
|
|
426
|
+
obj_nums << get_lca_bootstrap(names, name2node, tree) - minus
|
|
427
|
+
end
|
|
428
|
+
obj.num = obj_nums
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
##############################################
|
|
435
|
+
headers = CSV.open(mcmctxt_file, &:readline)
|
|
436
|
+
|
|
437
|
+
col_data = Array.new
|
|
438
|
+
CSV.foreach(mcmctxt_file) do |row|
|
|
439
|
+
data = row[0].split("\t")
|
|
440
|
+
if is_rtc
|
|
441
|
+
if not rtc_info.empty?
|
|
442
|
+
#col_data << row if rtc_info.all?{|rtc_name, rtcs| rtcs.all?{|rtc| rtc.is_co_evolve?(data)} }
|
|
443
|
+
col_data << row if rtc_info.all?{|rtc_name, rtc| rtc.is_co_evolve?(data:data, is_strict:is_strict) }
|
|
444
|
+
elsif not scm_info.empty?
|
|
445
|
+
col_data << row if scm_info.all?{|rtc_name, rtc| rtc.is_co_evolve2?(data:data, is_strict:is_strict) }
|
|
446
|
+
end
|
|
447
|
+
else
|
|
448
|
+
col_data << row
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
STDERR.puts "# of samples after filtering\t" + col_data.size.to_s.colorize(:red)
|
|
453
|
+
|
|
454
|
+
col_data.each_with_index do |row_arr, index| # row_arr: single-element array
|
|
455
|
+
if is_renum and index > 0
|
|
456
|
+
posteriors = row_arr[0].split("\t")
|
|
457
|
+
posteriors[0] = index
|
|
458
|
+
row_arr = [posteriors.join("\t")]
|
|
459
|
+
end
|
|
460
|
+
puts row_arr
|
|
461
|
+
end
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
|
TreeSAK/fa2phy.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from Bio import AlignIO
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
fa2phy_usage = '''
|
|
6
|
+
======= fa2phy example commands =======
|
|
7
|
+
|
|
8
|
+
TreeSAK fa2phy -i msa.fa -o msa.phy
|
|
9
|
+
|
|
10
|
+
=======================================
|
|
11
|
+
'''
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def fa2phy(args):
|
|
15
|
+
|
|
16
|
+
fasta_in = args['i']
|
|
17
|
+
phy_out = args['o']
|
|
18
|
+
|
|
19
|
+
alignment = AlignIO.read(fasta_in, 'fasta')
|
|
20
|
+
|
|
21
|
+
max_seq_id_len = 0
|
|
22
|
+
for each_seq in alignment:
|
|
23
|
+
seq_id_len = len(each_seq.id)
|
|
24
|
+
if seq_id_len > max_seq_id_len:
|
|
25
|
+
max_seq_id_len = seq_id_len
|
|
26
|
+
|
|
27
|
+
with open(phy_out, 'w') as msa_out_handle:
|
|
28
|
+
msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
|
|
29
|
+
for each_seq in alignment:
|
|
30
|
+
seq_id = each_seq.id
|
|
31
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
32
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == '__main__':
|
|
36
|
+
|
|
37
|
+
# initialize the options parser
|
|
38
|
+
parser = argparse.ArgumentParser()
|
|
39
|
+
parser.add_argument('-i', required=True, help='input MSA in fasta format')
|
|
40
|
+
parser.add_argument('-o', required=True, help='output MSA in phylip format')
|
|
41
|
+
args = vars(parser.parse_args())
|
|
42
|
+
fa2phy(args)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
filter_rename_ar53_usage = '''
|
|
8
|
+
===================================== filter_rename_ar53 example commands =====================================
|
|
9
|
+
|
|
10
|
+
TreeSAK filter_rename_ar53 -i seq_dir -x fa -g interested_gnms.txt -m interested_marker.txt -o seq_dir_renamed
|
|
11
|
+
|
|
12
|
+
===============================================================================================================
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
def sep_path_basename_ext(file_in):
|
|
16
|
+
|
|
17
|
+
f_path, f_name = os.path.split(file_in)
|
|
18
|
+
if f_path == '':
|
|
19
|
+
f_path = '.'
|
|
20
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
21
|
+
f_ext = f_ext[1:]
|
|
22
|
+
|
|
23
|
+
return f_name, f_path, f_base, f_ext
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_shared_uniq_elements(list_1, list_2):
|
|
27
|
+
shared_set = set(list_1).intersection(list_2)
|
|
28
|
+
list_1_uniq = []
|
|
29
|
+
for e1 in list_1:
|
|
30
|
+
if e1 not in shared_set:
|
|
31
|
+
list_1_uniq.append(e1)
|
|
32
|
+
list_2_uniq = []
|
|
33
|
+
for e2 in list_2:
|
|
34
|
+
if e2 not in shared_set:
|
|
35
|
+
list_2_uniq.append(e2)
|
|
36
|
+
return shared_set, list_1_uniq, list_2_uniq
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def filter_rename_ar53(args):
|
|
40
|
+
|
|
41
|
+
marker_seq_dir = args['i']
|
|
42
|
+
marker_seq_ext = args['x']
|
|
43
|
+
interested_gnm_txt = args['g']
|
|
44
|
+
interested_marker_txt = args['m']
|
|
45
|
+
op_dir = args['o']
|
|
46
|
+
force_overwrite = args['f']
|
|
47
|
+
|
|
48
|
+
# create output folder
|
|
49
|
+
if os.path.isdir(op_dir) is True:
|
|
50
|
+
if force_overwrite is True:
|
|
51
|
+
os.system('rm -r %s' % op_dir)
|
|
52
|
+
else:
|
|
53
|
+
print('%s exist, program exited!' % op_dir)
|
|
54
|
+
exit()
|
|
55
|
+
os.mkdir(op_dir)
|
|
56
|
+
|
|
57
|
+
interested_marker_set = set()
|
|
58
|
+
if os.path.isfile(interested_marker_txt) is True:
|
|
59
|
+
for each_marker in open(interested_marker_txt):
|
|
60
|
+
interested_marker_set.add(each_marker.strip())
|
|
61
|
+
if len(interested_marker_set) == 0:
|
|
62
|
+
print('No marker provided in %s, program exited!' % interested_gnm_txt)
|
|
63
|
+
exit()
|
|
64
|
+
|
|
65
|
+
interested_gnm_set = set()
|
|
66
|
+
if os.path.isfile(interested_gnm_txt) is True:
|
|
67
|
+
for each_gnm in open(interested_gnm_txt):
|
|
68
|
+
interested_gnm_set.add(each_gnm.strip())
|
|
69
|
+
if len(interested_gnm_set) == 0:
|
|
70
|
+
print('No genome provided in %s, program exited!' % interested_gnm_txt)
|
|
71
|
+
exit()
|
|
72
|
+
|
|
73
|
+
marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
|
|
74
|
+
marker_seq_list = [os.path.basename(i) for i in glob.glob(marker_seq_re)]
|
|
75
|
+
marker_seq_id_list = [('.'.join(i.split('.')[:-1])) for i in marker_seq_list]
|
|
76
|
+
if len(marker_seq_list) == 0:
|
|
77
|
+
print('No file found in %s, program exited!' % marker_seq_dir)
|
|
78
|
+
exit()
|
|
79
|
+
|
|
80
|
+
marker_to_process = set()
|
|
81
|
+
if len(interested_marker_set) == 0:
|
|
82
|
+
marker_to_process = marker_seq_id_list
|
|
83
|
+
else:
|
|
84
|
+
shared_set, marker_seq_uniq, interested_marker_uniq = get_shared_uniq_elements(marker_seq_id_list, interested_marker_set)
|
|
85
|
+
if len(interested_marker_uniq) > 0:
|
|
86
|
+
print('Sequences for the following interested markers were not found:')
|
|
87
|
+
print(','.join(interested_marker_uniq))
|
|
88
|
+
marker_to_process = shared_set
|
|
89
|
+
|
|
90
|
+
for marker_id in marker_to_process:
|
|
91
|
+
pwd_file_in = '%s/%s.%s' % (marker_seq_dir, marker_id, marker_seq_ext)
|
|
92
|
+
pwd_file_out = '%s/%s.%s' % (op_dir, marker_id, marker_seq_ext)
|
|
93
|
+
pwd_op_file_handle = open(pwd_file_out, 'w')
|
|
94
|
+
for each_seq in SeqIO.parse(pwd_file_in, 'fasta'):
|
|
95
|
+
gnm_id = each_seq.id
|
|
96
|
+
if os.path.isfile(interested_gnm_txt) is False:
|
|
97
|
+
pwd_op_file_handle.write('>%s_XXX\n' % gnm_id)
|
|
98
|
+
pwd_op_file_handle.write('%s\n' % str(each_seq.seq))
|
|
99
|
+
else:
|
|
100
|
+
if gnm_id in interested_gnm_set:
|
|
101
|
+
pwd_op_file_handle.write('>%s_XXX\n' % gnm_id)
|
|
102
|
+
pwd_op_file_handle.write('%s\n' % str(each_seq.seq))
|
|
103
|
+
pwd_op_file_handle.close()
|
|
104
|
+
|
|
105
|
+
print('Done!')
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == '__main__':
|
|
109
|
+
|
|
110
|
+
filter_rename_ar53_parser = argparse.ArgumentParser()
|
|
111
|
+
filter_rename_ar53_parser.add_argument('-i', required=True, help='sequence folder')
|
|
112
|
+
filter_rename_ar53_parser.add_argument('-x', required=True, help='file extension')
|
|
113
|
+
filter_rename_ar53_parser.add_argument('-g', required=False, default=None, help='interested genome, no ext, one id per line')
|
|
114
|
+
filter_rename_ar53_parser.add_argument('-m', required=False, default=None, help='interested marker, no ext, one id per line')
|
|
115
|
+
filter_rename_ar53_parser.add_argument('-o', required=True, help='output folder')
|
|
116
|
+
filter_rename_ar53_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
117
|
+
args = vars(filter_rename_ar53_parser.parse_args())
|
|
118
|
+
filter_rename_ar53(args)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from ete3 import Tree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
format_leaf_name_usage = '''
|
|
7
|
+
========= FLN (Format Leaf Name) example commands =========
|
|
8
|
+
|
|
9
|
+
BioSAK FLN -i input.tree -o output.tree -s2u -nsqm -ndqm
|
|
10
|
+
BioSAK FLN -i input.tree -o output.tree -ns -nsqm
|
|
11
|
+
|
|
12
|
+
===========================================================
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def format_leaf_name(args):
|
|
17
|
+
|
|
18
|
+
tree_file_in = args['i']
|
|
19
|
+
tree_format = args['fmt']
|
|
20
|
+
tree_file_out = args['o']
|
|
21
|
+
no_space = args['ns']
|
|
22
|
+
space_to_underscore = args['s2u']
|
|
23
|
+
no_single_quotation_mark = args['nsqm']
|
|
24
|
+
no_double_quotation_mark = args['ndqm']
|
|
25
|
+
|
|
26
|
+
if os.path.isfile(tree_file_in) is False:
|
|
27
|
+
print('Tree file not found, program exited!')
|
|
28
|
+
exit()
|
|
29
|
+
|
|
30
|
+
if (no_space is True) and (space_to_underscore is True):
|
|
31
|
+
print('Two actions (-ns and -s2u) specified to spaces in tree leaves, program exited!')
|
|
32
|
+
exit()
|
|
33
|
+
|
|
34
|
+
t = Tree(tree_file_in, format=tree_format)
|
|
35
|
+
|
|
36
|
+
# get rename dict
|
|
37
|
+
mag_rename_dict = dict()
|
|
38
|
+
for leaf in t:
|
|
39
|
+
leaf_name = leaf.name
|
|
40
|
+
if space_to_underscore is True:
|
|
41
|
+
leaf_name = leaf_name.replace(' ', '_')
|
|
42
|
+
if no_space is True:
|
|
43
|
+
leaf_name = leaf_name.replace(' ', '')
|
|
44
|
+
if no_single_quotation_mark is True:
|
|
45
|
+
leaf_name = leaf_name.replace("'", '')
|
|
46
|
+
if no_double_quotation_mark is True:
|
|
47
|
+
leaf_name = leaf_name.replace('"', '')
|
|
48
|
+
mag_rename_dict[leaf.name] = leaf_name
|
|
49
|
+
|
|
50
|
+
for leaf in t:
|
|
51
|
+
leaf_name = leaf.name
|
|
52
|
+
leaf_name_new = mag_rename_dict[leaf_name]
|
|
53
|
+
leaf.name = leaf_name_new
|
|
54
|
+
t.write(format=tree_format, outfile=tree_file_out)
|
|
55
|
+
|
|
56
|
+
print('Done!')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
|
|
61
|
+
format_leaf_name_parser = argparse.ArgumentParser()
|
|
62
|
+
format_leaf_name_parser.add_argument('-i', required=True, help='input tree')
|
|
63
|
+
format_leaf_name_parser.add_argument('-fmt', required=False, default=1, help='tree format, default: 1')
|
|
64
|
+
format_leaf_name_parser.add_argument('-o', required=True, help='output tree')
|
|
65
|
+
format_leaf_name_parser.add_argument('-s2u', required=False, action="store_true", help='change space in tree leaves to underscore')
|
|
66
|
+
format_leaf_name_parser.add_argument('-ns', required=False, action="store_true", help='remove space from leaf names')
|
|
67
|
+
format_leaf_name_parser.add_argument('-nsqm', required=False, action="store_true", help='remove single quotation marks from leaf names')
|
|
68
|
+
format_leaf_name_parser.add_argument('-ndqm', required=False, action="store_true", help='remove double quotation marks from leaf names')
|
|
69
|
+
args = vars(format_leaf_name_parser.parse_args())
|
|
70
|
+
format_leaf_name(args)
|