treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/do_rrtc.rb ADDED
@@ -0,0 +1,464 @@
1
+ #! /bin/env ruby
2
+
3
+
4
+ ################################################################################
5
+
6
+ # updated on 2023-01-11
7
+ # auto-detect input type (marginal or joint)
8
+ # updated on 2023-01-09
9
+ # --rtc a folder as an input allowed
10
+ # updated on 2022-11-18
11
+ # bugs fixed
12
+ # updated on 2022-11-18 for non-independent ASR based on scm
13
+
14
+
15
+ ################################################################################
16
+
17
+ require 'find'
18
+ require 'getoptlong'
19
+ require 'csv'
20
+ require 'parallel'
21
+ require 'bio-nwk'
22
+ require 'colorize'
23
+
24
+
25
+ ################################################################################
26
+ class Dir
27
+ def self.mkdirs(path)
28
+ if(!File.directory?(path))
29
+ if(!mkdirs(File.dirname(path)))
30
+ return false;
31
+ end
32
+ mkdir(path)
33
+ end
34
+ return true
35
+ end
36
+ end
37
+
38
+
39
+ ################################################################################
40
+ def mkdir_with_force(outdir, is_force=false, is_tolerate=false)
41
+ if outdir.class != String
42
+ raise "outdir wrong? Exiting ......"
43
+ end
44
+
45
+ if ! Dir.exists?(outdir)
46
+ `mkdir -p #{outdir}`
47
+ else
48
+ if is_tolerate
49
+ ;
50
+ elsif is_force
51
+ `rm -rf #{outdir}`
52
+ `mkdir -p #{outdir}`
53
+ else
54
+ raise "The outdir #{outdir} has already existed!"
55
+ end
56
+ end
57
+ end
58
+
59
+
60
+ def read_infiles(indir, suffix='', is_all_subfolder=false)
61
+ infiles = Array.new
62
+ if ! is_all_subfolder
63
+ Dir.foreach(indir) do |b|
64
+ next if b =~ /^\./
65
+ if suffix.is_a?(String)
66
+ if suffix != ''
67
+ next if b !~ /#{suffix}$/
68
+ end
69
+ elsif suffix.is_a?(Array)
70
+ next unless suffix.any?{|i| b =~ /#{i}$/ }
71
+ end
72
+ infiles << File.join(indir, b)
73
+ end
74
+ else
75
+ Find.find(indir) do |path|
76
+ next if File.directory?(path)
77
+ next if File.basename(path) =~ /^\./
78
+ infiles << path if suffix.is_a?(String) ? path =~ /\.#{suffix}$/ : suffix.any?{|i| path =~ /#{i}$/ }
79
+ end
80
+ end
81
+ return(infiles)
82
+ end
83
+
84
+
85
+ def getFilesBySuffices(indir, suffices)
86
+ files = Array.new
87
+ infiles = read_infiles(indir)
88
+ infiles.each do |infile|
89
+ if suffices.include?(File.extname(infile))
90
+ files << infile
91
+ end
92
+ end
93
+ return(files)
94
+ end
95
+
96
+
97
+ def get_file_path(file)
98
+ path = File.symlink?(file) ? File.readlink(file) : file
99
+ return(path)
100
+ end
101
+
102
+ ################################################################################
103
+
104
+ # all credits to Wolfgang Teuber (https://gitlab.com/knugie)
105
+
106
+ def weighted_rand(weights = {})
107
+ raise 'Probabilities must sum up to 1' unless weights.values.inject(&:+) == 1.0
108
+ raise 'Probabilities must not be negative' unless weights.values.all? { |p| p >= 0 }
109
+ # Do more sanity checks depending on the amount of trust in the software component using this method,
110
+ # e.g. don't allow duplicates, don't allow non-numeric values, etc.
111
+
112
+ # Ignore elements with probability 0
113
+ weights = weights.reject { |k, v| v == 0.0 } # e.g. => {"a"=>0.4, "b"=>0.4, "c"=>0.2}
114
+
115
+ # Accumulate probabilities and map them to a value
116
+ u = 0.0
117
+ ranges = weights.map { |v, p| [u += p, v] } # e.g. => [[0.4, "a"], [0.8, "b"], [1.0, "c"]]
118
+
119
+ # Generate a (pseudo-)random floating point number between 0.0(included) and 1.0(excluded)
120
+ u = rand # e.g. => 0.4651073966724186
121
+
122
+ # Find the first value that has an accumulated probability greater than the random number u
123
+ ranges.find { |p, v| p > u }.last # e.g. => "b"
124
+ end
125
+
126
+ ################################################################################
127
+
128
+ class Bio::Tree
129
+ def de_no_for_tips
130
+ allTips.each do |tip|
131
+ tip.name = tip.name.split(' ')[1,100].join('_')
132
+ end
133
+ end
134
+ end
135
+
136
+
137
+ class Subclade
138
+ attr_accessor :name, :num
139
+ def initialize(arr)
140
+ #@name = arr
141
+ @name = arr[0].is_a?(Array) ? arr.map{|i|i.sort} : arr.sort
142
+ #@name = arr[0] =~ /,/ ? arr.map{|i|i.split(',').sort} : arr.sort
143
+ @num = nil
144
+ end
145
+ end
146
+
147
+
148
+ class Symbiont < Subclade
149
+ attr_accessor :hosts
150
+ def initialize(arr)
151
+ @name = arr
152
+ @hosts = Array.new
153
+ end
154
+ def is_co_evolve?(data:, is_strict:)
155
+ @host2prob = Hash.new
156
+ @hosts.map{|i| @host2prob[i]=i.prob }
157
+ selected_host = is_strict ? @hosts.sort_by{|i| i.prob }.reverse[0] : weighted_rand(@host2prob)
158
+ if data[selected_host.num-1].to_f >= data[@num-1].to_f
159
+ return(true)
160
+ else
161
+ return(false)
162
+ end
163
+ end
164
+ def is_co_evolve2?(data:, is_strict:)
165
+ @host2prob = Hash.new
166
+ @hosts.map{|i| @host2prob[i]=i.prob }
167
+ selected_host = is_strict ? @hosts.sort_by{|i| i.prob }.reverse[0] : weighted_rand(@host2prob)
168
+ if selected_host.num.zip(@num).all?{|a, b| data[a-1].to_f >= data[b-1].to_f }
169
+ return(true)
170
+ else
171
+ return(false)
172
+ end
173
+ end
174
+ end
175
+
176
+
177
+ class Host < Subclade
178
+ attr_accessor :prob
179
+ def prob
180
+ @prob
181
+ end
182
+ def prob=(prob)
183
+ @prob = prob.to_f
184
+ end
185
+ end
186
+
187
+
188
+ ##############################################
189
+ def get_lca_bootstrap(names, name2node, tree)
190
+ nodes = names.map{|i| name2node[i] }
191
+ lca = tree.lowest_common_ancestor(nodes[0], nodes[1])
192
+ begin
193
+ return(lca.bootstrap)
194
+ rescue
195
+ raise nodes.join("\t")
196
+ end
197
+ end
198
+
199
+
200
+ def read_mcmctree_out(file)
201
+ is_start = false
202
+ tree = nil
203
+ in_fh = File.open(file, 'r')
204
+ #(((1_t5, 2_t9) 33 , ((3_t21, (4_t22, 5_t18) 36 ) 35 , (6_t3, 7_t6) 37 ) 34 ) 32 , (((((8_t8, 9_t12) 42 , (10_t23, 11_t14) 43 ) 41 , (12_t2, 13_t4) 44 ) 40 , 14_t24) 39 , ((((15_t30, 16_t20) 48 , 17_t13) 47 , (((18_t11, 19_t10) 51 , ((20_t1, 21_t26) 53 , (22_t7, (23_t19, 24_t27) 55 ) 54 ) 52 ) 50 , ((((25_t25, 26_t17) 59 , 27_t15) 58 , 28_t16) 57 , 29_t29) 56 ) 49 ) 46 , 30_t28) 45 ) 38 ) 31 ;
205
+ in_fh.each_line do |line|
206
+ line.chomp!
207
+ if is_start
208
+ tree = getTreeObjFromNwkString(line)
209
+ break
210
+ end
211
+ is_start = true if line =~ /^Species tree for FigTree/
212
+ end
213
+ in_fh.close
214
+
215
+ tree.de_no_for_tips
216
+
217
+ #get_internal_node_index(tree)
218
+ return(tree)
219
+ end
220
+
221
+
222
+ def get_rtc(file, root_two_children_names)
223
+ rtc_info = Hash.new
224
+ in_fh = File.open(file, 'r')
225
+ in_fh.each_line do |line|
226
+ line.chomp!
227
+ next if line =~ /^#|^$/
228
+ line_arr = line.split("\t")
229
+
230
+ symbiont = line_arr[0].gsub('_', '_').split(',')
231
+ rtc = Symbiont.new(symbiont)
232
+
233
+ (1..line_arr.size-1).each do |index|
234
+ ele = line_arr[index]
235
+ host = ele.split(':')[0].gsub('_', '_').split(',')
236
+ prob = ele.split(':')[1]
237
+
238
+ host_obj = Host.new(host)
239
+ host_obj.prob = prob
240
+ rtc.hosts << host_obj
241
+ end
242
+
243
+ root_obj = Host.new(root_two_children_names)
244
+
245
+ curr_total_prob = rtc.hosts.map{|host|host.prob}.reduce(&:+)
246
+ if curr_total_prob < 1
247
+ root_obj.prob = 1 - curr_total_prob
248
+ rtc.hosts << root_obj
249
+ end
250
+
251
+ rtc_info[symbiont] = rtc
252
+ end
253
+ in_fh.close
254
+ return(rtc_info)
255
+ end
256
+
257
+
258
+ def get_scm(file, root_two_children_names)
259
+ scm_info = Hash.new
260
+ symbionts = nil
261
+ in_fh = File.open(file, 'r')
262
+ rtc = nil
263
+
264
+ in_fh.each_line do |line|
265
+ line.chomp!
266
+ line_arr = line.split("\t")
267
+ next if line =~ /^#|^$/
268
+ if $. == 1
269
+ symbionts = line_arr.map{|i| i.gsub(' ', '_').split(',') }
270
+ rtc = Symbiont.new(symbionts)
271
+ next
272
+ end
273
+
274
+ prob = line_arr[-1].to_f
275
+ hosts = line_arr[0, line_arr.size-1].map{|i|i.split(',')}
276
+ hosts.map!{|a| a[0] == 'root' ? root_two_children_names : a } #2023-01, in case of >=1 root (fl; Z)
277
+ host_obj = Host.new(hosts)
278
+ host_obj.prob = prob
279
+ rtc.hosts << host_obj
280
+ end
281
+
282
+ root_obj = Host.new([root_two_children_names] * symbionts.size)
283
+ curr_total_prob = rtc.hosts.map{|host|host.prob}.reduce(&:+)
284
+ if curr_total_prob < 1
285
+ root_obj.prob = 1 - curr_total_prob
286
+ rtc.hosts << root_obj
287
+ end
288
+ scm_info[symbionts] = rtc
289
+
290
+ in_fh.close
291
+ return(scm_info)
292
+ end
293
+
294
+
295
+ ##############################################
296
+ def auto_detect_mj(rtc_files) # identify whether marginal or joint
297
+ first_file = rtc_files[0]
298
+ in_fh = File.open(first_file, 'r')
299
+ first_line = in_fh.readline.chomp
300
+ if first_line =~ /:[10] (\b | \.\d+)/x
301
+ type = 'marginal'
302
+ else
303
+ type = 'joint'
304
+ end
305
+ in_fh.close
306
+ STDERR.puts "type auto-detected\t" + type.colorize(:yellow)
307
+ return(type)
308
+ end
309
+
310
+
311
+ ##############################################
312
+ if __FILE__ == $0
313
+ indir = nil
314
+ infile = nil
315
+ type = nil
316
+ rtc_files = Array.new
317
+ scm_file = nil
318
+ scm_files = Array.new
319
+ mcmctxt_file = nil
320
+ is_rtc = true
321
+ is_renum = true
322
+ is_strict = false
323
+
324
+ rtc_info = Hash.new
325
+ scm_info = Hash.new
326
+
327
+ ##############################################
328
+ opts = GetoptLong.new(
329
+ ['--indir', GetoptLong::REQUIRED_ARGUMENT],
330
+ ['-i', GetoptLong::REQUIRED_ARGUMENT],
331
+ ['--marginal', GetoptLong::REQUIRED_ARGUMENT],
332
+ ['--joint', GetoptLong::REQUIRED_ARGUMENT],
333
+ ['--rtc', '--rrtc', GetoptLong::REQUIRED_ARGUMENT],
334
+ ['--mcmctxt', GetoptLong::REQUIRED_ARGUMENT],
335
+ ['--is_rtc', '--is_rrtc', '--rrtc_file', '--rtc_file', GetoptLong::REQUIRED_ARGUMENT],
336
+ ['--strict', '--is_strict', GetoptLong::NO_ARGUMENT],
337
+ ['--no_renum', '--no_re_num', GetoptLong::NO_ARGUMENT],
338
+ )
339
+
340
+
341
+ opts.each do |opt, value|
342
+ case opt
343
+ when '--indir'
344
+ indir = value
345
+ when '-i'
346
+ infile = value
347
+ when '--marginal'
348
+ rtc_files = File.directory?(value) ? read_infiles(value) : [value]
349
+ type = 'marginal'
350
+ when '--joint'
351
+ rtc_files = File.directory?(value) ? read_infiles(value) : [value]
352
+ type = 'joint'
353
+ when /^--r?rtc(file)?$/
354
+ rtc_files = File.directory?(value) ? read_infiles(value) : [value]
355
+ when '--mcmctxt'
356
+ mcmctxt_file = value
357
+ when '--is_rtc', '--is_rrtc'
358
+ is_rtc = value =~ /^true|T$/i ? true : false
359
+ when '--is_strict', '--strict'
360
+ is_strict = true
361
+ STDERR.puts "is_strict:\t" + "true".colorize(:yellow)
362
+ when '--no_renum', '--no_re_num'
363
+ is_renum = false
364
+ end
365
+ end
366
+
367
+
368
+ ##############################################
369
+ unless indir.nil?
370
+ infile = File.join(indir, 'out') # file 'out' from the mcmctree output
371
+ mcmctxt_file = File.join(indir, 'mcmc.txt')
372
+ if is_output
373
+ outdir = File.join(File.dirname(mcmctxt_file), '')
374
+ out_fh = File.open(outfile, 'w')
375
+ end
376
+ end
377
+
378
+ type = auto_detect_mj(rtc_files) if type.nil? # identify whether marginal or joint
379
+
380
+
381
+ ##############################################
382
+ tree = read_mcmctree_out(infile) # "out", NOT "mcmc.txt"
383
+
384
+ root_two_children_names = tree.children(tree.root).map{|i|tree.tips(i)[0].name}
385
+
386
+ case type
387
+ when 'marginal'
388
+ #rtc_info = get_rtc(rtc_file, root_two_children_names)
389
+ rtc_files.each do |rtc_file|
390
+ rtc_info.merge! get_rtc(rtc_file, root_two_children_names)
391
+ end
392
+ when 'joint'
393
+ rtc_files.each do |scm_file|
394
+ # note scm_info
395
+ scm_info.merge! get_scm(scm_file, root_two_children_names)
396
+ end
397
+ else
398
+ raise "rtc_file or scm_file has to be provided! Exiting ......"
399
+ end
400
+
401
+ name2node, node2name = tree.getNameNodeRela
402
+ rtc_info.delete_if{|names, rtc| not names.all?{|name|name2node.include?(name)}} unless rtc_info.nil?
403
+ scm_info.delete_if{|names, rtc| not names.flatten.all?{|name|name2node.include?(name)}} unless rtc_info.nil?
404
+
405
+ minus = tree.allTips.size - 1
406
+
407
+
408
+ ##############################################
409
+ rtc_info.each_pair do |names, rtc|
410
+ begin
411
+ rtc.num = get_lca_bootstrap(names, name2node, tree) - minus
412
+ rtc.hosts.map{|obj|obj.num = get_lca_bootstrap(obj.name, name2node, tree) - minus}
413
+ rescue
414
+ raise "species #{names} or #{rtc.hosts} not found"
415
+ end
416
+ end
417
+
418
+ scm_info.each_pair do |names, rtc|
419
+ begin
420
+ rtc.num = names.map{|names2|get_lca_bootstrap(names2, name2node, tree) - minus}
421
+ scm_info[names].num = rtc.num
422
+ rtc.hosts.each do |obj|
423
+ obj_nums = Array.new
424
+ names = obj.name
425
+ names.each do |names|
426
+ obj_nums << get_lca_bootstrap(names, name2node, tree) - minus
427
+ end
428
+ obj.num = obj_nums
429
+ end
430
+ end
431
+ end
432
+
433
+
434
+ ##############################################
435
+ headers = CSV.open(mcmctxt_file, &:readline)
436
+
437
+ col_data = Array.new
438
+ CSV.foreach(mcmctxt_file) do |row|
439
+ data = row[0].split("\t")
440
+ if is_rtc
441
+ if not rtc_info.empty?
442
+ #col_data << row if rtc_info.all?{|rtc_name, rtcs| rtcs.all?{|rtc| rtc.is_co_evolve?(data)} }
443
+ col_data << row if rtc_info.all?{|rtc_name, rtc| rtc.is_co_evolve?(data:data, is_strict:is_strict) }
444
+ elsif not scm_info.empty?
445
+ col_data << row if scm_info.all?{|rtc_name, rtc| rtc.is_co_evolve2?(data:data, is_strict:is_strict) }
446
+ end
447
+ else
448
+ col_data << row
449
+ end
450
+ end
451
+
452
+ STDERR.puts "# of samples after filtering\t" + col_data.size.to_s.colorize(:red)
453
+
454
+ col_data.each_with_index do |row_arr, index| # row_arr: single-element array
455
+ if is_renum and index > 0
456
+ posteriors = row_arr[0].split("\t")
457
+ posteriors[0] = index
458
+ row_arr = [posteriors.join("\t")]
459
+ end
460
+ puts row_arr
461
+ end
462
+ end
463
+
464
+
TreeSAK/fa2phy.py ADDED
@@ -0,0 +1,42 @@
1
+ import argparse
2
+ from Bio import AlignIO
3
+
4
+
5
+ fa2phy_usage = '''
6
+ ======= fa2phy example commands =======
7
+
8
+ TreeSAK fa2phy -i msa.fa -o msa.phy
9
+
10
+ =======================================
11
+ '''
12
+
13
+
14
+ def fa2phy(args):
15
+
16
+ fasta_in = args['i']
17
+ phy_out = args['o']
18
+
19
+ alignment = AlignIO.read(fasta_in, 'fasta')
20
+
21
+ max_seq_id_len = 0
22
+ for each_seq in alignment:
23
+ seq_id_len = len(each_seq.id)
24
+ if seq_id_len > max_seq_id_len:
25
+ max_seq_id_len = seq_id_len
26
+
27
+ with open(phy_out, 'w') as msa_out_handle:
28
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
29
+ for each_seq in alignment:
30
+ seq_id = each_seq.id
31
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
32
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
33
+
34
+
35
+ if __name__ == '__main__':
36
+
37
+ # initialize the options parser
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument('-i', required=True, help='input MSA in fasta format')
40
+ parser.add_argument('-o', required=True, help='output MSA in phylip format')
41
+ args = vars(parser.parse_args())
42
+ fa2phy(args)
@@ -0,0 +1,118 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+
6
+
7
+ filter_rename_ar53_usage = '''
8
+ ===================================== filter_rename_ar53 example commands =====================================
9
+
10
+ TreeSAK filter_rename_ar53 -i seq_dir -x fa -g interested_gnms.txt -m interested_marker.txt -o seq_dir_renamed
11
+
12
+ ===============================================================================================================
13
+ '''
14
+
15
+ def sep_path_basename_ext(file_in):
16
+
17
+ f_path, f_name = os.path.split(file_in)
18
+ if f_path == '':
19
+ f_path = '.'
20
+ f_base, f_ext = os.path.splitext(f_name)
21
+ f_ext = f_ext[1:]
22
+
23
+ return f_name, f_path, f_base, f_ext
24
+
25
+
26
+ def get_shared_uniq_elements(list_1, list_2):
27
+ shared_set = set(list_1).intersection(list_2)
28
+ list_1_uniq = []
29
+ for e1 in list_1:
30
+ if e1 not in shared_set:
31
+ list_1_uniq.append(e1)
32
+ list_2_uniq = []
33
+ for e2 in list_2:
34
+ if e2 not in shared_set:
35
+ list_2_uniq.append(e2)
36
+ return shared_set, list_1_uniq, list_2_uniq
37
+
38
+
39
+ def filter_rename_ar53(args):
40
+
41
+ marker_seq_dir = args['i']
42
+ marker_seq_ext = args['x']
43
+ interested_gnm_txt = args['g']
44
+ interested_marker_txt = args['m']
45
+ op_dir = args['o']
46
+ force_overwrite = args['f']
47
+
48
+ # create output folder
49
+ if os.path.isdir(op_dir) is True:
50
+ if force_overwrite is True:
51
+ os.system('rm -r %s' % op_dir)
52
+ else:
53
+ print('%s exist, program exited!' % op_dir)
54
+ exit()
55
+ os.mkdir(op_dir)
56
+
57
+ interested_marker_set = set()
58
+ if os.path.isfile(interested_marker_txt) is True:
59
+ for each_marker in open(interested_marker_txt):
60
+ interested_marker_set.add(each_marker.strip())
61
+ if len(interested_marker_set) == 0:
62
+ print('No marker provided in %s, program exited!' % interested_gnm_txt)
63
+ exit()
64
+
65
+ interested_gnm_set = set()
66
+ if os.path.isfile(interested_gnm_txt) is True:
67
+ for each_gnm in open(interested_gnm_txt):
68
+ interested_gnm_set.add(each_gnm.strip())
69
+ if len(interested_gnm_set) == 0:
70
+ print('No genome provided in %s, program exited!' % interested_gnm_txt)
71
+ exit()
72
+
73
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
74
+ marker_seq_list = [os.path.basename(i) for i in glob.glob(marker_seq_re)]
75
+ marker_seq_id_list = [('.'.join(i.split('.')[:-1])) for i in marker_seq_list]
76
+ if len(marker_seq_list) == 0:
77
+ print('No file found in %s, program exited!' % marker_seq_dir)
78
+ exit()
79
+
80
+ marker_to_process = set()
81
+ if len(interested_marker_set) == 0:
82
+ marker_to_process = marker_seq_id_list
83
+ else:
84
+ shared_set, marker_seq_uniq, interested_marker_uniq = get_shared_uniq_elements(marker_seq_id_list, interested_marker_set)
85
+ if len(interested_marker_uniq) > 0:
86
+ print('Sequences for the following interested markers were not found:')
87
+ print(','.join(interested_marker_uniq))
88
+ marker_to_process = shared_set
89
+
90
+ for marker_id in marker_to_process:
91
+ pwd_file_in = '%s/%s.%s' % (marker_seq_dir, marker_id, marker_seq_ext)
92
+ pwd_file_out = '%s/%s.%s' % (op_dir, marker_id, marker_seq_ext)
93
+ pwd_op_file_handle = open(pwd_file_out, 'w')
94
+ for each_seq in SeqIO.parse(pwd_file_in, 'fasta'):
95
+ gnm_id = each_seq.id
96
+ if os.path.isfile(interested_gnm_txt) is False:
97
+ pwd_op_file_handle.write('>%s_XXX\n' % gnm_id)
98
+ pwd_op_file_handle.write('%s\n' % str(each_seq.seq))
99
+ else:
100
+ if gnm_id in interested_gnm_set:
101
+ pwd_op_file_handle.write('>%s_XXX\n' % gnm_id)
102
+ pwd_op_file_handle.write('%s\n' % str(each_seq.seq))
103
+ pwd_op_file_handle.close()
104
+
105
+ print('Done!')
106
+
107
+
108
+ if __name__ == '__main__':
109
+
110
+ filter_rename_ar53_parser = argparse.ArgumentParser()
111
+ filter_rename_ar53_parser.add_argument('-i', required=True, help='sequence folder')
112
+ filter_rename_ar53_parser.add_argument('-x', required=True, help='file extension')
113
+ filter_rename_ar53_parser.add_argument('-g', required=False, default=None, help='interested genome, no ext, one id per line')
114
+ filter_rename_ar53_parser.add_argument('-m', required=False, default=None, help='interested marker, no ext, one id per line')
115
+ filter_rename_ar53_parser.add_argument('-o', required=True, help='output folder')
116
+ filter_rename_ar53_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
117
+ args = vars(filter_rename_ar53_parser.parse_args())
118
+ filter_rename_ar53(args)
@@ -0,0 +1,70 @@
1
+ import os
2
+ import argparse
3
+ from ete3 import Tree
4
+
5
+
6
+ format_leaf_name_usage = '''
7
+ ========= FLN (Format Leaf Name) example commands =========
8
+
9
+ BioSAK FLN -i input.tree -o output.tree -s2u -nsqm -ndqm
10
+ BioSAK FLN -i input.tree -o output.tree -ns -nsqm
11
+
12
+ ===========================================================
13
+ '''
14
+
15
+
16
+ def format_leaf_name(args):
17
+
18
+ tree_file_in = args['i']
19
+ tree_format = args['fmt']
20
+ tree_file_out = args['o']
21
+ no_space = args['ns']
22
+ space_to_underscore = args['s2u']
23
+ no_single_quotation_mark = args['nsqm']
24
+ no_double_quotation_mark = args['ndqm']
25
+
26
+ if os.path.isfile(tree_file_in) is False:
27
+ print('Tree file not found, program exited!')
28
+ exit()
29
+
30
+ if (no_space is True) and (space_to_underscore is True):
31
+ print('Two actions (-ns and -s2u) specified to spaces in tree leaves, program exited!')
32
+ exit()
33
+
34
+ t = Tree(tree_file_in, format=tree_format)
35
+
36
+ # get rename dict
37
+ mag_rename_dict = dict()
38
+ for leaf in t:
39
+ leaf_name = leaf.name
40
+ if space_to_underscore is True:
41
+ leaf_name = leaf_name.replace(' ', '_')
42
+ if no_space is True:
43
+ leaf_name = leaf_name.replace(' ', '')
44
+ if no_single_quotation_mark is True:
45
+ leaf_name = leaf_name.replace("'", '')
46
+ if no_double_quotation_mark is True:
47
+ leaf_name = leaf_name.replace('"', '')
48
+ mag_rename_dict[leaf.name] = leaf_name
49
+
50
+ for leaf in t:
51
+ leaf_name = leaf.name
52
+ leaf_name_new = mag_rename_dict[leaf_name]
53
+ leaf.name = leaf_name_new
54
+ t.write(format=tree_format, outfile=tree_file_out)
55
+
56
+ print('Done!')
57
+
58
+
59
+ if __name__ == '__main__':
60
+
61
+ format_leaf_name_parser = argparse.ArgumentParser()
62
+ format_leaf_name_parser.add_argument('-i', required=True, help='input tree')
63
+ format_leaf_name_parser.add_argument('-fmt', required=False, default=1, help='tree format, default: 1')
64
+ format_leaf_name_parser.add_argument('-o', required=True, help='output tree')
65
+ format_leaf_name_parser.add_argument('-s2u', required=False, action="store_true", help='change space in tree leaves to underscore')
66
+ format_leaf_name_parser.add_argument('-ns', required=False, action="store_true", help='remove space from leaf names')
67
+ format_leaf_name_parser.add_argument('-nsqm', required=False, action="store_true", help='remove single quotation marks from leaf names')
68
+ format_leaf_name_parser.add_argument('-ndqm', required=False, action="store_true", help='remove double quotation marks from leaf names')
69
+ args = vars(format_leaf_name_parser.parse_args())
70
+ format_leaf_name(args)