treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/SplitScore2.py ADDED
@@ -0,0 +1,597 @@
1
+ from __future__ import print_function
2
+ import os
3
+ import glob
4
+ import numpy
5
+ import argparse
6
+ import subprocess
7
+ from ete3 import Tree
8
+ import multiprocessing as mp
9
+ from operator import itemgetter
10
+ from collections import defaultdict
11
+
12
+
13
+ SplitScore2_usage = '''
14
+ ======================== SplitScore2 example commands ========================
15
+
16
+ TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir -c 25,50,75
17
+ TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir -c 20,40,60,80
18
+
19
+ # format of gnm_cluster.tsv (tab separated)
20
+ GCA_013330055.1 c01_UBA8516
21
+ GCA_023251795.1 c01_UBA8516
22
+ GCA_023251295.1 c01_UBA8516
23
+ GCA_005877305.1 c02_TA-20
24
+ GCA_013287585.1 c02_TA-20
25
+
26
+ # gnm_taxon.txt: GTDB format
27
+
28
+ # install R packages
29
+ install.packages("optparse")
30
+ install.packages("plyr")
31
+ install.packages("dbplyr")
32
+ install.packages("dplyr")
33
+ install.packages("tidyr")
34
+ install.packages("ggplot2")
35
+ install.packages("data.table")
36
+ install.packages("RColorBrewer")
37
+ install.packages("gplots")
38
+ install.packages("ape")
39
+
40
+ =============================================================================
41
+ '''
42
+
43
+
44
+ def sep_path_basename_ext(file_in):
45
+ f_path, file_name = os.path.split(file_in)
46
+ if f_path == '':
47
+ f_path = '.'
48
+ f_base, f_ext = os.path.splitext(file_name)
49
+ return f_path, f_base, f_ext
50
+
51
+
52
+ def check_executables(program_list):
53
+
54
+ not_detected_programs = []
55
+ for needed_program in program_list:
56
+
57
+ if subprocess.call(['which', needed_program], stdout=open(os.devnull, 'wb')) != 0:
58
+ not_detected_programs.append(needed_program)
59
+
60
+ if not_detected_programs != []:
61
+ print('%s not detected, program exited!' % ','.join(not_detected_programs))
62
+ exit()
63
+
64
+
65
+ def gtdb_gnm_metadata_parser(gtdb_genome_metadata):
66
+
67
+ genome_to_taxon_dict = {}
68
+ genome_to_biosample_dict = {}
69
+ genome_to_completeness_dict = {}
70
+ genome_to_contamination_dict = {}
71
+ col_index = {}
72
+ for each_ref in open(gtdb_genome_metadata):
73
+ each_ref_split = each_ref.strip().split('\t')
74
+ if each_ref.startswith('accession'):
75
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
76
+ else:
77
+ ref_accession = each_ref_split[0][3:]
78
+ gnm_completeness = float(each_ref_split[2])
79
+ gnm_contamination = float(each_ref_split[3])
80
+ gtdb_taxon = each_ref_split[col_index['gtdb_taxonomy']]
81
+ ncbi_biosample = each_ref_split[col_index['ncbi_biosample']]
82
+ genome_to_taxon_dict[ref_accession] = gtdb_taxon
83
+ genome_to_completeness_dict[ref_accession] = gnm_completeness
84
+ genome_to_contamination_dict[ref_accession] = gnm_contamination
85
+ genome_to_biosample_dict[ref_accession] = ncbi_biosample
86
+
87
+ return genome_to_completeness_dict, genome_to_contamination_dict, genome_to_taxon_dict, genome_to_biosample_dict
88
+
89
+
90
+ def get_rename_dict(tree_str_in, mag_cluster_dict, gtdb_gnm_tax_dict):
91
+
92
+ # rename dict: {'old_name':'new_name'}
93
+
94
+ leaf_rename_dict = {}
95
+ for leaf in Tree(tree_str_in, format=1):
96
+
97
+ leaf_name_gnm = '_'.join(leaf.name.split('_')[:-1])
98
+ leaf_cluster = mag_cluster_dict.get(leaf_name_gnm, 'cluster_0')
99
+
100
+ # get mag_taxon_str
101
+ gnm_taxon_str = 'NA'
102
+ if leaf_name_gnm in gtdb_gnm_tax_dict:
103
+ gnm_taxon_str = gtdb_gnm_tax_dict[leaf_name_gnm]
104
+
105
+ # get mag_taxon_str (GCA GCF things)
106
+ if gnm_taxon_str == 'NA':
107
+ mag_id_no_ext_no_source_GCF = leaf_name_gnm.replace('GCA', 'GCF')
108
+ if mag_id_no_ext_no_source_GCF in gtdb_gnm_tax_dict:
109
+ gnm_taxon_str = gtdb_gnm_tax_dict[mag_id_no_ext_no_source_GCF]
110
+
111
+ gnm_taxon_str_no_space = gnm_taxon_str.replace(' ', '_')
112
+ gnm_taxon_str_no_space = gnm_taxon_str_no_space.replace(';', '|')
113
+ leaf_name_new = '%s|%s|strain__%s' % (leaf_cluster, gnm_taxon_str_no_space, '_'.join(leaf.name.split('_')[:-1]))
114
+
115
+ leaf_rename_dict[leaf.name] = leaf_name_new
116
+
117
+ return leaf_rename_dict
118
+
119
+
120
+ def rename_tree(tree_str_in, rename_dict):
121
+
122
+ t_in = Tree(tree_str_in, format=1)
123
+ for leaf in t_in:
124
+ leaf_name = leaf.name
125
+ leaf_name_new = rename_dict.get(leaf_name, leaf_name)
126
+ leaf.name = leaf_name_new
127
+
128
+ return t_in.write()
129
+
130
+
131
+ def parse_taxonomy(taxon_name): # given a taxon name, try to return whatever taxonomic info is available as a list starting with the highest level classification and going lower (or a map?)
132
+
133
+ name_elements = taxon_name.split('|')
134
+ if (len(name_elements) < 8) or (len(name_elements) > 9):
135
+ print("Nonstandard!")
136
+ quit()
137
+
138
+ name_map = dict()
139
+ name_map['cluster'] = name_elements[0]
140
+ name_map['domain'] = name_elements[1]
141
+ name_map['phylum'] = name_elements[2]
142
+ name_map['class'] = name_elements[3]
143
+ name_map['order'] = name_elements[4]
144
+ name_map['family'] = name_elements[5]
145
+ name_map['genus'] = name_elements[6]
146
+ name_map['species'] = name_elements[7]
147
+ if len(name_elements) == 9:
148
+ name_map['ncbi_id'] = name_elements[8]
149
+ return name_map
150
+
151
+
152
+ def summarize_taxonomy(name_list, tax_level, name_to_tax_dict): # take a list of names from a clade and summarize taxonomic info (labels and their frequencies)
153
+ total_size = len(name_list) # it perhaps makes sense to normalize by the size of the clade
154
+ breakdown = {}
155
+ for name in name_list:
156
+ info = name_to_tax_dict[name]
157
+ if info[tax_level] in breakdown:
158
+ breakdown[info[tax_level]] += 1.0 / float(total_size)
159
+ else:
160
+ breakdown[info[tax_level]] = 1.0 / float(total_size)
161
+ return breakdown
162
+
163
+
164
+ def count_sister_taxa(target_label, tree_in_ml, tree_in_bs, output_file):
165
+
166
+ # edit target_label to make the comparisons at a desired taxonomic level
167
+ # compute the most frequent sister group of each (monophyletic?) group on the tree, to identify trends in gene transfers, "unstable" taxa, etc.
168
+
169
+ # read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end)
170
+ labels = {}
171
+ name_to_tax_info = defaultdict(dict)
172
+ all_tree_leaf_names = []
173
+ ml_tree = Tree(tree_in_ml) # note that ete3 treats this input tree as rooted
174
+ for leaf in ml_tree:
175
+ taxonomy = parse_taxonomy(leaf.name)
176
+ name_to_tax_info[leaf.name] = taxonomy
177
+ all_tree_leaf_names.append(leaf.name)
178
+ leaf.add_feature("tax", taxonomy[target_label])
179
+ labels[taxonomy[target_label]] = 1
180
+ groups = labels.keys()
181
+
182
+ # compute the number of clades (weizhi: monophyletic group) per label in the ML tree, and their sizes
183
+ ML_groups = defaultdict(list) # the list is the size of each clade, len(list) is the number of clades for that label in the ML tree
184
+ # ML_groups: the number of leaves in each monophyletic groups of the corresponding target_label (e.g. genus)
185
+ for label in groups:
186
+ node_num = 0
187
+ for monophyletic_clade in ml_tree.get_monophyletic(values=[label], target_attr="tax"): # get monophyletic groups for each target_label (e.g. genus)
188
+ size_clade = 0 # get the number of leaf (size_clade) in the monophyletic group
189
+ for leaf in monophyletic_clade:
190
+ size_clade += 1
191
+ ML_groups[label].append(size_clade)
192
+ node_num += 1
193
+
194
+ summary = defaultdict(dict)
195
+ clades_per_group = defaultdict(list)
196
+ treeNum = -1
197
+ for line in open(tree_in_bs): # read in each bootstrap tree
198
+
199
+ treeNum += 1
200
+ tree = Tree(line.rstrip())
201
+ for leaf in tree:
202
+ tax = name_to_tax_info[leaf.name] # this should set up taxonomy correctly...
203
+ leaf.add_feature("tax", tax[target_label]) # this adds a feature called tax to the leaf, with the attribute of the phylum name
204
+ for label in groups:
205
+ clades_per_group[label].append(0.0) # setup the clade counting for this particular tree
206
+ tree.unroot() # Weizhi: why is this
207
+
208
+ # iterate over groups that are monophyletic for the taxon label of choice.
209
+ # Choose the smallest sister branch for the comparison. (Assume root is within the larger sister clade (Weizhi:why?))
210
+ for label in groups:
211
+ monophyletic_clade_index = 1
212
+ for monophyletic_clade in tree.get_monophyletic(values=[label], target_attr="tax"): # node: monophyletic clade
213
+ clades_per_group[label][treeNum] += 1.0
214
+ sister_clades = monophyletic_clade.get_sisters()
215
+ monophyletic_clade_index += 1
216
+ sister_index = 1
217
+ for each_sister in sister_clades:
218
+ current_sister_leaf_list = []
219
+ for leaf in each_sister:
220
+ current_sister_leaf_list.append(leaf.name)
221
+ sister_index += 1
222
+
223
+ if monophyletic_clade.is_root(): # monophyletic clade is root
224
+ continue
225
+
226
+ # Weizhi: bifurcation
227
+ elif len(sister_clades) == 1: # not at the trifurcation. Do something a bit hacky to find the bigger sister clade
228
+
229
+ taxa_in_sister = []
230
+ for leaf in sister_clades[0]:
231
+ taxa_in_sister.append(leaf.name)
232
+
233
+ size_sister = len(taxa_in_sister)
234
+
235
+ taxa_in_group = []
236
+ for leaf in monophyletic_clade:
237
+ taxa_in_group.append(leaf.name)
238
+
239
+ taxa_in_other_groups = [] # what does OG mean? (other groups?)
240
+ for leaf_name in all_tree_leaf_names:
241
+ if leaf_name in taxa_in_sister:
242
+ continue
243
+ elif leaf_name in taxa_in_group:
244
+ continue
245
+ else:
246
+ taxa_in_other_groups.append(leaf_name)
247
+ size_other_groups = len(taxa_in_other_groups)
248
+
249
+ sister_tax = {} # taxa in the smaller groups (either the sister group or the OG)
250
+ if size_other_groups > size_sister:
251
+ sister_tax = summarize_taxonomy(taxa_in_sister, target_label, name_to_tax_info)
252
+ else:
253
+ sister_tax = summarize_taxonomy(taxa_in_other_groups, target_label, name_to_tax_info)
254
+
255
+ # store the tax info of the sister group
256
+ for element in sister_tax:
257
+ if element in summary[label]:
258
+ summary[label][element] += sister_tax[element]
259
+ else:
260
+ summary[label][element] = sister_tax[element]
261
+
262
+ else: # trifurcation in tree. Just treat the two sisters in the same way.
263
+
264
+ taxa_in_sisters_1 = []
265
+ for leaf in sister_clades[0]:
266
+ taxa_in_sisters_1.append(leaf.name)
267
+
268
+ taxa_in_sisters_2 = []
269
+ for leaf in sister_clades[1]:
270
+ taxa_in_sisters_2.append(leaf.name)
271
+
272
+ # get the size of two sisters
273
+ size_s1 = len(taxa_in_sisters_1)
274
+ size_s2 = len(taxa_in_sisters_2)
275
+
276
+ # get taxa in the smaller sister group
277
+ sister_tax = {}
278
+ if size_s1 > size_s2:
279
+ sister_tax = summarize_taxonomy(taxa_in_sisters_2, target_label, name_to_tax_info)
280
+ else:
281
+ sister_tax = summarize_taxonomy(taxa_in_sisters_1, target_label, name_to_tax_info)
282
+
283
+ for element in sister_tax:
284
+ if element in summary[label]:
285
+ summary[label][element] += sister_tax[element]
286
+ else:
287
+ summary[label][element] = sister_tax[element]
288
+
289
+ # now print out some kind of summary. For each label, the sorted list of sister taxa and their frequencies?
290
+ outh = open(output_file, "w")
291
+ for label in summary:
292
+ num_groups = len(ML_groups[label])
293
+ size_str = ''
294
+ if num_groups == 1:
295
+ size_str = ML_groups[label][0]
296
+ else:
297
+ size_str = ','.join(str(x) for x in (sorted(ML_groups[label], reverse=True)))
298
+
299
+ avg_num_clades = float("{0:.4f}".format(numpy.mean(clades_per_group[label])))
300
+ total_num_clades = numpy.sum(clades_per_group[label])
301
+ sorted_sisters = sorted(summary[label].items(), key=itemgetter(1), reverse=True)
302
+
303
+ for tup in sorted_sisters:
304
+ double_normalize = float(tup[1]) / float(total_num_clades) # normalize the frequencies by the total number of clades, to account for different bootstrap numbers/MCMC sample numbers
305
+ double_normalize = float("{0:.4f}".format(double_normalize))
306
+ str_to_write = '%s\t%s\t%s\t%s\t%s\t%s' % (label, tup[0], float("{0:.4f}".format(tup[1])), avg_num_clades, double_normalize, size_str)
307
+ outh.write(str_to_write + '\n')
308
+ outh.close()
309
+
310
+
311
+ def count_sister_taxa_worker(arg_list):
312
+
313
+ mag_cluster_dict = arg_list[0]
314
+ gnm_tax_dict = arg_list[1]
315
+ tree_ml = arg_list[2]
316
+ ufboot_file = arg_list[3]
317
+ target_label = arg_list[4]
318
+ tree_ml_renamed = arg_list[5]
319
+ ufboot_file_renamed = arg_list[6]
320
+ count_sister_taxa_op_txt = arg_list[7]
321
+ gene_id = arg_list[8]
322
+ renamed_gnm_to_cluster_dir = arg_list[9]
323
+
324
+ # rename ml tree
325
+ tree_ml_renamed_handle = open(tree_ml_renamed, 'w')
326
+ current_tree_rename_dict = get_rename_dict(tree_ml, mag_cluster_dict, gnm_tax_dict)
327
+ tree_ml_str_renamed = rename_tree(tree_ml, current_tree_rename_dict)
328
+ tree_ml_renamed_handle.write(tree_ml_str_renamed + '\n')
329
+ tree_ml_renamed_handle.close()
330
+
331
+ current_renamed_gnm_to_cluster_txt = '%s/%s.txt' % (renamed_gnm_to_cluster_dir, gene_id)
332
+ current_renamed_gnm_to_cluster_txt_handle = open(current_renamed_gnm_to_cluster_txt, 'w')
333
+ for each_leaf in current_tree_rename_dict:
334
+ renamed_leaf = current_tree_rename_dict[each_leaf]
335
+ cluster_id = renamed_leaf.split('|')[0]
336
+ current_renamed_gnm_to_cluster_txt_handle.write('%s\t%s\n' % (renamed_leaf, cluster_id))
337
+ current_renamed_gnm_to_cluster_txt_handle.close()
338
+
339
+ # rename ufboot trees
340
+ ufboot_file_renamed_handle = open(ufboot_file_renamed, 'w')
341
+ for each_tree in open(ufboot_file):
342
+ tree_str = each_tree.strip()
343
+ current_tree_rename_dict = get_rename_dict(tree_str, mag_cluster_dict, gnm_tax_dict)
344
+ tree_str_renamed = rename_tree(tree_str, current_tree_rename_dict)
345
+ ufboot_file_renamed_handle.write(tree_str_renamed + '\n')
346
+ ufboot_file_renamed_handle.close()
347
+
348
+ # count_sister_taxa
349
+ count_sister_taxa(target_label, tree_ml_renamed, ufboot_file_renamed, count_sister_taxa_op_txt)
350
+
351
+
352
+ def run_count_sister_taxa(gtdb_classification_txt, hog_list, contree_dir, ufboot_dir, gnm_cluster_txt, target_label, num_threads, output_dir, force_overwrite):
353
+
354
+ # define file name
355
+ renamed_gnm_to_cluster_dir = '%s/renamed_genome_to_cluster' % output_dir
356
+ renamed_gnm_to_cluster_tmp_txt = '%s/renamed_genome_to_cluster_tmp.txt' % output_dir
357
+ renamed_gnm_to_cluster_txt = '%s/renamed_genome_to_cluster.txt' % output_dir
358
+ renamed_gnm_to_cluster_iTOL_txt = '%s/renamed_genome_to_cluster_iTOL.txt' % output_dir
359
+ renamed_contree_dir = '%s/renamed_contree' % output_dir
360
+ renamed_ufboot_dir = '%s/renamed_ufboot' % output_dir
361
+ count_sister_taxa_op_dir = '%s/count_sister_taxa_op' % output_dir
362
+
363
+ if os.path.isdir(output_dir) is True:
364
+ if force_overwrite is True:
365
+ os.system('rm -r %s' % output_dir)
366
+ else:
367
+ print('%s exist, program exited!' % output_dir)
368
+ exit()
369
+
370
+ os.mkdir(output_dir)
371
+ os.mkdir(renamed_contree_dir)
372
+ os.mkdir(renamed_ufboot_dir)
373
+ os.mkdir(count_sister_taxa_op_dir)
374
+ os.mkdir(renamed_gnm_to_cluster_dir)
375
+
376
+ ####################################################################################################################
377
+
378
+ gnm_tax_dict = {}
379
+ for each in open(gtdb_classification_txt):
380
+ if not each.startswith('user_genome'):
381
+ each_split = each.strip().split('\t')
382
+ gnm_tax_dict[each_split[0]] = each_split[1]
383
+
384
+ ####################################################################################################################
385
+
386
+ mag_cluster_dict = {}
387
+ for each_gnm in open(gnm_cluster_txt):
388
+ each_gnm_split = each_gnm.strip().split('\t')
389
+ mag_cluster_dict[each_gnm_split[0]] = each_gnm_split[1]
390
+
391
+ argument_lol = []
392
+ for og_id in hog_list:
393
+
394
+ # define file name
395
+ tree_ml = '%s/%s.contree' % (contree_dir, og_id)
396
+ ufboot_file = '%s/%s.ufboot' % (ufboot_dir, og_id)
397
+ tree_ml_renamed = '%s/%s_renamed.contree' % (renamed_contree_dir, og_id)
398
+ ufboot_file_renamed = '%s/%s_renamed.ufboot' % (renamed_ufboot_dir, og_id)
399
+ count_sister_taxa_op_txt = '%s/%s_count_sister_taxa.txt' % (count_sister_taxa_op_dir, og_id)
400
+
401
+ if os.path.isfile(tree_ml) is False:
402
+ print('%s not found!' % tree_ml)
403
+ exit()
404
+
405
+ current_arg_list = [mag_cluster_dict, gnm_tax_dict, tree_ml, ufboot_file, target_label, tree_ml_renamed, ufboot_file_renamed, count_sister_taxa_op_txt, og_id, renamed_gnm_to_cluster_dir]
406
+ argument_lol.append(current_arg_list)
407
+
408
+ # run with multiprocessing
409
+ pool = mp.Pool(processes=num_threads)
410
+ pool.map(count_sister_taxa_worker, argument_lol)
411
+ pool.close()
412
+ pool.join()
413
+
414
+ # combine renamed_gnm_to_cluster files
415
+ os.system('cat %s/*.txt > %s' % (renamed_gnm_to_cluster_dir, renamed_gnm_to_cluster_tmp_txt))
416
+ os.system('cat %s | sort | uniq > %s' % (renamed_gnm_to_cluster_tmp_txt, renamed_gnm_to_cluster_txt))
417
+ BioSAK_iTOL_cmd = 'BioSAK iTOL -ColorRange -lg %s -lt Cluster -o %s' % (renamed_gnm_to_cluster_txt, renamed_gnm_to_cluster_iTOL_txt)
418
+ os.system(BioSAK_iTOL_cmd)
419
+
420
+
421
+ def get_taxa_count_stats(step_1_op_dir, hog_list_sorted, get_taxa_count_stats_wd, force_overwrite, TaxaCountStats_Rscript):
422
+
423
+ # define input files to R script
424
+ combined_contree_file = '%s/combined.contree' % get_taxa_count_stats_wd
425
+ genes_to_remove_txt = '%s/Genes_to_remove.txt' % get_taxa_count_stats_wd
426
+ list_of_trees_txt = '%s/List_of_trees.txt' % get_taxa_count_stats_wd
427
+ mapping_txt = '%s/mapping.txt' % get_taxa_count_stats_wd
428
+ marker_list_txt = '%s/MarkerList.txt' % get_taxa_count_stats_wd
429
+ combined_count_sister_taxa_op = '%s/combined_count_sister_taxa_op.txt' % get_taxa_count_stats_wd
430
+ TaxaCountStats_op = '%s/TaxaCountStats_output.txt' % get_taxa_count_stats_wd
431
+
432
+ if os.path.isdir(get_taxa_count_stats_wd) is True:
433
+ if force_overwrite is True:
434
+ os.system('rm -r %s' % get_taxa_count_stats_wd)
435
+ else:
436
+ print('%s exist, program exited!' % get_taxa_count_stats_wd)
437
+ exit()
438
+ os.mkdir(get_taxa_count_stats_wd)
439
+
440
+ cluster_to_domain_dict = {}
441
+ marker_list_txt_handle = open(marker_list_txt, 'w')
442
+ marker_list_txt_handle.write('MarkerID\n')
443
+ list_of_trees_txt_handle = open(list_of_trees_txt, 'w')
444
+ combined_contree_file_handle = open(combined_contree_file, 'w')
445
+ combined_count_sister_taxa_op_handle = open(combined_count_sister_taxa_op, 'w')
446
+ combined_count_sister_taxa_op_handle.write('MarkerID\tGroup_of_interest\tSister_taxa\tNormalized_sum_of_occurances\tsplits\tNormalized2_sum_of_occurances\tClusters\n')
447
+ for each_hog in hog_list_sorted:
448
+
449
+ # write out to combined_count_sister_taxa_op
450
+ pwd_count_sister_taxa_op_txt = '%s/count_sister_taxa_op/%s_count_sister_taxa.txt' % (step_1_op_dir, each_hog)
451
+ with open(pwd_count_sister_taxa_op_txt) as count_sister_taxa_op_txt_opened:
452
+ for each_line in count_sister_taxa_op_txt_opened:
453
+ combined_count_sister_taxa_op_handle.write('%s\t%s' % (each_hog, each_line))
454
+
455
+ # write out to combined_contree_file
456
+ pwd_renamed_contree_file = '%s/renamed_contree/%s_renamed.contree' % (step_1_op_dir, each_hog)
457
+ with open(pwd_renamed_contree_file, 'r') as pwd_renamed_contree_file_opened:
458
+ combined_contree_file_handle.write(pwd_renamed_contree_file_opened.readline())
459
+
460
+ # add to cluster_to_domain_dict
461
+ t_in = Tree(pwd_renamed_contree_file, format=1)
462
+ for leaf in t_in:
463
+ leaf_name_split = leaf.name.split('|')
464
+ cluster_to_domain_dict[leaf_name_split[0]] = leaf_name_split[1]
465
+
466
+ # write out to marker_list_txt
467
+ marker_list_txt_handle.write(each_hog + '\n')
468
+ list_of_trees_txt_handle.write(each_hog + '\n')
469
+
470
+ marker_list_txt_handle.close()
471
+ list_of_trees_txt_handle.close()
472
+ combined_contree_file_handle.close()
473
+ combined_count_sister_taxa_op_handle.close()
474
+
475
+ # prepare mapping_txt
476
+ mapping_txt_handle = open(mapping_txt, 'w')
477
+ mapping_txt_handle.write('Cluster\tDomain\n')
478
+ for each_cluster in cluster_to_domain_dict:
479
+ mapping_txt_handle.write('%s\t%s\n' % (each_cluster, cluster_to_domain_dict[each_cluster]))
480
+ mapping_txt_handle.close()
481
+
482
+ # prepare genes_to_remove_txt
483
+ genes_to_remove_txt_handle = open(genes_to_remove_txt, 'w')
484
+ genes_to_remove_txt_handle.write('MarkerID\n')
485
+ genes_to_remove_txt_handle.close()
486
+
487
+ # run TaxaCountStats.R
488
+ get_TaxaCountStats_cmd = 'Rscript %s -t %s -l %s -g %s -x %s -s %s -r %s -o %s > /dev/null' % (TaxaCountStats_Rscript, combined_contree_file, list_of_trees_txt, mapping_txt, marker_list_txt, combined_count_sister_taxa_op, genes_to_remove_txt, TaxaCountStats_op)
489
+ print('Running: ' + get_TaxaCountStats_cmd)
490
+ os.system(get_TaxaCountStats_cmd)
491
+
492
+
493
+ def group_marker(taxa_counts_tats_op_txt, marker_seq_dir, marker_rank_cutoff_str, op_dir):
494
+
495
+ marker_rank_cutoff_list = marker_rank_cutoff_str.split(',')
496
+
497
+ marker_score_dict = dict()
498
+ header_index_dict = dict()
499
+ for each_marker in open(taxa_counts_tats_op_txt):
500
+ each_marker_split = each_marker.replace('\n', '').split('\t')
501
+ if each_marker.startswith('MarkerID\t'):
502
+ header_index_dict = {k: v for v, k in enumerate(each_marker_split)}
503
+ else:
504
+ marker_id = each_marker_split[header_index_dict['MarkerID']]
505
+ marker_score = int(each_marker_split[header_index_dict['RankA_B']])
506
+ marker_score_dict[marker_id] = marker_score
507
+
508
+ marker_list_sorted_best_to_wrost = [i[0] for i in sorted(marker_score_dict.items(), key=lambda x: x[1])]
509
+ marker_list_sorted_wrost_to_best = [i[0] for i in sorted(marker_score_dict.items(), key=lambda x: x[1])][::-1]
510
+
511
+ for each_cutoff in marker_rank_cutoff_list:
512
+
513
+ marker_num_rounded = round(len(marker_list_sorted_wrost_to_best)*float(each_cutoff)/100)
514
+ marker_list_best = marker_list_sorted_best_to_wrost[:marker_num_rounded]
515
+ marker_list_worst = marker_list_sorted_wrost_to_best[:marker_num_rounded]
516
+ seq_dir_best = '%s/best%s' % (op_dir, each_cutoff)
517
+ seq_dir_worst = '%s/worst%s' % (op_dir, each_cutoff)
518
+
519
+ os.system('mkdir %s' % seq_dir_best)
520
+ os.system('mkdir %s' % seq_dir_worst)
521
+
522
+ # get the best markers
523
+ for bm in marker_list_best:
524
+ os.system('cp %s/%s.fa %s/' % (marker_seq_dir, bm, seq_dir_best))
525
+
526
+ # get the worst markers
527
+ for wm in marker_list_worst:
528
+ os.system('cp %s/%s.fa %s/' % (marker_seq_dir, wm, seq_dir_worst))
529
+
530
+
531
+ def SplitScore2(args):
532
+
533
+ step_1_op_dir = args['i']
534
+ gnm_group_txt = args['g']
535
+ gtdb_classification_txt = args['k']
536
+ force_overwrite = args['f']
537
+ num_of_threads = args['t']
538
+ step_2_op_dir = args['o']
539
+ marker_rank_cutoff_str = args['c']
540
+ target_label = 'cluster'
541
+ check_executables(['Rscript'])
542
+
543
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
544
+ TaxaCountStats_Rscript = '%s/TaxaCountStats.R' % current_file_path
545
+ qualified_og_seq_dir = '%s/qualified_OGs' % step_1_op_dir
546
+ contree_file_re = '%s/*.contree' % qualified_og_seq_dir
547
+ ufboot_file_re = '%s/*.ufboot' % qualified_og_seq_dir
548
+ count_sister_taxa_op_dir = '%s/count_sister_taxa_wd' % step_2_op_dir
549
+ get_taxa_count_stats_op_dir = '%s/get_taxa_count_stats_wd' % step_2_op_dir
550
+ TaxaCountStats_output_txt = '%s/get_taxa_count_stats_wd/TaxaCountStats_output.txt' % step_2_op_dir
551
+
552
+ contree_file_set_base = set()
553
+ for each_contree_file in glob.glob(contree_file_re):
554
+ _, f_base, _ = sep_path_basename_ext(each_contree_file)
555
+ contree_file_set_base.add(f_base)
556
+
557
+ ufboot_file_set_base = set()
558
+ for each_ufboot_file in glob.glob(ufboot_file_re):
559
+ _, f_base, _ = sep_path_basename_ext(each_ufboot_file)
560
+ ufboot_file_set_base.add(f_base)
561
+
562
+ contree_ufboot_shared = set(contree_file_set_base).intersection(ufboot_file_set_base)
563
+ contree_ufboot_shared_sorted = sorted([i for i in contree_ufboot_shared])
564
+
565
+ # create output folder
566
+ if os.path.isdir(step_2_op_dir) is True:
567
+ if force_overwrite is True:
568
+ os.system('rm -r %s' % step_2_op_dir)
569
+ else:
570
+ print('%s exist, program exited!' % step_2_op_dir)
571
+ exit()
572
+ os.mkdir(step_2_op_dir)
573
+
574
+ print('Counting sister taxa with %s cores' % num_of_threads)
575
+ run_count_sister_taxa(gtdb_classification_txt, contree_ufboot_shared_sorted, qualified_og_seq_dir, qualified_og_seq_dir, gnm_group_txt, target_label, num_of_threads, count_sister_taxa_op_dir, force_overwrite)
576
+
577
+ print('Summarising sister taxa')
578
+ get_taxa_count_stats(count_sister_taxa_op_dir, contree_ufboot_shared_sorted, get_taxa_count_stats_op_dir, force_overwrite, TaxaCountStats_Rscript)
579
+
580
+ print('Exporting markers by split score')
581
+ group_marker(TaxaCountStats_output_txt, qualified_og_seq_dir, marker_rank_cutoff_str, step_2_op_dir)
582
+
583
+ print('Done!')
584
+
585
+
586
+ if __name__ == '__main__':
587
+
588
+ SplitScore2_parser = argparse.ArgumentParser()
589
+ SplitScore2_parser.add_argument('-i', required=True, help='output dir from SplitScore1')
590
+ SplitScore2_parser.add_argument('-g', required=True, help='genome group')
591
+ SplitScore2_parser.add_argument('-k', required=True, help='genome taxon, GTDB format')
592
+ SplitScore2_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
593
+ SplitScore2_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
594
+ SplitScore2_parser.add_argument('-c', required=False, default='25,50,75', help='marker ranking cutoffs, default: 25,50,75')
595
+ SplitScore2_parser.add_argument('-o', required=True, help='output directory')
596
+ args = vars(SplitScore2_parser.parse_args())
597
+ SplitScore2(args)