treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/SplitScore2.py ADDED
@@ -0,0 +1,608 @@
1
+ from __future__ import print_function
2
+ import os
3
+ import glob
4
+ import numpy
5
+ import argparse
6
+ import subprocess
7
+ from ete3 import Tree
8
+ import multiprocessing as mp
9
+ from operator import itemgetter
10
+ from collections import defaultdict
11
+
12
+
13
+ SplitScore2_usage = '''
14
+ ======================== SplitScore2 example commands ========================
15
+
16
+ TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir -c 25,50,75
17
+ TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir -c 20,40,60,80
18
+
19
+ # format of gnm_cluster.tsv (tab separated)
20
+ GCA_013330055.1 c01_UBA8516
21
+ GCA_023251795.1 c01_UBA8516
22
+ GCA_023251295.1 c01_UBA8516
23
+ GCA_005877305.1 c02_TA-20
24
+ GCA_013287585.1 c02_TA-20
25
+
26
+ # gnm_taxon.txt: GTDB format
27
+
28
+ # install R packages
29
+ install.packages("optparse")
30
+ install.packages("plyr")
31
+ install.packages("dbplyr")
32
+ install.packages("dplyr")
33
+ install.packages("tidyr")
34
+ install.packages("ggplot2")
35
+ install.packages("data.table")
36
+ install.packages("RColorBrewer")
37
+ install.packages("gplots")
38
+ install.packages("ape")
39
+
40
+ # As described in the Undinarchaeota paper (Nina Dombrowski 2020, NC)
41
+
42
+ =============================================================================
43
+ '''
44
+
45
+
46
+ def sep_path_basename_ext(file_in):
47
+ f_path, file_name = os.path.split(file_in)
48
+ if f_path == '':
49
+ f_path = '.'
50
+ f_base, f_ext = os.path.splitext(file_name)
51
+ return f_path, f_base, f_ext
52
+
53
+
54
+ def check_executables(program_list):
55
+
56
+ not_detected_programs = []
57
+ for needed_program in program_list:
58
+
59
+ if subprocess.call(['which', needed_program], stdout=open(os.devnull, 'wb')) != 0:
60
+ not_detected_programs.append(needed_program)
61
+
62
+ if not_detected_programs != []:
63
+ print('%s not detected, program exited!' % ','.join(not_detected_programs))
64
+ exit()
65
+
66
+
67
+ def gtdb_gnm_metadata_parser(gtdb_genome_metadata):
68
+
69
+ genome_to_taxon_dict = {}
70
+ genome_to_biosample_dict = {}
71
+ genome_to_completeness_dict = {}
72
+ genome_to_contamination_dict = {}
73
+ col_index = {}
74
+ for each_ref in open(gtdb_genome_metadata):
75
+ each_ref_split = each_ref.strip().split('\t')
76
+ if each_ref.startswith('accession'):
77
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
78
+ else:
79
+ ref_accession = each_ref_split[0][3:]
80
+ gnm_completeness = float(each_ref_split[2])
81
+ gnm_contamination = float(each_ref_split[3])
82
+ gtdb_taxon = each_ref_split[col_index['gtdb_taxonomy']]
83
+ ncbi_biosample = each_ref_split[col_index['ncbi_biosample']]
84
+ genome_to_taxon_dict[ref_accession] = gtdb_taxon
85
+ genome_to_completeness_dict[ref_accession] = gnm_completeness
86
+ genome_to_contamination_dict[ref_accession] = gnm_contamination
87
+ genome_to_biosample_dict[ref_accession] = ncbi_biosample
88
+
89
+ return genome_to_completeness_dict, genome_to_contamination_dict, genome_to_taxon_dict, genome_to_biosample_dict
90
+
91
+
92
+ def get_rename_dict(tree_str_in, mag_cluster_dict, gtdb_gnm_tax_dict, seq_named_by_gnm):
93
+
94
+ # rename dict: {'old_name':'new_name'}
95
+
96
+ leaf_rename_dict = {}
97
+ for leaf in Tree(tree_str_in, format=1):
98
+
99
+ leaf_name_gnm = '_'.join(leaf.name.split('_')[:-1])
100
+ if seq_named_by_gnm is True:
101
+ leaf_name_gnm = leaf.name
102
+ leaf_cluster = mag_cluster_dict.get(leaf_name_gnm, 'cluster_0')
103
+
104
+ # get mag_taxon_str
105
+ gnm_taxon_str = 'NA'
106
+ if leaf_name_gnm in gtdb_gnm_tax_dict:
107
+ gnm_taxon_str = gtdb_gnm_tax_dict[leaf_name_gnm]
108
+
109
+ # get mag_taxon_str (GCA GCF things)
110
+ if gnm_taxon_str == 'NA':
111
+ mag_id_no_ext_no_source_GCF = leaf_name_gnm.replace('GCA', 'GCF')
112
+ if mag_id_no_ext_no_source_GCF in gtdb_gnm_tax_dict:
113
+ gnm_taxon_str = gtdb_gnm_tax_dict[mag_id_no_ext_no_source_GCF]
114
+
115
+ gnm_taxon_str_no_space = gnm_taxon_str.replace(' ', '_')
116
+ gnm_taxon_str_no_space = gnm_taxon_str_no_space.replace(';', '|')
117
+ leaf_name_new = '%s|%s|strain__%s' % (leaf_cluster, gnm_taxon_str_no_space, '_'.join(leaf.name.split('_')[:-1]))
118
+ if seq_named_by_gnm is True:
119
+ leaf_name_new = '%s|%s|strain__%s' % (leaf_cluster, gnm_taxon_str_no_space,leaf.name)
120
+
121
+ leaf_rename_dict[leaf.name] = leaf_name_new
122
+
123
+ return leaf_rename_dict
124
+
125
+
126
+ def rename_tree(tree_str_in, rename_dict):
127
+
128
+ t_in = Tree(tree_str_in, format=1)
129
+ for leaf in t_in:
130
+ leaf_name = leaf.name
131
+ leaf_name_new = rename_dict.get(leaf_name, leaf_name)
132
+ leaf.name = leaf_name_new
133
+
134
+ return t_in.write()
135
+
136
+
137
+ def parse_taxonomy(taxon_name): # given a taxon name, try to return whatever taxonomic info is available as a list starting with the highest level classification and going lower (or a map?)
138
+
139
+ name_elements = taxon_name.split('|')
140
+ if (len(name_elements) < 8) or (len(name_elements) > 9):
141
+ print("Nonstandard!")
142
+ quit()
143
+
144
+ name_map = dict()
145
+ name_map['cluster'] = name_elements[0]
146
+ name_map['domain'] = name_elements[1]
147
+ name_map['phylum'] = name_elements[2]
148
+ name_map['class'] = name_elements[3]
149
+ name_map['order'] = name_elements[4]
150
+ name_map['family'] = name_elements[5]
151
+ name_map['genus'] = name_elements[6]
152
+ name_map['species'] = name_elements[7]
153
+ if len(name_elements) == 9:
154
+ name_map['ncbi_id'] = name_elements[8]
155
+ return name_map
156
+
157
+
158
+ def summarize_taxonomy(name_list, tax_level, name_to_tax_dict): # take a list of names from a clade and summarize taxonomic info (labels and their frequencies)
159
+ total_size = len(name_list) # it perhaps makes sense to normalize by the size of the clade
160
+ breakdown = {}
161
+ for name in name_list:
162
+ info = name_to_tax_dict[name]
163
+ if info[tax_level] in breakdown:
164
+ breakdown[info[tax_level]] += 1.0 / float(total_size)
165
+ else:
166
+ breakdown[info[tax_level]] = 1.0 / float(total_size)
167
+ return breakdown
168
+
169
+
170
+ def count_sister_taxa(target_label, tree_in_ml, tree_in_bs, output_file):
171
+
172
+ # edit target_label to make the comparisons at a desired taxonomic level
173
+ # compute the most frequent sister group of each (monophyletic?) group on the tree, to identify trends in gene transfers, "unstable" taxa, etc.
174
+
175
+ # read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end)
176
+ labels = {}
177
+ name_to_tax_info = defaultdict(dict)
178
+ all_tree_leaf_names = []
179
+ ml_tree = Tree(tree_in_ml) # note that ete3 treats this input tree as rooted
180
+ for leaf in ml_tree:
181
+ taxonomy = parse_taxonomy(leaf.name)
182
+ name_to_tax_info[leaf.name] = taxonomy
183
+ all_tree_leaf_names.append(leaf.name)
184
+ leaf.add_feature("tax", taxonomy[target_label])
185
+ labels[taxonomy[target_label]] = 1
186
+ groups = labels.keys()
187
+
188
+ # compute the number of clades (weizhi: monophyletic group) per label in the ML tree, and their sizes
189
+ ML_groups = defaultdict(list) # the list is the size of each clade, len(list) is the number of clades for that label in the ML tree
190
+ # ML_groups: the number of leaves in each monophyletic groups of the corresponding target_label (e.g. genus)
191
+ for label in groups:
192
+ node_num = 0
193
+ for monophyletic_clade in ml_tree.get_monophyletic(values=[label], target_attr="tax"): # get monophyletic groups for each target_label (e.g. genus)
194
+ size_clade = 0 # get the number of leaf (size_clade) in the monophyletic group
195
+ for leaf in monophyletic_clade:
196
+ size_clade += 1
197
+ ML_groups[label].append(size_clade)
198
+ node_num += 1
199
+
200
+ summary = defaultdict(dict)
201
+ clades_per_group = defaultdict(list)
202
+ treeNum = -1
203
+ for line in open(tree_in_bs): # read in each bootstrap tree
204
+
205
+ treeNum += 1
206
+ tree = Tree(line.rstrip())
207
+ for leaf in tree:
208
+ tax = name_to_tax_info[leaf.name] # this should set up taxonomy correctly...
209
+ leaf.add_feature("tax", tax[target_label]) # this adds a feature called tax to the leaf, with the attribute of the phylum name
210
+ for label in groups:
211
+ clades_per_group[label].append(0.0) # setup the clade counting for this particular tree
212
+ tree.unroot() # Weizhi: why is this
213
+
214
+ # iterate over groups that are monophyletic for the taxon label of choice.
215
+ # Choose the smallest sister branch for the comparison. (Assume root is within the larger sister clade (Weizhi:why?))
216
+ for label in groups:
217
+ monophyletic_clade_index = 1
218
+ for monophyletic_clade in tree.get_monophyletic(values=[label], target_attr="tax"): # node: monophyletic clade
219
+ clades_per_group[label][treeNum] += 1.0
220
+ sister_clades = monophyletic_clade.get_sisters()
221
+ monophyletic_clade_index += 1
222
+ sister_index = 1
223
+ for each_sister in sister_clades:
224
+ current_sister_leaf_list = []
225
+ for leaf in each_sister:
226
+ current_sister_leaf_list.append(leaf.name)
227
+ sister_index += 1
228
+
229
+ if monophyletic_clade.is_root(): # monophyletic clade is root
230
+ continue
231
+
232
+ # Weizhi: bifurcation
233
+ elif len(sister_clades) == 1: # not at the trifurcation. Do something a bit hacky to find the bigger sister clade
234
+
235
+ taxa_in_sister = []
236
+ for leaf in sister_clades[0]:
237
+ taxa_in_sister.append(leaf.name)
238
+
239
+ size_sister = len(taxa_in_sister)
240
+
241
+ taxa_in_group = []
242
+ for leaf in monophyletic_clade:
243
+ taxa_in_group.append(leaf.name)
244
+
245
+ taxa_in_other_groups = [] # what does OG mean? (other groups?)
246
+ for leaf_name in all_tree_leaf_names:
247
+ if leaf_name in taxa_in_sister:
248
+ continue
249
+ elif leaf_name in taxa_in_group:
250
+ continue
251
+ else:
252
+ taxa_in_other_groups.append(leaf_name)
253
+ size_other_groups = len(taxa_in_other_groups)
254
+
255
+ sister_tax = {} # taxa in the smaller groups (either the sister group or the OG)
256
+ if size_other_groups > size_sister:
257
+ sister_tax = summarize_taxonomy(taxa_in_sister, target_label, name_to_tax_info)
258
+ else:
259
+ sister_tax = summarize_taxonomy(taxa_in_other_groups, target_label, name_to_tax_info)
260
+
261
+ # store the tax info of the sister group
262
+ for element in sister_tax:
263
+ if element in summary[label]:
264
+ summary[label][element] += sister_tax[element]
265
+ else:
266
+ summary[label][element] = sister_tax[element]
267
+
268
+ else: # trifurcation in tree. Just treat the two sisters in the same way.
269
+
270
+ taxa_in_sisters_1 = []
271
+ for leaf in sister_clades[0]:
272
+ taxa_in_sisters_1.append(leaf.name)
273
+
274
+ taxa_in_sisters_2 = []
275
+ for leaf in sister_clades[1]:
276
+ taxa_in_sisters_2.append(leaf.name)
277
+
278
+ # get the size of two sisters
279
+ size_s1 = len(taxa_in_sisters_1)
280
+ size_s2 = len(taxa_in_sisters_2)
281
+
282
+ # get taxa in the smaller sister group
283
+ sister_tax = {}
284
+ if size_s1 > size_s2:
285
+ sister_tax = summarize_taxonomy(taxa_in_sisters_2, target_label, name_to_tax_info)
286
+ else:
287
+ sister_tax = summarize_taxonomy(taxa_in_sisters_1, target_label, name_to_tax_info)
288
+
289
+ for element in sister_tax:
290
+ if element in summary[label]:
291
+ summary[label][element] += sister_tax[element]
292
+ else:
293
+ summary[label][element] = sister_tax[element]
294
+
295
+ # now print out some kind of summary. For each label, the sorted list of sister taxa and their frequencies?
296
+ outh = open(output_file, "w")
297
+ for label in summary:
298
+ num_groups = len(ML_groups[label])
299
+ size_str = ''
300
+ if num_groups == 1:
301
+ size_str = ML_groups[label][0]
302
+ else:
303
+ size_str = ','.join(str(x) for x in (sorted(ML_groups[label], reverse=True)))
304
+
305
+ avg_num_clades = float("{0:.4f}".format(numpy.mean(clades_per_group[label])))
306
+ total_num_clades = numpy.sum(clades_per_group[label])
307
+ sorted_sisters = sorted(summary[label].items(), key=itemgetter(1), reverse=True)
308
+
309
+ for tup in sorted_sisters:
310
+ double_normalize = float(tup[1]) / float(total_num_clades) # normalize the frequencies by the total number of clades, to account for different bootstrap numbers/MCMC sample numbers
311
+ double_normalize = float("{0:.4f}".format(double_normalize))
312
+ str_to_write = '%s\t%s\t%s\t%s\t%s\t%s' % (label, tup[0], float("{0:.4f}".format(tup[1])), avg_num_clades, double_normalize, size_str)
313
+ outh.write(str_to_write + '\n')
314
+ outh.close()
315
+
316
+
317
+ def count_sister_taxa_worker(arg_list):
318
+
319
+ mag_cluster_dict = arg_list[0]
320
+ gnm_tax_dict = arg_list[1]
321
+ tree_ml = arg_list[2]
322
+ ufboot_file = arg_list[3]
323
+ target_label = arg_list[4]
324
+ tree_ml_renamed = arg_list[5]
325
+ ufboot_file_renamed = arg_list[6]
326
+ count_sister_taxa_op_txt = arg_list[7]
327
+ gene_id = arg_list[8]
328
+ renamed_gnm_to_cluster_dir = arg_list[9]
329
+ seq_named_by_gnm = arg_list[10]
330
+
331
+ # rename ml tree
332
+ tree_ml_renamed_handle = open(tree_ml_renamed, 'w')
333
+ current_tree_rename_dict = get_rename_dict(tree_ml, mag_cluster_dict, gnm_tax_dict, seq_named_by_gnm)
334
+ tree_ml_str_renamed = rename_tree(tree_ml, current_tree_rename_dict)
335
+ tree_ml_renamed_handle.write(tree_ml_str_renamed + '\n')
336
+ tree_ml_renamed_handle.close()
337
+
338
+ current_renamed_gnm_to_cluster_txt = '%s/%s.txt' % (renamed_gnm_to_cluster_dir, gene_id)
339
+ current_renamed_gnm_to_cluster_txt_handle = open(current_renamed_gnm_to_cluster_txt, 'w')
340
+ for each_leaf in current_tree_rename_dict:
341
+ renamed_leaf = current_tree_rename_dict[each_leaf]
342
+ cluster_id = renamed_leaf.split('|')[0]
343
+ current_renamed_gnm_to_cluster_txt_handle.write('%s\t%s\n' % (renamed_leaf, cluster_id))
344
+ current_renamed_gnm_to_cluster_txt_handle.close()
345
+
346
+ # rename ufboot trees
347
+ ufboot_file_renamed_handle = open(ufboot_file_renamed, 'w')
348
+ for each_tree in open(ufboot_file):
349
+ tree_str = each_tree.strip()
350
+ current_tree_rename_dict = get_rename_dict(tree_str, mag_cluster_dict, gnm_tax_dict, seq_named_by_gnm)
351
+ tree_str_renamed = rename_tree(tree_str, current_tree_rename_dict)
352
+ ufboot_file_renamed_handle.write(tree_str_renamed + '\n')
353
+ ufboot_file_renamed_handle.close()
354
+
355
+ # count_sister_taxa
356
+ count_sister_taxa(target_label, tree_ml_renamed, ufboot_file_renamed, count_sister_taxa_op_txt)
357
+
358
+
359
+ def run_count_sister_taxa(gtdb_classification_txt, hog_list, contree_dir, ufboot_dir, gnm_cluster_txt, target_label, num_threads, output_dir, force_overwrite, seq_named_by_gnm):
360
+
361
+ # define file name
362
+ renamed_gnm_to_cluster_dir = '%s/renamed_genome_to_cluster' % output_dir
363
+ renamed_gnm_to_cluster_tmp_txt = '%s/renamed_genome_to_cluster_tmp.txt' % output_dir
364
+ renamed_gnm_to_cluster_txt = '%s/renamed_genome_to_cluster.txt' % output_dir
365
+ renamed_gnm_to_cluster_iTOL_txt = '%s/renamed_genome_to_cluster_iTOL.txt' % output_dir
366
+ renamed_contree_dir = '%s/renamed_contree' % output_dir
367
+ renamed_ufboot_dir = '%s/renamed_ufboot' % output_dir
368
+ count_sister_taxa_op_dir = '%s/count_sister_taxa_op' % output_dir
369
+
370
+ if os.path.isdir(output_dir) is True:
371
+ if force_overwrite is True:
372
+ os.system('rm -r %s' % output_dir)
373
+ else:
374
+ print('%s exist, program exited!' % output_dir)
375
+ exit()
376
+
377
+ os.mkdir(output_dir)
378
+ os.mkdir(renamed_contree_dir)
379
+ os.mkdir(renamed_ufboot_dir)
380
+ os.mkdir(count_sister_taxa_op_dir)
381
+ os.mkdir(renamed_gnm_to_cluster_dir)
382
+
383
+ ####################################################################################################################
384
+
385
+ gnm_tax_dict = {}
386
+ for each in open(gtdb_classification_txt):
387
+ if not each.startswith('user_genome'):
388
+ each_split = each.strip().split('\t')
389
+ gnm_tax_dict[each_split[0]] = each_split[1]
390
+
391
+ ####################################################################################################################
392
+
393
+ mag_cluster_dict = {}
394
+ for each_gnm in open(gnm_cluster_txt):
395
+ each_gnm_split = each_gnm.strip().split('\t')
396
+ print(each_gnm_split)
397
+ mag_cluster_dict[each_gnm_split[0]] = each_gnm_split[1]
398
+
399
+ argument_lol = []
400
+ for og_id in hog_list:
401
+
402
+ # define file name
403
+ tree_ml = '%s/%s.contree' % (contree_dir, og_id)
404
+ ufboot_file = '%s/%s.ufboot' % (ufboot_dir, og_id)
405
+ tree_ml_renamed = '%s/%s_renamed.contree' % (renamed_contree_dir, og_id)
406
+ ufboot_file_renamed = '%s/%s_renamed.ufboot' % (renamed_ufboot_dir, og_id)
407
+ count_sister_taxa_op_txt = '%s/%s_count_sister_taxa.txt' % (count_sister_taxa_op_dir, og_id)
408
+
409
+ if os.path.isfile(tree_ml) is False:
410
+ print('%s not found!' % tree_ml)
411
+ exit()
412
+
413
+ current_arg_list = [mag_cluster_dict, gnm_tax_dict, tree_ml, ufboot_file, target_label, tree_ml_renamed, ufboot_file_renamed, count_sister_taxa_op_txt, og_id, renamed_gnm_to_cluster_dir, seq_named_by_gnm]
414
+ argument_lol.append(current_arg_list)
415
+
416
+ # run with multiprocessing
417
+ pool = mp.Pool(processes=num_threads)
418
+ pool.map(count_sister_taxa_worker, argument_lol)
419
+ pool.close()
420
+ pool.join()
421
+
422
+ # combine renamed_gnm_to_cluster files
423
+ os.system('cat %s/*.txt > %s' % (renamed_gnm_to_cluster_dir, renamed_gnm_to_cluster_tmp_txt))
424
+ os.system('cat %s | sort | uniq > %s' % (renamed_gnm_to_cluster_tmp_txt, renamed_gnm_to_cluster_txt))
425
+ TreeSAK_iTOL_cmd = 'TreeSAK iTOL -ColorRange -lg %s -lt Cluster -o %s' % (renamed_gnm_to_cluster_txt, renamed_gnm_to_cluster_iTOL_txt)
426
+ os.system(TreeSAK_iTOL_cmd)
427
+
428
+
429
+ def get_taxa_count_stats(step_1_op_dir, hog_list_sorted, get_taxa_count_stats_wd, force_overwrite, TaxaCountStats_Rscript):
430
+
431
+ # define input files to R script
432
+ combined_contree_file = '%s/combined.contree' % get_taxa_count_stats_wd
433
+ genes_to_remove_txt = '%s/Genes_to_remove.txt' % get_taxa_count_stats_wd
434
+ list_of_trees_txt = '%s/List_of_trees.txt' % get_taxa_count_stats_wd
435
+ mapping_txt = '%s/mapping.txt' % get_taxa_count_stats_wd
436
+ marker_list_txt = '%s/MarkerList.txt' % get_taxa_count_stats_wd
437
+ combined_count_sister_taxa_op = '%s/combined_count_sister_taxa_op.txt' % get_taxa_count_stats_wd
438
+ TaxaCountStats_op = '%s/TaxaCountStats_output.txt' % get_taxa_count_stats_wd
439
+
440
+ if os.path.isdir(get_taxa_count_stats_wd) is True:
441
+ if force_overwrite is True:
442
+ os.system('rm -r %s' % get_taxa_count_stats_wd)
443
+ else:
444
+ print('%s exist, program exited!' % get_taxa_count_stats_wd)
445
+ exit()
446
+ os.mkdir(get_taxa_count_stats_wd)
447
+
448
+ cluster_to_domain_dict = {}
449
+ marker_list_txt_handle = open(marker_list_txt, 'w')
450
+ marker_list_txt_handle.write('MarkerID\n')
451
+ list_of_trees_txt_handle = open(list_of_trees_txt, 'w')
452
+ combined_contree_file_handle = open(combined_contree_file, 'w')
453
+ combined_count_sister_taxa_op_handle = open(combined_count_sister_taxa_op, 'w')
454
+ combined_count_sister_taxa_op_handle.write('MarkerID\tGroup_of_interest\tSister_taxa\tNormalized_sum_of_occurances\tsplits\tNormalized2_sum_of_occurances\tClusters\n')
455
+ for each_hog in hog_list_sorted:
456
+
457
+ # write out to combined_count_sister_taxa_op
458
+ pwd_count_sister_taxa_op_txt = '%s/count_sister_taxa_op/%s_count_sister_taxa.txt' % (step_1_op_dir, each_hog)
459
+ with open(pwd_count_sister_taxa_op_txt) as count_sister_taxa_op_txt_opened:
460
+ for each_line in count_sister_taxa_op_txt_opened:
461
+ combined_count_sister_taxa_op_handle.write('%s\t%s' % (each_hog, each_line))
462
+
463
+ # write out to combined_contree_file
464
+ pwd_renamed_contree_file = '%s/renamed_contree/%s_renamed.contree' % (step_1_op_dir, each_hog)
465
+ with open(pwd_renamed_contree_file, 'r') as pwd_renamed_contree_file_opened:
466
+ combined_contree_file_handle.write(pwd_renamed_contree_file_opened.readline())
467
+
468
+ # add to cluster_to_domain_dict
469
+ t_in = Tree(pwd_renamed_contree_file, format=1)
470
+ for leaf in t_in:
471
+ leaf_name_split = leaf.name.split('|')
472
+ cluster_to_domain_dict[leaf_name_split[0]] = leaf_name_split[1]
473
+
474
+ # write out to marker_list_txt
475
+ marker_list_txt_handle.write(each_hog + '\n')
476
+ list_of_trees_txt_handle.write(each_hog + '\n')
477
+
478
+ marker_list_txt_handle.close()
479
+ list_of_trees_txt_handle.close()
480
+ combined_contree_file_handle.close()
481
+ combined_count_sister_taxa_op_handle.close()
482
+
483
+ # prepare mapping_txt
484
+ mapping_txt_handle = open(mapping_txt, 'w')
485
+ mapping_txt_handle.write('Cluster\tDomain\n')
486
+ for each_cluster in cluster_to_domain_dict:
487
+ mapping_txt_handle.write('%s\t%s\n' % (each_cluster, cluster_to_domain_dict[each_cluster]))
488
+ mapping_txt_handle.close()
489
+
490
+ # prepare genes_to_remove_txt
491
+ genes_to_remove_txt_handle = open(genes_to_remove_txt, 'w')
492
+ genes_to_remove_txt_handle.write('MarkerID\n')
493
+ genes_to_remove_txt_handle.close()
494
+
495
+ # run TaxaCountStats.R
496
+ get_TaxaCountStats_cmd = 'Rscript %s -t %s -l %s -g %s -x %s -s %s -r %s -o %s > /dev/null' % (TaxaCountStats_Rscript, combined_contree_file, list_of_trees_txt, mapping_txt, marker_list_txt, combined_count_sister_taxa_op, genes_to_remove_txt, TaxaCountStats_op)
497
+ print('Running: ' + get_TaxaCountStats_cmd)
498
+ os.system(get_TaxaCountStats_cmd)
499
+
500
+
501
+ def group_marker(taxa_counts_tats_op_txt, marker_seq_dir, marker_rank_cutoff_str, op_dir):
502
+
503
+ marker_rank_cutoff_list = marker_rank_cutoff_str.split(',')
504
+
505
+ marker_score_dict = dict()
506
+ header_index_dict = dict()
507
+ for each_marker in open(taxa_counts_tats_op_txt):
508
+ each_marker_split = each_marker.replace('\n', '').split('\t')
509
+ if each_marker.startswith('MarkerID\t'):
510
+ header_index_dict = {k: v for v, k in enumerate(each_marker_split)}
511
+ else:
512
+ marker_id = each_marker_split[header_index_dict['MarkerID']]
513
+ marker_score = int(each_marker_split[header_index_dict['RankA_B']])
514
+ marker_score_dict[marker_id] = marker_score
515
+
516
+ marker_list_sorted_best_to_wrost = [i[0] for i in sorted(marker_score_dict.items(), key=lambda x: x[1])]
517
+ marker_list_sorted_wrost_to_best = [i[0] for i in sorted(marker_score_dict.items(), key=lambda x: x[1])][::-1]
518
+
519
+ for each_cutoff in marker_rank_cutoff_list:
520
+
521
+ marker_num_rounded = round(len(marker_list_sorted_wrost_to_best)*float(each_cutoff)/100)
522
+ marker_list_best = marker_list_sorted_best_to_wrost[:marker_num_rounded]
523
+ marker_list_worst = marker_list_sorted_wrost_to_best[:marker_num_rounded]
524
+ seq_dir_best = '%s/best%s' % (op_dir, each_cutoff)
525
+ seq_dir_worst = '%s/worst%s' % (op_dir, each_cutoff)
526
+
527
+ os.system('mkdir %s' % seq_dir_best)
528
+ os.system('mkdir %s' % seq_dir_worst)
529
+
530
+ # get the best markers
531
+ for bm in marker_list_best:
532
+ os.system('cp %s/%s.fa %s/' % (marker_seq_dir, bm, seq_dir_best))
533
+
534
+ # get the worst markers
535
+ for wm in marker_list_worst:
536
+ os.system('cp %s/%s.fa %s/' % (marker_seq_dir, wm, seq_dir_worst))
537
+
538
+
539
+ def SplitScore2(args):
540
+
541
+ step_1_op_dir = args['i']
542
+ gnm_group_txt = args['g']
543
+ gtdb_classification_txt = args['k']
544
+ force_overwrite = args['f']
545
+ num_of_threads = args['t']
546
+ step_2_op_dir = args['o']
547
+ marker_rank_cutoff_str = args['c']
548
+ seq_named_by_gnm = args['seq_named_by_gnm']
549
+ target_label = 'cluster'
550
+
551
+ check_executables(['Rscript'])
552
+
553
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
554
+ TaxaCountStats_Rscript = '%s/TaxaCountStats.R' % current_file_path
555
+ qualified_og_seq_dir = '%s/qualified_OGs' % step_1_op_dir
556
+ contree_file_re = '%s/*.contree' % qualified_og_seq_dir
557
+ ufboot_file_re = '%s/*.ufboot' % qualified_og_seq_dir
558
+ count_sister_taxa_op_dir = '%s/count_sister_taxa_wd' % step_2_op_dir
559
+ get_taxa_count_stats_op_dir = '%s/get_taxa_count_stats_wd' % step_2_op_dir
560
+ TaxaCountStats_output_txt = '%s/get_taxa_count_stats_wd/TaxaCountStats_output.txt' % step_2_op_dir
561
+
562
+ contree_file_set_base = set()
563
+ for each_contree_file in glob.glob(contree_file_re):
564
+ _, f_base, _ = sep_path_basename_ext(each_contree_file)
565
+ contree_file_set_base.add(f_base)
566
+
567
+ ufboot_file_set_base = set()
568
+ for each_ufboot_file in glob.glob(ufboot_file_re):
569
+ _, f_base, _ = sep_path_basename_ext(each_ufboot_file)
570
+ ufboot_file_set_base.add(f_base)
571
+
572
+ contree_ufboot_shared = set(contree_file_set_base).intersection(ufboot_file_set_base)
573
+ contree_ufboot_shared_sorted = sorted([i for i in contree_ufboot_shared])
574
+
575
+ # create output folder
576
+ if os.path.isdir(step_2_op_dir) is True:
577
+ if force_overwrite is True:
578
+ os.system('rm -r %s' % step_2_op_dir)
579
+ else:
580
+ print('%s exist, program exited!' % step_2_op_dir)
581
+ exit()
582
+ os.mkdir(step_2_op_dir)
583
+
584
+ print('Counting sister taxa with %s cores' % num_of_threads)
585
+ run_count_sister_taxa(gtdb_classification_txt, contree_ufboot_shared_sorted, qualified_og_seq_dir, qualified_og_seq_dir, gnm_group_txt, target_label, num_of_threads, count_sister_taxa_op_dir, force_overwrite, seq_named_by_gnm)
586
+
587
+ print('Summarising sister taxa')
588
+ get_taxa_count_stats(count_sister_taxa_op_dir, contree_ufboot_shared_sorted, get_taxa_count_stats_op_dir, force_overwrite, TaxaCountStats_Rscript)
589
+
590
+ print('Exporting markers by split score')
591
+ group_marker(TaxaCountStats_output_txt, qualified_og_seq_dir, marker_rank_cutoff_str, step_2_op_dir)
592
+
593
+ print('Done!')
594
+
595
+
596
+ if __name__ == '__main__':
597
+
598
+ SplitScore2_parser = argparse.ArgumentParser()
599
+ SplitScore2_parser.add_argument('-i', required=True, help='output dir from SplitScore1')
600
+ SplitScore2_parser.add_argument('-g', required=True, help='genome group')
601
+ SplitScore2_parser.add_argument('-k', required=True, help='genome taxon, GTDB format')
602
+ SplitScore2_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
603
+ SplitScore2_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
604
+ SplitScore2_parser.add_argument('-c', required=False, default='25,50,75', help='marker ranking cutoffs, default: 25,50,75')
605
+ SplitScore2_parser.add_argument('-seq_named_by_gnm', required=False, action="store_true", help='named_by_gnm, specify if sequence named by gnm')
606
+ SplitScore2_parser.add_argument('-o', required=True, help='output directory')
607
+ args = vars(SplitScore2_parser.parse_args())
608
+ SplitScore2(args)