treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,455 @@
1
+ from __future__ import print_function
2
+ import os
3
+ import re
4
+ import sys
5
+ import numpy
6
+ import argparse
7
+ from ete3 import Tree
8
+ import multiprocessing as mp
9
+ from operator import itemgetter
10
+ from collections import defaultdict
11
+
12
+
13
+ def get_rename_dict(tree_str_in, mag_rename_dict, mag_cluster_dict, sponge_mag_tax_dict, gtdb_gnm_tax_dict):
14
+
15
+ # rename dict: {'old_name':'new_name'}
16
+
17
+ leaf_rename_dict = {}
18
+ for leaf in Tree(tree_str_in, format=1):
19
+
20
+ leaf_name_gnm = '_'.join(leaf.name.split('_')[:-1])
21
+ leaf_name_gnm = mag_rename_dict.get(leaf_name_gnm, leaf_name_gnm)
22
+ leaf_cluster = mag_cluster_dict.get(leaf_name_gnm, 'cluster_0')
23
+
24
+ leaf_name_gnm_no_source = leaf_name_gnm
25
+ if '.gtdb' in leaf_name_gnm_no_source:
26
+ leaf_name_gnm_no_source = leaf_name_gnm[:-5]
27
+ if '.ncbi' in leaf_name_gnm:
28
+ leaf_name_gnm_no_source = leaf_name_gnm[:-5]
29
+
30
+ # get mag_taxon_str
31
+ gnm_taxon_str = 'NA'
32
+ if leaf_name_gnm_no_source in sponge_mag_tax_dict:
33
+ gnm_taxon_str = sponge_mag_tax_dict[leaf_name_gnm_no_source]
34
+ if leaf_name_gnm_no_source in gtdb_gnm_tax_dict:
35
+ gnm_taxon_str = gtdb_gnm_tax_dict[leaf_name_gnm_no_source]
36
+
37
+ # get mag_taxon_str (GCA GCF things)
38
+ if gnm_taxon_str == 'NA':
39
+ mag_id_no_ext_no_source_GCF = leaf_name_gnm_no_source.replace('GCA', 'GCF')
40
+ if mag_id_no_ext_no_source_GCF in gtdb_gnm_tax_dict:
41
+ gnm_taxon_str = gtdb_gnm_tax_dict[mag_id_no_ext_no_source_GCF]
42
+
43
+ gnm_taxon_str_no_space = gnm_taxon_str.replace(' ', '_')
44
+ gnm_taxon_str_no_space = gnm_taxon_str_no_space.replace(';', '|')
45
+ leaf_name_new = '%s|%s|strain__%s' % (leaf_cluster, gnm_taxon_str_no_space, '_'.join(leaf.name.split('_')[:-1]))
46
+
47
+ leaf_rename_dict[leaf.name] = leaf_name_new
48
+
49
+ return leaf_rename_dict
50
+
51
+
52
+ def rename_tree(tree_str_in, rename_dict):
53
+
54
+ t_in = Tree(tree_str_in, format=1)
55
+ for leaf in t_in:
56
+ leaf_name = leaf.name
57
+ leaf_name_new = rename_dict.get(leaf_name, leaf_name)
58
+ leaf.name = leaf_name_new
59
+
60
+ return t_in.write()
61
+
62
+
63
+ def gtdb_gnm_metadata_parser(gtdb_genome_metadata):
64
+
65
+ genome_to_taxon_dict = {}
66
+ genome_to_completeness_dict = {}
67
+ genome_to_contamination_dict = {}
68
+ genome_to_biosample_dict = {}
69
+
70
+ col_index = {}
71
+ for each_ref in open(gtdb_genome_metadata):
72
+ each_ref_split = each_ref.strip().split('\t')
73
+ if each_ref.startswith('accession'):
74
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
75
+ else:
76
+ ref_accession = each_ref_split[0][3:]
77
+ gnm_completeness = float(each_ref_split[2])
78
+ gnm_contamination = float(each_ref_split[3])
79
+ gtdb_taxon = each_ref_split[col_index['gtdb_taxonomy']]
80
+ ncbi_biosample = each_ref_split[col_index['ncbi_biosample']]
81
+ genome_to_taxon_dict[ref_accession] = gtdb_taxon
82
+ genome_to_completeness_dict[ref_accession] = gnm_completeness
83
+ genome_to_contamination_dict[ref_accession] = gnm_contamination
84
+ genome_to_biosample_dict[ref_accession] = ncbi_biosample
85
+
86
+ return genome_to_completeness_dict, genome_to_contamination_dict, genome_to_taxon_dict, genome_to_biosample_dict
87
+
88
+
89
+ def parse_taxonomy(taxon_name): # given a taxon name, try to return whatever taxonomic info is available as a list starting with the highest level classification and going lower (or a map?)
90
+ #name_elements = re.split("\|", taxon_name)
91
+ name_elements = taxon_name.split('|')
92
+ #print('name_elements')
93
+ #print(name_elements)
94
+
95
+ if (len(name_elements) < 8) or (len(name_elements) > 9):
96
+ print("Nonstandard!")
97
+ quit()
98
+
99
+ name_map = {}
100
+ name_map['cluster'] = name_elements[0]
101
+ name_map['domain'] = name_elements[1]
102
+ name_map['phylum'] = name_elements[2]
103
+ name_map['class'] = name_elements[3]
104
+ name_map['order'] = name_elements[4]
105
+ name_map['family'] = name_elements[5]
106
+ name_map['genus'] = name_elements[6]
107
+ name_map['species'] = name_elements[7]
108
+ if len(name_elements) == 9:
109
+ name_map['ncbi_id'] = name_elements[8]
110
+ return name_map
111
+
112
+
113
+ def summarize_taxonomy(name_list, tax_level, name_to_tax_dict): # take a list of names from a clade and summarize taxonomic info (labels and their frequencies)
114
+ total_size = len(name_list) # it perhaps makes sense to normalize by the size of the clade
115
+ breakdown = {}
116
+ for name in name_list:
117
+ info = name_to_tax_dict[name]
118
+ if info[tax_level] in breakdown:
119
+ breakdown[info[tax_level]] += 1.0 / float(total_size)
120
+ else:
121
+ breakdown[info[tax_level]] = 1.0 / float(total_size)
122
+ return breakdown
123
+
124
+
125
+ def count_sister_taxa(target_label, tree_in_ml, tree_in_bs, output_file):
126
+
127
+ # read in argument
128
+ # target_label = args['l']
129
+ # tree_in_ml = args['ml']
130
+ # tree_in_bs = args['bs']
131
+ # output_file = args['out']
132
+
133
+ # edit target_label to make the comparisons at a desired taxonomic level
134
+
135
+ # compute the most frequent sister group of each (monophyletic?) group on the tree, to identify trends in gene transfers, "unstable" taxa, etc.
136
+
137
+ # read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end)
138
+ labels = {}
139
+ name_to_tax_info = defaultdict(dict)
140
+ all_tree_leaf_names = []
141
+ ml_tree = Tree(tree_in_ml) # note that ete3 treats this input tree as rooted
142
+ for leaf in ml_tree:
143
+ taxonomy = parse_taxonomy(leaf.name)
144
+ name_to_tax_info[leaf.name] = taxonomy
145
+ all_tree_leaf_names.append(leaf.name)
146
+ leaf.add_feature("tax", taxonomy[target_label])
147
+ labels[taxonomy[target_label]] = 1
148
+ groups = labels.keys()
149
+
150
+ # compute the number of clades (weizhi: monophyletic group) per label in the ML tree, and their sizes
151
+ ML_groups = defaultdict(list) # the list is the size of each clade, len(list) is the number of clades for that label in the ML tree
152
+ # ML_groups: the number of leaves in each monophyletic groups of the corresponding target_label (e.g. genus)
153
+ for label in groups:
154
+ node_num = 0
155
+ for monophyletic_clade in ml_tree.get_monophyletic(values=[label], target_attr="tax"): # get monophyletic groups for each target_label (e.g. genus)
156
+ # print('node')
157
+ # print(node)
158
+ size_clade = 0 # get the number of leaf (size_clade) in the monophyletic group
159
+ for leaf in monophyletic_clade:
160
+ size_clade += 1
161
+ ML_groups[label].append(size_clade)
162
+ node_num += 1
163
+ # print('The number of monophyletic clade (node_num) of %s (label):\t%s' % (label, node_num))
164
+
165
+ # print()
166
+ # print('Dict holds the number of monophyletic clades per taxon, and their sizes')
167
+ # print('ML_groups:\t %s' % ML_groups)
168
+ # print()
169
+
170
+ summary = defaultdict(dict)
171
+ clades_per_group = defaultdict(list)
172
+ treeNum = -1
173
+ for line in open(tree_in_bs): # read in each bootstrap tree
174
+
175
+ treeNum += 1
176
+ tree = Tree(line.rstrip())
177
+ for leaf in tree:
178
+ tax = name_to_tax_info[leaf.name] # this should set up taxonomy correctly...
179
+ leaf.add_feature("tax", tax[target_label]) # this adds a feature called tax to the leaf, with the attribute of the phylum name
180
+ for label in groups:
181
+ clades_per_group[label].append(0.0) # setup the clade counting for this particular tree
182
+ tree.unroot() # Weizhi: why is this
183
+
184
+ # iterate over groups that are monophyletic for the taxon label of choice.
185
+ # Choose the smallest sister branch for the comparison. (Assume root is within the larger sister clade (Weizhi:why?))
186
+ for label in groups:
187
+
188
+ # print('tree.get_monophyletic(values=[label], target_attr="tax")')
189
+ # print(tree.get_monophyletic(values=[label], target_attr="tax"))
190
+ # print('---------------------------------------------------------------------------------------------------v')
191
+ # print('label: %s' % label)
192
+ monophyletic_clade_index = 1
193
+ for monophyletic_clade in tree.get_monophyletic(values=[label], target_attr="tax"): # node: monophyletic clade
194
+ clades_per_group[label][treeNum] += 1.0
195
+ # print node.get_ascii()
196
+ sister_clades = monophyletic_clade.get_sisters()
197
+
198
+ # print('--------------------v')
199
+ # print('monophyletic clade %s in %s (label)' % (monophyletic_clade_index, label))
200
+ monophyletic_clade_index += 1
201
+ #print(monophyletic_clade)
202
+ # print(len(sisters))
203
+ # for leaf in sisters[0]:
204
+ # print(leaf.name)
205
+ # print(sisters)
206
+ # print('sisters of current monophyletic clade')
207
+ sister_index = 1
208
+ for each_sister in sister_clades:
209
+ current_sister_leaf_list = []
210
+ for leaf in each_sister:
211
+ current_sister_leaf_list.append(leaf.name)
212
+ # print('sister %s has %s leaves: %s' % (sister_index, len(current_sister_leaf_list), ','.join([])))
213
+ sister_index += 1
214
+
215
+ if monophyletic_clade.is_root(): # monophyletic clade is root
216
+ continue
217
+
218
+ # Weizhi: bifurcation
219
+ elif len(sister_clades) == 1: # not at the trifurcation. Do something a bit hacky to find the bigger sister clade
220
+
221
+ taxa_in_sister = []
222
+ for leaf in sister_clades[0]:
223
+ taxa_in_sister.append(leaf.name)
224
+
225
+ size_sister = len(taxa_in_sister)
226
+
227
+ taxa_in_group = []
228
+ for leaf in monophyletic_clade:
229
+ taxa_in_group.append(leaf.name)
230
+
231
+ taxa_in_other_groups = [] # what does OG mean? (other groups?)
232
+ for leaf_name in all_tree_leaf_names:
233
+ if leaf_name in taxa_in_sister:
234
+ continue
235
+ elif leaf_name in taxa_in_group:
236
+ continue
237
+ else:
238
+ taxa_in_other_groups.append(leaf_name)
239
+ size_other_groups = len(taxa_in_other_groups)
240
+
241
+ sister_tax = {} # taxa in the smaller groups (either the sister group or the OG)
242
+ if size_other_groups > size_sister:
243
+ sister_tax = summarize_taxonomy(taxa_in_sister, target_label, name_to_tax_info)
244
+ else:
245
+ sister_tax = summarize_taxonomy(taxa_in_other_groups, target_label, name_to_tax_info)
246
+
247
+ # print('size_sister: %s' % size_sister)
248
+ # print('size_other_groups: %s' % size_other_groups)
249
+ # print('sister_tax (not really, actually taxa in the smaller one (either the sister group or the OG))')
250
+ # print(sister_tax)
251
+
252
+ # store the tax info of the sister group
253
+ for element in sister_tax:
254
+ # print('element: %s' % element)
255
+ #print('summary[label]: %s' % summary[label])
256
+ if element in summary[label]:
257
+ summary[label][element] += sister_tax[element]
258
+ #print('summary (in): %s' % summary)
259
+ else:
260
+ summary[label][element] = sister_tax[element]
261
+ #print('summary (not in): %s' % summary)
262
+
263
+ else: # trifurcation in tree. Just treat the two sisters in the same way.
264
+
265
+ taxa_in_sisters_1 = []
266
+ for leaf in sister_clades[0]:
267
+ taxa_in_sisters_1.append(leaf.name)
268
+
269
+ taxa_in_sisters_2 = []
270
+ for leaf in sister_clades[1]:
271
+ taxa_in_sisters_2.append(leaf.name)
272
+
273
+ # get the size of two sisters
274
+ size_s1 = len(taxa_in_sisters_1)
275
+ size_s2 = len(taxa_in_sisters_2)
276
+
277
+ # print('size_s1: %s' % size_s1)
278
+ # print('size_s2: %s' % size_s2)
279
+
280
+ # get taxa in the smaller sister group
281
+ sister_tax = {}
282
+ if size_s1 > size_s2:
283
+ sister_tax = summarize_taxonomy(taxa_in_sisters_2, target_label, name_to_tax_info)
284
+ else:
285
+ sister_tax = summarize_taxonomy(taxa_in_sisters_1, target_label, name_to_tax_info)
286
+
287
+ # print('sister_tax (taxa in the smaller sister group)')
288
+ # print(sister_tax)
289
+
290
+ for element in sister_tax:
291
+ if element in summary[label]:
292
+ summary[label][element] += sister_tax[element]
293
+ else:
294
+ summary[label][element] = sister_tax[element]
295
+
296
+ # print('--------------------^')
297
+ # print()
298
+ # print('---------------------------------------------------------------------------------------------------^')
299
+
300
+ # now print out some kind of summary. For each label, the sorted list of sister taxa and their frequencies?
301
+ outh = open(output_file, "w")
302
+ for label in summary:
303
+ num_groups = len(ML_groups[label])
304
+ size_str = ''
305
+ if num_groups == 1:
306
+ size_str = ML_groups[label][0]
307
+ else:
308
+ size_str = ','.join(str(x) for x in (sorted(ML_groups[label], reverse=True)))
309
+
310
+ avg_num_clades = float("{0:.4f}".format(numpy.mean(clades_per_group[label])))
311
+ total_num_clades = numpy.sum(clades_per_group[label])
312
+ sorted_sisters = sorted(summary[label].items(), key=itemgetter(1), reverse=True)
313
+
314
+ # if label == 'g__TA-20':
315
+ # print('ML_groups[label]:\t%s' % ML_groups[label])
316
+ # print('clades_per_group[label]:\t%s' % clades_per_group[label])
317
+ # print('avg_num_clades (mean of list):\t%s' % numpy.mean(clades_per_group[label]))
318
+ # print('total_num_clades (sum of list):\t%s' % total_num_clades)
319
+ # print('summary[label]:\t%s' % summary[label])
320
+ # print('sorted_sisters\t%s' % sorted_sisters)
321
+
322
+ for tup in sorted_sisters:
323
+ double_normalize = float(tup[1]) / float(total_num_clades) # normalize the frequencies by the total number of clades, to account for different bootstrap numbers/MCMC sample numbers
324
+ double_normalize = float("{0:.4f}".format(double_normalize))
325
+
326
+ str_to_write = '%s\t%s\t%s\t%s\t%s\t%s' % (label, tup[0], float("{0:.4f}".format(tup[1])), avg_num_clades, double_normalize, size_str)
327
+ outh.write(str_to_write + '\n')
328
+ # if label == 'g__TA-20':
329
+ # print(str_to_write)
330
+ outh.close()
331
+
332
+
333
+ def count_sister_taxa_worker(arg_list):
334
+
335
+ mag_rename_dict = arg_list[0]
336
+ mag_cluster_dict = arg_list[1]
337
+ sponge_archaeal_MAG_tax_dict = arg_list[2]
338
+ gtdb_ar_gnm_tax_dict = arg_list[3]
339
+ tree_ml = arg_list[4]
340
+ ufboot_file = arg_list[5]
341
+ target_label = arg_list[6]
342
+ tree_ml_renamed = arg_list[7]
343
+ ufboot_file_renamed = arg_list[8]
344
+ count_sister_taxa_op_txt = arg_list[9]
345
+ gene_id = arg_list[10]
346
+ renamed_gnm_to_cluster_dir = arg_list[11]
347
+
348
+ # rename ml tree
349
+ tree_ml_renamed_handle = open(tree_ml_renamed, 'w')
350
+ current_tree_rename_dict = get_rename_dict(tree_ml, mag_rename_dict, mag_cluster_dict, sponge_archaeal_MAG_tax_dict, gtdb_ar_gnm_tax_dict)
351
+ tree_ml_str_renamed = rename_tree(tree_ml, current_tree_rename_dict)
352
+ tree_ml_renamed_handle.write(tree_ml_str_renamed + '\n')
353
+ tree_ml_renamed_handle.close()
354
+
355
+ current_renamed_gnm_to_cluster_txt = '%s/%s.txt' % (renamed_gnm_to_cluster_dir, gene_id)
356
+ current_renamed_gnm_to_cluster_txt_handle = open(current_renamed_gnm_to_cluster_txt, 'w')
357
+ for each_leaf in current_tree_rename_dict:
358
+ renamed_leaf = current_tree_rename_dict[each_leaf]
359
+ cluster_id = renamed_leaf.split('|')[0]
360
+ current_renamed_gnm_to_cluster_txt_handle.write('%s\t%s\n' % (renamed_leaf, cluster_id))
361
+ current_renamed_gnm_to_cluster_txt_handle.close()
362
+
363
+ # rename ufboot trees
364
+ ufboot_file_renamed_handle = open(ufboot_file_renamed, 'w')
365
+ for each_tree in open(ufboot_file):
366
+ tree_str = each_tree.strip()
367
+ current_tree_rename_dict = get_rename_dict(tree_str, mag_rename_dict, mag_cluster_dict, sponge_archaeal_MAG_tax_dict, gtdb_ar_gnm_tax_dict)
368
+ tree_str_renamed = rename_tree(tree_str, current_tree_rename_dict)
369
+ ufboot_file_renamed_handle.write(tree_str_renamed + '\n')
370
+ ufboot_file_renamed_handle.close()
371
+
372
+ # count_sister_taxa
373
+ count_sister_taxa(target_label, tree_ml_renamed, ufboot_file_renamed, count_sister_taxa_op_txt)
374
+
375
+
376
+ def run_count_sister_taxa(genome_metadata_ar53_r207_Mac, sponge_MAG_GTDB_archaea, hog_id_txt, contree_and_ufboot_dir, archaeal_mags_renamed_for_prokka_txt, gnm_cluster_txt, target_label, num_threads, output_dir):
377
+
378
+ # define file name
379
+ renamed_gnm_to_cluster_dir = '%s/renamed_genome_to_cluster' % output_dir
380
+ renamed_gnm_to_cluster_tmp_txt = '%s/renamed_genome_to_cluster_tmp.txt' % output_dir
381
+ renamed_gnm_to_cluster_txt = '%s/renamed_genome_to_cluster.txt' % output_dir
382
+ renamed_gnm_to_cluster_iTOL_txt = '%s/renamed_genome_to_cluster_iTOL.txt' % output_dir
383
+ renamed_contree_dir = '%s/renamed_contree' % output_dir
384
+ renamed_ufboot_dir = '%s/renamed_ufboot' % output_dir
385
+ count_sister_taxa_op_dir = '%s/count_sister_taxa_op' % output_dir
386
+
387
+ os.mkdir(output_dir)
388
+ os.mkdir(renamed_contree_dir)
389
+ os.mkdir(renamed_ufboot_dir)
390
+ os.mkdir(count_sister_taxa_op_dir)
391
+ os.mkdir(renamed_gnm_to_cluster_dir)
392
+
393
+ _, _, gtdb_ar_gnm_tax_dict, _ = gtdb_gnm_metadata_parser(genome_metadata_ar53_r207_Mac)
394
+
395
+ sponge_archaeal_MAG_tax_dict = {}
396
+ for each in open(sponge_MAG_GTDB_archaea):
397
+ if not each.startswith('user_genome'):
398
+ each_split = each.strip().split('\t')
399
+ sponge_archaeal_MAG_tax_dict[each_split[0]] = each_split[1]
400
+
401
+ hog_list = []
402
+ for each_hog in open(hog_id_txt):
403
+ hog_list.append(each_hog.strip())
404
+
405
+ mag_cluster_dict = {}
406
+ for each_gnm in open(gnm_cluster_txt):
407
+ each_gnm_split = each_gnm.strip().split('\t')
408
+ mag_cluster_dict[each_gnm_split[1]] = each_gnm_split[0]
409
+
410
+ mag_rename_dict = {}
411
+ for each_mag in open(archaeal_mags_renamed_for_prokka_txt):
412
+ each_mag_split = each_mag.strip().split('\t')
413
+ before_rename = each_mag_split[0]
414
+ after_rename = each_mag_split[1]
415
+ mag_rename_dict[after_rename] = before_rename
416
+
417
+ argument_lol = []
418
+ for og_id in hog_list:
419
+
420
+ # define file name
421
+ tree_ml = '%s/%s_iqtree.contree' % (contree_and_ufboot_dir, og_id)
422
+ ufboot_file = '%s/%s_iqtree.ufboot' % (contree_and_ufboot_dir, og_id)
423
+ tree_ml_renamed = '%s/%s_iqtree_renamed.contree' % (renamed_contree_dir, og_id)
424
+ ufboot_file_renamed = '%s/%s_iqtree_renamed.ufboot' % (renamed_ufboot_dir, og_id)
425
+ count_sister_taxa_op_txt = '%s/%s_iqtree_count_sister_taxa.txt' % (count_sister_taxa_op_dir, og_id)
426
+
427
+ current_arg_list = [mag_rename_dict, mag_cluster_dict, sponge_archaeal_MAG_tax_dict, gtdb_ar_gnm_tax_dict, tree_ml, ufboot_file, target_label, tree_ml_renamed, ufboot_file_renamed, count_sister_taxa_op_txt, og_id, renamed_gnm_to_cluster_dir]
428
+ argument_lol.append(current_arg_list)
429
+
430
+ # run with multiprocessing
431
+ pool = mp.Pool(processes=num_threads)
432
+ pool.map(count_sister_taxa_worker, argument_lol)
433
+ pool.close()
434
+ pool.join()
435
+
436
+ # combine renamed_gnm_to_cluster files
437
+ os.system('cat %s/*.txt > %s' % (renamed_gnm_to_cluster_dir, renamed_gnm_to_cluster_tmp_txt))
438
+ os.system('cat %s | sort | uniq > %s' % (renamed_gnm_to_cluster_tmp_txt, renamed_gnm_to_cluster_txt))
439
+ BioSAK_iTOL_cmd = 'BioSAK iTOL -ColorRange -lg %s -lt Cluster -out %s' % (renamed_gnm_to_cluster_txt, renamed_gnm_to_cluster_iTOL_txt)
440
+ os.system(BioSAK_iTOL_cmd)
441
+
442
+
443
+ ####################################################### Test 2023-07-11 ########################################################
444
+
445
+ genome_metadata_ar53_r207_Mac = '/Users/songweizhi/DB/GTDB_r207/ar53_metadata_r207.tsv'
446
+ sponge_MAG_GTDB_archaea = '/Users/songweizhi/Documents/Research/Sponge_Hologenome/0_backup/0_metadata_mac/Sponge_MAGs_1677.ar53.summary.tsv'
447
+ hog_id_txt = '/Users/songweizhi/Documents/Research/Sponge_Hologenome/0_backup/5_Archaeal_tree_50_5_Markers_by_split_wd/HOG_id.txt'
448
+ contree_and_ufboot_dir = '/Users/songweizhi/Documents/Research/Sponge_Hologenome/0_backup/5_Archaeal_tree_50_5_Markers_by_split_wd/contree_and_ufboot_files'
449
+ archaeal_mags_renamed_for_prokka_txt = '/Users/songweizhi/Documents/Research/Sponge_Hologenome/0_backup/0_metadata_mac/Archaeal_mags_renamed_for_prokka.txt'
450
+ gnm_cluster_txt = '/Users/songweizhi/Documents/Research/Sponge_Hologenome/0_backup/5_Archaeal_tree_50_5_Markers_by_split_wd/genome_clusters_v1.txt'
451
+ target_label = 'cluster'
452
+ num_threads = 10
453
+ output_dir = '/Users/songweizhi/Desktop/count_sister_taxa_op_Test_2023_07_11'
454
+
455
+ run_count_sister_taxa(genome_metadata_ar53_r207_Mac, sponge_MAG_GTDB_archaea, hog_id_txt, contree_and_ufboot_dir, archaeal_mags_renamed_for_prokka_txt, gnm_cluster_txt, target_label, num_threads, output_dir)
@@ -0,0 +1,74 @@
1
+ import argparse
2
+
3
+
4
+ sample_drep_gnms_usage = '''
5
+ ============================= sample_drep_gnms example commands =============================
6
+
7
+ BioSAK sample_drep_gnms -c Cdb.csv -r rep_gnms.txt -k sponge_gnms.txt -o sampled_gnms.txt
8
+
9
+ =============================================================================================
10
+ '''
11
+
12
+
13
+ def cdb_2_gnm_cluster_file(Cdb_file):
14
+ cluster_to_bin_dict = {}
15
+ for each_bin in open(Cdb_file):
16
+ if not each_bin.startswith('genome,secondary_cluster'):
17
+ each_bin_split = each_bin.strip().split(',')
18
+ bin_id = each_bin_split[0]
19
+ secondary_cluster = each_bin_split[1]
20
+ if secondary_cluster not in cluster_to_bin_dict:
21
+ cluster_to_bin_dict[secondary_cluster] = [bin_id]
22
+ else:
23
+ cluster_to_bin_dict[secondary_cluster].append(bin_id)
24
+
25
+ return cluster_to_bin_dict
26
+
27
+
28
+ def sample_drep_gnms(args):
29
+
30
+ drep_cdb = args['c']
31
+ drep_representative_gnm_txt = args['r']
32
+ gnm_to_keep_txt = args['k']
33
+ sampled_gnm_txt = args['o']
34
+
35
+ cluster_to_gnm_dict = cdb_2_gnm_cluster_file(drep_cdb)
36
+ drep_representative_gnm_list = [i.strip() for i in open(drep_representative_gnm_txt)]
37
+ to_keep_gnm_set = [i.strip() for i in open(gnm_to_keep_txt)]
38
+
39
+ subsampled_gnm_set = set()
40
+ for each_c in cluster_to_gnm_dict:
41
+ cluster_gnms = cluster_to_gnm_dict[each_c]
42
+ if len(cluster_gnms) == 1:
43
+ subsampled_gnm_set.add(cluster_gnms[0])
44
+ else:
45
+ rep_g = ''
46
+ g_in_to_keep_list = []
47
+ for each_g in cluster_gnms:
48
+ if each_g in drep_representative_gnm_list:
49
+ rep_g = each_g
50
+ if each_g in to_keep_gnm_set:
51
+ g_in_to_keep_list.append(each_g)
52
+
53
+ if len(g_in_to_keep_list) == 0:
54
+ subsampled_gnm_set.add(rep_g)
55
+ else:
56
+ for each_to_keep_g in g_in_to_keep_list:
57
+ subsampled_gnm_set.add(each_to_keep_g)
58
+
59
+ sampled_gnm_list_sorted = sorted([i for i in subsampled_gnm_set])
60
+ sampled_gnm_list_sorted_no_ext = ['.'.join(i.split('.')[:-1]) for i in sampled_gnm_list_sorted]
61
+
62
+ with open(sampled_gnm_txt, 'w') as sampled_gnm_txt_handle:
63
+ sampled_gnm_txt_handle.write('\n'.join(sampled_gnm_list_sorted_no_ext))
64
+
65
+
66
+ if __name__ == '__main__':
67
+
68
+ SliceMSA_parser = argparse.ArgumentParser()
69
+ SliceMSA_parser.add_argument('-c', required=True, help='Cdb.csv')
70
+ SliceMSA_parser.add_argument('-r', required=True, help='Id of drep representative genomes, with file extension')
71
+ SliceMSA_parser.add_argument('-k', required=True, help='ID of genomes to keep, with file extension')
72
+ SliceMSA_parser.add_argument('-o', required=True, help='ID of subsampled genomes')
73
+ args = vars(SliceMSA_parser.parse_args())
74
+ sample_drep_gnms(args)
TreeSAK/subset.py ADDED
@@ -0,0 +1,69 @@
1
+ import argparse
2
+ from ete3 import Tree
3
+
4
+
5
+ subset_usage = '''
6
+ =================== subset example commands ===================
7
+
8
+ TreeSAK subset -fmt 1 -i in.tree -k leaves.txt -o subset.tree
9
+ TreeSAK subset -fmt 1 -i in.tree -r leaves.txt -o subset.tree
10
+
11
+ ===============================================================
12
+ '''
13
+
14
+
15
+ def subset(args):
16
+
17
+ tree_file_in = args['i']
18
+ tree_file_out = args['o']
19
+ to_keep_txt = args['k']
20
+ to_remove_txt = args['r']
21
+ tree_fmt = args['fmt']
22
+
23
+
24
+ genomes_to_keep = []
25
+ if (to_keep_txt is None) and (to_remove_txt is None):
26
+ print('Please specify either -k or -r, program exited!')
27
+ exit()
28
+
29
+ elif (to_keep_txt is not None) and (to_remove_txt is not None):
30
+ print('Please do NOT specify -k and -r at the same time, program exited!')
31
+ exit()
32
+
33
+ elif (to_keep_txt is not None) and (to_remove_txt is None):
34
+ genomes_to_keep = [i.strip() for i in open(to_keep_txt)]
35
+
36
+ elif (to_keep_txt is None) and (to_remove_txt is not None):
37
+
38
+ genomes_to_remove = [i.strip() for i in open(to_remove_txt)]
39
+
40
+ leaf_list = []
41
+ for leaf in Tree(tree_file_in, quoted_node_names=True, format=tree_fmt):
42
+ leaf_name = leaf.name
43
+ leaf_list.append(leaf_name)
44
+
45
+ for each_leaf in leaf_list:
46
+ if each_leaf not in genomes_to_remove:
47
+ genomes_to_keep.append(each_leaf)
48
+
49
+ if len(leaf_list) == len(genomes_to_keep):
50
+ print('No leaf to remove, program exited!')
51
+ exit()
52
+
53
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=tree_fmt)
54
+ subset_tree = input_tree.copy()
55
+ subset_tree.prune(genomes_to_keep, preserve_branch_length=True)
56
+ subset_tree.write(outfile=tree_file_out)
57
+
58
+ print('Subset tree exported to: %s' % tree_file_out)
59
+
60
+ if __name__ == '__main__':
61
+
62
+ parser = argparse.ArgumentParser()
63
+ parser.add_argument('-i', required=True, help='input tree file')
64
+ parser.add_argument('-o', required=True, help='output tree file')
65
+ parser.add_argument('-k', required=False, default=None, help='leaves to keep')
66
+ parser.add_argument('-r', required=False, default=None, help='leaves to remove')
67
+ parser.add_argument('-fmt', required=False, default=1, type=int, help='tree format, default: 1')
68
+ args = vars(parser.parse_args())
69
+ subset(args)