treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/ALE4.py ADDED
@@ -0,0 +1,636 @@
1
+ import os
2
+ import glob
3
+ import math
4
+ import random
5
+ import argparse
6
+ import seaborn as sns
7
+ from ete3 import Tree
8
+ from itolapi import Itol
9
+ from PyPDF3.pdf import PageObject
10
+ from PyPDF3 import PdfFileWriter, PdfFileReader
11
+
12
+
13
+ ALE4_usage = '''
14
+ ========================= ALE4 example commands =========================
15
+
16
+ TreeSAK ALE4 -1 ALE1_op_dir -2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -f -api your_own_itol_api -fc 0.3 -o ALE4_op_dir_0.3
17
+ TreeSAK ALE4 -1 ALE1_op_dir -2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -f -api your_own_itol_api -fc 0.5 -o ALE4_op_dir_0.5
18
+ TreeSAK ALE4 -1 ALE1_op_dir -2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -f -api your_own_itol_api -fc 0.8 -o ALE4_op_dir_0.8
19
+
20
+ # To do:
21
+ # add protein family to the top of the pdf file
22
+
23
+ =========================================================================
24
+ '''
25
+
26
+
27
+ def sep_path_basename_ext(file_in):
28
+ f_path, file_name = os.path.split(file_in)
29
+ if f_path == '':
30
+ f_path = '.'
31
+ f_base, f_ext = os.path.splitext(file_name)
32
+ return f_path, f_base, f_ext
33
+
34
+
35
+ def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
36
+
37
+ input_tree = Tree(tree_file_in)
38
+ subset_tree = input_tree.copy()
39
+ subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
40
+ if tree_file_out is None:
41
+ return subset_tree.write()
42
+ else:
43
+ subset_tree.write(outfile=tree_file_out)
44
+
45
+
46
+ def merge_pdf(pdf_1, pdf_2, margin_size, op_pdf):
47
+
48
+ page1 = PdfFileReader(open(pdf_1, "rb"), strict=False).getPage(0)
49
+ page2 = PdfFileReader(open(pdf_2, "rb"), strict=False).getPage(0)
50
+
51
+ total_width = page1.mediaBox.upperRight[0] + page2.mediaBox.upperRight[0] + margin_size*3
52
+ total_height = max([page1.mediaBox.upperRight[1], page2.mediaBox.upperRight[1]]) + margin_size*2
53
+
54
+ new_page = PageObject.createBlankPage(None, total_width, total_height)
55
+ new_page.mergeTranslatedPage(page1, margin_size, (total_height-margin_size-page1.mediaBox.upperRight[1]))
56
+ new_page.mergeTranslatedPage(page2, (page1.mediaBox.upperRight[0] + margin_size*2), margin_size)
57
+
58
+ output = PdfFileWriter()
59
+ output.addPage(new_page)
60
+ output.write(open(op_pdf, "wb"))
61
+
62
+
63
+ def uts_to_itol_connections(genome_tree_file, ale_formatted_gnm_tree, interal_node_prefix, uts_file, freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num, itol_connection_txt, dr_separator):
64
+
65
+ # get internal_node_to_leaf_dict
66
+ internal_node_to_leaf_dict = get_node_to_leaf_dict(ale_formatted_gnm_tree)
67
+
68
+ paired_donor_to_recipient_leaf_dict = dict()
69
+ qualified_hgt_num = 0
70
+
71
+ leaf_id_set = []
72
+ if os.path.isfile(genome_tree_file):
73
+ leaf_id_set = [i.name for i in Tree(genome_tree_file, format=3).get_leaves()]
74
+ else:
75
+ print('%s not found!' % genome_tree_file)
76
+
77
+ hgt_freq_dict = dict()
78
+ connection_line_to_write_dict = dict()
79
+ with open(itol_connection_txt, 'w') as itol_connection_txt_handle:
80
+ itol_connection_txt_handle.write('DATASET_CONNECTION\nSEPARATOR TAB\nDATASET_LABEL\tdemo_connections\n')
81
+ itol_connection_txt_handle.write('COLOR\t#ff0ff0\nDRAW_ARROWS\t1\nARROW_SIZE\t60\nLOOP_SIZE\t100\n')
82
+ itol_connection_txt_handle.write('MAXIMUM_LINE_WIDTH\t10\nCURVE_ANGLE\t45\nCENTER_CURVES\t1\nALIGN_TO_LABELS\t0\nDATA\n')
83
+ for each_line in open(uts_file):
84
+ if not each_line.startswith('#'):
85
+ each_line_split = each_line.strip().split('\t')
86
+ donor = each_line_split[0]
87
+ recipient = each_line_split[1]
88
+ freq = float(each_line_split[2])
89
+
90
+ # add prefix to internal donor node
91
+ if donor in leaf_id_set:
92
+ donor_with_prefix = donor
93
+ else:
94
+ donor_with_prefix = interal_node_prefix + donor
95
+
96
+ # add prefix to internal recipient node
97
+ if recipient in leaf_id_set:
98
+ recipient_with_prefix = recipient
99
+ else:
100
+ recipient_with_prefix = interal_node_prefix + recipient
101
+
102
+ key_str = '%s%s%s' % (donor_with_prefix, dr_separator, recipient_with_prefix)
103
+
104
+ line_to_write = ''
105
+ if freq >= freq_cutoff:
106
+ if ignore_leaf_hgt is False:
107
+ if ignore_vertical_hgt is False:
108
+ line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
109
+ qualified_hgt_num += 1
110
+ else:
111
+ donor_is_ancestor_of_recipient = check_a_is_ancestor_of_b(ale_formatted_gnm_tree, donor, recipient)
112
+ donor_is_child_of_recipient = check_a_is_child_of_b(ale_formatted_gnm_tree, donor, recipient)
113
+ if (donor_is_ancestor_of_recipient is False) and (donor_is_child_of_recipient is False):
114
+ line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
115
+ qualified_hgt_num += 1
116
+ else:
117
+ if (each_line_split[0] not in leaf_id_set) and (each_line_split[1] not in leaf_id_set):
118
+ donor_node_leaf_num = len(internal_node_to_leaf_dict.get(donor, []))
119
+ recipient_node_leaf_num = len(internal_node_to_leaf_dict.get(recipient, []))
120
+ if (donor_node_leaf_num >= donor_node_min_leaf_num) and (recipient_node_leaf_num >= recipient_node_min_leaf_num):
121
+ if ignore_vertical_hgt is False:
122
+ line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
123
+ qualified_hgt_num += 1
124
+ else:
125
+ donor_is_ancestor_of_recipient = check_a_is_ancestor_of_b(ale_formatted_gnm_tree, donor, recipient)
126
+ donor_is_child_of_recipient = check_a_is_child_of_b(ale_formatted_gnm_tree, donor, recipient)
127
+ if (donor_is_ancestor_of_recipient is False) and (donor_is_child_of_recipient is False):
128
+ line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
129
+ qualified_hgt_num += 1
130
+ paired_donor_to_recipient_leaf_dict[key_str] = [internal_node_to_leaf_dict.get(donor, []), internal_node_to_leaf_dict.get(recipient, [])]
131
+
132
+ if line_to_write != '':
133
+ itol_connection_txt_handle.write(line_to_write)
134
+ connection_line_to_write_dict[key_str] = line_to_write
135
+ hgt_freq_dict[key_str] = freq
136
+
137
+ combined_connection_file_path, combined_connection_file_basename, combined_connection_file_ext = sep_path_basename_ext(itol_connection_txt)
138
+
139
+ # write out connections separately
140
+ for each_connection in connection_line_to_write_dict:
141
+ pwd_connection_txt = '%s/%s_%s.txt' % (combined_connection_file_path, combined_connection_file_basename, each_connection)
142
+ pwd_connection_txt_handle = open(pwd_connection_txt, 'w')
143
+ pwd_connection_txt_handle.write('DATASET_CONNECTION\nSEPARATOR TAB\nDATASET_LABEL\tdemo_connections\n')
144
+ pwd_connection_txt_handle.write('COLOR\t#ff0ff0\nDRAW_ARROWS\t1\nARROW_SIZE\t60\nLOOP_SIZE\t100\n')
145
+ pwd_connection_txt_handle.write('MAXIMUM_LINE_WIDTH\t10\nCURVE_ANGLE\t45\nCENTER_CURVES\t1\nALIGN_TO_LABELS\t0\nDATA\n')
146
+ pwd_connection_txt_handle.write(connection_line_to_write_dict[each_connection] + '\n')
147
+ pwd_connection_txt_handle.close()
148
+
149
+ return internal_node_to_leaf_dict, paired_donor_to_recipient_leaf_dict, hgt_freq_dict
150
+
151
+
152
+ def itol_tree(tree_file, annotation_file_list, project_name, APIkey, display_mode, op_plot):
153
+
154
+ # https://github.com/albertyw/itolapi
155
+ # http://itol.embl.de/help.cgi#batch
156
+
157
+ op_plot_ext = op_plot.split('.')[-1]
158
+
159
+ # upload tree to iTOL
160
+ itol_uploader = Itol()
161
+ itol_uploader.params['projectName'] = project_name # better to create a project with a unique name.
162
+ itol_uploader.params['APIkey'] = APIkey # sine we are the same account, we can use the same APIkey
163
+ itol_uploader.params['treeName'] = tree_file
164
+ itol_uploader.add_file(tree_file)
165
+
166
+ # upload annotation files to iTOL
167
+ for annotation_file in annotation_file_list:
168
+ itol_uploader.add_file(annotation_file)
169
+
170
+ status = itol_uploader.upload()
171
+ # import pdb;pdb.set_trace()
172
+ assert status != False
173
+
174
+ # the following parameters are optional, refer to https://itol.embl.de/help.cgi#batchExp
175
+ if len(annotation_file_list) == 1:
176
+ datasets_visible_str = '0'
177
+ elif len(annotation_file_list) == 2:
178
+ datasets_visible_str = '0,1'
179
+ elif len(annotation_file_list) == 3:
180
+ datasets_visible_str = '0,1,2'
181
+ else:
182
+ datasets_visible_str = ','.join([str(i) for i in list(range(0, len(annotation_file_list)))])
183
+ itol_exporter = itol_uploader.get_itol_export()
184
+ itol_exporter.set_export_param_value('datasets_visible', datasets_visible_str)
185
+ itol_exporter.set_export_param_value('display_mode', display_mode)
186
+ itol_exporter.set_export_param_value('range_mode', '2')
187
+ itol_exporter.set_export_param_value('dashed_lines', '1')
188
+ # itol_exporter.set_export_param_value('current_font_size', '96')
189
+ itol_exporter.set_export_param_value('line_width', '3')
190
+ itol_exporter.set_export_param_value('vertical_shift_factor', '0.9')
191
+ itol_exporter.set_export_param_value('horizontal_scale_factor', '0.9')
192
+ itol_exporter.set_export_param_value('format', op_plot_ext)
193
+ itol_exporter.export(op_plot)
194
+
195
+
196
+ def get_node_to_leaf_dict(tree_file):
197
+ internal_node_to_leaf_dict = dict()
198
+ for node in Tree(tree_file, format=1).traverse():
199
+ if not node.is_leaf():
200
+ node_name = node.name
201
+ node_leaf_list = node.get_leaf_names()
202
+ internal_node_to_leaf_dict[node_name] = node_leaf_list
203
+ return internal_node_to_leaf_dict
204
+
205
+
206
+ def combine_trees(t1_with_len, t2_with_name, op_tree_with_both):
207
+
208
+ # assume t1 has branch length and t2 has internal node name
209
+
210
+ t1 = Tree(t1_with_len, format=0)
211
+ t2 = Tree(t2_with_name, format=1)
212
+
213
+ t1_leaves_to_node_dict = dict()
214
+ for t1_node in t1.traverse():
215
+ leaf_str = '__'.join(sorted(list(t1_node.get_leaf_names())))
216
+ t1_leaves_to_node_dict[leaf_str] = t1_node
217
+
218
+ t2_leaves_to_node_dict = dict()
219
+ for t2_node in t2.traverse():
220
+ leaf_str = '__'.join(sorted(list(t2_node.get_leaf_names())))
221
+ t2_leaves_to_node_dict[leaf_str] = t2_node
222
+
223
+ t1_node_to_t2_node_dict = dict()
224
+ for index, t1_node in t1_leaves_to_node_dict.items():
225
+ t2_node = t2_leaves_to_node_dict[index]
226
+ t1_node_to_t2_node_dict[t1_node] = t2_node
227
+
228
+ merged_tree = t1.copy()
229
+ for node, t1_node in zip(merged_tree.traverse(), t1.traverse()):
230
+ node.name = t1_node_to_t2_node_dict[t1_node].name
231
+ merged_tree.write(outfile=op_tree_with_both, format=3)
232
+
233
+
234
+ def prefix_internal_nodes(tree_in, prefix_str, tree_out):
235
+ t = Tree(tree_in, format=3)
236
+ t_renamed = t.copy()
237
+ for node in t_renamed.traverse():
238
+ if not node.is_leaf():
239
+ node_name_prefixed = '%s%s' % (prefix_str, node.name)
240
+ node.name = node_name_prefixed
241
+ t_renamed.write(outfile=tree_out, format=3)
242
+
243
+
244
+ def check_a_is_ancestor_of_b(tree_file, node_a, node_b):
245
+
246
+ a_is_ancestor_of_b = False
247
+ for node in Tree(tree_file, format=1).traverse():
248
+ node_name = node.name
249
+ if node_name == node_b:
250
+ node_ancestor_list = [i.name for i in node.get_ancestors()]
251
+ if node_a in node_ancestor_list:
252
+ a_is_ancestor_of_b = True
253
+ return a_is_ancestor_of_b
254
+
255
+
256
+ def check_a_is_child_of_b(tree_file, node_a, node_b):
257
+
258
+ a_is_child_of_b = False
259
+ for node in Tree(tree_file, format=1).traverse():
260
+ node_name = node.name
261
+ if node_name == node_b:
262
+ node_children_list = [i.name for i in node.get_descendants()]
263
+ if node_a in node_children_list:
264
+ a_is_child_of_b = True
265
+ return a_is_child_of_b
266
+
267
+
268
+ def root_at_midpoint(tree_in, tree_out_rooted):
269
+ t = Tree(tree_in)
270
+ midpoint = t.get_midpoint_outgroup()
271
+ t.set_outgroup(midpoint)
272
+ t.write(outfile=tree_out_rooted)
273
+
274
+
275
+ def get_color_list(color_num):
276
+ if color_num <= 8:
277
+ color_list_combined = ['#3787c0', '#39399f', '#ffb939', '#399f39', '#9f399f', '#fb694a', '#9f9f39', '#959595']
278
+
279
+ elif 8 < color_num <= 16:
280
+ color_list_combined = ['#2b7bba', '#89bedc', '#2e2e99', '#8a8acc', '#ffa500', '#ffc55c', '#2e992e', '#8acc8a',
281
+ '#992e99', '#cc8acc', '#d52221', '#fc8161', '#99992e', '#cccc8a', '#5c5c5c', '#adadad']
282
+ else:
283
+ color_num_each = math.ceil(color_num / 8) + 2
284
+ color_list_1 = sns.color_palette('Blues', n_colors=color_num_each).as_hex()
285
+ color_list_2 = sns.light_palette('navy', n_colors=color_num_each).as_hex()
286
+ color_list_3 = sns.light_palette('orange', n_colors=color_num_each).as_hex()
287
+ color_list_4 = sns.light_palette('green', n_colors=color_num_each).as_hex()
288
+ color_list_5 = sns.light_palette('purple', n_colors=color_num_each).as_hex()
289
+ color_list_6 = sns.color_palette('Reds', n_colors=color_num_each).as_hex()
290
+ color_list_7 = sns.light_palette('olive', n_colors=color_num_each).as_hex()
291
+ color_list_8 = sns.color_palette('Greys', n_colors=color_num_each).as_hex()
292
+
293
+ color_list_combined = []
294
+ for color_list in [color_list_1, color_list_2, color_list_3, color_list_4, color_list_5, color_list_6,
295
+ color_list_7, color_list_8]:
296
+ for color in color_list[2:][::-1]:
297
+ color_list_combined.append(color)
298
+
299
+ color_list_to_return = random.sample(color_list_combined, color_num)
300
+
301
+ color_list_to_return_sorted = []
302
+ for color_to_return in color_list_combined:
303
+ if color_to_return in color_list_to_return:
304
+ color_list_to_return_sorted.append(color_to_return)
305
+
306
+ return color_list_to_return_sorted
307
+
308
+
309
+ def iTOL(Leaf_to_Group_dict, Group_to_Color_dict, FileOut):
310
+
311
+ Group_set = set()
312
+ for each_leaf in Leaf_to_Group_dict:
313
+ Group_set.add(Leaf_to_Group_dict[each_leaf])
314
+
315
+ if len(Group_to_Color_dict) == 0:
316
+ Group_to_Color_dict = dict(zip(Group_set, get_color_list(len(Group_set))))
317
+ else:
318
+ group_without_color_list = []
319
+ for each_group in Group_set:
320
+ if each_group not in Group_to_Color_dict:
321
+ group_without_color_list.append(each_group)
322
+ if len(group_without_color_list) > 0:
323
+ color_list_unprovided = get_color_list(len(group_without_color_list))
324
+ Group_to_Color_dict_unprovided = dict(zip(group_without_color_list, color_list_unprovided))
325
+ for each_group in Group_to_Color_dict_unprovided:
326
+ Group_to_Color_dict[each_group] = Group_to_Color_dict_unprovided[each_group]
327
+
328
+ FileOut_handle = open(FileOut, 'w')
329
+ FileOut_handle.write('DATASET_COLORSTRIP\n')
330
+ FileOut_handle.write('SEPARATOR TAB\n')
331
+ FileOut_handle.write('DATASET_LABEL\tTaxonomy\n')
332
+ FileOut_handle.write('\n# customize strip attributes here\n')
333
+ FileOut_handle.write('STRIP_WIDTH\t100\n')
334
+ FileOut_handle.write('MARGIN\t20\n')
335
+ FileOut_handle.write('\n# provide data here\nDATA\n')
336
+ for leaf in Leaf_to_Group_dict:
337
+ leaf_group = Leaf_to_Group_dict[leaf]
338
+ leaf_color = Group_to_Color_dict[leaf_group]
339
+ FileOut_handle.write('%s\t%s\t%s\n' % (leaf, leaf_color, leaf_group))
340
+ FileOut_handle.close()
341
+
342
+
343
+ def ale_splitter(rec_file):
344
+
345
+ options = [True, True, True, True]
346
+
347
+ with open(rec_file) as f:
348
+ lines = f.readlines()
349
+ stree = lines[2].strip()
350
+ ll = lines[6].strip().split()[-1]
351
+ rates = lines[8].strip().split("\t")[1:]
352
+ n_reconciled_trees = int(lines[9].strip().split()[0])
353
+ reconciled_trees = lines[11:n_reconciled_trees + 11]
354
+ n_of_events = lines[11 + n_reconciled_trees + 1].split("\t")[1:]
355
+ table = lines[11 + n_reconciled_trees + 3:]
356
+
357
+ if options[0]:
358
+ with open(rec_file.replace("uml_rec", "stree"), "w") as f:
359
+ f.write(stree.split("\t")[-1])
360
+
361
+ if options[1]:
362
+ with open(rec_file.replace("uml_rec", "info"), "w") as f:
363
+ f.write("LL:" + "\t" + ll + "\n")
364
+ f.write("Dp:" + "\t" + rates[0] + "\n")
365
+ f.write("Tp:" + "\t" + rates[1] + "\n")
366
+ f.write("Lp:" + "\t" + rates[2] + "\n")
367
+ f.write("De:" + "\t" + n_of_events[0] + "\n")
368
+ f.write("Te:" + "\t" + n_of_events[1] + "\n")
369
+ f.write("Le:" + "\t" + n_of_events[2] + "\n")
370
+ f.write("Se:" + "\t" + n_of_events[3] + "\n")
371
+
372
+ if options[2]:
373
+ with open(rec_file.replace("uml_rec", "recs"), "w") as f:
374
+ for t in reconciled_trees:
375
+ f.write(t)
376
+
377
+ if options[3]:
378
+ with open(rec_file.replace("uml_rec", "rec_table"), "w") as f:
379
+ for e in table:
380
+ f.write(e)
381
+
382
+
383
+ def parse_ale_op_worker(arg_list):
384
+
385
+ qualified_og = arg_list[0]
386
+ gene_tree_dir = arg_list[1]
387
+ ale_wd = arg_list[2]
388
+ op_dir = arg_list[3]
389
+ interal_node_prefix = arg_list[4]
390
+ gnm_pco_dict = arg_list[5]
391
+ d_color = arg_list[6]
392
+ r_color = arg_list[7]
393
+ project_name = arg_list[8]
394
+ API_key = arg_list[9]
395
+ display_mode = arg_list[10]
396
+ hgt_freq_cutoff = arg_list[11]
397
+ ignore_leaf_hgt = arg_list[12]
398
+ ignore_vertical_hgt = arg_list[13]
399
+ donor_node_min_leaf_num = arg_list[14]
400
+ recipient_node_min_leaf_num = arg_list[15]
401
+ dr_separator = arg_list[16]
402
+ root_gene_tree_at_midpoint = arg_list[17]
403
+ p_color_dict = arg_list[18]
404
+ gnm_tree_no_underscore = arg_list[19]
405
+ pwd_itol_dir = arg_list[20]
406
+ pwd_itol_colorstrip_txt_gnm = arg_list[21]
407
+ pwd_itol_label_txt_gnm = arg_list[22]
408
+
409
+ current_og_dir = '%s/%s' % (pwd_itol_dir, qualified_og)
410
+ pwd_genome_tree_file = '%s/%s' % (ale_wd, gnm_tree_no_underscore)
411
+ pwd_gene_tree_treefile = '%s/%s.treefile' % (gene_tree_dir, qualified_og)
412
+ pwd_uts_file = '%s/%s_%s.ufboot.ale.uTs' % (ale_wd, gnm_tree_no_underscore, qualified_og)
413
+ pwd_uml_rec_file = '%s/%s_%s.ufboot.ale.uml_rec' % (ale_wd, gnm_tree_no_underscore, qualified_og)
414
+ pwd_gene_tree_treefile_midpoint_rooted = '%s/%s_midpoint_rooted.treefile' % (current_og_dir, qualified_og)
415
+ pwd_ale_formatted_gnm_tree = '%s/%s.ufboot_genome_tree.tree' % (current_og_dir, qualified_og)
416
+ pwd_ale_formatted_gnm_tree_with_len = '%s/%s.ufboot_genome_tree_with_len.tree' % (current_og_dir, qualified_og)
417
+ pwd_ale_formatted_gnm_tree_with_len_prefixed = '%s/%s.ufboot_genome_tree_with_len_prefixed.tree' % (current_og_dir, qualified_og)
418
+ pwd_itol_connection_txt_all = '%s/%s_iTOL_connection.txt' % (current_og_dir, qualified_og)
419
+ pwd_gene_tree_itol_label_txt = '%s/%s_iTOL_gene_pco.txt' % (current_og_dir, qualified_og)
420
+ pwd_gene_tree_itol_colorstrip_txt = '%s/%s_iTOL_colorstrip_gene.txt' % (current_og_dir, qualified_og)
421
+
422
+ os.mkdir(current_og_dir)
423
+
424
+ # run ale_splitter
425
+ ale_splitter(pwd_uml_rec_file)
426
+
427
+ internal_node_to_leaf_dict = dict()
428
+ paired_donor_to_recipient_leaf_dict = dict()
429
+ hgt_freq_dict = dict()
430
+ if os.path.isfile(pwd_uts_file) is True:
431
+
432
+ # write out ALE formatted genome tree
433
+ renamed_genome_tree_str = open(pwd_uml_rec_file).readlines()[2].strip().split('\t')[1]
434
+ with open(pwd_ale_formatted_gnm_tree, 'w') as ale_renamed_species_tree_handle:
435
+ ale_renamed_species_tree_handle.write(renamed_genome_tree_str + '\n')
436
+
437
+ internal_node_to_leaf_dict, paired_donor_to_recipient_leaf_dict, hgt_freq_dict = uts_to_itol_connections(pwd_genome_tree_file, pwd_ale_formatted_gnm_tree, interal_node_prefix, pwd_uts_file, hgt_freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num, pwd_itol_connection_txt_all, dr_separator)
438
+ else:
439
+ print('%s: uTs file not found, you need to run ALE first!' % qualified_og)
440
+
441
+ # combine_trees
442
+ combine_trees(pwd_genome_tree_file, pwd_ale_formatted_gnm_tree, pwd_ale_formatted_gnm_tree_with_len)
443
+
444
+ # prefix_internal_nodes of combined tree
445
+ prefix_internal_nodes(pwd_ale_formatted_gnm_tree_with_len, interal_node_prefix, pwd_ale_formatted_gnm_tree_with_len_prefixed)
446
+
447
+ # write out iTOL label file for gene and genome tree, also colorstrip for taxonomy
448
+ pwd_gene_tree_itol_label_txt_handle = open(pwd_gene_tree_itol_label_txt, 'w')
449
+ pwd_gene_tree_itol_label_txt_handle.write('LABELS\nSEPARATOR TAB\n\nDATA\n')
450
+ gene_to_p_dict = dict()
451
+ for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
452
+ gene_gnm = '_'.join(each_gene.split('_')[:-1])
453
+ genome_with_taxon = gnm_pco_dict[gene_gnm]
454
+ gene_to_p_dict[each_gene] = genome_with_taxon.split('__')[0]
455
+ pwd_gene_tree_itol_label_txt_handle.write('%s\t%s_%s\n' % (each_gene, genome_with_taxon, each_gene.split('_')[-1]))
456
+ pwd_gene_tree_itol_label_txt_handle.close()
457
+
458
+ iTOL(gene_to_p_dict, p_color_dict, pwd_gene_tree_itol_colorstrip_txt)
459
+
460
+ # root gene tree at midpoint
461
+ gene_tree_to_plot = pwd_gene_tree_treefile
462
+ if root_gene_tree_at_midpoint is True:
463
+ root_at_midpoint(pwd_gene_tree_treefile, pwd_gene_tree_treefile_midpoint_rooted)
464
+ gene_tree_to_plot = pwd_gene_tree_treefile_midpoint_rooted
465
+
466
+ # plot separately
467
+ n = 1
468
+ for each_d2r in paired_donor_to_recipient_leaf_dict:
469
+ each_d2r_freq = hgt_freq_dict[each_d2r]
470
+ each_d2r_d_list = paired_donor_to_recipient_leaf_dict[each_d2r][0]
471
+ each_d2r_r_list = paired_donor_to_recipient_leaf_dict[each_d2r][1]
472
+ pwd_gene_tree_itol_label_txt = '%s/%s_iTOL_gene_pco.txt' % (current_og_dir, qualified_og)
473
+ pwd_gnm_tree_label_color_txt = '%s/%s_iTOL_label_color_genome_%s.txt' % (current_og_dir, qualified_og, each_d2r)
474
+ pwd_gene_tree_label_color_txt = '%s/%s_iTOL_label_color_gene_%s.txt' % (current_og_dir, qualified_og, each_d2r)
475
+ pwd_itol_connection_txt = '%s/%s_iTOL_connection_%s.txt' % (current_og_dir, qualified_og, each_d2r)
476
+ pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf = '%s/%s_genome_tree_with_HGT_%s.pdf' % (current_og_dir, qualified_og, each_d2r)
477
+ pwd_gene_tree_treefile_subset_pdf = '%s/%s_subset_%s.pdf' % (current_og_dir, qualified_og, each_d2r)
478
+ pwd_combined_image_with_ale_hgts = '%s/%s_HGT_%s_%s_%s.pdf' % (op_dir, qualified_og, n, each_d2r, each_d2r_freq)
479
+
480
+ # write out gnm_tree_label_color_txt
481
+ pwd_gnm_tree_label_color_txt_handle = open(pwd_gnm_tree_label_color_txt, 'w')
482
+ pwd_gnm_tree_label_color_txt_handle.write('DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\texample_style\nCOLOR\t#ffff00\n\nDATA\n')
483
+ pwd_gnm_tree_label_color_txt_handle.write('%s\tlabel\tclade\t%s\t1\tnormal\n' % (each_d2r.split(dr_separator)[0], d_color))
484
+ pwd_gnm_tree_label_color_txt_handle.write('%s\tlabel\tclade\t%s\t1\tnormal\n' % (each_d2r.split(dr_separator)[1], r_color))
485
+ pwd_gnm_tree_label_color_txt_handle.close()
486
+
487
+ # write out iTOL label file for gene and genome tree, also colorstrip for taxonomy
488
+ pwd_gene_tree_label_color_txt_handle = open(pwd_gene_tree_label_color_txt, 'w')
489
+ pwd_gene_tree_label_color_txt_handle.write('DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\texample_style\nCOLOR\t#ffff00\n\nDATA\n')
490
+ for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
491
+ gene_name_for_ale = '_'.join(each_gene.strip().split('_')[:-1])
492
+ gene_name_for_ale = gene_name_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
493
+ if gene_name_for_ale in each_d2r_d_list:
494
+ pwd_gene_tree_label_color_txt_handle.write('%s\tlabel\tnode\t%s\t1\tnormal\n' % (each_gene, d_color))
495
+ elif gene_name_for_ale in each_d2r_r_list:
496
+ pwd_gene_tree_label_color_txt_handle.write('%s\tlabel\tnode\t%s\t1\tnormal\n' % (each_gene, r_color))
497
+ pwd_gene_tree_label_color_txt_handle.close()
498
+
499
+ itol_tree(pwd_ale_formatted_gnm_tree_with_len_prefixed, [pwd_gnm_tree_label_color_txt, pwd_itol_label_txt_gnm, pwd_itol_connection_txt, pwd_itol_colorstrip_txt_gnm], project_name, API_key, display_mode, pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf)
500
+ itol_tree(gene_tree_to_plot, [pwd_gene_tree_itol_label_txt, pwd_gene_tree_label_color_txt, pwd_gene_tree_itol_colorstrip_txt], project_name, API_key, display_mode, pwd_gene_tree_treefile_subset_pdf)
501
+ merge_pdf(pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf, pwd_gene_tree_treefile_subset_pdf, 66, pwd_combined_image_with_ale_hgts)
502
+ n += 1
503
+
504
+
505
+ def ALE4(args):
506
+
507
+ ale1_op_dir = args['1']
508
+ ale2_op_dir = args['2']
509
+ genome_taxon_txt = args['c']
510
+ ar_phylum_color_code_txt = args['color']
511
+ op_dir = args['o']
512
+ force_create_op_dir = args['f']
513
+ API_key = args['api']
514
+ hgt_freq_cutoff = args['fc']
515
+ donor_node_min_leaf_num = args['mld']
516
+ recipient_node_min_leaf_num = args['mlr']
517
+ project_name = args['itol']
518
+
519
+ ignore_vertical_hgt = True # filter ALE predicted HGTs
520
+ ignore_leaf_hgt = True # filter ALE predicted HGTs
521
+ interal_node_prefix = 'IN' # plot tree with HGT
522
+ display_mode = '1' # plot tree with HGT, 1=rectangular, 2=circular, 3=unrooted
523
+ align_leaf_name = True # plot tree with HGT
524
+ show_scale = False # plot tree with HGT
525
+ d_color = '#FF0000' # plot tree with HGT
526
+ r_color = '#0000FF' # plot tree with HGT
527
+ dr_separator = '_to_' # plot tree with HGT
528
+ root_gene_tree_at_midpoint = True # plot tree with HGT
529
+
530
+ ####################################################################################################################
531
+
532
+ gnm_tree_no_underscore = 'genome_tree.newick'
533
+ pwd_itol_dir = '%s/annotation_files' % op_dir
534
+ pwd_gnm_tree_itol_colorstrip_txt = '%s/iTOL_colorstrip_genome.txt' % pwd_itol_dir
535
+ pwd_gnm_tree_itol_label_txt = '%s/iTOL_genome_pco.txt' % pwd_itol_dir
536
+
537
+ ####################################################################################################################
538
+
539
+ ufboot_file_re = '%s/*.ufboot' % ale1_op_dir
540
+ uml_rec_file_re = '%s/*.ufboot.ale.uml_rec' % ale2_op_dir
541
+ ufboot_file_list = glob.glob(ufboot_file_re)
542
+ uml_rec_file_list = glob.glob(uml_rec_file_re)
543
+
544
+ ufboot_base_list = []
545
+ for each_ufboot in ufboot_file_list:
546
+ _, ufboot_base, _ = sep_path_basename_ext(each_ufboot)
547
+ ufboot_base_list.append(ufboot_base)
548
+
549
+ uml_rec_base_list = []
550
+ for each_uml_rec in uml_rec_file_list:
551
+ _, uml_rec_base, _ = sep_path_basename_ext(each_uml_rec)
552
+ uml_rec_base = uml_rec_base.replace((gnm_tree_no_underscore + '_'), '')
553
+ uml_rec_base = uml_rec_base.replace('.ufboot.ale', '')
554
+ uml_rec_base_list.append(uml_rec_base)
555
+
556
+ found_in_ufboot_only = [og for og in ufboot_base_list if og not in uml_rec_base_list]
557
+ found_in_uml_rec_only = [og for og in uml_rec_base_list if og not in ufboot_base_list]
558
+ found_in_both = [og for og in ufboot_base_list if og in uml_rec_base_list]
559
+
560
+ if len(found_in_ufboot_only) > 0:
561
+ print('The following OGs will be ignored as they were not found in %s:' % ale2_op_dir)
562
+ print(','.join(found_in_ufboot_only))
563
+ if len(found_in_uml_rec_only) > 0:
564
+ print('The following OGs will be ignored as they were not found in %s:' % ale1_op_dir)
565
+ print(','.join(found_in_uml_rec_only))
566
+ print()
567
+
568
+ if os.path.isdir(op_dir) is True:
569
+ if force_create_op_dir is True:
570
+ os.system('rm -r %s' % op_dir)
571
+ else:
572
+ print('Output folder detected, program exited!')
573
+ exit()
574
+ os.system('mkdir %s' % op_dir)
575
+ os.system('mkdir %s' % pwd_itol_dir)
576
+
577
+ # read in genome taxonomy
578
+ pwd_gnm_tree_itol_label_txt_handle = open(pwd_gnm_tree_itol_label_txt, 'w')
579
+ pwd_gnm_tree_itol_label_txt_handle.write('LABELS\nSEPARATOR TAB\n\nDATA\n')
580
+ gnm_pco_dict = dict()
581
+ genome_to_p_dict = dict()
582
+ for each_gnm in open(genome_taxon_txt):
583
+ each_gnm_split = each_gnm.strip().split('\t')
584
+ gnm_id = each_gnm_split[0]
585
+ genome_name_for_ale = gnm_id
586
+ genome_name_for_ale = genome_name_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
587
+ taxon_str = each_gnm_split[1]
588
+ gnm_phylum = taxon_str.split(';')[1]
589
+ gnm_class = taxon_str.split(';')[2]
590
+ gnm_order = taxon_str.split(';')[3]
591
+ pco_str = '%s__%s__%s__%s' % (gnm_phylum[3:], gnm_class[3:], gnm_order[3:], gnm_id)
592
+ gnm_pco_dict[gnm_id] = pco_str
593
+ genome_to_p_dict[genome_name_for_ale] = gnm_phylum[3:]
594
+ pwd_gnm_tree_itol_label_txt_handle.write('%s\t%s\n' % (genome_name_for_ale, pco_str))
595
+ pwd_gnm_tree_itol_label_txt_handle.close()
596
+
597
+ # read in phylum color
598
+ p_color_dict = dict()
599
+ for each_line in open(ar_phylum_color_code_txt):
600
+ each_line_split = each_line.strip().split('\t')
601
+ phylum_id = each_line_split[1]
602
+ color_id = each_line_split[0]
603
+ p_color_dict[phylum_id] = color_id
604
+
605
+ iTOL(genome_to_p_dict, p_color_dict, pwd_gnm_tree_itol_colorstrip_txt)
606
+
607
+ # parse ALE output
608
+ n = 1
609
+ for qualified_og in found_in_both:
610
+ print('Processing (%s/%s): %s' % (n, len(found_in_both), qualified_og))
611
+ current_arg_list = [qualified_og, ale1_op_dir, ale2_op_dir, op_dir, interal_node_prefix, gnm_pco_dict, d_color,
612
+ r_color, project_name, API_key, display_mode, hgt_freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt,
613
+ donor_node_min_leaf_num, recipient_node_min_leaf_num, dr_separator, root_gene_tree_at_midpoint,
614
+ p_color_dict, gnm_tree_no_underscore, pwd_itol_dir, pwd_gnm_tree_itol_colorstrip_txt, pwd_gnm_tree_itol_label_txt]
615
+ parse_ale_op_worker(current_arg_list)
616
+ n += 1
617
+
618
+ print('Done!')
619
+
620
+
621
+ if __name__ == '__main__':
622
+
623
+ ALE4_parser = argparse.ArgumentParser()
624
+ ALE4_parser.add_argument('-1', required=True, help='ALE1 output directory')
625
+ ALE4_parser.add_argument('-2', required=True, help='ALE2 output directory')
626
+ ALE4_parser.add_argument('-c', required=True, help='genome_taxon, GTDB format')
627
+ ALE4_parser.add_argument('-color', required=True, help='phylum color code')
628
+ ALE4_parser.add_argument('-o', required=True, help='output dir, i.e., ALE4_op_dir')
629
+ ALE4_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
630
+ ALE4_parser.add_argument('-api', required=True, help='iTOL API key')
631
+ ALE4_parser.add_argument('-fc', required=False, type=float, default=0.5, help='hgt_freq_cutoff, default: 0.5')
632
+ ALE4_parser.add_argument('-mld', required=False, type=int, default=5, help='donor_node_min_leaf_num, default: 5')
633
+ ALE4_parser.add_argument('-mlr', required=False, type=int, default=5, help='recipient_node_min_leaf_num, default: 5')
634
+ ALE4_parser.add_argument('-itol', required=False, default='batch_access_tmp', help='iTOL project_name, default: batch_access_tmp')
635
+ args = vars(ALE4_parser.parse_args())
636
+ ALE4(args)