treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/ALE4.py
ADDED
|
@@ -0,0 +1,636 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import math
|
|
4
|
+
import random
|
|
5
|
+
import argparse
|
|
6
|
+
import seaborn as sns
|
|
7
|
+
from ete3 import Tree
|
|
8
|
+
from itolapi import Itol
|
|
9
|
+
from PyPDF3.pdf import PageObject
|
|
10
|
+
from PyPDF3 import PdfFileWriter, PdfFileReader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
ALE4_usage = '''
|
|
14
|
+
========================= ALE4 example commands =========================
|
|
15
|
+
|
|
16
|
+
TreeSAK ALE4 -1 ALE1_op_dir -2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -f -api your_own_itol_api -fc 0.3 -o ALE4_op_dir_0.3
|
|
17
|
+
TreeSAK ALE4 -1 ALE1_op_dir -2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -f -api your_own_itol_api -fc 0.5 -o ALE4_op_dir_0.5
|
|
18
|
+
TreeSAK ALE4 -1 ALE1_op_dir -2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -f -api your_own_itol_api -fc 0.8 -o ALE4_op_dir_0.8
|
|
19
|
+
|
|
20
|
+
# To do:
|
|
21
|
+
# add protein family to the top of the pdf file
|
|
22
|
+
|
|
23
|
+
=========================================================================
|
|
24
|
+
'''
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def sep_path_basename_ext(file_in):
|
|
28
|
+
f_path, file_name = os.path.split(file_in)
|
|
29
|
+
if f_path == '':
|
|
30
|
+
f_path = '.'
|
|
31
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
32
|
+
return f_path, f_base, f_ext
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
|
|
36
|
+
|
|
37
|
+
input_tree = Tree(tree_file_in)
|
|
38
|
+
subset_tree = input_tree.copy()
|
|
39
|
+
subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
|
|
40
|
+
if tree_file_out is None:
|
|
41
|
+
return subset_tree.write()
|
|
42
|
+
else:
|
|
43
|
+
subset_tree.write(outfile=tree_file_out)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def merge_pdf(pdf_1, pdf_2, margin_size, op_pdf):
|
|
47
|
+
|
|
48
|
+
page1 = PdfFileReader(open(pdf_1, "rb"), strict=False).getPage(0)
|
|
49
|
+
page2 = PdfFileReader(open(pdf_2, "rb"), strict=False).getPage(0)
|
|
50
|
+
|
|
51
|
+
total_width = page1.mediaBox.upperRight[0] + page2.mediaBox.upperRight[0] + margin_size*3
|
|
52
|
+
total_height = max([page1.mediaBox.upperRight[1], page2.mediaBox.upperRight[1]]) + margin_size*2
|
|
53
|
+
|
|
54
|
+
new_page = PageObject.createBlankPage(None, total_width, total_height)
|
|
55
|
+
new_page.mergeTranslatedPage(page1, margin_size, (total_height-margin_size-page1.mediaBox.upperRight[1]))
|
|
56
|
+
new_page.mergeTranslatedPage(page2, (page1.mediaBox.upperRight[0] + margin_size*2), margin_size)
|
|
57
|
+
|
|
58
|
+
output = PdfFileWriter()
|
|
59
|
+
output.addPage(new_page)
|
|
60
|
+
output.write(open(op_pdf, "wb"))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def uts_to_itol_connections(genome_tree_file, ale_formatted_gnm_tree, interal_node_prefix, uts_file, freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num, itol_connection_txt, dr_separator):
|
|
64
|
+
|
|
65
|
+
# get internal_node_to_leaf_dict
|
|
66
|
+
internal_node_to_leaf_dict = get_node_to_leaf_dict(ale_formatted_gnm_tree)
|
|
67
|
+
|
|
68
|
+
paired_donor_to_recipient_leaf_dict = dict()
|
|
69
|
+
qualified_hgt_num = 0
|
|
70
|
+
|
|
71
|
+
leaf_id_set = []
|
|
72
|
+
if os.path.isfile(genome_tree_file):
|
|
73
|
+
leaf_id_set = [i.name for i in Tree(genome_tree_file, format=3).get_leaves()]
|
|
74
|
+
else:
|
|
75
|
+
print('%s not found!' % genome_tree_file)
|
|
76
|
+
|
|
77
|
+
hgt_freq_dict = dict()
|
|
78
|
+
connection_line_to_write_dict = dict()
|
|
79
|
+
with open(itol_connection_txt, 'w') as itol_connection_txt_handle:
|
|
80
|
+
itol_connection_txt_handle.write('DATASET_CONNECTION\nSEPARATOR TAB\nDATASET_LABEL\tdemo_connections\n')
|
|
81
|
+
itol_connection_txt_handle.write('COLOR\t#ff0ff0\nDRAW_ARROWS\t1\nARROW_SIZE\t60\nLOOP_SIZE\t100\n')
|
|
82
|
+
itol_connection_txt_handle.write('MAXIMUM_LINE_WIDTH\t10\nCURVE_ANGLE\t45\nCENTER_CURVES\t1\nALIGN_TO_LABELS\t0\nDATA\n')
|
|
83
|
+
for each_line in open(uts_file):
|
|
84
|
+
if not each_line.startswith('#'):
|
|
85
|
+
each_line_split = each_line.strip().split('\t')
|
|
86
|
+
donor = each_line_split[0]
|
|
87
|
+
recipient = each_line_split[1]
|
|
88
|
+
freq = float(each_line_split[2])
|
|
89
|
+
|
|
90
|
+
# add prefix to internal donor node
|
|
91
|
+
if donor in leaf_id_set:
|
|
92
|
+
donor_with_prefix = donor
|
|
93
|
+
else:
|
|
94
|
+
donor_with_prefix = interal_node_prefix + donor
|
|
95
|
+
|
|
96
|
+
# add prefix to internal recipient node
|
|
97
|
+
if recipient in leaf_id_set:
|
|
98
|
+
recipient_with_prefix = recipient
|
|
99
|
+
else:
|
|
100
|
+
recipient_with_prefix = interal_node_prefix + recipient
|
|
101
|
+
|
|
102
|
+
key_str = '%s%s%s' % (donor_with_prefix, dr_separator, recipient_with_prefix)
|
|
103
|
+
|
|
104
|
+
line_to_write = ''
|
|
105
|
+
if freq >= freq_cutoff:
|
|
106
|
+
if ignore_leaf_hgt is False:
|
|
107
|
+
if ignore_vertical_hgt is False:
|
|
108
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
109
|
+
qualified_hgt_num += 1
|
|
110
|
+
else:
|
|
111
|
+
donor_is_ancestor_of_recipient = check_a_is_ancestor_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
112
|
+
donor_is_child_of_recipient = check_a_is_child_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
113
|
+
if (donor_is_ancestor_of_recipient is False) and (donor_is_child_of_recipient is False):
|
|
114
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
115
|
+
qualified_hgt_num += 1
|
|
116
|
+
else:
|
|
117
|
+
if (each_line_split[0] not in leaf_id_set) and (each_line_split[1] not in leaf_id_set):
|
|
118
|
+
donor_node_leaf_num = len(internal_node_to_leaf_dict.get(donor, []))
|
|
119
|
+
recipient_node_leaf_num = len(internal_node_to_leaf_dict.get(recipient, []))
|
|
120
|
+
if (donor_node_leaf_num >= donor_node_min_leaf_num) and (recipient_node_leaf_num >= recipient_node_min_leaf_num):
|
|
121
|
+
if ignore_vertical_hgt is False:
|
|
122
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
123
|
+
qualified_hgt_num += 1
|
|
124
|
+
else:
|
|
125
|
+
donor_is_ancestor_of_recipient = check_a_is_ancestor_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
126
|
+
donor_is_child_of_recipient = check_a_is_child_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
127
|
+
if (donor_is_ancestor_of_recipient is False) and (donor_is_child_of_recipient is False):
|
|
128
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
129
|
+
qualified_hgt_num += 1
|
|
130
|
+
paired_donor_to_recipient_leaf_dict[key_str] = [internal_node_to_leaf_dict.get(donor, []), internal_node_to_leaf_dict.get(recipient, [])]
|
|
131
|
+
|
|
132
|
+
if line_to_write != '':
|
|
133
|
+
itol_connection_txt_handle.write(line_to_write)
|
|
134
|
+
connection_line_to_write_dict[key_str] = line_to_write
|
|
135
|
+
hgt_freq_dict[key_str] = freq
|
|
136
|
+
|
|
137
|
+
combined_connection_file_path, combined_connection_file_basename, combined_connection_file_ext = sep_path_basename_ext(itol_connection_txt)
|
|
138
|
+
|
|
139
|
+
# write out connections separately
|
|
140
|
+
for each_connection in connection_line_to_write_dict:
|
|
141
|
+
pwd_connection_txt = '%s/%s_%s.txt' % (combined_connection_file_path, combined_connection_file_basename, each_connection)
|
|
142
|
+
pwd_connection_txt_handle = open(pwd_connection_txt, 'w')
|
|
143
|
+
pwd_connection_txt_handle.write('DATASET_CONNECTION\nSEPARATOR TAB\nDATASET_LABEL\tdemo_connections\n')
|
|
144
|
+
pwd_connection_txt_handle.write('COLOR\t#ff0ff0\nDRAW_ARROWS\t1\nARROW_SIZE\t60\nLOOP_SIZE\t100\n')
|
|
145
|
+
pwd_connection_txt_handle.write('MAXIMUM_LINE_WIDTH\t10\nCURVE_ANGLE\t45\nCENTER_CURVES\t1\nALIGN_TO_LABELS\t0\nDATA\n')
|
|
146
|
+
pwd_connection_txt_handle.write(connection_line_to_write_dict[each_connection] + '\n')
|
|
147
|
+
pwd_connection_txt_handle.close()
|
|
148
|
+
|
|
149
|
+
return internal_node_to_leaf_dict, paired_donor_to_recipient_leaf_dict, hgt_freq_dict
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def itol_tree(tree_file, annotation_file_list, project_name, APIkey, display_mode, op_plot):
|
|
153
|
+
|
|
154
|
+
# https://github.com/albertyw/itolapi
|
|
155
|
+
# http://itol.embl.de/help.cgi#batch
|
|
156
|
+
|
|
157
|
+
op_plot_ext = op_plot.split('.')[-1]
|
|
158
|
+
|
|
159
|
+
# upload tree to iTOL
|
|
160
|
+
itol_uploader = Itol()
|
|
161
|
+
itol_uploader.params['projectName'] = project_name # better to create a project with a unique name.
|
|
162
|
+
itol_uploader.params['APIkey'] = APIkey # sine we are the same account, we can use the same APIkey
|
|
163
|
+
itol_uploader.params['treeName'] = tree_file
|
|
164
|
+
itol_uploader.add_file(tree_file)
|
|
165
|
+
|
|
166
|
+
# upload annotation files to iTOL
|
|
167
|
+
for annotation_file in annotation_file_list:
|
|
168
|
+
itol_uploader.add_file(annotation_file)
|
|
169
|
+
|
|
170
|
+
status = itol_uploader.upload()
|
|
171
|
+
# import pdb;pdb.set_trace()
|
|
172
|
+
assert status != False
|
|
173
|
+
|
|
174
|
+
# the following parameters are optional, refer to https://itol.embl.de/help.cgi#batchExp
|
|
175
|
+
if len(annotation_file_list) == 1:
|
|
176
|
+
datasets_visible_str = '0'
|
|
177
|
+
elif len(annotation_file_list) == 2:
|
|
178
|
+
datasets_visible_str = '0,1'
|
|
179
|
+
elif len(annotation_file_list) == 3:
|
|
180
|
+
datasets_visible_str = '0,1,2'
|
|
181
|
+
else:
|
|
182
|
+
datasets_visible_str = ','.join([str(i) for i in list(range(0, len(annotation_file_list)))])
|
|
183
|
+
itol_exporter = itol_uploader.get_itol_export()
|
|
184
|
+
itol_exporter.set_export_param_value('datasets_visible', datasets_visible_str)
|
|
185
|
+
itol_exporter.set_export_param_value('display_mode', display_mode)
|
|
186
|
+
itol_exporter.set_export_param_value('range_mode', '2')
|
|
187
|
+
itol_exporter.set_export_param_value('dashed_lines', '1')
|
|
188
|
+
# itol_exporter.set_export_param_value('current_font_size', '96')
|
|
189
|
+
itol_exporter.set_export_param_value('line_width', '3')
|
|
190
|
+
itol_exporter.set_export_param_value('vertical_shift_factor', '0.9')
|
|
191
|
+
itol_exporter.set_export_param_value('horizontal_scale_factor', '0.9')
|
|
192
|
+
itol_exporter.set_export_param_value('format', op_plot_ext)
|
|
193
|
+
itol_exporter.export(op_plot)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_node_to_leaf_dict(tree_file):
|
|
197
|
+
internal_node_to_leaf_dict = dict()
|
|
198
|
+
for node in Tree(tree_file, format=1).traverse():
|
|
199
|
+
if not node.is_leaf():
|
|
200
|
+
node_name = node.name
|
|
201
|
+
node_leaf_list = node.get_leaf_names()
|
|
202
|
+
internal_node_to_leaf_dict[node_name] = node_leaf_list
|
|
203
|
+
return internal_node_to_leaf_dict
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def combine_trees(t1_with_len, t2_with_name, op_tree_with_both):
|
|
207
|
+
|
|
208
|
+
# assume t1 has branch length and t2 has internal node name
|
|
209
|
+
|
|
210
|
+
t1 = Tree(t1_with_len, format=0)
|
|
211
|
+
t2 = Tree(t2_with_name, format=1)
|
|
212
|
+
|
|
213
|
+
t1_leaves_to_node_dict = dict()
|
|
214
|
+
for t1_node in t1.traverse():
|
|
215
|
+
leaf_str = '__'.join(sorted(list(t1_node.get_leaf_names())))
|
|
216
|
+
t1_leaves_to_node_dict[leaf_str] = t1_node
|
|
217
|
+
|
|
218
|
+
t2_leaves_to_node_dict = dict()
|
|
219
|
+
for t2_node in t2.traverse():
|
|
220
|
+
leaf_str = '__'.join(sorted(list(t2_node.get_leaf_names())))
|
|
221
|
+
t2_leaves_to_node_dict[leaf_str] = t2_node
|
|
222
|
+
|
|
223
|
+
t1_node_to_t2_node_dict = dict()
|
|
224
|
+
for index, t1_node in t1_leaves_to_node_dict.items():
|
|
225
|
+
t2_node = t2_leaves_to_node_dict[index]
|
|
226
|
+
t1_node_to_t2_node_dict[t1_node] = t2_node
|
|
227
|
+
|
|
228
|
+
merged_tree = t1.copy()
|
|
229
|
+
for node, t1_node in zip(merged_tree.traverse(), t1.traverse()):
|
|
230
|
+
node.name = t1_node_to_t2_node_dict[t1_node].name
|
|
231
|
+
merged_tree.write(outfile=op_tree_with_both, format=3)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def prefix_internal_nodes(tree_in, prefix_str, tree_out):
|
|
235
|
+
t = Tree(tree_in, format=3)
|
|
236
|
+
t_renamed = t.copy()
|
|
237
|
+
for node in t_renamed.traverse():
|
|
238
|
+
if not node.is_leaf():
|
|
239
|
+
node_name_prefixed = '%s%s' % (prefix_str, node.name)
|
|
240
|
+
node.name = node_name_prefixed
|
|
241
|
+
t_renamed.write(outfile=tree_out, format=3)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def check_a_is_ancestor_of_b(tree_file, node_a, node_b):
|
|
245
|
+
|
|
246
|
+
a_is_ancestor_of_b = False
|
|
247
|
+
for node in Tree(tree_file, format=1).traverse():
|
|
248
|
+
node_name = node.name
|
|
249
|
+
if node_name == node_b:
|
|
250
|
+
node_ancestor_list = [i.name for i in node.get_ancestors()]
|
|
251
|
+
if node_a in node_ancestor_list:
|
|
252
|
+
a_is_ancestor_of_b = True
|
|
253
|
+
return a_is_ancestor_of_b
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def check_a_is_child_of_b(tree_file, node_a, node_b):
|
|
257
|
+
|
|
258
|
+
a_is_child_of_b = False
|
|
259
|
+
for node in Tree(tree_file, format=1).traverse():
|
|
260
|
+
node_name = node.name
|
|
261
|
+
if node_name == node_b:
|
|
262
|
+
node_children_list = [i.name for i in node.get_descendants()]
|
|
263
|
+
if node_a in node_children_list:
|
|
264
|
+
a_is_child_of_b = True
|
|
265
|
+
return a_is_child_of_b
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def root_at_midpoint(tree_in, tree_out_rooted):
|
|
269
|
+
t = Tree(tree_in)
|
|
270
|
+
midpoint = t.get_midpoint_outgroup()
|
|
271
|
+
t.set_outgroup(midpoint)
|
|
272
|
+
t.write(outfile=tree_out_rooted)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def get_color_list(color_num):
|
|
276
|
+
if color_num <= 8:
|
|
277
|
+
color_list_combined = ['#3787c0', '#39399f', '#ffb939', '#399f39', '#9f399f', '#fb694a', '#9f9f39', '#959595']
|
|
278
|
+
|
|
279
|
+
elif 8 < color_num <= 16:
|
|
280
|
+
color_list_combined = ['#2b7bba', '#89bedc', '#2e2e99', '#8a8acc', '#ffa500', '#ffc55c', '#2e992e', '#8acc8a',
|
|
281
|
+
'#992e99', '#cc8acc', '#d52221', '#fc8161', '#99992e', '#cccc8a', '#5c5c5c', '#adadad']
|
|
282
|
+
else:
|
|
283
|
+
color_num_each = math.ceil(color_num / 8) + 2
|
|
284
|
+
color_list_1 = sns.color_palette('Blues', n_colors=color_num_each).as_hex()
|
|
285
|
+
color_list_2 = sns.light_palette('navy', n_colors=color_num_each).as_hex()
|
|
286
|
+
color_list_3 = sns.light_palette('orange', n_colors=color_num_each).as_hex()
|
|
287
|
+
color_list_4 = sns.light_palette('green', n_colors=color_num_each).as_hex()
|
|
288
|
+
color_list_5 = sns.light_palette('purple', n_colors=color_num_each).as_hex()
|
|
289
|
+
color_list_6 = sns.color_palette('Reds', n_colors=color_num_each).as_hex()
|
|
290
|
+
color_list_7 = sns.light_palette('olive', n_colors=color_num_each).as_hex()
|
|
291
|
+
color_list_8 = sns.color_palette('Greys', n_colors=color_num_each).as_hex()
|
|
292
|
+
|
|
293
|
+
color_list_combined = []
|
|
294
|
+
for color_list in [color_list_1, color_list_2, color_list_3, color_list_4, color_list_5, color_list_6,
|
|
295
|
+
color_list_7, color_list_8]:
|
|
296
|
+
for color in color_list[2:][::-1]:
|
|
297
|
+
color_list_combined.append(color)
|
|
298
|
+
|
|
299
|
+
color_list_to_return = random.sample(color_list_combined, color_num)
|
|
300
|
+
|
|
301
|
+
color_list_to_return_sorted = []
|
|
302
|
+
for color_to_return in color_list_combined:
|
|
303
|
+
if color_to_return in color_list_to_return:
|
|
304
|
+
color_list_to_return_sorted.append(color_to_return)
|
|
305
|
+
|
|
306
|
+
return color_list_to_return_sorted
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def iTOL(Leaf_to_Group_dict, Group_to_Color_dict, FileOut):
|
|
310
|
+
|
|
311
|
+
Group_set = set()
|
|
312
|
+
for each_leaf in Leaf_to_Group_dict:
|
|
313
|
+
Group_set.add(Leaf_to_Group_dict[each_leaf])
|
|
314
|
+
|
|
315
|
+
if len(Group_to_Color_dict) == 0:
|
|
316
|
+
Group_to_Color_dict = dict(zip(Group_set, get_color_list(len(Group_set))))
|
|
317
|
+
else:
|
|
318
|
+
group_without_color_list = []
|
|
319
|
+
for each_group in Group_set:
|
|
320
|
+
if each_group not in Group_to_Color_dict:
|
|
321
|
+
group_without_color_list.append(each_group)
|
|
322
|
+
if len(group_without_color_list) > 0:
|
|
323
|
+
color_list_unprovided = get_color_list(len(group_without_color_list))
|
|
324
|
+
Group_to_Color_dict_unprovided = dict(zip(group_without_color_list, color_list_unprovided))
|
|
325
|
+
for each_group in Group_to_Color_dict_unprovided:
|
|
326
|
+
Group_to_Color_dict[each_group] = Group_to_Color_dict_unprovided[each_group]
|
|
327
|
+
|
|
328
|
+
FileOut_handle = open(FileOut, 'w')
|
|
329
|
+
FileOut_handle.write('DATASET_COLORSTRIP\n')
|
|
330
|
+
FileOut_handle.write('SEPARATOR TAB\n')
|
|
331
|
+
FileOut_handle.write('DATASET_LABEL\tTaxonomy\n')
|
|
332
|
+
FileOut_handle.write('\n# customize strip attributes here\n')
|
|
333
|
+
FileOut_handle.write('STRIP_WIDTH\t100\n')
|
|
334
|
+
FileOut_handle.write('MARGIN\t20\n')
|
|
335
|
+
FileOut_handle.write('\n# provide data here\nDATA\n')
|
|
336
|
+
for leaf in Leaf_to_Group_dict:
|
|
337
|
+
leaf_group = Leaf_to_Group_dict[leaf]
|
|
338
|
+
leaf_color = Group_to_Color_dict[leaf_group]
|
|
339
|
+
FileOut_handle.write('%s\t%s\t%s\n' % (leaf, leaf_color, leaf_group))
|
|
340
|
+
FileOut_handle.close()
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def ale_splitter(rec_file):
|
|
344
|
+
|
|
345
|
+
options = [True, True, True, True]
|
|
346
|
+
|
|
347
|
+
with open(rec_file) as f:
|
|
348
|
+
lines = f.readlines()
|
|
349
|
+
stree = lines[2].strip()
|
|
350
|
+
ll = lines[6].strip().split()[-1]
|
|
351
|
+
rates = lines[8].strip().split("\t")[1:]
|
|
352
|
+
n_reconciled_trees = int(lines[9].strip().split()[0])
|
|
353
|
+
reconciled_trees = lines[11:n_reconciled_trees + 11]
|
|
354
|
+
n_of_events = lines[11 + n_reconciled_trees + 1].split("\t")[1:]
|
|
355
|
+
table = lines[11 + n_reconciled_trees + 3:]
|
|
356
|
+
|
|
357
|
+
if options[0]:
|
|
358
|
+
with open(rec_file.replace("uml_rec", "stree"), "w") as f:
|
|
359
|
+
f.write(stree.split("\t")[-1])
|
|
360
|
+
|
|
361
|
+
if options[1]:
|
|
362
|
+
with open(rec_file.replace("uml_rec", "info"), "w") as f:
|
|
363
|
+
f.write("LL:" + "\t" + ll + "\n")
|
|
364
|
+
f.write("Dp:" + "\t" + rates[0] + "\n")
|
|
365
|
+
f.write("Tp:" + "\t" + rates[1] + "\n")
|
|
366
|
+
f.write("Lp:" + "\t" + rates[2] + "\n")
|
|
367
|
+
f.write("De:" + "\t" + n_of_events[0] + "\n")
|
|
368
|
+
f.write("Te:" + "\t" + n_of_events[1] + "\n")
|
|
369
|
+
f.write("Le:" + "\t" + n_of_events[2] + "\n")
|
|
370
|
+
f.write("Se:" + "\t" + n_of_events[3] + "\n")
|
|
371
|
+
|
|
372
|
+
if options[2]:
|
|
373
|
+
with open(rec_file.replace("uml_rec", "recs"), "w") as f:
|
|
374
|
+
for t in reconciled_trees:
|
|
375
|
+
f.write(t)
|
|
376
|
+
|
|
377
|
+
if options[3]:
|
|
378
|
+
with open(rec_file.replace("uml_rec", "rec_table"), "w") as f:
|
|
379
|
+
for e in table:
|
|
380
|
+
f.write(e)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def parse_ale_op_worker(arg_list):
|
|
384
|
+
|
|
385
|
+
qualified_og = arg_list[0]
|
|
386
|
+
gene_tree_dir = arg_list[1]
|
|
387
|
+
ale_wd = arg_list[2]
|
|
388
|
+
op_dir = arg_list[3]
|
|
389
|
+
interal_node_prefix = arg_list[4]
|
|
390
|
+
gnm_pco_dict = arg_list[5]
|
|
391
|
+
d_color = arg_list[6]
|
|
392
|
+
r_color = arg_list[7]
|
|
393
|
+
project_name = arg_list[8]
|
|
394
|
+
API_key = arg_list[9]
|
|
395
|
+
display_mode = arg_list[10]
|
|
396
|
+
hgt_freq_cutoff = arg_list[11]
|
|
397
|
+
ignore_leaf_hgt = arg_list[12]
|
|
398
|
+
ignore_vertical_hgt = arg_list[13]
|
|
399
|
+
donor_node_min_leaf_num = arg_list[14]
|
|
400
|
+
recipient_node_min_leaf_num = arg_list[15]
|
|
401
|
+
dr_separator = arg_list[16]
|
|
402
|
+
root_gene_tree_at_midpoint = arg_list[17]
|
|
403
|
+
p_color_dict = arg_list[18]
|
|
404
|
+
gnm_tree_no_underscore = arg_list[19]
|
|
405
|
+
pwd_itol_dir = arg_list[20]
|
|
406
|
+
pwd_itol_colorstrip_txt_gnm = arg_list[21]
|
|
407
|
+
pwd_itol_label_txt_gnm = arg_list[22]
|
|
408
|
+
|
|
409
|
+
current_og_dir = '%s/%s' % (pwd_itol_dir, qualified_og)
|
|
410
|
+
pwd_genome_tree_file = '%s/%s' % (ale_wd, gnm_tree_no_underscore)
|
|
411
|
+
pwd_gene_tree_treefile = '%s/%s.treefile' % (gene_tree_dir, qualified_og)
|
|
412
|
+
pwd_uts_file = '%s/%s_%s.ufboot.ale.uTs' % (ale_wd, gnm_tree_no_underscore, qualified_og)
|
|
413
|
+
pwd_uml_rec_file = '%s/%s_%s.ufboot.ale.uml_rec' % (ale_wd, gnm_tree_no_underscore, qualified_og)
|
|
414
|
+
pwd_gene_tree_treefile_midpoint_rooted = '%s/%s_midpoint_rooted.treefile' % (current_og_dir, qualified_og)
|
|
415
|
+
pwd_ale_formatted_gnm_tree = '%s/%s.ufboot_genome_tree.tree' % (current_og_dir, qualified_og)
|
|
416
|
+
pwd_ale_formatted_gnm_tree_with_len = '%s/%s.ufboot_genome_tree_with_len.tree' % (current_og_dir, qualified_og)
|
|
417
|
+
pwd_ale_formatted_gnm_tree_with_len_prefixed = '%s/%s.ufboot_genome_tree_with_len_prefixed.tree' % (current_og_dir, qualified_og)
|
|
418
|
+
pwd_itol_connection_txt_all = '%s/%s_iTOL_connection.txt' % (current_og_dir, qualified_og)
|
|
419
|
+
pwd_gene_tree_itol_label_txt = '%s/%s_iTOL_gene_pco.txt' % (current_og_dir, qualified_og)
|
|
420
|
+
pwd_gene_tree_itol_colorstrip_txt = '%s/%s_iTOL_colorstrip_gene.txt' % (current_og_dir, qualified_og)
|
|
421
|
+
|
|
422
|
+
os.mkdir(current_og_dir)
|
|
423
|
+
|
|
424
|
+
# run ale_splitter
|
|
425
|
+
ale_splitter(pwd_uml_rec_file)
|
|
426
|
+
|
|
427
|
+
internal_node_to_leaf_dict = dict()
|
|
428
|
+
paired_donor_to_recipient_leaf_dict = dict()
|
|
429
|
+
hgt_freq_dict = dict()
|
|
430
|
+
if os.path.isfile(pwd_uts_file) is True:
|
|
431
|
+
|
|
432
|
+
# write out ALE formatted genome tree
|
|
433
|
+
renamed_genome_tree_str = open(pwd_uml_rec_file).readlines()[2].strip().split('\t')[1]
|
|
434
|
+
with open(pwd_ale_formatted_gnm_tree, 'w') as ale_renamed_species_tree_handle:
|
|
435
|
+
ale_renamed_species_tree_handle.write(renamed_genome_tree_str + '\n')
|
|
436
|
+
|
|
437
|
+
internal_node_to_leaf_dict, paired_donor_to_recipient_leaf_dict, hgt_freq_dict = uts_to_itol_connections(pwd_genome_tree_file, pwd_ale_formatted_gnm_tree, interal_node_prefix, pwd_uts_file, hgt_freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num, pwd_itol_connection_txt_all, dr_separator)
|
|
438
|
+
else:
|
|
439
|
+
print('%s: uTs file not found, you need to run ALE first!' % qualified_og)
|
|
440
|
+
|
|
441
|
+
# combine_trees
|
|
442
|
+
combine_trees(pwd_genome_tree_file, pwd_ale_formatted_gnm_tree, pwd_ale_formatted_gnm_tree_with_len)
|
|
443
|
+
|
|
444
|
+
# prefix_internal_nodes of combined tree
|
|
445
|
+
prefix_internal_nodes(pwd_ale_formatted_gnm_tree_with_len, interal_node_prefix, pwd_ale_formatted_gnm_tree_with_len_prefixed)
|
|
446
|
+
|
|
447
|
+
# write out iTOL label file for gene and genome tree, also colorstrip for taxonomy
|
|
448
|
+
pwd_gene_tree_itol_label_txt_handle = open(pwd_gene_tree_itol_label_txt, 'w')
|
|
449
|
+
pwd_gene_tree_itol_label_txt_handle.write('LABELS\nSEPARATOR TAB\n\nDATA\n')
|
|
450
|
+
gene_to_p_dict = dict()
|
|
451
|
+
for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
|
|
452
|
+
gene_gnm = '_'.join(each_gene.split('_')[:-1])
|
|
453
|
+
genome_with_taxon = gnm_pco_dict[gene_gnm]
|
|
454
|
+
gene_to_p_dict[each_gene] = genome_with_taxon.split('__')[0]
|
|
455
|
+
pwd_gene_tree_itol_label_txt_handle.write('%s\t%s_%s\n' % (each_gene, genome_with_taxon, each_gene.split('_')[-1]))
|
|
456
|
+
pwd_gene_tree_itol_label_txt_handle.close()
|
|
457
|
+
|
|
458
|
+
iTOL(gene_to_p_dict, p_color_dict, pwd_gene_tree_itol_colorstrip_txt)
|
|
459
|
+
|
|
460
|
+
# root gene tree at midpoint
|
|
461
|
+
gene_tree_to_plot = pwd_gene_tree_treefile
|
|
462
|
+
if root_gene_tree_at_midpoint is True:
|
|
463
|
+
root_at_midpoint(pwd_gene_tree_treefile, pwd_gene_tree_treefile_midpoint_rooted)
|
|
464
|
+
gene_tree_to_plot = pwd_gene_tree_treefile_midpoint_rooted
|
|
465
|
+
|
|
466
|
+
# plot separately
|
|
467
|
+
n = 1
|
|
468
|
+
for each_d2r in paired_donor_to_recipient_leaf_dict:
|
|
469
|
+
each_d2r_freq = hgt_freq_dict[each_d2r]
|
|
470
|
+
each_d2r_d_list = paired_donor_to_recipient_leaf_dict[each_d2r][0]
|
|
471
|
+
each_d2r_r_list = paired_donor_to_recipient_leaf_dict[each_d2r][1]
|
|
472
|
+
pwd_gene_tree_itol_label_txt = '%s/%s_iTOL_gene_pco.txt' % (current_og_dir, qualified_og)
|
|
473
|
+
pwd_gnm_tree_label_color_txt = '%s/%s_iTOL_label_color_genome_%s.txt' % (current_og_dir, qualified_og, each_d2r)
|
|
474
|
+
pwd_gene_tree_label_color_txt = '%s/%s_iTOL_label_color_gene_%s.txt' % (current_og_dir, qualified_og, each_d2r)
|
|
475
|
+
pwd_itol_connection_txt = '%s/%s_iTOL_connection_%s.txt' % (current_og_dir, qualified_og, each_d2r)
|
|
476
|
+
pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf = '%s/%s_genome_tree_with_HGT_%s.pdf' % (current_og_dir, qualified_og, each_d2r)
|
|
477
|
+
pwd_gene_tree_treefile_subset_pdf = '%s/%s_subset_%s.pdf' % (current_og_dir, qualified_og, each_d2r)
|
|
478
|
+
pwd_combined_image_with_ale_hgts = '%s/%s_HGT_%s_%s_%s.pdf' % (op_dir, qualified_og, n, each_d2r, each_d2r_freq)
|
|
479
|
+
|
|
480
|
+
# write out gnm_tree_label_color_txt
|
|
481
|
+
pwd_gnm_tree_label_color_txt_handle = open(pwd_gnm_tree_label_color_txt, 'w')
|
|
482
|
+
pwd_gnm_tree_label_color_txt_handle.write('DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\texample_style\nCOLOR\t#ffff00\n\nDATA\n')
|
|
483
|
+
pwd_gnm_tree_label_color_txt_handle.write('%s\tlabel\tclade\t%s\t1\tnormal\n' % (each_d2r.split(dr_separator)[0], d_color))
|
|
484
|
+
pwd_gnm_tree_label_color_txt_handle.write('%s\tlabel\tclade\t%s\t1\tnormal\n' % (each_d2r.split(dr_separator)[1], r_color))
|
|
485
|
+
pwd_gnm_tree_label_color_txt_handle.close()
|
|
486
|
+
|
|
487
|
+
# write out iTOL label file for gene and genome tree, also colorstrip for taxonomy
|
|
488
|
+
pwd_gene_tree_label_color_txt_handle = open(pwd_gene_tree_label_color_txt, 'w')
|
|
489
|
+
pwd_gene_tree_label_color_txt_handle.write('DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\texample_style\nCOLOR\t#ffff00\n\nDATA\n')
|
|
490
|
+
for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
|
|
491
|
+
gene_name_for_ale = '_'.join(each_gene.strip().split('_')[:-1])
|
|
492
|
+
gene_name_for_ale = gene_name_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
|
|
493
|
+
if gene_name_for_ale in each_d2r_d_list:
|
|
494
|
+
pwd_gene_tree_label_color_txt_handle.write('%s\tlabel\tnode\t%s\t1\tnormal\n' % (each_gene, d_color))
|
|
495
|
+
elif gene_name_for_ale in each_d2r_r_list:
|
|
496
|
+
pwd_gene_tree_label_color_txt_handle.write('%s\tlabel\tnode\t%s\t1\tnormal\n' % (each_gene, r_color))
|
|
497
|
+
pwd_gene_tree_label_color_txt_handle.close()
|
|
498
|
+
|
|
499
|
+
itol_tree(pwd_ale_formatted_gnm_tree_with_len_prefixed, [pwd_gnm_tree_label_color_txt, pwd_itol_label_txt_gnm, pwd_itol_connection_txt, pwd_itol_colorstrip_txt_gnm], project_name, API_key, display_mode, pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf)
|
|
500
|
+
itol_tree(gene_tree_to_plot, [pwd_gene_tree_itol_label_txt, pwd_gene_tree_label_color_txt, pwd_gene_tree_itol_colorstrip_txt], project_name, API_key, display_mode, pwd_gene_tree_treefile_subset_pdf)
|
|
501
|
+
merge_pdf(pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf, pwd_gene_tree_treefile_subset_pdf, 66, pwd_combined_image_with_ale_hgts)
|
|
502
|
+
n += 1
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def ALE4(args):
|
|
506
|
+
|
|
507
|
+
ale1_op_dir = args['1']
|
|
508
|
+
ale2_op_dir = args['2']
|
|
509
|
+
genome_taxon_txt = args['c']
|
|
510
|
+
ar_phylum_color_code_txt = args['color']
|
|
511
|
+
op_dir = args['o']
|
|
512
|
+
force_create_op_dir = args['f']
|
|
513
|
+
API_key = args['api']
|
|
514
|
+
hgt_freq_cutoff = args['fc']
|
|
515
|
+
donor_node_min_leaf_num = args['mld']
|
|
516
|
+
recipient_node_min_leaf_num = args['mlr']
|
|
517
|
+
project_name = args['itol']
|
|
518
|
+
|
|
519
|
+
ignore_vertical_hgt = True # filter ALE predicted HGTs
|
|
520
|
+
ignore_leaf_hgt = True # filter ALE predicted HGTs
|
|
521
|
+
interal_node_prefix = 'IN' # plot tree with HGT
|
|
522
|
+
display_mode = '1' # plot tree with HGT, 1=rectangular, 2=circular, 3=unrooted
|
|
523
|
+
align_leaf_name = True # plot tree with HGT
|
|
524
|
+
show_scale = False # plot tree with HGT
|
|
525
|
+
d_color = '#FF0000' # plot tree with HGT
|
|
526
|
+
r_color = '#0000FF' # plot tree with HGT
|
|
527
|
+
dr_separator = '_to_' # plot tree with HGT
|
|
528
|
+
root_gene_tree_at_midpoint = True # plot tree with HGT
|
|
529
|
+
|
|
530
|
+
####################################################################################################################
|
|
531
|
+
|
|
532
|
+
gnm_tree_no_underscore = 'genome_tree.newick'
|
|
533
|
+
pwd_itol_dir = '%s/annotation_files' % op_dir
|
|
534
|
+
pwd_gnm_tree_itol_colorstrip_txt = '%s/iTOL_colorstrip_genome.txt' % pwd_itol_dir
|
|
535
|
+
pwd_gnm_tree_itol_label_txt = '%s/iTOL_genome_pco.txt' % pwd_itol_dir
|
|
536
|
+
|
|
537
|
+
####################################################################################################################
|
|
538
|
+
|
|
539
|
+
ufboot_file_re = '%s/*.ufboot' % ale1_op_dir
|
|
540
|
+
uml_rec_file_re = '%s/*.ufboot.ale.uml_rec' % ale2_op_dir
|
|
541
|
+
ufboot_file_list = glob.glob(ufboot_file_re)
|
|
542
|
+
uml_rec_file_list = glob.glob(uml_rec_file_re)
|
|
543
|
+
|
|
544
|
+
ufboot_base_list = []
|
|
545
|
+
for each_ufboot in ufboot_file_list:
|
|
546
|
+
_, ufboot_base, _ = sep_path_basename_ext(each_ufboot)
|
|
547
|
+
ufboot_base_list.append(ufboot_base)
|
|
548
|
+
|
|
549
|
+
uml_rec_base_list = []
|
|
550
|
+
for each_uml_rec in uml_rec_file_list:
|
|
551
|
+
_, uml_rec_base, _ = sep_path_basename_ext(each_uml_rec)
|
|
552
|
+
uml_rec_base = uml_rec_base.replace((gnm_tree_no_underscore + '_'), '')
|
|
553
|
+
uml_rec_base = uml_rec_base.replace('.ufboot.ale', '')
|
|
554
|
+
uml_rec_base_list.append(uml_rec_base)
|
|
555
|
+
|
|
556
|
+
found_in_ufboot_only = [og for og in ufboot_base_list if og not in uml_rec_base_list]
|
|
557
|
+
found_in_uml_rec_only = [og for og in uml_rec_base_list if og not in ufboot_base_list]
|
|
558
|
+
found_in_both = [og for og in ufboot_base_list if og in uml_rec_base_list]
|
|
559
|
+
|
|
560
|
+
if len(found_in_ufboot_only) > 0:
|
|
561
|
+
print('The following OGs will be ignored as they were not found in %s:' % ale2_op_dir)
|
|
562
|
+
print(','.join(found_in_ufboot_only))
|
|
563
|
+
if len(found_in_uml_rec_only) > 0:
|
|
564
|
+
print('The following OGs will be ignored as they were not found in %s:' % ale1_op_dir)
|
|
565
|
+
print(','.join(found_in_uml_rec_only))
|
|
566
|
+
print()
|
|
567
|
+
|
|
568
|
+
if os.path.isdir(op_dir) is True:
|
|
569
|
+
if force_create_op_dir is True:
|
|
570
|
+
os.system('rm -r %s' % op_dir)
|
|
571
|
+
else:
|
|
572
|
+
print('Output folder detected, program exited!')
|
|
573
|
+
exit()
|
|
574
|
+
os.system('mkdir %s' % op_dir)
|
|
575
|
+
os.system('mkdir %s' % pwd_itol_dir)
|
|
576
|
+
|
|
577
|
+
# read in genome taxonomy
|
|
578
|
+
pwd_gnm_tree_itol_label_txt_handle = open(pwd_gnm_tree_itol_label_txt, 'w')
|
|
579
|
+
pwd_gnm_tree_itol_label_txt_handle.write('LABELS\nSEPARATOR TAB\n\nDATA\n')
|
|
580
|
+
gnm_pco_dict = dict()
|
|
581
|
+
genome_to_p_dict = dict()
|
|
582
|
+
for each_gnm in open(genome_taxon_txt):
|
|
583
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
584
|
+
gnm_id = each_gnm_split[0]
|
|
585
|
+
genome_name_for_ale = gnm_id
|
|
586
|
+
genome_name_for_ale = genome_name_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
|
|
587
|
+
taxon_str = each_gnm_split[1]
|
|
588
|
+
gnm_phylum = taxon_str.split(';')[1]
|
|
589
|
+
gnm_class = taxon_str.split(';')[2]
|
|
590
|
+
gnm_order = taxon_str.split(';')[3]
|
|
591
|
+
pco_str = '%s__%s__%s__%s' % (gnm_phylum[3:], gnm_class[3:], gnm_order[3:], gnm_id)
|
|
592
|
+
gnm_pco_dict[gnm_id] = pco_str
|
|
593
|
+
genome_to_p_dict[genome_name_for_ale] = gnm_phylum[3:]
|
|
594
|
+
pwd_gnm_tree_itol_label_txt_handle.write('%s\t%s\n' % (genome_name_for_ale, pco_str))
|
|
595
|
+
pwd_gnm_tree_itol_label_txt_handle.close()
|
|
596
|
+
|
|
597
|
+
# read in phylum color
|
|
598
|
+
p_color_dict = dict()
|
|
599
|
+
for each_line in open(ar_phylum_color_code_txt):
|
|
600
|
+
each_line_split = each_line.strip().split('\t')
|
|
601
|
+
phylum_id = each_line_split[1]
|
|
602
|
+
color_id = each_line_split[0]
|
|
603
|
+
p_color_dict[phylum_id] = color_id
|
|
604
|
+
|
|
605
|
+
iTOL(genome_to_p_dict, p_color_dict, pwd_gnm_tree_itol_colorstrip_txt)
|
|
606
|
+
|
|
607
|
+
# parse ALE output
|
|
608
|
+
n = 1
|
|
609
|
+
for qualified_og in found_in_both:
|
|
610
|
+
print('Processing (%s/%s): %s' % (n, len(found_in_both), qualified_og))
|
|
611
|
+
current_arg_list = [qualified_og, ale1_op_dir, ale2_op_dir, op_dir, interal_node_prefix, gnm_pco_dict, d_color,
|
|
612
|
+
r_color, project_name, API_key, display_mode, hgt_freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt,
|
|
613
|
+
donor_node_min_leaf_num, recipient_node_min_leaf_num, dr_separator, root_gene_tree_at_midpoint,
|
|
614
|
+
p_color_dict, gnm_tree_no_underscore, pwd_itol_dir, pwd_gnm_tree_itol_colorstrip_txt, pwd_gnm_tree_itol_label_txt]
|
|
615
|
+
parse_ale_op_worker(current_arg_list)
|
|
616
|
+
n += 1
|
|
617
|
+
|
|
618
|
+
print('Done!')
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
if __name__ == '__main__':
|
|
622
|
+
|
|
623
|
+
ALE4_parser = argparse.ArgumentParser()
|
|
624
|
+
ALE4_parser.add_argument('-1', required=True, help='ALE1 output directory')
|
|
625
|
+
ALE4_parser.add_argument('-2', required=True, help='ALE2 output directory')
|
|
626
|
+
ALE4_parser.add_argument('-c', required=True, help='genome_taxon, GTDB format')
|
|
627
|
+
ALE4_parser.add_argument('-color', required=True, help='phylum color code')
|
|
628
|
+
ALE4_parser.add_argument('-o', required=True, help='output dir, i.e., ALE4_op_dir')
|
|
629
|
+
ALE4_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
630
|
+
ALE4_parser.add_argument('-api', required=True, help='iTOL API key')
|
|
631
|
+
ALE4_parser.add_argument('-fc', required=False, type=float, default=0.5, help='hgt_freq_cutoff, default: 0.5')
|
|
632
|
+
ALE4_parser.add_argument('-mld', required=False, type=int, default=5, help='donor_node_min_leaf_num, default: 5')
|
|
633
|
+
ALE4_parser.add_argument('-mlr', required=False, type=int, default=5, help='recipient_node_min_leaf_num, default: 5')
|
|
634
|
+
ALE4_parser.add_argument('-itol', required=False, default='batch_access_tmp', help='iTOL project_name, default: batch_access_tmp')
|
|
635
|
+
args = vars(ALE4_parser.parse_args())
|
|
636
|
+
ALE4(args)
|