treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/ALE_backup.py
ADDED
|
@@ -0,0 +1,1081 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import math
|
|
4
|
+
import random
|
|
5
|
+
import argparse
|
|
6
|
+
import seaborn as sns
|
|
7
|
+
from Bio import SeqIO
|
|
8
|
+
from ete3 import Tree
|
|
9
|
+
from PIL import Image
|
|
10
|
+
from itolapi import Itol
|
|
11
|
+
import multiprocessing as mp
|
|
12
|
+
from PyPDF3.pdf import PageObject
|
|
13
|
+
from PyPDF3 import PdfFileWriter, PdfFileReader
|
|
14
|
+
from ete3 import TextFace, TreeStyle, NodeStyle
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
ALE_usage = '''
|
|
18
|
+
========================= ALE example commands =========================
|
|
19
|
+
|
|
20
|
+
TreeSAK ALE1 -> step 1: get gene tree
|
|
21
|
+
TreeSAK ALE2 -> step 2: run ALE
|
|
22
|
+
TreeSAK ALE3 -> step 3: parse ALE output
|
|
23
|
+
TreeSAK ALE4 -> Infer ancestral genome
|
|
24
|
+
|
|
25
|
+
cd /Users/songweizhi/Desktop/demo
|
|
26
|
+
TreeSAK ALE1 -i OrthologousGroups.txt -s combined_d__Archaea_o_rs.faa -p oma -c genome_taxon.txt -m 50 -n 2 -t 6 -jt 3 -f -o ALE1_op_dir
|
|
27
|
+
TreeSAK ALE2 -i ALE1_op_dir -s Marker_set_1_PA_75_C60_PMSF_concatenated_rooted.treefile -c genome_taxon.txt -t 6 -f -o ALE2_op_dir
|
|
28
|
+
TreeSAK ALE3 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color ar_phylum_color_code.txt -o ALE3_op_dir_0.3 -fc 0.3 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
29
|
+
TreeSAK ALE3 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color ar_phylum_color_code.txt -o ALE3_op_dir_0.5 -fc 0.5 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
30
|
+
TreeSAK ALE3 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color ar_phylum_color_code.txt -o ALE3_op_dir_0.8 -fc 0.8 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
ALEobserve OMA00003_for_ALE.ufboot
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
cd /Users/songweizhi/Desktop/demo/subset_gnm_tree_no
|
|
37
|
+
docker run -v $PWD:$PWD -w $PWD gregmich/alesuite_new ALEobserve OMA00003_for_ALE.ufboot
|
|
38
|
+
docker run -v $PWD:$PWD -w $PWD gregmich/alesuite_new ALEml_undated genome_tree_rooted.treefile OMA00003_for_ALE.ufboot.ale
|
|
39
|
+
|
|
40
|
+
cd /Users/songweizhi/Desktop/demo/subset_gnm_tree_yes
|
|
41
|
+
docker run -v $PWD:$PWD -w $PWD gregmich/alesuite_new ALEobserve OMA00003_for_ALE.ufboot
|
|
42
|
+
docker run -v $PWD:$PWD -w $PWD gregmich/alesuite_new ALEml_undated OMA00003_genome_tree_for_ALE.treefile OMA00003_for_ALE.ufboot.ale
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
========================================================================
|
|
47
|
+
'''
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def sep_path_basename_ext(file_in):
|
|
51
|
+
|
|
52
|
+
# separate path and file name
|
|
53
|
+
f_path, file_name = os.path.split(file_in)
|
|
54
|
+
if f_path == '':
|
|
55
|
+
f_path = '.'
|
|
56
|
+
|
|
57
|
+
# separate file basename and extension
|
|
58
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
59
|
+
|
|
60
|
+
return f_path, f_base, f_ext
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def select_seq(arg_list):
|
|
64
|
+
|
|
65
|
+
seq_file = arg_list[0]
|
|
66
|
+
id_file = arg_list[1]
|
|
67
|
+
output_file = arg_list[2]
|
|
68
|
+
|
|
69
|
+
seq_id_set = {i.strip() for i in open(id_file)}
|
|
70
|
+
output_file_handle = open(output_file, 'w')
|
|
71
|
+
for seq_record in SeqIO.parse(seq_file, 'fasta'):
|
|
72
|
+
if seq_record.id in seq_id_set:
|
|
73
|
+
SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
|
|
74
|
+
output_file_handle.close()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def root_with_out_group(tree_file, out_group_txt, tree_file_rooted):
|
|
78
|
+
|
|
79
|
+
out_group_set = set()
|
|
80
|
+
for each_og in open(out_group_txt):
|
|
81
|
+
out_group_set.add(each_og.strip())
|
|
82
|
+
|
|
83
|
+
tre = Tree(tree_file, format=1)
|
|
84
|
+
out_group_lca = tre.get_common_ancestor(out_group_set)
|
|
85
|
+
tre.set_outgroup(out_group_lca)
|
|
86
|
+
tre.write(outfile=tree_file_rooted)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
|
|
90
|
+
|
|
91
|
+
input_tree = Tree(tree_file_in)
|
|
92
|
+
subset_tree = input_tree.copy()
|
|
93
|
+
subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
|
|
94
|
+
if tree_file_out is None:
|
|
95
|
+
return subset_tree.write()
|
|
96
|
+
else:
|
|
97
|
+
subset_tree.write(outfile=tree_file_out)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def plot_tree(input_tree, tree_title, node_label_dict, node_label_color_dict, align_leaf_label, show_scale, output_plot):
|
|
101
|
+
|
|
102
|
+
if os.path.isfile(input_tree) is False:
|
|
103
|
+
print('Tree file not found, program exited!')
|
|
104
|
+
print(input_tree)
|
|
105
|
+
exit()
|
|
106
|
+
|
|
107
|
+
t = Tree(input_tree)
|
|
108
|
+
ts = TreeStyle()
|
|
109
|
+
ts.mode = "r" # tree model: 'r' for rectangular, 'c' for circular
|
|
110
|
+
ts.show_border = False # set tree image border
|
|
111
|
+
ts.show_leaf_name = False # show/hide leaf name, hide here, so you can customise it below with node.add_face()
|
|
112
|
+
ts.title.add_face(TextFace(tree_title, fsize=9, fgcolor='black', ftype='Arial', tight_text=False), column=0) # add tree title
|
|
113
|
+
|
|
114
|
+
# set node style
|
|
115
|
+
for each_node in t.traverse():
|
|
116
|
+
ns = NodeStyle()
|
|
117
|
+
ns["shape"] = "circle" # dot shape: circle, square or sphere
|
|
118
|
+
ns["fgcolor"] = "black" # color of shape(not label)
|
|
119
|
+
ns['size'] = 0 # node shape size
|
|
120
|
+
ns['hz_line_type'] = 0 # horizontal branch line type: 0 for solid, 1 for dashed, 2 for dotted
|
|
121
|
+
ns['vt_line_type'] = 0 # vertical branch line type: 0 for solid, 1 for dashed, 2 for dotted
|
|
122
|
+
ns['hz_line_width'] = 0.5 # horizontal branch line width
|
|
123
|
+
ns['vt_line_width'] = 0.5 # vertical branch line width
|
|
124
|
+
|
|
125
|
+
leaf_label_position = 'branch-right'
|
|
126
|
+
if align_leaf_label is True:
|
|
127
|
+
leaf_label_position = 'aligned'
|
|
128
|
+
|
|
129
|
+
if each_node.is_leaf():
|
|
130
|
+
node_id = each_node.name
|
|
131
|
+
node_label_color = node_label_color_dict.get(node_id, 'black')
|
|
132
|
+
node_label_text = node_label_dict.get(node_id, node_id)
|
|
133
|
+
each_node.add_face(TextFace(node_label_text, fsize=8, fgcolor=node_label_color, tight_text=False, bold=False),
|
|
134
|
+
column=0, position=leaf_label_position) # aligned, branch-right
|
|
135
|
+
else:
|
|
136
|
+
pass
|
|
137
|
+
each_node.set_style(ns)
|
|
138
|
+
|
|
139
|
+
# set layout
|
|
140
|
+
ts.rotation = 0 # from 0 to 360
|
|
141
|
+
ts.margin_top = 10 # top tree image margin
|
|
142
|
+
ts.margin_bottom = 10 # bottom tree image margin
|
|
143
|
+
ts.margin_left = 10 # left tree image margin
|
|
144
|
+
ts.margin_right = 10 # right tree image margin
|
|
145
|
+
ts.branch_vertical_margin = 3 # 3 pixels between adjancent branches
|
|
146
|
+
ts.show_scale = show_scale # show_scale
|
|
147
|
+
ts.show_border = False # set tree image border
|
|
148
|
+
|
|
149
|
+
# write out tree
|
|
150
|
+
t.render(output_plot, w=1200, units="px", tree_style=ts)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def merge_image(image_file_list, output_image):
|
|
154
|
+
|
|
155
|
+
images = [Image.open(x) for x in image_file_list]
|
|
156
|
+
widths, heights = zip(*(i.size for i in images))
|
|
157
|
+
total_width = sum(widths)
|
|
158
|
+
max_height = max(heights)
|
|
159
|
+
new_im = Image.new('RGB', (total_width, max_height), color='white')
|
|
160
|
+
|
|
161
|
+
x_offset = 0
|
|
162
|
+
for im in images:
|
|
163
|
+
new_im.paste(im, (x_offset, 0))
|
|
164
|
+
x_offset += im.size[0]
|
|
165
|
+
|
|
166
|
+
new_im.save(output_image)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def merge_pdf(pdf_1, pdf_2, margin_size, op_pdf):
|
|
170
|
+
|
|
171
|
+
page1 = PdfFileReader(open(pdf_1, "rb"), strict=False).getPage(0)
|
|
172
|
+
page2 = PdfFileReader(open(pdf_2, "rb"), strict=False).getPage(0)
|
|
173
|
+
|
|
174
|
+
total_width = page1.mediaBox.upperRight[0] + page2.mediaBox.upperRight[0] + margin_size*3
|
|
175
|
+
total_height = max([page1.mediaBox.upperRight[1], page2.mediaBox.upperRight[1]]) + margin_size*2
|
|
176
|
+
|
|
177
|
+
new_page = PageObject.createBlankPage(None, total_width, total_height)
|
|
178
|
+
new_page.mergeTranslatedPage(page1, margin_size, (total_height-margin_size-page1.mediaBox.upperRight[1]))
|
|
179
|
+
new_page.mergeTranslatedPage(page2, (page1.mediaBox.upperRight[0] + margin_size*2), margin_size)
|
|
180
|
+
|
|
181
|
+
output = PdfFileWriter()
|
|
182
|
+
output.addPage(new_page)
|
|
183
|
+
output.write(open(op_pdf, "wb"))
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def uts_to_itol_connections(genome_tree_file, ale_formatted_gnm_tree, interal_node_prefix, uts_file, freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num, itol_connection_txt, dr_separator):
|
|
187
|
+
|
|
188
|
+
# get internal_node_to_leaf_dict
|
|
189
|
+
internal_node_to_leaf_dict = get_node_to_leaf_dict(ale_formatted_gnm_tree)
|
|
190
|
+
|
|
191
|
+
paired_donor_to_recipient_leaf_dict = dict()
|
|
192
|
+
qualified_hgt_num = 0
|
|
193
|
+
|
|
194
|
+
leaf_id_set = []
|
|
195
|
+
if os.path.isfile(genome_tree_file):
|
|
196
|
+
leaf_id_set = [i.name for i in Tree(genome_tree_file, format=3).get_leaves()]
|
|
197
|
+
else:
|
|
198
|
+
print('%s not found!' % genome_tree_file)
|
|
199
|
+
|
|
200
|
+
hgt_freq_dict = dict()
|
|
201
|
+
connection_line_to_write_dict = dict()
|
|
202
|
+
with open(itol_connection_txt, 'w') as itol_connection_txt_handle:
|
|
203
|
+
itol_connection_txt_handle.write('DATASET_CONNECTION\nSEPARATOR TAB\nDATASET_LABEL\tdemo_connections\n')
|
|
204
|
+
itol_connection_txt_handle.write('COLOR\t#ff0ff0\nDRAW_ARROWS\t1\nARROW_SIZE\t60\nLOOP_SIZE\t100\n')
|
|
205
|
+
itol_connection_txt_handle.write('MAXIMUM_LINE_WIDTH\t10\nCURVE_ANGLE\t45\nCENTER_CURVES\t1\nALIGN_TO_LABELS\t0\nDATA\n')
|
|
206
|
+
for each_line in open(uts_file):
|
|
207
|
+
if not each_line.startswith('#'):
|
|
208
|
+
each_line_split = each_line.strip().split('\t')
|
|
209
|
+
donor = each_line_split[0]
|
|
210
|
+
recipient = each_line_split[1]
|
|
211
|
+
freq = float(each_line_split[2])
|
|
212
|
+
|
|
213
|
+
# add prefix to internal donor node
|
|
214
|
+
if donor in leaf_id_set:
|
|
215
|
+
donor_with_prefix = donor
|
|
216
|
+
else:
|
|
217
|
+
donor_with_prefix = interal_node_prefix + donor
|
|
218
|
+
|
|
219
|
+
# add prefix to internal recipient node
|
|
220
|
+
if recipient in leaf_id_set:
|
|
221
|
+
recipient_with_prefix = recipient
|
|
222
|
+
else:
|
|
223
|
+
recipient_with_prefix = interal_node_prefix + recipient
|
|
224
|
+
|
|
225
|
+
key_str = '%s%s%s' % (donor_with_prefix, dr_separator, recipient_with_prefix)
|
|
226
|
+
|
|
227
|
+
line_to_write = ''
|
|
228
|
+
if freq >= freq_cutoff:
|
|
229
|
+
if ignore_leaf_hgt is False:
|
|
230
|
+
if ignore_vertical_hgt is False:
|
|
231
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
232
|
+
qualified_hgt_num += 1
|
|
233
|
+
else:
|
|
234
|
+
donor_is_ancestor_of_recipient = check_a_is_ancestor_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
235
|
+
donor_is_child_of_recipient = check_a_is_child_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
236
|
+
if (donor_is_ancestor_of_recipient is False) and (donor_is_child_of_recipient is False):
|
|
237
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
238
|
+
qualified_hgt_num += 1
|
|
239
|
+
else:
|
|
240
|
+
if (each_line_split[0] not in leaf_id_set) and (each_line_split[1] not in leaf_id_set):
|
|
241
|
+
donor_node_leaf_num = len(internal_node_to_leaf_dict.get(donor, []))
|
|
242
|
+
recipient_node_leaf_num = len(internal_node_to_leaf_dict.get(recipient, []))
|
|
243
|
+
if (donor_node_leaf_num >= donor_node_min_leaf_num) and (recipient_node_leaf_num >= recipient_node_min_leaf_num):
|
|
244
|
+
if ignore_vertical_hgt is False:
|
|
245
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
246
|
+
qualified_hgt_num += 1
|
|
247
|
+
else:
|
|
248
|
+
donor_is_ancestor_of_recipient = check_a_is_ancestor_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
249
|
+
donor_is_child_of_recipient = check_a_is_child_of_b(ale_formatted_gnm_tree, donor, recipient)
|
|
250
|
+
if (donor_is_ancestor_of_recipient is False) and (donor_is_child_of_recipient is False):
|
|
251
|
+
line_to_write = '%s\t%s\t%s\t%s\t%s\t%s->%s(%s)\n' % (donor_with_prefix, recipient_with_prefix, freq, '#EB984E', 'normal', donor_with_prefix, recipient_with_prefix, freq)
|
|
252
|
+
qualified_hgt_num += 1
|
|
253
|
+
paired_donor_to_recipient_leaf_dict[key_str] = [internal_node_to_leaf_dict.get(donor, []), internal_node_to_leaf_dict.get(recipient, [])]
|
|
254
|
+
|
|
255
|
+
if line_to_write != '':
|
|
256
|
+
itol_connection_txt_handle.write(line_to_write)
|
|
257
|
+
connection_line_to_write_dict[key_str] = line_to_write
|
|
258
|
+
hgt_freq_dict[key_str] = freq
|
|
259
|
+
|
|
260
|
+
combined_connection_file_path, combined_connection_file_basename, combined_connection_file_ext = sep_path_basename_ext(itol_connection_txt)
|
|
261
|
+
|
|
262
|
+
# write out connections separately
|
|
263
|
+
for each_connection in connection_line_to_write_dict:
|
|
264
|
+
pwd_connection_txt = '%s/%s_%s.txt' % (combined_connection_file_path, combined_connection_file_basename, each_connection)
|
|
265
|
+
pwd_connection_txt_handle = open(pwd_connection_txt, 'w')
|
|
266
|
+
pwd_connection_txt_handle.write('DATASET_CONNECTION\nSEPARATOR TAB\nDATASET_LABEL\tdemo_connections\n')
|
|
267
|
+
pwd_connection_txt_handle.write('COLOR\t#ff0ff0\nDRAW_ARROWS\t1\nARROW_SIZE\t60\nLOOP_SIZE\t100\n')
|
|
268
|
+
pwd_connection_txt_handle.write('MAXIMUM_LINE_WIDTH\t10\nCURVE_ANGLE\t45\nCENTER_CURVES\t1\nALIGN_TO_LABELS\t0\nDATA\n')
|
|
269
|
+
pwd_connection_txt_handle.write(connection_line_to_write_dict[each_connection] + '\n')
|
|
270
|
+
pwd_connection_txt_handle.close()
|
|
271
|
+
|
|
272
|
+
return internal_node_to_leaf_dict, paired_donor_to_recipient_leaf_dict, hgt_freq_dict
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def itol_tree(tree_file, annotation_file_list, project_name, APIkey, display_mode, op_plot):
|
|
276
|
+
|
|
277
|
+
# https://github.com/albertyw/itolapi
|
|
278
|
+
# http://itol.embl.de/help.cgi#batch
|
|
279
|
+
|
|
280
|
+
op_plot_ext = op_plot.split('.')[-1]
|
|
281
|
+
|
|
282
|
+
# upload tree to iTOL
|
|
283
|
+
itol_uploader = Itol()
|
|
284
|
+
itol_uploader.params['projectName'] = project_name # better to create a project with a unique name.
|
|
285
|
+
itol_uploader.params['APIkey'] = APIkey # sine we are the same account, we can use the same APIkey
|
|
286
|
+
itol_uploader.params['treeName'] = tree_file
|
|
287
|
+
itol_uploader.add_file(tree_file)
|
|
288
|
+
|
|
289
|
+
# upload annotation files to iTOL
|
|
290
|
+
for annotation_file in annotation_file_list:
|
|
291
|
+
itol_uploader.add_file(annotation_file)
|
|
292
|
+
|
|
293
|
+
status = itol_uploader.upload()
|
|
294
|
+
# import pdb;pdb.set_trace()
|
|
295
|
+
assert status != False
|
|
296
|
+
|
|
297
|
+
# the following parameters are optional, refer to https://itol.embl.de/help.cgi#batchExp
|
|
298
|
+
if len(annotation_file_list) == 1:
|
|
299
|
+
datasets_visible_str = '0'
|
|
300
|
+
elif len(annotation_file_list) == 2:
|
|
301
|
+
datasets_visible_str = '0,1'
|
|
302
|
+
elif len(annotation_file_list) == 3:
|
|
303
|
+
datasets_visible_str = '0,1,2'
|
|
304
|
+
else:
|
|
305
|
+
datasets_visible_str = ','.join([str(i) for i in list(range(0, len(annotation_file_list)))])
|
|
306
|
+
itol_exporter = itol_uploader.get_itol_export()
|
|
307
|
+
itol_exporter.set_export_param_value('datasets_visible', datasets_visible_str)
|
|
308
|
+
itol_exporter.set_export_param_value('display_mode', display_mode)
|
|
309
|
+
itol_exporter.set_export_param_value('range_mode', '2')
|
|
310
|
+
itol_exporter.set_export_param_value('dashed_lines', '1')
|
|
311
|
+
# itol_exporter.set_export_param_value('current_font_size', '96')
|
|
312
|
+
itol_exporter.set_export_param_value('line_width', '3')
|
|
313
|
+
itol_exporter.set_export_param_value('vertical_shift_factor', '0.9')
|
|
314
|
+
itol_exporter.set_export_param_value('horizontal_scale_factor', '0.9')
|
|
315
|
+
itol_exporter.set_export_param_value('format', op_plot_ext)
|
|
316
|
+
itol_exporter.export(op_plot)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def get_node_to_leaf_dict(tree_file):
|
|
320
|
+
internal_node_to_leaf_dict = dict()
|
|
321
|
+
for node in Tree(tree_file, format=1).traverse():
|
|
322
|
+
if not node.is_leaf():
|
|
323
|
+
node_name = node.name
|
|
324
|
+
node_leaf_list = node.get_leaf_names()
|
|
325
|
+
internal_node_to_leaf_dict[node_name] = node_leaf_list
|
|
326
|
+
return internal_node_to_leaf_dict
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def combine_trees(t1_with_len, t2_with_name, op_tree_with_both):
|
|
330
|
+
|
|
331
|
+
# assume t1 has brancn length
|
|
332
|
+
# assume t2 has internal node name
|
|
333
|
+
|
|
334
|
+
t1 = Tree(t1_with_len, format=0)
|
|
335
|
+
t2 = Tree(t2_with_name, format=1)
|
|
336
|
+
|
|
337
|
+
t1_leaves_to_node_dict = dict()
|
|
338
|
+
for t1_node in t1.traverse():
|
|
339
|
+
leaf_str = '__'.join(sorted(list(t1_node.get_leaf_names())))
|
|
340
|
+
t1_leaves_to_node_dict[leaf_str] = t1_node
|
|
341
|
+
|
|
342
|
+
t2_leaves_to_node_dict = dict()
|
|
343
|
+
for t2_node in t2.traverse():
|
|
344
|
+
leaf_str = '__'.join(sorted(list(t2_node.get_leaf_names())))
|
|
345
|
+
t2_leaves_to_node_dict[leaf_str] = t2_node
|
|
346
|
+
|
|
347
|
+
t1_node_to_t2_node_dict = dict()
|
|
348
|
+
for index, t1_node in t1_leaves_to_node_dict.items():
|
|
349
|
+
t2_node = t2_leaves_to_node_dict[index]
|
|
350
|
+
t1_node_to_t2_node_dict[t1_node] = t2_node
|
|
351
|
+
|
|
352
|
+
merged_tree = t1.copy()
|
|
353
|
+
for node, t1_node in zip(merged_tree.traverse(), t1.traverse()):
|
|
354
|
+
node.name = t1_node_to_t2_node_dict[t1_node].name
|
|
355
|
+
merged_tree.write(outfile=op_tree_with_both, format=3)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def prefix_internal_nodes(tree_in, prefix_str, tree_out):
|
|
359
|
+
t = Tree(tree_in, format=3)
|
|
360
|
+
t_renamed = t.copy()
|
|
361
|
+
for node in t_renamed.traverse():
|
|
362
|
+
if not node.is_leaf():
|
|
363
|
+
node_name_prefixed = '%s%s' % (prefix_str, node.name)
|
|
364
|
+
node.name = node_name_prefixed
|
|
365
|
+
t_renamed.write(outfile=tree_out, format=3)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def check_a_is_ancestor_of_b(tree_file, node_a, node_b):
|
|
369
|
+
|
|
370
|
+
a_is_ancestor_of_b = False
|
|
371
|
+
for node in Tree(tree_file, format=1).traverse():
|
|
372
|
+
node_name = node.name
|
|
373
|
+
if node_name == node_b:
|
|
374
|
+
node_ancestor_list = [i.name for i in node.get_ancestors()]
|
|
375
|
+
if node_a in node_ancestor_list:
|
|
376
|
+
a_is_ancestor_of_b = True
|
|
377
|
+
|
|
378
|
+
return a_is_ancestor_of_b
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def check_a_is_child_of_b(tree_file, node_a, node_b):
|
|
382
|
+
|
|
383
|
+
a_is_child_of_b = False
|
|
384
|
+
for node in Tree(tree_file, format=1).traverse():
|
|
385
|
+
node_name = node.name
|
|
386
|
+
if node_name == node_b:
|
|
387
|
+
node_children_list = [i.name for i in node.get_descendants()]
|
|
388
|
+
if node_a in node_children_list:
|
|
389
|
+
a_is_child_of_b = True
|
|
390
|
+
|
|
391
|
+
return a_is_child_of_b
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def root_at_midpoint(tree_in, tree_in_rooted):
|
|
395
|
+
t = Tree(tree_in)
|
|
396
|
+
midpoint = t.get_midpoint_outgroup()
|
|
397
|
+
t.set_outgroup(midpoint)
|
|
398
|
+
t.write(outfile=tree_in_rooted)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def get_color_list(color_num):
|
|
402
|
+
if color_num <= 8:
|
|
403
|
+
color_list_combined = ['#3787c0', '#39399f', '#ffb939', '#399f39', '#9f399f', '#fb694a', '#9f9f39', '#959595']
|
|
404
|
+
|
|
405
|
+
elif 8 < color_num <= 16:
|
|
406
|
+
color_list_combined = ['#2b7bba', '#89bedc', '#2e2e99', '#8a8acc', '#ffa500', '#ffc55c', '#2e992e', '#8acc8a',
|
|
407
|
+
'#992e99', '#cc8acc', '#d52221', '#fc8161', '#99992e', '#cccc8a', '#5c5c5c', '#adadad']
|
|
408
|
+
else:
|
|
409
|
+
color_num_each = math.ceil(color_num / 8) + 2
|
|
410
|
+
color_list_1 = sns.color_palette('Blues', n_colors=color_num_each).as_hex()
|
|
411
|
+
color_list_2 = sns.light_palette('navy', n_colors=color_num_each).as_hex()
|
|
412
|
+
color_list_3 = sns.light_palette('orange', n_colors=color_num_each).as_hex()
|
|
413
|
+
color_list_4 = sns.light_palette('green', n_colors=color_num_each).as_hex()
|
|
414
|
+
color_list_5 = sns.light_palette('purple', n_colors=color_num_each).as_hex()
|
|
415
|
+
color_list_6 = sns.color_palette('Reds', n_colors=color_num_each).as_hex()
|
|
416
|
+
color_list_7 = sns.light_palette('olive', n_colors=color_num_each).as_hex()
|
|
417
|
+
color_list_8 = sns.color_palette('Greys', n_colors=color_num_each).as_hex()
|
|
418
|
+
|
|
419
|
+
color_list_combined = []
|
|
420
|
+
for color_list in [color_list_1, color_list_2, color_list_3, color_list_4, color_list_5, color_list_6,
|
|
421
|
+
color_list_7, color_list_8]:
|
|
422
|
+
for color in color_list[2:][::-1]:
|
|
423
|
+
color_list_combined.append(color)
|
|
424
|
+
|
|
425
|
+
color_list_to_return = random.sample(color_list_combined, color_num)
|
|
426
|
+
|
|
427
|
+
color_list_to_return_sorted = []
|
|
428
|
+
for color_to_return in color_list_combined:
|
|
429
|
+
if color_to_return in color_list_to_return:
|
|
430
|
+
color_list_to_return_sorted.append(color_to_return)
|
|
431
|
+
|
|
432
|
+
return color_list_to_return_sorted
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def scale_str_to_size_list(scale_str):
|
|
436
|
+
scale_list = scale_str.split('-')
|
|
437
|
+
scale_list = [float(i) for i in scale_list]
|
|
438
|
+
|
|
439
|
+
shape_size_list = []
|
|
440
|
+
if scale_list[0] == 0:
|
|
441
|
+
shape_size_list = [0]
|
|
442
|
+
for each_value in scale_list[1:-1]:
|
|
443
|
+
current_size = each_value / scale_list[-1]
|
|
444
|
+
shape_size_list.append(current_size)
|
|
445
|
+
shape_size_list.append(1)
|
|
446
|
+
|
|
447
|
+
if scale_list[0] != 0:
|
|
448
|
+
shape_size_list = [0.1]
|
|
449
|
+
interval_num = len(scale_list) - 1
|
|
450
|
+
interval_value = (1 - 0.1) / interval_num
|
|
451
|
+
n = 1
|
|
452
|
+
for each_value in scale_list[1:-1]:
|
|
453
|
+
shape_size_list.append(interval_value * n + 0.1)
|
|
454
|
+
n += 1
|
|
455
|
+
shape_size_list.append(1)
|
|
456
|
+
|
|
457
|
+
return shape_size_list
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def get_ortho_to_gene_dict(ortho_groups_txt, og_program):
|
|
461
|
+
|
|
462
|
+
ortho_to_gene_dict = dict()
|
|
463
|
+
for each_og in open(ortho_groups_txt):
|
|
464
|
+
if not each_og.startswith('#'):
|
|
465
|
+
og_id = ''
|
|
466
|
+
gene_list = []
|
|
467
|
+
if og_program == 'orthofinder':
|
|
468
|
+
each_og_split = each_og.strip().split(' ')
|
|
469
|
+
og_id = each_og_split[0][:-1]
|
|
470
|
+
gene_list = each_og_split[1:]
|
|
471
|
+
elif og_program == 'oma':
|
|
472
|
+
each_og_split = each_og.strip().split('\t')
|
|
473
|
+
og_id = each_og_split[0]
|
|
474
|
+
group_member_list = each_og_split[1:]
|
|
475
|
+
for each_protein in group_member_list:
|
|
476
|
+
protein_id = each_protein.split(' ')[0].split(':')[1]
|
|
477
|
+
gene_list.append(protein_id)
|
|
478
|
+
ortho_to_gene_dict[og_id] = gene_list
|
|
479
|
+
|
|
480
|
+
return ortho_to_gene_dict
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def prepare_ale_ip_worker(arg_list):
|
|
484
|
+
|
|
485
|
+
qualified_og = arg_list[0]
|
|
486
|
+
gene_tree_dir = arg_list[1]
|
|
487
|
+
ale_wd = arg_list[2]
|
|
488
|
+
genome_tree_file_rooted = arg_list[3]
|
|
489
|
+
gnm_pco_dict = arg_list[4]
|
|
490
|
+
gene_tree_ufboot_for_ale = arg_list[5]
|
|
491
|
+
genome_tree_file_subset_for_ale = arg_list[6]
|
|
492
|
+
|
|
493
|
+
genome_tree_file_subset = '%s_genome_tree.treefile' % qualified_og
|
|
494
|
+
gene_tree_ufboot = '%s.ufboot' % qualified_og
|
|
495
|
+
gene_tree_treefile = '%s.treefile' % qualified_og
|
|
496
|
+
gene_tree_treefile_subset = '%s_subset.treefile' % qualified_og
|
|
497
|
+
pwd_genome_tree_file_subset = '%s/%s' % (gene_tree_dir, genome_tree_file_subset)
|
|
498
|
+
pwd_genome_tree_file_subset_for_ale = '%s/%s' % (ale_wd, genome_tree_file_subset_for_ale)
|
|
499
|
+
pwd_gene_tree_ufboot = '%s/%s' % (gene_tree_dir, gene_tree_ufboot)
|
|
500
|
+
pwd_gene_tree_ufboot_for_ale = '%s/%s' % (ale_wd, gene_tree_ufboot_for_ale)
|
|
501
|
+
pwd_gene_tree_treefile = '%s/%s' % (gene_tree_dir, gene_tree_treefile)
|
|
502
|
+
pwd_gene_tree_treefile_subset = '%s/%s' % (gene_tree_dir, gene_tree_treefile_subset)
|
|
503
|
+
|
|
504
|
+
# get genomes on gene tree
|
|
505
|
+
gene_gnm_set = set()
|
|
506
|
+
gnm_to_gene_dict = dict()
|
|
507
|
+
for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
|
|
508
|
+
|
|
509
|
+
# get gnm id
|
|
510
|
+
gene_gnm = '_'.join(each_gene.split('_')[:-1])
|
|
511
|
+
|
|
512
|
+
gene_gnm_set.add(gene_gnm)
|
|
513
|
+
if gene_gnm not in gnm_to_gene_dict:
|
|
514
|
+
gnm_to_gene_dict[gene_gnm] = {each_gene}
|
|
515
|
+
else:
|
|
516
|
+
gnm_to_gene_dict[gene_gnm].add(each_gene)
|
|
517
|
+
|
|
518
|
+
# subset genome tree
|
|
519
|
+
genome_tree_leaf_set = Tree(genome_tree_file_rooted).get_leaf_names()
|
|
520
|
+
gnms_in_both_trees = set(genome_tree_leaf_set).intersection(gene_gnm_set)
|
|
521
|
+
gnm_tree_subset_str = subset_tree(genome_tree_file_rooted, gnms_in_both_trees, None)
|
|
522
|
+
gnm_tree_subset_str_for_ale = gnm_tree_subset_str
|
|
523
|
+
gnm_tree_subset_str_for_ale = gnm_tree_subset_str_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
|
|
524
|
+
|
|
525
|
+
# write out genome tree subset
|
|
526
|
+
with open(pwd_genome_tree_file_subset, 'w') as pwd_genome_tree_file_subset_handle:
|
|
527
|
+
pwd_genome_tree_file_subset_handle.write(gnm_tree_subset_str)
|
|
528
|
+
|
|
529
|
+
# write out genome tree subset for running ALE
|
|
530
|
+
with open(pwd_genome_tree_file_subset_for_ale, 'w') as pwd_genome_tree_file_subset_for_ale_handle:
|
|
531
|
+
pwd_genome_tree_file_subset_for_ale_handle.write(gnm_tree_subset_str_for_ale)
|
|
532
|
+
|
|
533
|
+
# get genes to keep in gene tree
|
|
534
|
+
gene_set_to_keep = set()
|
|
535
|
+
for each_gnm in gnms_in_both_trees:
|
|
536
|
+
gene_set_to_keep.update(gnm_to_gene_dict.get(each_gnm, set()))
|
|
537
|
+
|
|
538
|
+
# subset gene_tree.treefile
|
|
539
|
+
subset_tree(pwd_gene_tree_treefile, gene_set_to_keep, pwd_gene_tree_treefile_subset)
|
|
540
|
+
|
|
541
|
+
# subset gene_tree.ufboot and rename leaves for running ALE
|
|
542
|
+
pwd_gene_tree_ufboot_for_ale_handle = open(pwd_gene_tree_ufboot_for_ale, 'w')
|
|
543
|
+
for each_gene_tree in open(pwd_gene_tree_ufboot):
|
|
544
|
+
gene_tree_str = each_gene_tree.strip()
|
|
545
|
+
gene_tree_str_subset_for_ale = subset_tree(gene_tree_str, gene_set_to_keep, None)
|
|
546
|
+
gene_tree_str_subset_for_ale = gene_tree_str_subset_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
|
|
547
|
+
pwd_gene_tree_ufboot_for_ale_handle.write(gene_tree_str_subset_for_ale + '\n')
|
|
548
|
+
pwd_gene_tree_ufboot_for_ale_handle.close()
|
|
549
|
+
|
|
550
|
+
# get gene tree leaf name dict (for plot)
|
|
551
|
+
leaf_name_dict = dict()
|
|
552
|
+
for each_gene in Tree(pwd_gene_tree_treefile_subset).get_leaf_names():
|
|
553
|
+
gene_id = each_gene
|
|
554
|
+
gene_genome = '_'.join(gene_id.split('_')[:-1])
|
|
555
|
+
genome_pco = gnm_pco_dict[gene_genome]
|
|
556
|
+
gene_id_with_taxon = '%s_%s' % (genome_pco, gene_id.split('_')[-1])
|
|
557
|
+
leaf_name_dict[gene_id] = gene_id_with_taxon
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def ALE1(args):
|
|
561
|
+
|
|
562
|
+
orthogroups_op_txt = args['i']
|
|
563
|
+
combined_faa = args['s']
|
|
564
|
+
og_program = args['p']
|
|
565
|
+
genome_taxon_txt = args['c']
|
|
566
|
+
min_og_genome_num = args['m']
|
|
567
|
+
min_og_phylum_num = args['n']
|
|
568
|
+
num_threads = args['t']
|
|
569
|
+
js_num_threads = args['jt']
|
|
570
|
+
force_create_op_dir = args['f']
|
|
571
|
+
op_dir = args['o']
|
|
572
|
+
designate_ogs = []
|
|
573
|
+
to_ignore_ogs_list = []
|
|
574
|
+
|
|
575
|
+
# define output file name
|
|
576
|
+
get_gene_tree_cmds_txt = '%s_cmds.txt' % op_dir
|
|
577
|
+
|
|
578
|
+
if os.path.isdir(op_dir) is True:
|
|
579
|
+
if force_create_op_dir is True:
|
|
580
|
+
os.system('rm -r %s' % op_dir)
|
|
581
|
+
else:
|
|
582
|
+
print('Output folder detected, program exited!')
|
|
583
|
+
exit()
|
|
584
|
+
|
|
585
|
+
if force_create_op_dir is True:
|
|
586
|
+
if os.path.isdir(op_dir) is True:
|
|
587
|
+
os.system('rm -r %s' % op_dir)
|
|
588
|
+
os.system('mkdir %s' % op_dir)
|
|
589
|
+
|
|
590
|
+
# read in genome taxonomy
|
|
591
|
+
gnm_p_dict = dict()
|
|
592
|
+
gnm_c_dict = dict()
|
|
593
|
+
gnm_o_dict = dict()
|
|
594
|
+
gnm_pco_dict = dict()
|
|
595
|
+
for each_gnm in open(genome_taxon_txt):
|
|
596
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
597
|
+
gnm_id = each_gnm_split[0]
|
|
598
|
+
taxon_str = each_gnm_split[1]
|
|
599
|
+
gnm_phylum = taxon_str.split(';')[1]
|
|
600
|
+
gnm_class = taxon_str.split(';')[2]
|
|
601
|
+
gnm_order = taxon_str.split(';')[3]
|
|
602
|
+
gnm_p_dict[gnm_id] = gnm_phylum
|
|
603
|
+
gnm_c_dict[gnm_id] = gnm_class
|
|
604
|
+
gnm_o_dict[gnm_id] = gnm_order
|
|
605
|
+
gnm_pco_dict[gnm_id] = '%s__%s__%s__%s' % (gnm_phylum[3:], gnm_class[3:], gnm_order[3:], gnm_id)
|
|
606
|
+
|
|
607
|
+
# get ortho_to_gene_dict
|
|
608
|
+
ortho_to_gene_dict = get_ortho_to_gene_dict(orthogroups_op_txt, og_program)
|
|
609
|
+
|
|
610
|
+
# get qualified orthogroups
|
|
611
|
+
qualified_og_set = set()
|
|
612
|
+
for each_ortho in ortho_to_gene_dict:
|
|
613
|
+
ortho_gene_set = ortho_to_gene_dict[each_ortho]
|
|
614
|
+
ortho_p_set = set()
|
|
615
|
+
ortho_gnm_set = set()
|
|
616
|
+
for each_gene in ortho_gene_set:
|
|
617
|
+
gene_gnm = '_'.join(each_gene.split('_')[:-1])
|
|
618
|
+
gnm_taxon = gnm_p_dict[gene_gnm]
|
|
619
|
+
ortho_gnm_set.add(gene_gnm)
|
|
620
|
+
ortho_p_set.add(gnm_taxon)
|
|
621
|
+
if (len(ortho_gnm_set) >= min_og_genome_num) and (len(ortho_p_set) >= min_og_phylum_num):
|
|
622
|
+
qualified_og_set.add(each_ortho)
|
|
623
|
+
print('The total number of identified orthogroups is %s.' % len(ortho_to_gene_dict))
|
|
624
|
+
print('The number of orthogroups spanning >= %s genomes and >= %s phyla is %s.' % (min_og_genome_num, min_og_phylum_num, len(qualified_og_set)))
|
|
625
|
+
|
|
626
|
+
# process qualified OG
|
|
627
|
+
og_to_process = sorted([i for i in qualified_og_set])
|
|
628
|
+
if len(designate_ogs) > 0:
|
|
629
|
+
print('The number of designated OGs to process: %s' % len(designate_ogs))
|
|
630
|
+
og_to_process = designate_ogs
|
|
631
|
+
|
|
632
|
+
og_to_process_no_ignored = set()
|
|
633
|
+
for each_og in og_to_process:
|
|
634
|
+
if each_og not in to_ignore_ogs_list:
|
|
635
|
+
og_to_process_no_ignored.add(each_og)
|
|
636
|
+
|
|
637
|
+
# extract gene sequences and prepare commands for building gene tree
|
|
638
|
+
print('Preparing commands for building gene trees')
|
|
639
|
+
extract_seq_arg_lol = []
|
|
640
|
+
prepare_ale_ip_worker_arg_lol = []
|
|
641
|
+
get_gene_tree_cmds_txt_handle = open(get_gene_tree_cmds_txt, 'w')
|
|
642
|
+
for qualified_og in sorted(og_to_process_no_ignored):
|
|
643
|
+
qualified_og_gene_set = ortho_to_gene_dict[qualified_og]
|
|
644
|
+
qualified_og_gene_txt = '%s/%s.txt' % (op_dir, qualified_og)
|
|
645
|
+
qualified_og_gene_faa = '%s/%s.faa' % (op_dir, qualified_og)
|
|
646
|
+
qualified_og_gene_aln = '%s/%s.aln' % (op_dir, qualified_og)
|
|
647
|
+
qualified_og_gene_aln_trimmed = '%s/%s_trimmed.aln' % (op_dir, qualified_og)
|
|
648
|
+
pwd_gene_tree_ufboot = '%s/%s.ufboot' % (op_dir, qualified_og)
|
|
649
|
+
|
|
650
|
+
# write out the id of genes
|
|
651
|
+
with open(qualified_og_gene_txt, 'w') as qualified_og_gene_txt_handle:
|
|
652
|
+
qualified_og_gene_txt_handle.write('\n'.join(qualified_og_gene_set))
|
|
653
|
+
|
|
654
|
+
# add to mp lol
|
|
655
|
+
extract_seq_arg_lol.append([combined_faa, qualified_og_gene_txt, qualified_og_gene_faa])
|
|
656
|
+
|
|
657
|
+
# write out js for mafft, trimal and iqtree
|
|
658
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s.faa > %s.aln' % (js_num_threads, qualified_og, qualified_og)
|
|
659
|
+
trimal_cmd = 'trimal -in %s.aln -out %s -automated1' % (qualified_og, qualified_og_gene_aln_trimmed)
|
|
660
|
+
iqtree_cmd = 'iqtree -m LG+G+I -bb 1000 --wbtl -nt %s -s %s.aln -pre %s' % (js_num_threads, qualified_og, qualified_og)
|
|
661
|
+
iqtree_cmd_trimmed = 'iqtree -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s_trimmed' % (js_num_threads, qualified_og_gene_aln_trimmed, qualified_og)
|
|
662
|
+
get_gene_tree_cmds_txt_handle.write('%s; %s\n' % (mafft_cmd, iqtree_cmd))
|
|
663
|
+
get_gene_tree_cmds_txt_handle.close()
|
|
664
|
+
|
|
665
|
+
# extract gene sequences with multiprocessing
|
|
666
|
+
print('Extracting gene sequences with %s cores' % num_threads)
|
|
667
|
+
pool = mp.Pool(processes=num_threads)
|
|
668
|
+
pool.map(select_seq, extract_seq_arg_lol)
|
|
669
|
+
pool.close()
|
|
670
|
+
pool.join()
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def ALE2(args):
|
|
674
|
+
|
|
675
|
+
gene_tree_dir = args['i']
|
|
676
|
+
genome_tree_file_rooted = args['s']
|
|
677
|
+
genome_taxon_txt = args['c']
|
|
678
|
+
force_create_ale_wd = args['f']
|
|
679
|
+
num_threads = args['t']
|
|
680
|
+
ale_wd = args['o']
|
|
681
|
+
designate_ogs = []
|
|
682
|
+
to_ignore_ogs_list = []
|
|
683
|
+
run_ale_cmds_txt = '%s_cmds.txt' % ale_wd
|
|
684
|
+
|
|
685
|
+
ufboot_file_re = '%s/*.ufboot' % gene_tree_dir
|
|
686
|
+
ufboot_file_list = glob.glob(ufboot_file_re)
|
|
687
|
+
|
|
688
|
+
og_to_process_list = []
|
|
689
|
+
for each_ufboot in ufboot_file_list:
|
|
690
|
+
_, ufboot_base, _ = sep_path_basename_ext(each_ufboot)
|
|
691
|
+
og_to_process_list.append(ufboot_base)
|
|
692
|
+
|
|
693
|
+
# read in genome taxonomy
|
|
694
|
+
gnm_pco_dict = dict()
|
|
695
|
+
for each_gnm in open(genome_taxon_txt):
|
|
696
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
697
|
+
gnm_id = each_gnm_split[0]
|
|
698
|
+
taxon_str = each_gnm_split[1]
|
|
699
|
+
gnm_phylum = taxon_str.split(';')[1]
|
|
700
|
+
gnm_class = taxon_str.split(';')[2]
|
|
701
|
+
gnm_order = taxon_str.split(';')[3]
|
|
702
|
+
gnm_pco_dict[gnm_id] = '%s__%s__%s__%s' % (gnm_phylum[3:], gnm_class[3:], gnm_order[3:], gnm_id)
|
|
703
|
+
|
|
704
|
+
# create ale_wd
|
|
705
|
+
if force_create_ale_wd is True:
|
|
706
|
+
if os.path.isdir(ale_wd) is True:
|
|
707
|
+
os.system('rm -r %s' % ale_wd)
|
|
708
|
+
os.system('mkdir %s' % ale_wd)
|
|
709
|
+
|
|
710
|
+
run_ale_cmds_txt_handle = open(run_ale_cmds_txt, 'w')
|
|
711
|
+
prepare_ale_ip_worker_arg_lol = []
|
|
712
|
+
for qualified_og in og_to_process_list:
|
|
713
|
+
pwd_gene_tree_ufboot = '%s/%s.ufboot' % (gene_tree_dir, qualified_og)
|
|
714
|
+
if os.path.isfile(pwd_gene_tree_ufboot) is False:
|
|
715
|
+
print('%s not found, please build gene tree first!' % pwd_gene_tree_ufboot)
|
|
716
|
+
else:
|
|
717
|
+
gene_tree_ufboot_for_ale = '%s_for_ALE.ufboot' % qualified_og
|
|
718
|
+
genome_tree_file_subset_for_ale = '%s_genome_tree_for_ALE.treefile' % qualified_og
|
|
719
|
+
obtain_ale_file_cmd = 'ALEobserve %s' % gene_tree_ufboot_for_ale
|
|
720
|
+
reconciliation_cmd = 'ALEml_undated %s %s_for_ALE.ufboot.ale' % (genome_tree_file_subset_for_ale, qualified_og)
|
|
721
|
+
current_arg_list = [qualified_og, gene_tree_dir, ale_wd, genome_tree_file_rooted, gnm_pco_dict, gene_tree_ufboot_for_ale, genome_tree_file_subset_for_ale]
|
|
722
|
+
run_ale_cmds_txt_handle.write('%s; %s\n' % (obtain_ale_file_cmd, reconciliation_cmd))
|
|
723
|
+
prepare_ale_ip_worker_arg_lol.append(current_arg_list)
|
|
724
|
+
run_ale_cmds_txt_handle.close()
|
|
725
|
+
|
|
726
|
+
# prepare input files and job script for running ALE with multiprocessing
|
|
727
|
+
print('Preparing files for running ALE with %s cores for %s OGs' % (num_threads, len(prepare_ale_ip_worker_arg_lol)))
|
|
728
|
+
pool = mp.Pool(processes=num_threads)
|
|
729
|
+
pool.map(prepare_ale_ip_worker, prepare_ale_ip_worker_arg_lol)
|
|
730
|
+
pool.close()
|
|
731
|
+
pool.join()
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def iTOL(Leaf_to_Group_dict, Group_to_Color_dict, FileOut):
|
|
735
|
+
|
|
736
|
+
Group_set = set()
|
|
737
|
+
for each_leaf in Leaf_to_Group_dict:
|
|
738
|
+
Group_set.add(Leaf_to_Group_dict[each_leaf])
|
|
739
|
+
|
|
740
|
+
if len(Group_to_Color_dict) == 0:
|
|
741
|
+
Group_to_Color_dict = dict(zip(Group_set, get_color_list(len(Group_set))))
|
|
742
|
+
else:
|
|
743
|
+
group_without_color_list = []
|
|
744
|
+
for each_group in Group_set:
|
|
745
|
+
if each_group not in Group_to_Color_dict:
|
|
746
|
+
group_without_color_list.append(each_group)
|
|
747
|
+
if len(group_without_color_list) > 0:
|
|
748
|
+
color_list_unprovided = get_color_list(len(group_without_color_list))
|
|
749
|
+
Group_to_Color_dict_unprovided = dict(zip(group_without_color_list, color_list_unprovided))
|
|
750
|
+
for each_group in Group_to_Color_dict_unprovided:
|
|
751
|
+
Group_to_Color_dict[each_group] = Group_to_Color_dict_unprovided[each_group]
|
|
752
|
+
|
|
753
|
+
FileOut_handle = open(FileOut, 'w')
|
|
754
|
+
FileOut_handle.write('DATASET_COLORSTRIP\n')
|
|
755
|
+
FileOut_handle.write('SEPARATOR TAB\n')
|
|
756
|
+
FileOut_handle.write('DATASET_LABEL\tTaxonomy\n')
|
|
757
|
+
FileOut_handle.write('\n# customize strip attributes here\n')
|
|
758
|
+
FileOut_handle.write('STRIP_WIDTH\t100\n')
|
|
759
|
+
FileOut_handle.write('MARGIN\t20\n')
|
|
760
|
+
FileOut_handle.write('\n# provide data here\nDATA\n')
|
|
761
|
+
for leaf in Leaf_to_Group_dict:
|
|
762
|
+
leaf_group = Leaf_to_Group_dict[leaf]
|
|
763
|
+
leaf_color = Group_to_Color_dict[leaf_group]
|
|
764
|
+
FileOut_handle.write('%s\t%s\t%s\n' % (leaf, leaf_color, leaf_group))
|
|
765
|
+
FileOut_handle.close()
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def parse_ale_op_worker(arg_list):
|
|
769
|
+
|
|
770
|
+
qualified_og = arg_list[0]
|
|
771
|
+
gene_tree_dir = arg_list[1]
|
|
772
|
+
ale_wd = arg_list[2]
|
|
773
|
+
ale_op_dir = arg_list[3]
|
|
774
|
+
ale_hgt_plot_dir = arg_list[4]
|
|
775
|
+
interal_node_prefix = arg_list[5]
|
|
776
|
+
gnm_pco_dict = arg_list[6]
|
|
777
|
+
d_color = arg_list[7]
|
|
778
|
+
r_color = arg_list[8]
|
|
779
|
+
project_name = arg_list[9]
|
|
780
|
+
API_key = arg_list[10]
|
|
781
|
+
display_mode = arg_list[11]
|
|
782
|
+
hgt_freq_cutoff = arg_list[12]
|
|
783
|
+
ignore_leaf_hgt = arg_list[13]
|
|
784
|
+
ignore_vertical_hgt = arg_list[14]
|
|
785
|
+
donor_node_min_leaf_num = arg_list[15]
|
|
786
|
+
recipient_node_min_leaf_num = arg_list[16]
|
|
787
|
+
dr_separator = arg_list[17]
|
|
788
|
+
root_gene_tree_at_midpoint = arg_list[18]
|
|
789
|
+
p_color_txt = arg_list[19]
|
|
790
|
+
|
|
791
|
+
ale_uml_rec_file = '%s/%s_for_ALE.ufboot.ale.uml_rec' % (ale_wd, qualified_og)
|
|
792
|
+
gene_tree_treefile = '%s.treefile' % qualified_og
|
|
793
|
+
genome_tree_file_subset_for_ale = '%s_genome_tree_for_ALE.treefile' % qualified_og
|
|
794
|
+
gene_tree_ufboot_for_ale = '%s_for_ALE.ufboot' % qualified_og
|
|
795
|
+
uts_file = '%s.ale.uTs' % gene_tree_ufboot_for_ale
|
|
796
|
+
uml_rec_file = '%s.ale.uml_rec' % gene_tree_ufboot_for_ale
|
|
797
|
+
ale_formatted_gnm_tree = '%s_ALE_formatted_genome_tree.tree' % gene_tree_ufboot_for_ale
|
|
798
|
+
ale_formatted_gnm_tree_with_len = '%s_ALE_formatted_genome_tree_with_len.tree' % gene_tree_ufboot_for_ale
|
|
799
|
+
ale_formatted_gnm_tree_with_len_prefixed = '%s_ALE_formatted_genome_tree_with_len_prefixed.tree' % gene_tree_ufboot_for_ale
|
|
800
|
+
itol_connection_txt_all = '%s_iTOL_connection.txt' % qualified_og
|
|
801
|
+
itol_label_txt = '%s_iTOL_genome_pco.txt' % qualified_og
|
|
802
|
+
gene_tree_itol_label_txt = '%s_iTOL_gene_pco.txt' % qualified_og
|
|
803
|
+
gene_tree_treefile_subset = '%s_subset.treefile' % qualified_og
|
|
804
|
+
gene_tree_treefile_subset_midpoint_rooted = '%s_subset_midpoint_rooted.treefile' % qualified_og
|
|
805
|
+
gene_tree_itol_colorstrip_txt = '%s_iTOL_colorstrip_gene.txt' % qualified_og
|
|
806
|
+
genome_tree_itol_colorstrip_txt = '%s_iTOL_colorstrip_genome.txt' % qualified_og
|
|
807
|
+
pwd_gene_tree_treefile_subset = '%s/%s' % (gene_tree_dir, gene_tree_treefile_subset)
|
|
808
|
+
pwd_gene_tree_treefile_subset_midpoint_rooted = '%s/%s' % (ale_op_dir, gene_tree_treefile_subset_midpoint_rooted)
|
|
809
|
+
pwd_gene_tree_treefile = '%s/%s' % (gene_tree_dir, gene_tree_treefile)
|
|
810
|
+
pwd_genome_tree_file_subset_for_ale = '%s/%s' % (ale_op_dir, genome_tree_file_subset_for_ale)
|
|
811
|
+
pwd_itol_connection_txt_all = '%s/%s' % (ale_hgt_plot_dir, itol_connection_txt_all)
|
|
812
|
+
pwd_itol_label_txt = '%s/%s' % (ale_op_dir, itol_label_txt)
|
|
813
|
+
pwd_gene_tree_itol_label_txt = '%s/%s' % (ale_hgt_plot_dir, gene_tree_itol_label_txt)
|
|
814
|
+
pwd_uts_file = '%s/%s' % (ale_op_dir, uts_file)
|
|
815
|
+
pwd_uml_rec_file = '%s/%s' % (ale_op_dir, uml_rec_file)
|
|
816
|
+
pwd_ale_formatted_gnm_tree = '%s/%s' % (ale_op_dir, ale_formatted_gnm_tree)
|
|
817
|
+
pwd_ale_formatted_gnm_tree_with_len = '%s/%s' % (ale_op_dir, ale_formatted_gnm_tree_with_len)
|
|
818
|
+
pwd_ale_formatted_gnm_tree_with_len_prefixed = '%s/%s' % (ale_op_dir, ale_formatted_gnm_tree_with_len_prefixed)
|
|
819
|
+
pwd_gene_tree_itol_colorstrip_txt = '%s/%s' % (ale_hgt_plot_dir, gene_tree_itol_colorstrip_txt)
|
|
820
|
+
pwd_genome_tree_itol_colorstrip_txt = '%s/%s' % (ale_hgt_plot_dir, genome_tree_itol_colorstrip_txt)
|
|
821
|
+
|
|
822
|
+
# run ale_splitter
|
|
823
|
+
ale_splitter(ale_uml_rec_file)
|
|
824
|
+
|
|
825
|
+
# read in phylum color
|
|
826
|
+
p_color_dict = dict()
|
|
827
|
+
for each_line in open(p_color_txt):
|
|
828
|
+
each_line_split = each_line.strip().split('\t')
|
|
829
|
+
phylum_id = each_line_split[1]
|
|
830
|
+
color_id = each_line_split[0]
|
|
831
|
+
p_color_dict[phylum_id] = color_id
|
|
832
|
+
|
|
833
|
+
internal_node_to_leaf_dict = dict()
|
|
834
|
+
paired_donor_to_recipient_leaf_dict = dict()
|
|
835
|
+
hgt_freq_dict = dict()
|
|
836
|
+
if os.path.isfile(pwd_uts_file) is True:
|
|
837
|
+
|
|
838
|
+
# write out ALE formatted genome tree
|
|
839
|
+
renamed_genome_tree_str = open(pwd_uml_rec_file).readlines()[2].strip().split('\t')[1]
|
|
840
|
+
with open(pwd_ale_formatted_gnm_tree, 'w') as ale_renamed_species_tree_handle:
|
|
841
|
+
ale_renamed_species_tree_handle.write(renamed_genome_tree_str + '\n')
|
|
842
|
+
|
|
843
|
+
internal_node_to_leaf_dict, paired_donor_to_recipient_leaf_dict, hgt_freq_dict = uts_to_itol_connections(pwd_genome_tree_file_subset_for_ale, pwd_ale_formatted_gnm_tree, interal_node_prefix, pwd_uts_file, hgt_freq_cutoff, ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num, pwd_itol_connection_txt_all, dr_separator)
|
|
844
|
+
else:
|
|
845
|
+
print('%s: uTs file not found, you need to run ALE first!' % qualified_og)
|
|
846
|
+
|
|
847
|
+
# combine_trees
|
|
848
|
+
combine_trees(pwd_genome_tree_file_subset_for_ale, pwd_ale_formatted_gnm_tree, pwd_ale_formatted_gnm_tree_with_len)
|
|
849
|
+
|
|
850
|
+
# prefix_internal_nodes of combined tree
|
|
851
|
+
prefix_internal_nodes(pwd_ale_formatted_gnm_tree_with_len, interal_node_prefix, pwd_ale_formatted_gnm_tree_with_len_prefixed)
|
|
852
|
+
|
|
853
|
+
# write out iTOL label file for gene and genome tree, also colorstrip for taxonomy
|
|
854
|
+
pwd_itol_label_txt_handle = open(pwd_itol_label_txt, 'w')
|
|
855
|
+
pwd_itol_label_txt_handle.write('LABELS\nSEPARATOR TAB\n\nDATA\n')
|
|
856
|
+
pwd_gene_tree_itol_label_txt_handle = open(pwd_gene_tree_itol_label_txt, 'w')
|
|
857
|
+
pwd_gene_tree_itol_label_txt_handle.write('LABELS\nSEPARATOR TAB\n\nDATA\n')
|
|
858
|
+
wrote_gnm_set = set()
|
|
859
|
+
gene_to_p_dict = dict()
|
|
860
|
+
genome_to_p_dict = dict()
|
|
861
|
+
for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
|
|
862
|
+
gene_gnm = '_'.join(each_gene.split('_')[:-1])
|
|
863
|
+
genome_name_for_ale = gene_gnm
|
|
864
|
+
genome_name_for_ale = genome_name_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
|
|
865
|
+
genome_with_taxon = gnm_pco_dict[gene_gnm]
|
|
866
|
+
gene_to_p_dict[each_gene] = genome_with_taxon.split('__')[0]
|
|
867
|
+
if gene_gnm not in wrote_gnm_set:
|
|
868
|
+
genome_to_p_dict[genome_name_for_ale] = genome_with_taxon.split('__')[0]
|
|
869
|
+
pwd_itol_label_txt_handle.write('%s\t%s\n' % (genome_name_for_ale, genome_with_taxon))
|
|
870
|
+
wrote_gnm_set.add(gene_gnm)
|
|
871
|
+
pwd_gene_tree_itol_label_txt_handle.write('%s\t%s_%s\n' % (each_gene, genome_with_taxon, each_gene.split('_')[-1]))
|
|
872
|
+
pwd_itol_label_txt_handle.close()
|
|
873
|
+
pwd_gene_tree_itol_label_txt_handle.close()
|
|
874
|
+
|
|
875
|
+
iTOL(gene_to_p_dict, p_color_dict, pwd_gene_tree_itol_colorstrip_txt)
|
|
876
|
+
iTOL(genome_to_p_dict, p_color_dict, pwd_genome_tree_itol_colorstrip_txt)
|
|
877
|
+
|
|
878
|
+
# root gene tree at midpoint
|
|
879
|
+
gene_tree_to_plot = pwd_gene_tree_treefile_subset
|
|
880
|
+
if root_gene_tree_at_midpoint is True:
|
|
881
|
+
root_at_midpoint(pwd_gene_tree_treefile_subset, pwd_gene_tree_treefile_subset_midpoint_rooted)
|
|
882
|
+
gene_tree_to_plot = pwd_gene_tree_treefile_subset_midpoint_rooted
|
|
883
|
+
|
|
884
|
+
# plot separately
|
|
885
|
+
n = 1
|
|
886
|
+
for each_d2r in paired_donor_to_recipient_leaf_dict:
|
|
887
|
+
each_d2r_freq = hgt_freq_dict[each_d2r]
|
|
888
|
+
each_d2r_d_list = paired_donor_to_recipient_leaf_dict[each_d2r][0]
|
|
889
|
+
each_d2r_r_list = paired_donor_to_recipient_leaf_dict[each_d2r][1]
|
|
890
|
+
pwd_itol_label_txt = '%s/%s_iTOL_genome_pco.txt' % (ale_op_dir, qualified_og)
|
|
891
|
+
pwd_gene_tree_itol_label_txt = '%s/%s_iTOL_gene_pco.txt' % (ale_hgt_plot_dir, qualified_og)
|
|
892
|
+
pwd_gnm_tree_label_color_txt = '%s/%s_iTOL_label_color_genome_%s.txt' % (ale_hgt_plot_dir, qualified_og, each_d2r)
|
|
893
|
+
pwd_gene_tree_label_color_txt = '%s/%s_iTOL_label_color_gene_%s.txt' % (ale_hgt_plot_dir, qualified_og, each_d2r)
|
|
894
|
+
pwd_itol_connection_txt = '%s/%s_iTOL_connection_%s.txt' % (ale_hgt_plot_dir, qualified_og, each_d2r)
|
|
895
|
+
pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf = '%s/%s_genome_tree_with_HGT_%s.pdf' % (ale_wd, qualified_og, each_d2r)
|
|
896
|
+
pwd_gene_tree_treefile_subset_pdf = '%s/%s_subset_%s.pdf' % (ale_hgt_plot_dir, qualified_og, each_d2r)
|
|
897
|
+
pwd_gene_tree_treefile_subset_pdf_rooted = '%s/%s_subset_%s_rooted.pdf' % (ale_hgt_plot_dir, qualified_og, each_d2r)
|
|
898
|
+
pwd_combined_image_with_ale_hgts = '%s/%s_HGT_%s_%s_%s.pdf' % (ale_hgt_plot_dir, qualified_og, n, each_d2r, each_d2r_freq)
|
|
899
|
+
|
|
900
|
+
# write out gnm_tree_label_color_txt
|
|
901
|
+
pwd_gnm_tree_label_color_txt_handle = open(pwd_gnm_tree_label_color_txt, 'w')
|
|
902
|
+
pwd_gnm_tree_label_color_txt_handle.write('DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\texample_style\nCOLOR\t#ffff00\n\nDATA\n')
|
|
903
|
+
pwd_gnm_tree_label_color_txt_handle.write('%s\tlabel\tclade\t%s\t1\tnormal\n' % (each_d2r.split(dr_separator)[0], d_color))
|
|
904
|
+
pwd_gnm_tree_label_color_txt_handle.write('%s\tlabel\tclade\t%s\t1\tnormal\n' % (each_d2r.split(dr_separator)[1], r_color))
|
|
905
|
+
pwd_gnm_tree_label_color_txt_handle.close()
|
|
906
|
+
|
|
907
|
+
# write out iTOL label file for gene and genome tree, also colorstrip for taxonomy
|
|
908
|
+
pwd_gene_tree_label_color_txt_handle = open(pwd_gene_tree_label_color_txt, 'w')
|
|
909
|
+
pwd_gene_tree_label_color_txt_handle.write('DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\texample_style\nCOLOR\t#ffff00\n\nDATA\n')
|
|
910
|
+
for each_gene in Tree(pwd_gene_tree_treefile).get_leaf_names():
|
|
911
|
+
|
|
912
|
+
gene_name_for_ale = '_'.join(each_gene.strip().split('_')[:-1])
|
|
913
|
+
gene_name_for_ale = gene_name_for_ale.replace('GCA_', 'GCA').replace('GCF_', 'GCF')
|
|
914
|
+
if gene_name_for_ale in each_d2r_d_list:
|
|
915
|
+
pwd_gene_tree_label_color_txt_handle.write('%s\tlabel\tnode\t%s\t1\tnormal\n' % (each_gene, d_color))
|
|
916
|
+
elif gene_name_for_ale in each_d2r_r_list:
|
|
917
|
+
pwd_gene_tree_label_color_txt_handle.write('%s\tlabel\tnode\t%s\t1\tnormal\n' % (each_gene, r_color))
|
|
918
|
+
pwd_gene_tree_label_color_txt_handle.close()
|
|
919
|
+
|
|
920
|
+
itol_tree(pwd_ale_formatted_gnm_tree_with_len_prefixed, [pwd_gnm_tree_label_color_txt, pwd_itol_label_txt, pwd_itol_connection_txt, pwd_genome_tree_itol_colorstrip_txt], project_name, API_key, display_mode, pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf)
|
|
921
|
+
itol_tree(gene_tree_to_plot, [pwd_gene_tree_itol_label_txt, pwd_gene_tree_label_color_txt, pwd_gene_tree_itol_colorstrip_txt], project_name, API_key, display_mode, pwd_gene_tree_treefile_subset_pdf)
|
|
922
|
+
merge_pdf(pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf, pwd_gene_tree_treefile_subset_pdf, 66, pwd_combined_image_with_ale_hgts)
|
|
923
|
+
n += 1
|
|
924
|
+
|
|
925
|
+
os.system('mv %s %s/annotation_files/' % (pwd_ale_formatted_gnm_tree_with_len_prefixed_pdf, ale_hgt_plot_dir))
|
|
926
|
+
os.system('mv %s %s/annotation_files/' % (pwd_gene_tree_treefile_subset_pdf, ale_hgt_plot_dir))
|
|
927
|
+
os.system('mv %s %s/annotation_files/' % (pwd_gnm_tree_label_color_txt, ale_hgt_plot_dir))
|
|
928
|
+
os.system('mv %s %s/annotation_files/' % (pwd_gene_tree_label_color_txt, ale_hgt_plot_dir))
|
|
929
|
+
os.system('mv %s %s/annotation_files/' % (pwd_itol_connection_txt, ale_hgt_plot_dir))
|
|
930
|
+
os.system('mv %s %s/annotation_files/' % (pwd_itol_label_txt, ale_hgt_plot_dir))
|
|
931
|
+
os.system('mv %s %s/annotation_files/' % (pwd_gene_tree_itol_label_txt, ale_hgt_plot_dir))
|
|
932
|
+
os.system('mv %s %s/annotation_files/' % (pwd_itol_connection_txt_all, ale_hgt_plot_dir))
|
|
933
|
+
os.system('mv %s %s/annotation_files/' % (pwd_gene_tree_itol_colorstrip_txt, ale_hgt_plot_dir))
|
|
934
|
+
os.system('mv %s %s/annotation_files/' % (pwd_genome_tree_itol_colorstrip_txt, ale_hgt_plot_dir))
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
def ale_splitter(rec_file):
|
|
938
|
+
|
|
939
|
+
options = [True, True, True, True]
|
|
940
|
+
with open(rec_file) as f:
|
|
941
|
+
lines = f.readlines()
|
|
942
|
+
stree = lines[2].strip()
|
|
943
|
+
ll = lines[6].strip().split()[-1]
|
|
944
|
+
rates = lines[8].strip().split("\t")[1:]
|
|
945
|
+
n_reconciled_trees = int(lines[10].strip().split()[0])
|
|
946
|
+
reconciled_trees = lines[12:n_reconciled_trees + 12]
|
|
947
|
+
n_of_events = lines[12 + n_reconciled_trees + 1].split("\t")[1:]
|
|
948
|
+
table = lines[12 + n_reconciled_trees + 3:]
|
|
949
|
+
|
|
950
|
+
if options[0]:
|
|
951
|
+
with open(rec_file.replace("uml_rec", "stree"), "w") as f:
|
|
952
|
+
f.write(stree.split("\t")[-1])
|
|
953
|
+
if options[1]:
|
|
954
|
+
with open(rec_file.replace("uml_rec", "info"), "w") as f:
|
|
955
|
+
f.write("LL:" + "\t" + ll + "\n")
|
|
956
|
+
f.write("Dp:" + "\t" + rates[0] + "\n")
|
|
957
|
+
f.write("Tp:" + "\t" + rates[1] + "\n")
|
|
958
|
+
f.write("Lp:" + "\t" + rates[2] + "\n")
|
|
959
|
+
f.write("De:" + "\t" + n_of_events[0] + "\n")
|
|
960
|
+
f.write("Te:" + "\t" + n_of_events[1] + "\n")
|
|
961
|
+
f.write("Le:" + "\t" + n_of_events[2] + "\n")
|
|
962
|
+
f.write("Se:" + "\t" + n_of_events[3] + "\n")
|
|
963
|
+
if options[2]:
|
|
964
|
+
with open(rec_file.replace("uml_rec", "recs"), "w") as f:
|
|
965
|
+
for t in reconciled_trees:
|
|
966
|
+
f.write(t)
|
|
967
|
+
if options[3]:
|
|
968
|
+
with open(rec_file.replace("uml_rec", "rec_table"), "w") as f:
|
|
969
|
+
for e in table:
|
|
970
|
+
f.write(e)
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
def ALE3(args):
|
|
974
|
+
|
|
975
|
+
gene_tree_dir = args['i1']
|
|
976
|
+
ale_wd = args['i2']
|
|
977
|
+
genome_taxon_txt = args['c']
|
|
978
|
+
ar_phylum_color_code_txt = args['color']
|
|
979
|
+
ale_hgt_plot_dir = args['o']
|
|
980
|
+
force_create_op_dir = args['f']
|
|
981
|
+
API_key = args['api']
|
|
982
|
+
hgt_freq_cutoff = args['fc']
|
|
983
|
+
donor_node_min_leaf_num = args['mld']
|
|
984
|
+
recipient_node_min_leaf_num = args['mlr']
|
|
985
|
+
project_name = args['itol']
|
|
986
|
+
|
|
987
|
+
ignore_vertical_hgt = True # filter ALE predicted HGTs
|
|
988
|
+
ignore_leaf_hgt = True # filter ALE predicted HGTs
|
|
989
|
+
interal_node_prefix = 'IN' # plot tree with HGT
|
|
990
|
+
display_mode = '1' # plot tree with HGT, 1=rectangular, 2=circular, 3=unrooted
|
|
991
|
+
align_leaf_name = True # plot tree with HGT
|
|
992
|
+
show_scale = False # plot tree with HGT
|
|
993
|
+
d_color = '#FF0000' # plot tree with HGT
|
|
994
|
+
r_color = '#0000FF' # plot tree with HGT
|
|
995
|
+
dr_separator = '_to_' # plot tree with HGT
|
|
996
|
+
root_gene_tree_at_midpoint = True # plot tree with HGT
|
|
997
|
+
|
|
998
|
+
####################################################################################################################
|
|
999
|
+
|
|
1000
|
+
ufboot_file_re = '%s/*.ufboot' % gene_tree_dir
|
|
1001
|
+
ufboot_file_list = glob.glob(ufboot_file_re)
|
|
1002
|
+
og_to_process_list = []
|
|
1003
|
+
for each_ufboot in ufboot_file_list:
|
|
1004
|
+
_, ufboot_base, _ = sep_path_basename_ext(each_ufboot)
|
|
1005
|
+
og_to_process_list.append(ufboot_base)
|
|
1006
|
+
|
|
1007
|
+
# read in genome taxonomy
|
|
1008
|
+
gnm_pco_dict = dict()
|
|
1009
|
+
for each_gnm in open(genome_taxon_txt):
|
|
1010
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
1011
|
+
gnm_id = each_gnm_split[0]
|
|
1012
|
+
taxon_str = each_gnm_split[1]
|
|
1013
|
+
gnm_phylum = taxon_str.split(';')[1]
|
|
1014
|
+
gnm_class = taxon_str.split(';')[2]
|
|
1015
|
+
gnm_order = taxon_str.split(';')[3]
|
|
1016
|
+
gnm_pco_dict[gnm_id] = '%s__%s__%s__%s' % (gnm_phylum[3:], gnm_class[3:], gnm_order[3:], gnm_id)
|
|
1017
|
+
|
|
1018
|
+
if os.path.isdir(ale_hgt_plot_dir) is True:
|
|
1019
|
+
if force_create_op_dir is True:
|
|
1020
|
+
os.system('rm -r %s' % ale_hgt_plot_dir)
|
|
1021
|
+
else:
|
|
1022
|
+
print('Output folder detected, program exited!')
|
|
1023
|
+
exit()
|
|
1024
|
+
os.system('mkdir %s' % ale_hgt_plot_dir)
|
|
1025
|
+
os.system('mkdir %s/annotation_files' % ale_hgt_plot_dir)
|
|
1026
|
+
|
|
1027
|
+
# parse ALE output
|
|
1028
|
+
n = 1
|
|
1029
|
+
for qualified_og in og_to_process_list:
|
|
1030
|
+
|
|
1031
|
+
print('%s (%s/%s): Parsing ALE outputs' % (qualified_og, n, len(og_to_process_list)))
|
|
1032
|
+
current_arg_list = [qualified_og, gene_tree_dir, ale_wd, ale_wd, ale_hgt_plot_dir, interal_node_prefix,
|
|
1033
|
+
gnm_pco_dict, d_color, r_color, project_name, API_key, display_mode, hgt_freq_cutoff,
|
|
1034
|
+
ignore_leaf_hgt, ignore_vertical_hgt, donor_node_min_leaf_num, recipient_node_min_leaf_num,
|
|
1035
|
+
dr_separator, root_gene_tree_at_midpoint, ar_phylum_color_code_txt]
|
|
1036
|
+
parse_ale_op_worker(current_arg_list)
|
|
1037
|
+
n += 1
|
|
1038
|
+
|
|
1039
|
+
print('Done!')
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
def ALE4(args):
|
|
1043
|
+
pass
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
if __name__ == '__main__':
|
|
1047
|
+
pass
|
|
1048
|
+
|
|
1049
|
+
# ALE1_parser = argparse.ArgumentParser()
|
|
1050
|
+
# ALE1_parser.add_argument('-i', required=True, help='orthologous groups, either from orthofinder or oma')
|
|
1051
|
+
# ALE1_parser.add_argument('-s', required=True, help='sequence file, e.g., combined.faa')
|
|
1052
|
+
# ALE1_parser.add_argument('-p', required=True, help='orthologous identification program, orthofinder or oma')
|
|
1053
|
+
# ALE1_parser.add_argument('-m', required=False, type=int, default=50, help='min_og_genome_num, default: 50')
|
|
1054
|
+
# ALE1_parser.add_argument('-n', required=False, type=int, default=2, help='min_og_phylum_num, default: 2')
|
|
1055
|
+
# ALE1_parser.add_argument('-o', required=True, help='output dir, i.e., OMA working directory')
|
|
1056
|
+
# ALE1_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
|
|
1057
|
+
# ALE1_parser.add_argument('-jt', required=False, type=int, default=3, help='number of threads for job script, default: 3')
|
|
1058
|
+
# ALE1_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
1059
|
+
# ALE1_parser.add_argument('-c', required=True, help='genome_taxon_txt')
|
|
1060
|
+
# args = vars(ALE1_parser.parse_args())
|
|
1061
|
+
# ALE1(args)
|
|
1062
|
+
|
|
1063
|
+
# ALE2_parser = argparse.ArgumentParser()
|
|
1064
|
+
# ALE2_parser.add_argument('-i', required=True, help='ALE1 output directory')
|
|
1065
|
+
# ALE2_parser.add_argument('-s', required=True, help='rooted species tree')
|
|
1066
|
+
# ALE2_parser.add_argument('-c', required=True, help='genome_taxon_txt')
|
|
1067
|
+
# ALE2_parser.add_argument('-o', required=True, help='output dir, i.e., OMA working directory')
|
|
1068
|
+
# ALE2_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
1069
|
+
# ALE2_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
|
|
1070
|
+
# args = vars(ALE2_parser.parse_args())
|
|
1071
|
+
# ALE2(args)
|
|
1072
|
+
|
|
1073
|
+
# ALE3_parser = argparse.ArgumentParser()
|
|
1074
|
+
# ALE3_parser.add_argument('-i1', required=True, help='ALE1 output directory')
|
|
1075
|
+
# ALE3_parser.add_argument('-i2', required=True, help='ALE2 output directory')
|
|
1076
|
+
# ALE3_parser.add_argument('-c', required=True, help='genome_taxon_txt')
|
|
1077
|
+
# ALE3_parser.add_argument('-color', required=True, help='phylum_color_code.txt')
|
|
1078
|
+
# ALE3_parser.add_argument('-o', required=True, help='output dir, i.e., ALE3_op_dir')
|
|
1079
|
+
# ALE3_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
1080
|
+
# args = vars(ALE3_parser.parse_args())
|
|
1081
|
+
# ALE3(args)
|