treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/ALE5.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import operator
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
from itertools import chain
|
|
6
|
+
from itertools import combinations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def sep_path_basename_ext(file_in):
|
|
10
|
+
|
|
11
|
+
# separate path and file name
|
|
12
|
+
f_path, file_name = os.path.split(file_in)
|
|
13
|
+
if f_path == '':
|
|
14
|
+
f_path = '.'
|
|
15
|
+
|
|
16
|
+
# separate file basename and extension
|
|
17
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
18
|
+
|
|
19
|
+
return f_path, f_base, f_ext
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def powerset(iterable):
|
|
23
|
+
|
|
24
|
+
" powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) "
|
|
25
|
+
|
|
26
|
+
s = list(iterable) # allows duplicate elements
|
|
27
|
+
chain_obj = chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
|
|
28
|
+
combo_lol = []
|
|
29
|
+
for _, combo in enumerate(chain_obj, 1):
|
|
30
|
+
if len(list(combo)) > 0:
|
|
31
|
+
combo_lol.append(list(combo))
|
|
32
|
+
|
|
33
|
+
return combo_lol
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def lca_to_two_leaves(species_tree_from_ale, internal_node_id):
|
|
37
|
+
|
|
38
|
+
# read in ale species tree
|
|
39
|
+
stree_ale = Tree(species_tree_from_ale, format=1)
|
|
40
|
+
|
|
41
|
+
# get all leaves of the internal node
|
|
42
|
+
internal_node = stree_ale.search_nodes(name=internal_node_id)[0]
|
|
43
|
+
internal_node_leaf_object = internal_node.get_leaves()
|
|
44
|
+
internal_node_leaf_set = set()
|
|
45
|
+
for each_leaf in internal_node_leaf_object:
|
|
46
|
+
internal_node_leaf_set.add(each_leaf.name)
|
|
47
|
+
|
|
48
|
+
# get the two leaves needed
|
|
49
|
+
targeted_two_leaves = []
|
|
50
|
+
leaves_found = False
|
|
51
|
+
for leaf_1 in internal_node_leaf_set:
|
|
52
|
+
for leaf_2 in internal_node_leaf_set:
|
|
53
|
+
if leaf_1 != leaf_2:
|
|
54
|
+
if leaves_found is False:
|
|
55
|
+
current_lca_id = stree_ale.get_common_ancestor(leaf_1, leaf_2).name
|
|
56
|
+
if current_lca_id == internal_node_id:
|
|
57
|
+
targeted_two_leaves.append(leaf_1)
|
|
58
|
+
targeted_two_leaves.append(leaf_2)
|
|
59
|
+
leaves_found = True
|
|
60
|
+
|
|
61
|
+
return targeted_two_leaves[0], targeted_two_leaves[1]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def keep_highest_rrtc(rrtc_in, rrtc_out):
|
|
65
|
+
|
|
66
|
+
rrtc_highest_prob_dict = dict()
|
|
67
|
+
for each_rrtc in open(rrtc_in):
|
|
68
|
+
rrtc_r = each_rrtc.strip().split(':')[0].split('\t')[0]
|
|
69
|
+
rrtc_d = each_rrtc.strip().split(':')[0].split('\t')[1]
|
|
70
|
+
rrtc_v = float(each_rrtc.strip().split(':')[1])
|
|
71
|
+
rrtc_key = '%s___%s' % (rrtc_r, rrtc_d)
|
|
72
|
+
if rrtc_key not in rrtc_highest_prob_dict:
|
|
73
|
+
rrtc_highest_prob_dict[rrtc_key] = rrtc_v
|
|
74
|
+
else:
|
|
75
|
+
if rrtc_v > rrtc_highest_prob_dict[rrtc_key]:
|
|
76
|
+
rrtc_highest_prob_dict[rrtc_key] = rrtc_v
|
|
77
|
+
|
|
78
|
+
with open(rrtc_out, 'w') as rrtc_out_handle:
|
|
79
|
+
for each_rrtc in sorted(rrtc_highest_prob_dict.items(), key=operator.itemgetter(1))[::-1]:
|
|
80
|
+
rrtc_r = each_rrtc[0].split('___')[0]
|
|
81
|
+
rrtc_d = each_rrtc[0].split('___')[1]
|
|
82
|
+
rrtc_v = each_rrtc[1]
|
|
83
|
+
rrtc_out_handle.write('%s\t%s:%s\n' % (rrtc_r, rrtc_d, rrtc_v))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
########################################################################################################################
|
|
87
|
+
|
|
88
|
+
# file in
|
|
89
|
+
ip_dir = '/Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA'
|
|
90
|
+
species_tree_from_ale = '/Users/songweizhi/Desktop/DateArTree/05_pRTC_wd/genome_tree.newick.ufboot.ale.stree'
|
|
91
|
+
round_list = [1, 2, 3, 4, 5]
|
|
92
|
+
color_list = ['dodgerblue', 'goldenrod1', 'darkorange1', 'seagreen3', 'orchid3']
|
|
93
|
+
min_detected_times = 2
|
|
94
|
+
|
|
95
|
+
# file out
|
|
96
|
+
op_dir = '/Users/songweizhi/Desktop/DateArTree/05_pRTC_wd/HGTs_5_rds_ALE'
|
|
97
|
+
|
|
98
|
+
########################################################################################################################
|
|
99
|
+
|
|
100
|
+
rscript = '%s/rscript.R' % op_dir
|
|
101
|
+
plot_file = '%s/Venn.pdf' % op_dir
|
|
102
|
+
rtc_txt = '%s/rrtc.txt' % op_dir
|
|
103
|
+
rtc_txt_highest = '%s/rrtc_uniq_by_highest_prob.txt' % op_dir
|
|
104
|
+
|
|
105
|
+
if os.path.isdir(op_dir):
|
|
106
|
+
os.system('rm -r %s' % op_dir)
|
|
107
|
+
os.system('mkdir %s' % op_dir)
|
|
108
|
+
|
|
109
|
+
########################################################################################################################
|
|
110
|
+
|
|
111
|
+
hgt_dict = dict()
|
|
112
|
+
rd_to_hgt_dict= dict()
|
|
113
|
+
for each_rd in round_list:
|
|
114
|
+
|
|
115
|
+
rd_id = each_rd
|
|
116
|
+
current_rd_op_dir = '%s/ALE4_op_dir_%s_0.3' % (ip_dir, each_rd)
|
|
117
|
+
pdf_file_re = '%s/*.%s' % (current_rd_op_dir, 'pdf')
|
|
118
|
+
pdf_file_list = glob.glob(pdf_file_re)
|
|
119
|
+
|
|
120
|
+
rd_to_hgt_dict[rd_id] = set()
|
|
121
|
+
|
|
122
|
+
for each_pdf in pdf_file_list:
|
|
123
|
+
f_path, f_base, f_ext = sep_path_basename_ext(each_pdf)
|
|
124
|
+
f_base_split = f_base.split('_')
|
|
125
|
+
id_by_d_to_r = '%s_to_%s' % (f_base_split[3], f_base_split[5])
|
|
126
|
+
rd_og = '%s_%s' % (each_rd, f_base_split[0])
|
|
127
|
+
rd_og_value = '%s_%s_%s' % (each_rd, f_base_split[0], f_base_split[6])
|
|
128
|
+
|
|
129
|
+
rd_to_hgt_dict[rd_id].add(id_by_d_to_r)
|
|
130
|
+
|
|
131
|
+
if id_by_d_to_r not in hgt_dict:
|
|
132
|
+
hgt_dict[id_by_d_to_r] = []
|
|
133
|
+
hgt_dict[id_by_d_to_r].append(rd_og_value)
|
|
134
|
+
|
|
135
|
+
################################################### get Venn diagram ###################################################
|
|
136
|
+
|
|
137
|
+
combination_list = powerset(round_list)
|
|
138
|
+
|
|
139
|
+
value_str_list = []
|
|
140
|
+
for each_cmbo in combination_list:
|
|
141
|
+
current_str = ''
|
|
142
|
+
if len(each_cmbo) == 1:
|
|
143
|
+
current_value = rd_to_hgt_dict[each_cmbo[0]]
|
|
144
|
+
current_str = 'area%s=%s' % (each_cmbo[0], len(current_value))
|
|
145
|
+
value_str_list.append(current_str)
|
|
146
|
+
else:
|
|
147
|
+
value_lol = []
|
|
148
|
+
for each_element in each_cmbo:
|
|
149
|
+
ele_value = rd_to_hgt_dict[each_element]
|
|
150
|
+
value_lol.append(ele_value)
|
|
151
|
+
shared = set(value_lol[0]).intersection(*value_lol)
|
|
152
|
+
current_str = 'n%s=%s' % (''.join([str(i) for i in each_cmbo]), len(shared))
|
|
153
|
+
value_str_list.append(current_str)
|
|
154
|
+
|
|
155
|
+
value_str = ', '.join(value_str_list)
|
|
156
|
+
label_str = '"' + '", "'.join([str(i) for i in round_list]) + '"'
|
|
157
|
+
color_str = '"' + '", "'.join([str(i) for i in color_list]) + '"'
|
|
158
|
+
font_size_str = ', '.join(['1.2']*len(combination_list))
|
|
159
|
+
|
|
160
|
+
rscript_handle = open(rscript, 'w')
|
|
161
|
+
rscript_handle.write('library(futile.logger)\n')
|
|
162
|
+
rscript_handle.write('library(gridBase)\n')
|
|
163
|
+
rscript_handle.write('library(VennDiagram)\n')
|
|
164
|
+
rscript_handle.write('pdf(file="%s")\n' % plot_file)
|
|
165
|
+
rscript_handle.write('venn.plot <- draw.quintuple.venn(%s, category=c(%s), fill=c(%s), cat.col=c(%s), cat.cex=1.2, cat.dist=0.3, margin=0.05, cex=c(%s), ind=TRUE)\n' % (value_str, label_str, color_str, color_str, font_size_str))
|
|
166
|
+
rscript_handle.write('dev.off()\n')
|
|
167
|
+
rscript_handle.close()
|
|
168
|
+
|
|
169
|
+
os.system('Rscript %s' % rscript)
|
|
170
|
+
|
|
171
|
+
########################################################################################################################
|
|
172
|
+
|
|
173
|
+
rtc_txt_handle = open(rtc_txt, 'w')
|
|
174
|
+
qualified_hgt_num = 0
|
|
175
|
+
for each_hgt in hgt_dict:
|
|
176
|
+
|
|
177
|
+
occurence_list = hgt_dict[each_hgt]
|
|
178
|
+
pdf_dir = '%s/%s_%s' % (op_dir, each_hgt, len(occurence_list))
|
|
179
|
+
if len(occurence_list) >= min_detected_times:
|
|
180
|
+
|
|
181
|
+
#################### prepare rtc file ####################
|
|
182
|
+
|
|
183
|
+
donor_id = each_hgt.split('_to_')[0][2:]
|
|
184
|
+
recipient_id = each_hgt.split('_to_')[1][2:]
|
|
185
|
+
d_leaf_1, d_leaf_2 = lca_to_two_leaves(species_tree_from_ale, donor_id)
|
|
186
|
+
r_leaf_1, r_leaf_2 = lca_to_two_leaves(species_tree_from_ale, recipient_id)
|
|
187
|
+
|
|
188
|
+
for each_occurence in occurence_list:
|
|
189
|
+
value = each_occurence.split('_')[-1]
|
|
190
|
+
rtc_str = '%s,%s\t%s,%s:%s' % (r_leaf_1, r_leaf_2, d_leaf_1, d_leaf_2, value)
|
|
191
|
+
rtc_txt_handle.write(rtc_str + '\n')
|
|
192
|
+
|
|
193
|
+
##########################################################
|
|
194
|
+
|
|
195
|
+
qualified_hgt_num += 1
|
|
196
|
+
os.system('mkdir %s' % pdf_dir)
|
|
197
|
+
for each_h in occurence_list:
|
|
198
|
+
rd_id = each_h.split('_')[0]
|
|
199
|
+
og_id = each_h.split('_')[1]
|
|
200
|
+
value = each_h.split('_')[2]
|
|
201
|
+
pwd_input_pdf_in = '%s/ALE4_op_dir_%s_0.3/%s_HGT_*_%s_%s.pdf' % (ip_dir, rd_id, og_id, each_hgt,value)
|
|
202
|
+
pwd_input_pdf_out = '%s/%s_%s_%s_%s.pdf' % (pdf_dir, rd_id, og_id, each_hgt,value)
|
|
203
|
+
os.system('cp %s %s' % (pwd_input_pdf_in, pwd_input_pdf_out))
|
|
204
|
+
rtc_txt_handle.close()
|
|
205
|
+
|
|
206
|
+
# remove redundant HGTs, keep the one with the highest probability
|
|
207
|
+
keep_highest_rrtc(rtc_txt, rtc_txt_highest)
|
|
208
|
+
|
|
209
|
+
print('The number of HGTs detected in >= %s runs is %s.' % (min_detected_times, qualified_hgt_num))
|
|
210
|
+
print('Done!')
|
TreeSAK/ALE6.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from ete3 import Tree
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ALE6_usage = '''
|
|
9
|
+
====================================== ALE6 example commands ======================================
|
|
10
|
+
|
|
11
|
+
# This module is developed to faa ancestral genomes based on ALE outputs
|
|
12
|
+
TreeSAK ALE6 -1 ALE1_op_dir -3 ALE3_op_dir_30 -s species_tree.rooted.treefile -o ALE6_op_dir_30 -n 380 -cog BioSAK_arCOG_wd -kegg BioSAK_KEGG_wd
|
|
13
|
+
TreeSAK ALE6 -1 ALE1_op_dir -3 ALE3_op_dir_30 -s species_tree.rooted.treefile -o ALE6_op_dir_30 -n 294,309,380,404
|
|
14
|
+
TreeSAK ALE6 -1 ALE1_op_dir -3 ALE3_op_dir_30 -s species_tree.rooted.treefile -o ALE6_op_dir_30 -n interested_nodes.txt
|
|
15
|
+
|
|
16
|
+
# Needed input files:
|
|
17
|
+
-1: faa files
|
|
18
|
+
-3: GeneContent.txt, SpeciesTreeRef.newick and Transfer_propensity.txt
|
|
19
|
+
-s: the tree used as input for ALE2, to rename leafs back
|
|
20
|
+
|
|
21
|
+
# To be added:
|
|
22
|
+
1. A dereplication step to the produced faa file.
|
|
23
|
+
|
|
24
|
+
===================================================================================================
|
|
25
|
+
'''
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def sep_path_basename_ext(file_in):
|
|
29
|
+
|
|
30
|
+
f_path, f_name = os.path.split(file_in)
|
|
31
|
+
if f_path == '':
|
|
32
|
+
f_path = '.'
|
|
33
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
34
|
+
|
|
35
|
+
return f_name, f_path, f_base, f_ext[1:]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_internal_node_leaves(ale_species_tree_file, internal_node_id):
|
|
39
|
+
|
|
40
|
+
ale_species_tree = Tree(ale_species_tree_file, format=1)
|
|
41
|
+
internal_node = ale_species_tree.search_nodes(name=internal_node_id)[0]
|
|
42
|
+
internal_node_leaf_object = internal_node.get_leaves()
|
|
43
|
+
internal_node_leaf_set = set()
|
|
44
|
+
for each_leaf in internal_node_leaf_object:
|
|
45
|
+
internal_node_leaf_set.add(each_leaf.name)
|
|
46
|
+
|
|
47
|
+
return internal_node_leaf_set
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def rename_leaves(tree_file_in, rename_dict, tree_format, tree_file_out):
|
|
51
|
+
|
|
52
|
+
t = Tree(tree_file_in, format=tree_format)
|
|
53
|
+
for leaf in t:
|
|
54
|
+
leaf_name_new = rename_dict.get(leaf.name, leaf.name)
|
|
55
|
+
leaf.name = leaf_name_new
|
|
56
|
+
t.write(format=tree_format, outfile=tree_file_out)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def ALE6(args):
|
|
60
|
+
|
|
61
|
+
ale1_op_dir = args['1']
|
|
62
|
+
ale3_op_dir = args['3']
|
|
63
|
+
op_dir = args['o']
|
|
64
|
+
genome_tree_file_rooted = args['s']
|
|
65
|
+
force_create_op_dir = args['f']
|
|
66
|
+
interested_internal_nodes = args['n']
|
|
67
|
+
cog_annotation_wd = args['cog']
|
|
68
|
+
kegg_annotation_wd = args['kegg']
|
|
69
|
+
|
|
70
|
+
GeneContent_txt = '%s/GeneContent.txt' % ale3_op_dir
|
|
71
|
+
SpeciesTreeRef = '%s/SpeciesTreeRef.newick' % ale3_op_dir
|
|
72
|
+
transfer_propensity_txt = '%s/Transfer_propensity.txt' % ale3_op_dir
|
|
73
|
+
|
|
74
|
+
################################################## define op files #################################################
|
|
75
|
+
|
|
76
|
+
_, _, tree_base, tree_ext = sep_path_basename_ext(genome_tree_file_rooted)
|
|
77
|
+
|
|
78
|
+
faa_dir = '%s/faa_files' % op_dir
|
|
79
|
+
cog_dir = '%s/annotation_COG' % op_dir
|
|
80
|
+
kegg_dir = '%s/annotation_KEGG' % op_dir
|
|
81
|
+
cog_df_txt = '%s/annotation_COG.txt' % op_dir
|
|
82
|
+
cog_df_desc_txt = '%s/annotation_COG_desc.txt' % op_dir
|
|
83
|
+
kegg_df_txt = '%s/annotation_KEGG.txt' % op_dir
|
|
84
|
+
kegg_df_desc_txt = '%s/annotation_KEGG_desc.txt' % op_dir
|
|
85
|
+
fun_transfer_propensity_txt = '%s/function_transfer_propensity_weighted.txt' % op_dir
|
|
86
|
+
genome_tree_file_rooted_with_ale_internal_node = '%s/%s_with_ALE_internal_nodes.%s' % (op_dir, tree_base, tree_ext)
|
|
87
|
+
|
|
88
|
+
########################################## get the id of nodes to process ##########################################
|
|
89
|
+
|
|
90
|
+
gnm_name_dict_ale_fmt_to_original_fmt = dict()
|
|
91
|
+
for leaf in Tree(genome_tree_file_rooted, format=1):
|
|
92
|
+
leaf_name = leaf.name
|
|
93
|
+
leaf_name_new = leaf_name.replace('_', '')
|
|
94
|
+
leaf.name = leaf_name_new
|
|
95
|
+
gnm_name_dict_ale_fmt_to_original_fmt[leaf_name_new] = leaf_name
|
|
96
|
+
|
|
97
|
+
overall_internal_node_set = set()
|
|
98
|
+
line_num_index = 0
|
|
99
|
+
for each_line in open(GeneContent_txt):
|
|
100
|
+
line_num_index += 1
|
|
101
|
+
line_split = each_line.strip().split('\t')
|
|
102
|
+
if line_num_index > 1:
|
|
103
|
+
node_id = line_split[0]
|
|
104
|
+
if '(' not in node_id:
|
|
105
|
+
overall_internal_node_set.add(node_id)
|
|
106
|
+
|
|
107
|
+
internal_nodes_to_process = set()
|
|
108
|
+
if interested_internal_nodes is None:
|
|
109
|
+
internal_nodes_to_process = overall_internal_node_set
|
|
110
|
+
else:
|
|
111
|
+
if os.path.isfile(interested_internal_nodes) is False:
|
|
112
|
+
if ',' in interested_internal_nodes:
|
|
113
|
+
internal_nodes_to_process = interested_internal_nodes.split(',')
|
|
114
|
+
else:
|
|
115
|
+
internal_nodes_to_process.add(interested_internal_nodes)
|
|
116
|
+
else:
|
|
117
|
+
for each_node in open(interested_internal_nodes):
|
|
118
|
+
internal_nodes_to_process.add(each_node.strip())
|
|
119
|
+
|
|
120
|
+
#################### an addiitonal step (add ALE added internal node names to the rooted tree) #####################
|
|
121
|
+
|
|
122
|
+
# create output directory
|
|
123
|
+
if force_create_op_dir is True:
|
|
124
|
+
if os.path.isdir(op_dir) is True:
|
|
125
|
+
os.system('rm -r %s' % op_dir)
|
|
126
|
+
os.system('mkdir %s' % op_dir)
|
|
127
|
+
|
|
128
|
+
rename_leaves(SpeciesTreeRef, gnm_name_dict_ale_fmt_to_original_fmt, 1, genome_tree_file_rooted_with_ale_internal_node)
|
|
129
|
+
|
|
130
|
+
####################################################################################################################
|
|
131
|
+
|
|
132
|
+
os.system('mkdir %s' % faa_dir)
|
|
133
|
+
|
|
134
|
+
if cog_annotation_wd is not None:
|
|
135
|
+
os.system('mkdir %s' % cog_dir)
|
|
136
|
+
|
|
137
|
+
if kegg_annotation_wd is not None:
|
|
138
|
+
os.system('mkdir %s' % kegg_dir)
|
|
139
|
+
|
|
140
|
+
branch_to_leaf_dict = dict()
|
|
141
|
+
branch_to_content_dict = dict()
|
|
142
|
+
col_header_list = []
|
|
143
|
+
line_num_index = 0
|
|
144
|
+
for each_line in open(GeneContent_txt):
|
|
145
|
+
line_num_index += 1
|
|
146
|
+
line_split = each_line.strip().split('\t')
|
|
147
|
+
if line_num_index == 1:
|
|
148
|
+
col_header_list = line_split
|
|
149
|
+
else:
|
|
150
|
+
branch_id = line_split[0]
|
|
151
|
+
if branch_id in internal_nodes_to_process:
|
|
152
|
+
branch_to_content_dict[branch_id] = []
|
|
153
|
+
branch_child_leaf_set = get_internal_node_leaves(SpeciesTreeRef, branch_id)
|
|
154
|
+
branch_to_leaf_dict[branch_id] = branch_child_leaf_set
|
|
155
|
+
for (id, pa) in zip(col_header_list[1:], line_split[1:]):
|
|
156
|
+
if pa != '0':
|
|
157
|
+
branch_to_content_dict[branch_id].append(id)
|
|
158
|
+
|
|
159
|
+
branch_to_gene_dict = dict()
|
|
160
|
+
for each_branch in branch_to_content_dict:
|
|
161
|
+
branch_faa = '%s/%s.faa' % (faa_dir, each_branch)
|
|
162
|
+
branch_content = branch_to_content_dict[each_branch]
|
|
163
|
+
branch_child_set = branch_to_leaf_dict[each_branch]
|
|
164
|
+
branch_child_set_original_name = {gnm_name_dict_ale_fmt_to_original_fmt[i] for i in branch_child_set}
|
|
165
|
+
branch_faa_handle = open(branch_faa, 'w')
|
|
166
|
+
branch_to_gene_dict[each_branch] = set()
|
|
167
|
+
for each_prot_family in branch_content:
|
|
168
|
+
each_prot_family_faa = '%s/%s.faa' % (ale1_op_dir, each_prot_family)
|
|
169
|
+
for each_seq in SeqIO.parse(each_prot_family_faa, 'fasta'):
|
|
170
|
+
seq_id = each_seq.id
|
|
171
|
+
seq_gnm = '_'.join(seq_id.split('_')[:-1])
|
|
172
|
+
if seq_gnm in branch_child_set_original_name:
|
|
173
|
+
branch_faa_handle.write('>%s %s\n' % (each_seq.id, each_prot_family))
|
|
174
|
+
branch_faa_handle.write('%s\n' % each_seq.seq)
|
|
175
|
+
branch_to_gene_dict[each_branch].add(each_seq.id)
|
|
176
|
+
branch_faa_handle.close()
|
|
177
|
+
|
|
178
|
+
####################################################################################################################
|
|
179
|
+
|
|
180
|
+
# Read in COG annotation results
|
|
181
|
+
fun_to_gene_dict = dict()
|
|
182
|
+
annotation_dict_cog = dict()
|
|
183
|
+
fun_id_to_desc_dict = dict()
|
|
184
|
+
if cog_annotation_wd is not None:
|
|
185
|
+
|
|
186
|
+
print('Reading in COG annotation results')
|
|
187
|
+
file_re = '%s/*COG_wd/*_query_to_cog.txt' % (cog_annotation_wd)
|
|
188
|
+
file_list = glob.glob(file_re)
|
|
189
|
+
|
|
190
|
+
if len(file_list) == 0:
|
|
191
|
+
print('COG annotation file not detected, program exited!')
|
|
192
|
+
exit()
|
|
193
|
+
|
|
194
|
+
for each_file in file_list:
|
|
195
|
+
gnm_id = each_file.split('/')[-1].split('_query_to_cog')[0]
|
|
196
|
+
if gnm_id not in annotation_dict_cog:
|
|
197
|
+
annotation_dict_cog[gnm_id] = dict()
|
|
198
|
+
line_index = 0
|
|
199
|
+
for each_line in open(each_file):
|
|
200
|
+
if line_index > 0:
|
|
201
|
+
each_line_split = each_line.strip().split('\t')
|
|
202
|
+
if len(each_line_split) == 4:
|
|
203
|
+
gene_id = each_line_split[0]
|
|
204
|
+
cog_id = each_line_split[1]
|
|
205
|
+
cog_desc = each_line_split[3]
|
|
206
|
+
annotation_dict_cog[gnm_id][gene_id] = cog_id
|
|
207
|
+
fun_id_to_desc_dict[cog_id] = cog_desc
|
|
208
|
+
if cog_id not in fun_to_gene_dict:
|
|
209
|
+
fun_to_gene_dict[cog_id] = set()
|
|
210
|
+
fun_to_gene_dict[cog_id].add(gene_id)
|
|
211
|
+
line_index += 1
|
|
212
|
+
|
|
213
|
+
# Read in KEGG annotation results
|
|
214
|
+
annotation_dict_kegg = dict()
|
|
215
|
+
if kegg_annotation_wd is not None:
|
|
216
|
+
|
|
217
|
+
print('Reading in KEGG annotation results')
|
|
218
|
+
file_re = '%s/*KEGG_wd/*_ko_assignment_ABCD.txt' % (kegg_annotation_wd)
|
|
219
|
+
file_list = glob.glob(file_re)
|
|
220
|
+
|
|
221
|
+
if len(file_list) == 0:
|
|
222
|
+
print('KEGG annotation file not detected, program exited!')
|
|
223
|
+
exit()
|
|
224
|
+
|
|
225
|
+
for each_file in file_list:
|
|
226
|
+
gnm_id = each_file.split('/')[-1].split('_ko_assignment_ABCD')[0]
|
|
227
|
+
if gnm_id not in annotation_dict_kegg:
|
|
228
|
+
annotation_dict_kegg[gnm_id] = dict()
|
|
229
|
+
|
|
230
|
+
line_index = 0
|
|
231
|
+
for each_line in open(each_file):
|
|
232
|
+
if line_index > 0:
|
|
233
|
+
each_line_split = each_line.strip().split('\t')
|
|
234
|
+
if len(each_line_split) == 9:
|
|
235
|
+
gene_id = each_line_split[0]
|
|
236
|
+
ko_d_id = each_line_split[4][2:]
|
|
237
|
+
ko_d_desc = each_line_split[8]
|
|
238
|
+
annotation_dict_kegg[gnm_id][gene_id] = ko_d_id
|
|
239
|
+
fun_id_to_desc_dict[ko_d_id] = ko_d_desc
|
|
240
|
+
if ko_d_id not in fun_to_gene_dict:
|
|
241
|
+
fun_to_gene_dict[ko_d_id] = set()
|
|
242
|
+
fun_to_gene_dict[ko_d_id].add(gene_id)
|
|
243
|
+
line_index += 1
|
|
244
|
+
|
|
245
|
+
cog_dod = dict()
|
|
246
|
+
kegg_dod = dict()
|
|
247
|
+
all_identified_cog_set = set()
|
|
248
|
+
all_identified_kegg_set = set()
|
|
249
|
+
if (cog_annotation_wd is not None) or (kegg_annotation_wd is not None):
|
|
250
|
+
for each_branch in branch_to_gene_dict:
|
|
251
|
+
branch_gene_content = branch_to_gene_dict[each_branch]
|
|
252
|
+
branch_cog_set = set()
|
|
253
|
+
branch_kegg_set = set()
|
|
254
|
+
for each_gene in branch_gene_content:
|
|
255
|
+
gnm_id = '_'.join(each_gene.split('_')[:-1])
|
|
256
|
+
cog_fun = annotation_dict_cog[gnm_id].get(each_gene, 'na')
|
|
257
|
+
kegg_fun = annotation_dict_kegg[gnm_id].get(each_gene, 'na')
|
|
258
|
+
if cog_fun != 'na':
|
|
259
|
+
branch_cog_set.add(cog_fun)
|
|
260
|
+
all_identified_cog_set.add(cog_fun)
|
|
261
|
+
if kegg_fun != 'na':
|
|
262
|
+
branch_kegg_set.add(kegg_fun)
|
|
263
|
+
all_identified_kegg_set.add(kegg_fun)
|
|
264
|
+
|
|
265
|
+
cog_dod[each_branch] = branch_cog_set
|
|
266
|
+
kegg_dod[each_branch] = branch_kegg_set
|
|
267
|
+
|
|
268
|
+
# write out annotation
|
|
269
|
+
if len(branch_cog_set) > 0:
|
|
270
|
+
cog_annotation_txt = '%s/%s_COG.txt' % (cog_dir, each_branch)
|
|
271
|
+
cog_annotation_txt_handle = open(cog_annotation_txt, 'w')
|
|
272
|
+
for each_cog in sorted(list(branch_cog_set)):
|
|
273
|
+
cog_annotation_txt_handle.write('%s\t%s\n' % (each_cog, fun_id_to_desc_dict[each_cog]))
|
|
274
|
+
cog_annotation_txt_handle.close()
|
|
275
|
+
|
|
276
|
+
if len(branch_kegg_set) > 0:
|
|
277
|
+
kegg_annotation_txt = '%s/%s_KEGG.txt' % (kegg_dir, each_branch)
|
|
278
|
+
kegg_annotation_txt_handle = open(kegg_annotation_txt, 'w')
|
|
279
|
+
for each_kegg in sorted(list(branch_kegg_set)):
|
|
280
|
+
kegg_annotation_txt_handle.write('%s\t%s\n' % (each_kegg, fun_id_to_desc_dict[each_kegg]))
|
|
281
|
+
kegg_annotation_txt_handle.close()
|
|
282
|
+
|
|
283
|
+
all_identified_cog_list_sorted = sorted(list(all_identified_cog_set))
|
|
284
|
+
all_identified_kegg_list_sorted = sorted(list(all_identified_kegg_set))
|
|
285
|
+
all_identified_cog_list_sorted_desc = [('%s__%s' % (i, fun_id_to_desc_dict[i])) for i in all_identified_cog_list_sorted]
|
|
286
|
+
all_identified_kegg_list_sorted_desc = [('%s__%s' % (i, fun_id_to_desc_dict[i])) for i in all_identified_kegg_list_sorted]
|
|
287
|
+
|
|
288
|
+
# write out COG dataframe
|
|
289
|
+
if len(all_identified_cog_set) > 0:
|
|
290
|
+
cog_df_txt_handle = open(cog_df_txt, 'w')
|
|
291
|
+
cog_df_txt_handle.write('\t%s\n' % '\t'.join(all_identified_cog_list_sorted))
|
|
292
|
+
cog_df_desc_txt_handle = open(cog_df_desc_txt, 'w')
|
|
293
|
+
cog_df_desc_txt_handle.write('\t%s\n' % '\t'.join(all_identified_cog_list_sorted_desc))
|
|
294
|
+
for each_branch in sorted(list(cog_dod.keys())):
|
|
295
|
+
branch_cogs = cog_dod[each_branch]
|
|
296
|
+
cog_pa_list = [each_branch]
|
|
297
|
+
for each_cog in all_identified_cog_list_sorted:
|
|
298
|
+
if each_cog in branch_cogs:
|
|
299
|
+
cog_pa_list.append('1')
|
|
300
|
+
else:
|
|
301
|
+
cog_pa_list.append('0')
|
|
302
|
+
cog_df_txt_handle.write('\t'.join(cog_pa_list) + '\n')
|
|
303
|
+
cog_df_desc_txt_handle.write('\t'.join(cog_pa_list) + '\n')
|
|
304
|
+
cog_df_txt_handle.close()
|
|
305
|
+
cog_df_desc_txt_handle.close()
|
|
306
|
+
print('Annotation matrix exported to: %s' % cog_df_txt)
|
|
307
|
+
|
|
308
|
+
# write out KEGG dataframe
|
|
309
|
+
if len(all_identified_kegg_set) > 0:
|
|
310
|
+
kegg_df_txt_handle = open(kegg_df_txt, 'w')
|
|
311
|
+
kegg_df_txt_handle.write('\t%s\n' % '\t'.join(all_identified_kegg_list_sorted))
|
|
312
|
+
kegg_df_desc_txt_handle = open(kegg_df_desc_txt, 'w')
|
|
313
|
+
kegg_df_desc_txt_handle.write('\t%s\n' % '\t'.join(all_identified_kegg_list_sorted_desc))
|
|
314
|
+
for each_branch in sorted(list(kegg_dod.keys())):
|
|
315
|
+
branch_keggs = kegg_dod[each_branch]
|
|
316
|
+
kegg_pa_list = [each_branch]
|
|
317
|
+
for each_kegg in all_identified_kegg_list_sorted:
|
|
318
|
+
if each_kegg in branch_keggs:
|
|
319
|
+
kegg_pa_list.append('1')
|
|
320
|
+
else:
|
|
321
|
+
kegg_pa_list.append('0')
|
|
322
|
+
kegg_df_txt_handle.write('\t'.join(kegg_pa_list) + '\n')
|
|
323
|
+
kegg_df_desc_txt_handle.write('\t'.join(kegg_pa_list) + '\n')
|
|
324
|
+
kegg_df_txt_handle.close()
|
|
325
|
+
kegg_df_desc_txt_handle.close()
|
|
326
|
+
print('Annotation matrix exported to: %s' % kegg_df_txt)
|
|
327
|
+
|
|
328
|
+
################################# get transfer propensity of individual functions ##################################
|
|
329
|
+
|
|
330
|
+
print('Getting transfer propensity of individual function')
|
|
331
|
+
|
|
332
|
+
# get gene_to_oma_dict
|
|
333
|
+
faa_file_re = '%s/*.faa' % ale1_op_dir
|
|
334
|
+
faa_file_list = glob.glob(faa_file_re)
|
|
335
|
+
gene_to_oma_dict = dict()
|
|
336
|
+
for faa_file in faa_file_list:
|
|
337
|
+
_, _, faa_base, _ = sep_path_basename_ext(faa_file)
|
|
338
|
+
for each_seq in SeqIO.parse(faa_file, 'fasta'):
|
|
339
|
+
seq_id = each_seq.id
|
|
340
|
+
gene_to_oma_dict[seq_id] = faa_base
|
|
341
|
+
|
|
342
|
+
# get oma_to_transfer_propensity_dict
|
|
343
|
+
oma_to_transfer_propensity_dict = dict()
|
|
344
|
+
line_index = 0
|
|
345
|
+
for each_oma in open(transfer_propensity_txt):
|
|
346
|
+
if line_index > 0:
|
|
347
|
+
each_oma_split = each_oma.strip().split('\t')
|
|
348
|
+
oma_id = each_oma_split[0]
|
|
349
|
+
transfer_propensity = float(each_oma_split[1])
|
|
350
|
+
oma_to_transfer_propensity_dict[oma_id] = transfer_propensity
|
|
351
|
+
line_index += 1
|
|
352
|
+
|
|
353
|
+
fun_transfer_propensity_txt_handle = open(fun_transfer_propensity_txt, 'w')
|
|
354
|
+
fun_transfer_propensity_txt_handle.write('ID\tWeighted_transfer_propensity\tDescription\n')
|
|
355
|
+
oma_weighted_transfer_propensity_dict = dict()
|
|
356
|
+
for fun_id in sorted(list(fun_to_gene_dict.keys())):
|
|
357
|
+
current_fun_gene_set = fun_to_gene_dict[fun_id]
|
|
358
|
+
current_fun_oma_stats_dict = dict()
|
|
359
|
+
for gene_id in current_fun_gene_set:
|
|
360
|
+
gene_oma = gene_to_oma_dict.get(gene_id, 'na')
|
|
361
|
+
if gene_oma not in current_fun_oma_stats_dict:
|
|
362
|
+
current_fun_oma_stats_dict[gene_oma] = 1
|
|
363
|
+
else:
|
|
364
|
+
current_fun_oma_stats_dict[gene_oma] += 1
|
|
365
|
+
|
|
366
|
+
total_transfer_propensity = 0
|
|
367
|
+
total_oma_num = 0
|
|
368
|
+
for oma_id in current_fun_oma_stats_dict:
|
|
369
|
+
oma_num = current_fun_oma_stats_dict[oma_id]
|
|
370
|
+
oma_transfer_propensity = oma_to_transfer_propensity_dict.get(oma_id, 'na')
|
|
371
|
+
if oma_transfer_propensity != 'na':
|
|
372
|
+
total_transfer_propensity += (oma_num*oma_transfer_propensity)
|
|
373
|
+
total_oma_num += oma_num
|
|
374
|
+
|
|
375
|
+
oma_transfer_propensity_weighted = 'na'
|
|
376
|
+
if total_oma_num != 0:
|
|
377
|
+
oma_transfer_propensity_weighted = total_transfer_propensity/total_oma_num
|
|
378
|
+
oma_transfer_propensity_weighted = float("{0:.3f}".format(oma_transfer_propensity_weighted))
|
|
379
|
+
|
|
380
|
+
if oma_transfer_propensity_weighted != 'na':
|
|
381
|
+
oma_weighted_transfer_propensity_dict[fun_id] = oma_transfer_propensity_weighted
|
|
382
|
+
fun_transfer_propensity_txt_handle.write('%s\t%s\t%s\n' % (fun_id, oma_transfer_propensity_weighted, fun_id_to_desc_dict[fun_id]))
|
|
383
|
+
|
|
384
|
+
fun_transfer_propensity_txt_handle.close()
|
|
385
|
+
|
|
386
|
+
print('Done!')
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
if __name__ == '__main__':
|
|
390
|
+
|
|
391
|
+
ALE6_parser = argparse.ArgumentParser()
|
|
392
|
+
ALE6_parser.add_argument('-1', required=True, help='ALE1 output directory')
|
|
393
|
+
ALE6_parser.add_argument('-3', required=True, help='ALE3 output directory')
|
|
394
|
+
ALE6_parser.add_argument('-s', required=True, help='rooted species tree')
|
|
395
|
+
ALE6_parser.add_argument('-n', required=False, default=None, help='interested internal node(s)')
|
|
396
|
+
ALE6_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
|
|
397
|
+
ALE6_parser.add_argument('-kegg', required=False, default=None, help='KEGG annotation results')
|
|
398
|
+
ALE6_parser.add_argument('-o', required=True, help='output directory')
|
|
399
|
+
ALE6_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
400
|
+
args = vars(ALE6_parser.parse_args())
|
|
401
|
+
ALE6(args)
|