treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/ALE5.py ADDED
@@ -0,0 +1,210 @@
1
+ import os
2
+ import glob
3
+ import operator
4
+ from ete3 import Tree
5
+ from itertools import chain
6
+ from itertools import combinations
7
+
8
+
9
+ def sep_path_basename_ext(file_in):
10
+
11
+ # separate path and file name
12
+ f_path, file_name = os.path.split(file_in)
13
+ if f_path == '':
14
+ f_path = '.'
15
+
16
+ # separate file basename and extension
17
+ f_base, f_ext = os.path.splitext(file_name)
18
+
19
+ return f_path, f_base, f_ext
20
+
21
+
22
+ def powerset(iterable):
23
+
24
+ " powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) "
25
+
26
+ s = list(iterable) # allows duplicate elements
27
+ chain_obj = chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
28
+ combo_lol = []
29
+ for _, combo in enumerate(chain_obj, 1):
30
+ if len(list(combo)) > 0:
31
+ combo_lol.append(list(combo))
32
+
33
+ return combo_lol
34
+
35
+
36
+ def lca_to_two_leaves(species_tree_from_ale, internal_node_id):
37
+
38
+ # read in ale species tree
39
+ stree_ale = Tree(species_tree_from_ale, format=1)
40
+
41
+ # get all leaves of the internal node
42
+ internal_node = stree_ale.search_nodes(name=internal_node_id)[0]
43
+ internal_node_leaf_object = internal_node.get_leaves()
44
+ internal_node_leaf_set = set()
45
+ for each_leaf in internal_node_leaf_object:
46
+ internal_node_leaf_set.add(each_leaf.name)
47
+
48
+ # get the two leaves needed
49
+ targeted_two_leaves = []
50
+ leaves_found = False
51
+ for leaf_1 in internal_node_leaf_set:
52
+ for leaf_2 in internal_node_leaf_set:
53
+ if leaf_1 != leaf_2:
54
+ if leaves_found is False:
55
+ current_lca_id = stree_ale.get_common_ancestor(leaf_1, leaf_2).name
56
+ if current_lca_id == internal_node_id:
57
+ targeted_two_leaves.append(leaf_1)
58
+ targeted_two_leaves.append(leaf_2)
59
+ leaves_found = True
60
+
61
+ return targeted_two_leaves[0], targeted_two_leaves[1]
62
+
63
+
64
+ def keep_highest_rrtc(rrtc_in, rrtc_out):
65
+
66
+ rrtc_highest_prob_dict = dict()
67
+ for each_rrtc in open(rrtc_in):
68
+ rrtc_r = each_rrtc.strip().split(':')[0].split('\t')[0]
69
+ rrtc_d = each_rrtc.strip().split(':')[0].split('\t')[1]
70
+ rrtc_v = float(each_rrtc.strip().split(':')[1])
71
+ rrtc_key = '%s___%s' % (rrtc_r, rrtc_d)
72
+ if rrtc_key not in rrtc_highest_prob_dict:
73
+ rrtc_highest_prob_dict[rrtc_key] = rrtc_v
74
+ else:
75
+ if rrtc_v > rrtc_highest_prob_dict[rrtc_key]:
76
+ rrtc_highest_prob_dict[rrtc_key] = rrtc_v
77
+
78
+ with open(rrtc_out, 'w') as rrtc_out_handle:
79
+ for each_rrtc in sorted(rrtc_highest_prob_dict.items(), key=operator.itemgetter(1))[::-1]:
80
+ rrtc_r = each_rrtc[0].split('___')[0]
81
+ rrtc_d = each_rrtc[0].split('___')[1]
82
+ rrtc_v = each_rrtc[1]
83
+ rrtc_out_handle.write('%s\t%s:%s\n' % (rrtc_r, rrtc_d, rrtc_v))
84
+
85
+
86
+ ########################################################################################################################
87
+
88
+ # file in
89
+ ip_dir = '/Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA'
90
+ species_tree_from_ale = '/Users/songweizhi/Desktop/DateArTree/05_pRTC_wd/genome_tree.newick.ufboot.ale.stree'
91
+ round_list = [1, 2, 3, 4, 5]
92
+ color_list = ['dodgerblue', 'goldenrod1', 'darkorange1', 'seagreen3', 'orchid3']
93
+ min_detected_times = 2
94
+
95
+ # file out
96
+ op_dir = '/Users/songweizhi/Desktop/DateArTree/05_pRTC_wd/HGTs_5_rds_ALE'
97
+
98
+ ########################################################################################################################
99
+
100
+ rscript = '%s/rscript.R' % op_dir
101
+ plot_file = '%s/Venn.pdf' % op_dir
102
+ rtc_txt = '%s/rrtc.txt' % op_dir
103
+ rtc_txt_highest = '%s/rrtc_uniq_by_highest_prob.txt' % op_dir
104
+
105
+ if os.path.isdir(op_dir):
106
+ os.system('rm -r %s' % op_dir)
107
+ os.system('mkdir %s' % op_dir)
108
+
109
+ ########################################################################################################################
110
+
111
+ hgt_dict = dict()
112
+ rd_to_hgt_dict= dict()
113
+ for each_rd in round_list:
114
+
115
+ rd_id = each_rd
116
+ current_rd_op_dir = '%s/ALE4_op_dir_%s_0.3' % (ip_dir, each_rd)
117
+ pdf_file_re = '%s/*.%s' % (current_rd_op_dir, 'pdf')
118
+ pdf_file_list = glob.glob(pdf_file_re)
119
+
120
+ rd_to_hgt_dict[rd_id] = set()
121
+
122
+ for each_pdf in pdf_file_list:
123
+ f_path, f_base, f_ext = sep_path_basename_ext(each_pdf)
124
+ f_base_split = f_base.split('_')
125
+ id_by_d_to_r = '%s_to_%s' % (f_base_split[3], f_base_split[5])
126
+ rd_og = '%s_%s' % (each_rd, f_base_split[0])
127
+ rd_og_value = '%s_%s_%s' % (each_rd, f_base_split[0], f_base_split[6])
128
+
129
+ rd_to_hgt_dict[rd_id].add(id_by_d_to_r)
130
+
131
+ if id_by_d_to_r not in hgt_dict:
132
+ hgt_dict[id_by_d_to_r] = []
133
+ hgt_dict[id_by_d_to_r].append(rd_og_value)
134
+
135
+ ################################################### get Venn diagram ###################################################
136
+
137
+ combination_list = powerset(round_list)
138
+
139
+ value_str_list = []
140
+ for each_cmbo in combination_list:
141
+ current_str = ''
142
+ if len(each_cmbo) == 1:
143
+ current_value = rd_to_hgt_dict[each_cmbo[0]]
144
+ current_str = 'area%s=%s' % (each_cmbo[0], len(current_value))
145
+ value_str_list.append(current_str)
146
+ else:
147
+ value_lol = []
148
+ for each_element in each_cmbo:
149
+ ele_value = rd_to_hgt_dict[each_element]
150
+ value_lol.append(ele_value)
151
+ shared = set(value_lol[0]).intersection(*value_lol)
152
+ current_str = 'n%s=%s' % (''.join([str(i) for i in each_cmbo]), len(shared))
153
+ value_str_list.append(current_str)
154
+
155
+ value_str = ', '.join(value_str_list)
156
+ label_str = '"' + '", "'.join([str(i) for i in round_list]) + '"'
157
+ color_str = '"' + '", "'.join([str(i) for i in color_list]) + '"'
158
+ font_size_str = ', '.join(['1.2']*len(combination_list))
159
+
160
+ rscript_handle = open(rscript, 'w')
161
+ rscript_handle.write('library(futile.logger)\n')
162
+ rscript_handle.write('library(gridBase)\n')
163
+ rscript_handle.write('library(VennDiagram)\n')
164
+ rscript_handle.write('pdf(file="%s")\n' % plot_file)
165
+ rscript_handle.write('venn.plot <- draw.quintuple.venn(%s, category=c(%s), fill=c(%s), cat.col=c(%s), cat.cex=1.2, cat.dist=0.3, margin=0.05, cex=c(%s), ind=TRUE)\n' % (value_str, label_str, color_str, color_str, font_size_str))
166
+ rscript_handle.write('dev.off()\n')
167
+ rscript_handle.close()
168
+
169
+ os.system('Rscript %s' % rscript)
170
+
171
+ ########################################################################################################################
172
+
173
+ rtc_txt_handle = open(rtc_txt, 'w')
174
+ qualified_hgt_num = 0
175
+ for each_hgt in hgt_dict:
176
+
177
+ occurence_list = hgt_dict[each_hgt]
178
+ pdf_dir = '%s/%s_%s' % (op_dir, each_hgt, len(occurence_list))
179
+ if len(occurence_list) >= min_detected_times:
180
+
181
+ #################### prepare rtc file ####################
182
+
183
+ donor_id = each_hgt.split('_to_')[0][2:]
184
+ recipient_id = each_hgt.split('_to_')[1][2:]
185
+ d_leaf_1, d_leaf_2 = lca_to_two_leaves(species_tree_from_ale, donor_id)
186
+ r_leaf_1, r_leaf_2 = lca_to_two_leaves(species_tree_from_ale, recipient_id)
187
+
188
+ for each_occurence in occurence_list:
189
+ value = each_occurence.split('_')[-1]
190
+ rtc_str = '%s,%s\t%s,%s:%s' % (r_leaf_1, r_leaf_2, d_leaf_1, d_leaf_2, value)
191
+ rtc_txt_handle.write(rtc_str + '\n')
192
+
193
+ ##########################################################
194
+
195
+ qualified_hgt_num += 1
196
+ os.system('mkdir %s' % pdf_dir)
197
+ for each_h in occurence_list:
198
+ rd_id = each_h.split('_')[0]
199
+ og_id = each_h.split('_')[1]
200
+ value = each_h.split('_')[2]
201
+ pwd_input_pdf_in = '%s/ALE4_op_dir_%s_0.3/%s_HGT_*_%s_%s.pdf' % (ip_dir, rd_id, og_id, each_hgt,value)
202
+ pwd_input_pdf_out = '%s/%s_%s_%s_%s.pdf' % (pdf_dir, rd_id, og_id, each_hgt,value)
203
+ os.system('cp %s %s' % (pwd_input_pdf_in, pwd_input_pdf_out))
204
+ rtc_txt_handle.close()
205
+
206
+ # remove redundant HGTs, keep the one with the highest probability
207
+ keep_highest_rrtc(rtc_txt, rtc_txt_highest)
208
+
209
+ print('The number of HGTs detected in >= %s runs is %s.' % (min_detected_times, qualified_hgt_num))
210
+ print('Done!')
TreeSAK/ALE6.py ADDED
@@ -0,0 +1,401 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from ete3 import Tree
6
+
7
+
8
+ ALE6_usage = '''
9
+ ====================================== ALE6 example commands ======================================
10
+
11
+ # This module is developed to faa ancestral genomes based on ALE outputs
12
+ TreeSAK ALE6 -1 ALE1_op_dir -3 ALE3_op_dir_30 -s species_tree.rooted.treefile -o ALE6_op_dir_30 -n 380 -cog BioSAK_arCOG_wd -kegg BioSAK_KEGG_wd
13
+ TreeSAK ALE6 -1 ALE1_op_dir -3 ALE3_op_dir_30 -s species_tree.rooted.treefile -o ALE6_op_dir_30 -n 294,309,380,404
14
+ TreeSAK ALE6 -1 ALE1_op_dir -3 ALE3_op_dir_30 -s species_tree.rooted.treefile -o ALE6_op_dir_30 -n interested_nodes.txt
15
+
16
+ # Needed input files:
17
+ -1: faa files
18
+ -3: GeneContent.txt, SpeciesTreeRef.newick and Transfer_propensity.txt
19
+ -s: the tree used as input for ALE2, to rename leafs back
20
+
21
+ # To be added:
22
+ 1. A dereplication step to the produced faa file.
23
+
24
+ ===================================================================================================
25
+ '''
26
+
27
+
28
+ def sep_path_basename_ext(file_in):
29
+
30
+ f_path, f_name = os.path.split(file_in)
31
+ if f_path == '':
32
+ f_path = '.'
33
+ f_base, f_ext = os.path.splitext(f_name)
34
+
35
+ return f_name, f_path, f_base, f_ext[1:]
36
+
37
+
38
+ def get_internal_node_leaves(ale_species_tree_file, internal_node_id):
39
+
40
+ ale_species_tree = Tree(ale_species_tree_file, format=1)
41
+ internal_node = ale_species_tree.search_nodes(name=internal_node_id)[0]
42
+ internal_node_leaf_object = internal_node.get_leaves()
43
+ internal_node_leaf_set = set()
44
+ for each_leaf in internal_node_leaf_object:
45
+ internal_node_leaf_set.add(each_leaf.name)
46
+
47
+ return internal_node_leaf_set
48
+
49
+
50
+ def rename_leaves(tree_file_in, rename_dict, tree_format, tree_file_out):
51
+
52
+ t = Tree(tree_file_in, format=tree_format)
53
+ for leaf in t:
54
+ leaf_name_new = rename_dict.get(leaf.name, leaf.name)
55
+ leaf.name = leaf_name_new
56
+ t.write(format=tree_format, outfile=tree_file_out)
57
+
58
+
59
+ def ALE6(args):
60
+
61
+ ale1_op_dir = args['1']
62
+ ale3_op_dir = args['3']
63
+ op_dir = args['o']
64
+ genome_tree_file_rooted = args['s']
65
+ force_create_op_dir = args['f']
66
+ interested_internal_nodes = args['n']
67
+ cog_annotation_wd = args['cog']
68
+ kegg_annotation_wd = args['kegg']
69
+
70
+ GeneContent_txt = '%s/GeneContent.txt' % ale3_op_dir
71
+ SpeciesTreeRef = '%s/SpeciesTreeRef.newick' % ale3_op_dir
72
+ transfer_propensity_txt = '%s/Transfer_propensity.txt' % ale3_op_dir
73
+
74
+ ################################################## define op files #################################################
75
+
76
+ _, _, tree_base, tree_ext = sep_path_basename_ext(genome_tree_file_rooted)
77
+
78
+ faa_dir = '%s/faa_files' % op_dir
79
+ cog_dir = '%s/annotation_COG' % op_dir
80
+ kegg_dir = '%s/annotation_KEGG' % op_dir
81
+ cog_df_txt = '%s/annotation_COG.txt' % op_dir
82
+ cog_df_desc_txt = '%s/annotation_COG_desc.txt' % op_dir
83
+ kegg_df_txt = '%s/annotation_KEGG.txt' % op_dir
84
+ kegg_df_desc_txt = '%s/annotation_KEGG_desc.txt' % op_dir
85
+ fun_transfer_propensity_txt = '%s/function_transfer_propensity_weighted.txt' % op_dir
86
+ genome_tree_file_rooted_with_ale_internal_node = '%s/%s_with_ALE_internal_nodes.%s' % (op_dir, tree_base, tree_ext)
87
+
88
+ ########################################## get the id of nodes to process ##########################################
89
+
90
+ gnm_name_dict_ale_fmt_to_original_fmt = dict()
91
+ for leaf in Tree(genome_tree_file_rooted, format=1):
92
+ leaf_name = leaf.name
93
+ leaf_name_new = leaf_name.replace('_', '')
94
+ leaf.name = leaf_name_new
95
+ gnm_name_dict_ale_fmt_to_original_fmt[leaf_name_new] = leaf_name
96
+
97
+ overall_internal_node_set = set()
98
+ line_num_index = 0
99
+ for each_line in open(GeneContent_txt):
100
+ line_num_index += 1
101
+ line_split = each_line.strip().split('\t')
102
+ if line_num_index > 1:
103
+ node_id = line_split[0]
104
+ if '(' not in node_id:
105
+ overall_internal_node_set.add(node_id)
106
+
107
+ internal_nodes_to_process = set()
108
+ if interested_internal_nodes is None:
109
+ internal_nodes_to_process = overall_internal_node_set
110
+ else:
111
+ if os.path.isfile(interested_internal_nodes) is False:
112
+ if ',' in interested_internal_nodes:
113
+ internal_nodes_to_process = interested_internal_nodes.split(',')
114
+ else:
115
+ internal_nodes_to_process.add(interested_internal_nodes)
116
+ else:
117
+ for each_node in open(interested_internal_nodes):
118
+ internal_nodes_to_process.add(each_node.strip())
119
+
120
+ #################### an addiitonal step (add ALE added internal node names to the rooted tree) #####################
121
+
122
+ # create output directory
123
+ if force_create_op_dir is True:
124
+ if os.path.isdir(op_dir) is True:
125
+ os.system('rm -r %s' % op_dir)
126
+ os.system('mkdir %s' % op_dir)
127
+
128
+ rename_leaves(SpeciesTreeRef, gnm_name_dict_ale_fmt_to_original_fmt, 1, genome_tree_file_rooted_with_ale_internal_node)
129
+
130
+ ####################################################################################################################
131
+
132
+ os.system('mkdir %s' % faa_dir)
133
+
134
+ if cog_annotation_wd is not None:
135
+ os.system('mkdir %s' % cog_dir)
136
+
137
+ if kegg_annotation_wd is not None:
138
+ os.system('mkdir %s' % kegg_dir)
139
+
140
+ branch_to_leaf_dict = dict()
141
+ branch_to_content_dict = dict()
142
+ col_header_list = []
143
+ line_num_index = 0
144
+ for each_line in open(GeneContent_txt):
145
+ line_num_index += 1
146
+ line_split = each_line.strip().split('\t')
147
+ if line_num_index == 1:
148
+ col_header_list = line_split
149
+ else:
150
+ branch_id = line_split[0]
151
+ if branch_id in internal_nodes_to_process:
152
+ branch_to_content_dict[branch_id] = []
153
+ branch_child_leaf_set = get_internal_node_leaves(SpeciesTreeRef, branch_id)
154
+ branch_to_leaf_dict[branch_id] = branch_child_leaf_set
155
+ for (id, pa) in zip(col_header_list[1:], line_split[1:]):
156
+ if pa != '0':
157
+ branch_to_content_dict[branch_id].append(id)
158
+
159
+ branch_to_gene_dict = dict()
160
+ for each_branch in branch_to_content_dict:
161
+ branch_faa = '%s/%s.faa' % (faa_dir, each_branch)
162
+ branch_content = branch_to_content_dict[each_branch]
163
+ branch_child_set = branch_to_leaf_dict[each_branch]
164
+ branch_child_set_original_name = {gnm_name_dict_ale_fmt_to_original_fmt[i] for i in branch_child_set}
165
+ branch_faa_handle = open(branch_faa, 'w')
166
+ branch_to_gene_dict[each_branch] = set()
167
+ for each_prot_family in branch_content:
168
+ each_prot_family_faa = '%s/%s.faa' % (ale1_op_dir, each_prot_family)
169
+ for each_seq in SeqIO.parse(each_prot_family_faa, 'fasta'):
170
+ seq_id = each_seq.id
171
+ seq_gnm = '_'.join(seq_id.split('_')[:-1])
172
+ if seq_gnm in branch_child_set_original_name:
173
+ branch_faa_handle.write('>%s %s\n' % (each_seq.id, each_prot_family))
174
+ branch_faa_handle.write('%s\n' % each_seq.seq)
175
+ branch_to_gene_dict[each_branch].add(each_seq.id)
176
+ branch_faa_handle.close()
177
+
178
+ ####################################################################################################################
179
+
180
+ # Read in COG annotation results
181
+ fun_to_gene_dict = dict()
182
+ annotation_dict_cog = dict()
183
+ fun_id_to_desc_dict = dict()
184
+ if cog_annotation_wd is not None:
185
+
186
+ print('Reading in COG annotation results')
187
+ file_re = '%s/*COG_wd/*_query_to_cog.txt' % (cog_annotation_wd)
188
+ file_list = glob.glob(file_re)
189
+
190
+ if len(file_list) == 0:
191
+ print('COG annotation file not detected, program exited!')
192
+ exit()
193
+
194
+ for each_file in file_list:
195
+ gnm_id = each_file.split('/')[-1].split('_query_to_cog')[0]
196
+ if gnm_id not in annotation_dict_cog:
197
+ annotation_dict_cog[gnm_id] = dict()
198
+ line_index = 0
199
+ for each_line in open(each_file):
200
+ if line_index > 0:
201
+ each_line_split = each_line.strip().split('\t')
202
+ if len(each_line_split) == 4:
203
+ gene_id = each_line_split[0]
204
+ cog_id = each_line_split[1]
205
+ cog_desc = each_line_split[3]
206
+ annotation_dict_cog[gnm_id][gene_id] = cog_id
207
+ fun_id_to_desc_dict[cog_id] = cog_desc
208
+ if cog_id not in fun_to_gene_dict:
209
+ fun_to_gene_dict[cog_id] = set()
210
+ fun_to_gene_dict[cog_id].add(gene_id)
211
+ line_index += 1
212
+
213
+ # Read in KEGG annotation results
214
+ annotation_dict_kegg = dict()
215
+ if kegg_annotation_wd is not None:
216
+
217
+ print('Reading in KEGG annotation results')
218
+ file_re = '%s/*KEGG_wd/*_ko_assignment_ABCD.txt' % (kegg_annotation_wd)
219
+ file_list = glob.glob(file_re)
220
+
221
+ if len(file_list) == 0:
222
+ print('KEGG annotation file not detected, program exited!')
223
+ exit()
224
+
225
+ for each_file in file_list:
226
+ gnm_id = each_file.split('/')[-1].split('_ko_assignment_ABCD')[0]
227
+ if gnm_id not in annotation_dict_kegg:
228
+ annotation_dict_kegg[gnm_id] = dict()
229
+
230
+ line_index = 0
231
+ for each_line in open(each_file):
232
+ if line_index > 0:
233
+ each_line_split = each_line.strip().split('\t')
234
+ if len(each_line_split) == 9:
235
+ gene_id = each_line_split[0]
236
+ ko_d_id = each_line_split[4][2:]
237
+ ko_d_desc = each_line_split[8]
238
+ annotation_dict_kegg[gnm_id][gene_id] = ko_d_id
239
+ fun_id_to_desc_dict[ko_d_id] = ko_d_desc
240
+ if ko_d_id not in fun_to_gene_dict:
241
+ fun_to_gene_dict[ko_d_id] = set()
242
+ fun_to_gene_dict[ko_d_id].add(gene_id)
243
+ line_index += 1
244
+
245
+ cog_dod = dict()
246
+ kegg_dod = dict()
247
+ all_identified_cog_set = set()
248
+ all_identified_kegg_set = set()
249
+ if (cog_annotation_wd is not None) or (kegg_annotation_wd is not None):
250
+ for each_branch in branch_to_gene_dict:
251
+ branch_gene_content = branch_to_gene_dict[each_branch]
252
+ branch_cog_set = set()
253
+ branch_kegg_set = set()
254
+ for each_gene in branch_gene_content:
255
+ gnm_id = '_'.join(each_gene.split('_')[:-1])
256
+ cog_fun = annotation_dict_cog[gnm_id].get(each_gene, 'na')
257
+ kegg_fun = annotation_dict_kegg[gnm_id].get(each_gene, 'na')
258
+ if cog_fun != 'na':
259
+ branch_cog_set.add(cog_fun)
260
+ all_identified_cog_set.add(cog_fun)
261
+ if kegg_fun != 'na':
262
+ branch_kegg_set.add(kegg_fun)
263
+ all_identified_kegg_set.add(kegg_fun)
264
+
265
+ cog_dod[each_branch] = branch_cog_set
266
+ kegg_dod[each_branch] = branch_kegg_set
267
+
268
+ # write out annotation
269
+ if len(branch_cog_set) > 0:
270
+ cog_annotation_txt = '%s/%s_COG.txt' % (cog_dir, each_branch)
271
+ cog_annotation_txt_handle = open(cog_annotation_txt, 'w')
272
+ for each_cog in sorted(list(branch_cog_set)):
273
+ cog_annotation_txt_handle.write('%s\t%s\n' % (each_cog, fun_id_to_desc_dict[each_cog]))
274
+ cog_annotation_txt_handle.close()
275
+
276
+ if len(branch_kegg_set) > 0:
277
+ kegg_annotation_txt = '%s/%s_KEGG.txt' % (kegg_dir, each_branch)
278
+ kegg_annotation_txt_handle = open(kegg_annotation_txt, 'w')
279
+ for each_kegg in sorted(list(branch_kegg_set)):
280
+ kegg_annotation_txt_handle.write('%s\t%s\n' % (each_kegg, fun_id_to_desc_dict[each_kegg]))
281
+ kegg_annotation_txt_handle.close()
282
+
283
+ all_identified_cog_list_sorted = sorted(list(all_identified_cog_set))
284
+ all_identified_kegg_list_sorted = sorted(list(all_identified_kegg_set))
285
+ all_identified_cog_list_sorted_desc = [('%s__%s' % (i, fun_id_to_desc_dict[i])) for i in all_identified_cog_list_sorted]
286
+ all_identified_kegg_list_sorted_desc = [('%s__%s' % (i, fun_id_to_desc_dict[i])) for i in all_identified_kegg_list_sorted]
287
+
288
+ # write out COG dataframe
289
+ if len(all_identified_cog_set) > 0:
290
+ cog_df_txt_handle = open(cog_df_txt, 'w')
291
+ cog_df_txt_handle.write('\t%s\n' % '\t'.join(all_identified_cog_list_sorted))
292
+ cog_df_desc_txt_handle = open(cog_df_desc_txt, 'w')
293
+ cog_df_desc_txt_handle.write('\t%s\n' % '\t'.join(all_identified_cog_list_sorted_desc))
294
+ for each_branch in sorted(list(cog_dod.keys())):
295
+ branch_cogs = cog_dod[each_branch]
296
+ cog_pa_list = [each_branch]
297
+ for each_cog in all_identified_cog_list_sorted:
298
+ if each_cog in branch_cogs:
299
+ cog_pa_list.append('1')
300
+ else:
301
+ cog_pa_list.append('0')
302
+ cog_df_txt_handle.write('\t'.join(cog_pa_list) + '\n')
303
+ cog_df_desc_txt_handle.write('\t'.join(cog_pa_list) + '\n')
304
+ cog_df_txt_handle.close()
305
+ cog_df_desc_txt_handle.close()
306
+ print('Annotation matrix exported to: %s' % cog_df_txt)
307
+
308
+ # write out KEGG dataframe
309
+ if len(all_identified_kegg_set) > 0:
310
+ kegg_df_txt_handle = open(kegg_df_txt, 'w')
311
+ kegg_df_txt_handle.write('\t%s\n' % '\t'.join(all_identified_kegg_list_sorted))
312
+ kegg_df_desc_txt_handle = open(kegg_df_desc_txt, 'w')
313
+ kegg_df_desc_txt_handle.write('\t%s\n' % '\t'.join(all_identified_kegg_list_sorted_desc))
314
+ for each_branch in sorted(list(kegg_dod.keys())):
315
+ branch_keggs = kegg_dod[each_branch]
316
+ kegg_pa_list = [each_branch]
317
+ for each_kegg in all_identified_kegg_list_sorted:
318
+ if each_kegg in branch_keggs:
319
+ kegg_pa_list.append('1')
320
+ else:
321
+ kegg_pa_list.append('0')
322
+ kegg_df_txt_handle.write('\t'.join(kegg_pa_list) + '\n')
323
+ kegg_df_desc_txt_handle.write('\t'.join(kegg_pa_list) + '\n')
324
+ kegg_df_txt_handle.close()
325
+ kegg_df_desc_txt_handle.close()
326
+ print('Annotation matrix exported to: %s' % kegg_df_txt)
327
+
328
+ ################################# get transfer propensity of individual functions ##################################
329
+
330
+ print('Getting transfer propensity of individual function')
331
+
332
+ # get gene_to_oma_dict
333
+ faa_file_re = '%s/*.faa' % ale1_op_dir
334
+ faa_file_list = glob.glob(faa_file_re)
335
+ gene_to_oma_dict = dict()
336
+ for faa_file in faa_file_list:
337
+ _, _, faa_base, _ = sep_path_basename_ext(faa_file)
338
+ for each_seq in SeqIO.parse(faa_file, 'fasta'):
339
+ seq_id = each_seq.id
340
+ gene_to_oma_dict[seq_id] = faa_base
341
+
342
+ # get oma_to_transfer_propensity_dict
343
+ oma_to_transfer_propensity_dict = dict()
344
+ line_index = 0
345
+ for each_oma in open(transfer_propensity_txt):
346
+ if line_index > 0:
347
+ each_oma_split = each_oma.strip().split('\t')
348
+ oma_id = each_oma_split[0]
349
+ transfer_propensity = float(each_oma_split[1])
350
+ oma_to_transfer_propensity_dict[oma_id] = transfer_propensity
351
+ line_index += 1
352
+
353
+ fun_transfer_propensity_txt_handle = open(fun_transfer_propensity_txt, 'w')
354
+ fun_transfer_propensity_txt_handle.write('ID\tWeighted_transfer_propensity\tDescription\n')
355
+ oma_weighted_transfer_propensity_dict = dict()
356
+ for fun_id in sorted(list(fun_to_gene_dict.keys())):
357
+ current_fun_gene_set = fun_to_gene_dict[fun_id]
358
+ current_fun_oma_stats_dict = dict()
359
+ for gene_id in current_fun_gene_set:
360
+ gene_oma = gene_to_oma_dict.get(gene_id, 'na')
361
+ if gene_oma not in current_fun_oma_stats_dict:
362
+ current_fun_oma_stats_dict[gene_oma] = 1
363
+ else:
364
+ current_fun_oma_stats_dict[gene_oma] += 1
365
+
366
+ total_transfer_propensity = 0
367
+ total_oma_num = 0
368
+ for oma_id in current_fun_oma_stats_dict:
369
+ oma_num = current_fun_oma_stats_dict[oma_id]
370
+ oma_transfer_propensity = oma_to_transfer_propensity_dict.get(oma_id, 'na')
371
+ if oma_transfer_propensity != 'na':
372
+ total_transfer_propensity += (oma_num*oma_transfer_propensity)
373
+ total_oma_num += oma_num
374
+
375
+ oma_transfer_propensity_weighted = 'na'
376
+ if total_oma_num != 0:
377
+ oma_transfer_propensity_weighted = total_transfer_propensity/total_oma_num
378
+ oma_transfer_propensity_weighted = float("{0:.3f}".format(oma_transfer_propensity_weighted))
379
+
380
+ if oma_transfer_propensity_weighted != 'na':
381
+ oma_weighted_transfer_propensity_dict[fun_id] = oma_transfer_propensity_weighted
382
+ fun_transfer_propensity_txt_handle.write('%s\t%s\t%s\n' % (fun_id, oma_transfer_propensity_weighted, fun_id_to_desc_dict[fun_id]))
383
+
384
+ fun_transfer_propensity_txt_handle.close()
385
+
386
+ print('Done!')
387
+
388
+
389
+ if __name__ == '__main__':
390
+
391
+ ALE6_parser = argparse.ArgumentParser()
392
+ ALE6_parser.add_argument('-1', required=True, help='ALE1 output directory')
393
+ ALE6_parser.add_argument('-3', required=True, help='ALE3 output directory')
394
+ ALE6_parser.add_argument('-s', required=True, help='rooted species tree')
395
+ ALE6_parser.add_argument('-n', required=False, default=None, help='interested internal node(s)')
396
+ ALE6_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
397
+ ALE6_parser.add_argument('-kegg', required=False, default=None, help='KEGG annotation results')
398
+ ALE6_parser.add_argument('-o', required=True, help='output directory')
399
+ ALE6_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
400
+ args = vars(ALE6_parser.parse_args())
401
+ ALE6(args)