treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,198 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from ete3 import Tree
5
+
6
+
7
+ VisHPD95_usage = '''
8
+ ============================ VisHPD95 example command ============================
9
+
10
+ TreeSAK VisHPD95 -i mcmc_out -o HPD95.pdf -n nodes.txt -label label.txt
11
+ TreeSAK VisHPD95 -i mcmc_out -o HPD95.pdf -n nodes.txt -label label.txt -x 9 -y 6
12
+
13
+ # Example data
14
+ https://github.com/songweizhi/TreeSAK/tree/master/example_data/VisHPD95
15
+
16
+ ==================================================================================
17
+ '''
18
+
19
+ def mcmctree_out_to_tree_str(mamctree_out):
20
+
21
+ # get tree string from mamctree_out
22
+ tree_str = ''
23
+ tree_line = 0
24
+ current_line = 1
25
+ for each_line in open(mamctree_out):
26
+ if 'Species tree for FigTree. Branch lengths = posterior mean times; 95% CIs = labels' in each_line:
27
+ tree_line = current_line + 1
28
+ if tree_line == current_line:
29
+ tree_str = each_line.strip()
30
+ current_line += 1
31
+
32
+ tree_str_no_space = tree_str.replace(' ', '')
33
+
34
+ # rename tree nodes
35
+ t = Tree(tree_str_no_space, format=1)
36
+ for each_node in t.traverse():
37
+ if each_node.is_leaf():
38
+ node_name_new = '_'.join(each_node.name.split('_')[1:])
39
+ else:
40
+ node_name_new = 't_n%s' % each_node.name
41
+ each_node.name = node_name_new
42
+
43
+ tree_str_renamed = t.write(format=8)
44
+
45
+ return tree_str_renamed
46
+
47
+
48
+ def get_internal_node_to_plot(node_txt, mo_file):
49
+
50
+ tree_str = ''
51
+ if os.path.isfile(mo_file):
52
+ tree_str = mcmctree_out_to_tree_str(mo_file)
53
+
54
+ # get nodes to plot
55
+ node_set = set()
56
+ node_rename_dict = dict()
57
+ if os.path.isfile(node_txt) is True:
58
+ for each in open(node_txt):
59
+ each_split = each.strip().split('\t')
60
+ node_str = each_split[0]
61
+
62
+ # get internal_node_to_plot
63
+ internal_node_to_plot = ''
64
+ if ',' not in node_str:
65
+ internal_node_to_plot = each_split[0]
66
+ else:
67
+ leaf_list = node_str.split(',')
68
+ if tree_str == '':
69
+ print('MCMCTree out file not found, program exited!')
70
+ exit()
71
+ current_lca = Tree(tree_str, format=1).get_common_ancestor(leaf_list)
72
+ internal_node_to_plot = current_lca.name
73
+
74
+ # add internal_node_to_plot to node_set
75
+ if internal_node_to_plot != '':
76
+ node_set.add(internal_node_to_plot)
77
+
78
+ # read in name to show in plot
79
+ if len(each_split) == 2:
80
+ if each_split[1] != '':
81
+ node_rename_dict[internal_node_to_plot] = each_split[1]
82
+ else:
83
+ node_set = node_txt.split(',')
84
+
85
+ return node_set, node_rename_dict, tree_str
86
+
87
+
88
+ def read_in_posterior_mean(mcmctree_out):
89
+
90
+ # read in Posterior mean
91
+ node_to_mean_hpd95_dict = dict()
92
+ current_line = 1
93
+ posterior_mean_header_line = 0
94
+ for each_line in open(mcmctree_out):
95
+ if 'Posterior mean (95% Equal-tail CI) (95% HPD CI) HPD-CI-width' in each_line:
96
+ posterior_mean_header_line = current_line
97
+
98
+ if (posterior_mean_header_line != 0) and (current_line > posterior_mean_header_line):
99
+ each_line_split = each_line.strip().split(' ')
100
+
101
+ each_line_split_no_empty = []
102
+ for each_element in each_line_split:
103
+ if each_element not in ['', '(']:
104
+ each_element_value = each_element.replace('(', '').replace(')', '').replace(',', '')
105
+ each_line_split_no_empty.append(each_element_value)
106
+ if len(each_line_split_no_empty) == 9:
107
+ node_id = each_line_split_no_empty[0]
108
+ value_mean = each_line_split_no_empty[1]
109
+ value_hpd95_small = each_line_split_no_empty[4]
110
+ value_hpd95_big = each_line_split_no_empty[5]
111
+ node_to_mean_hpd95_dict[node_id] = [value_mean, value_hpd95_small, value_hpd95_big]
112
+ current_line += 1
113
+
114
+ return node_to_mean_hpd95_dict
115
+
116
+
117
+ def VisHPD95(args):
118
+
119
+ mcmc_in = args['i']
120
+ node_txt = args['n']
121
+ label_txt = args['label']
122
+ plot_out = args['o']
123
+ plot_width = args['x']
124
+ plot_height = args['y']
125
+
126
+ pwd_current_file = os.path.realpath(__file__)
127
+ current_file_path = '/'.join(pwd_current_file.split('/')[:-1])
128
+ VisHPD95_R = '%s/VisHPD95.R' % current_file_path
129
+
130
+ dm_out = '%s.txt' % plot_out
131
+
132
+ # check MCMCTree output file/dir
133
+ if os.path.isfile(mcmc_in) is True:
134
+ mcmc_out_file_list = [mcmc_in]
135
+ else:
136
+ mcmc_out_file_re = '%s/*_out.txt' % (mcmc_in)
137
+ mcmc_out_file_list = glob.glob(mcmc_out_file_re)
138
+
139
+ if len(mcmc_out_file_list) == 0:
140
+ print('MCMCTree out file not found, program exited!')
141
+ exit()
142
+
143
+ # read in y-axis label file
144
+ label_dict = dict()
145
+ color_dict = dict()
146
+ shape_dict = dict()
147
+ if label_txt is not None:
148
+ for each_sample in open(label_txt):
149
+ each_sample_split = each_sample.strip().split('\t')
150
+ if len(each_sample_split) == 3:
151
+ label_dict[each_sample_split[0]] = each_sample_split[1]
152
+ color_dict[each_sample_split[0]] = each_sample_split[1]
153
+ shape_dict[each_sample_split[0]] = each_sample_split[2]
154
+ else:
155
+ print('Format error: %s' % label_txt)
156
+ exit()
157
+
158
+ dm_out_handle = open(dm_out, 'w')
159
+ dm_out_handle.write('Test\tShape\tVar\tMean\tLow\tHigh\n')
160
+ for mcmc_out_file in mcmc_out_file_list:
161
+ mcmc_out_file_no_path = mcmc_out_file
162
+ if '/' in mcmc_out_file_no_path:
163
+ mcmc_out_file_no_path = mcmc_out_file_no_path.split('/')[-1]
164
+
165
+ color_col_to_write = color_dict.get(mcmc_out_file_no_path, mcmc_out_file_no_path)
166
+ shape_col_to_write = shape_dict.get(mcmc_out_file_no_path, mcmc_out_file_no_path)
167
+ node_set, node_rename_dict, tree_str = get_internal_node_to_plot(node_txt, mcmc_out_file)
168
+ node_to_mean_95_hpd_dict = read_in_posterior_mean(mcmc_out_file)
169
+
170
+ for each_node in node_set:
171
+ node_name_to_write = node_rename_dict.get(each_node, each_node)
172
+ mean_95_hpd_list = node_to_mean_95_hpd_dict.get(each_node)
173
+ dm_out_handle.write('%s\t%s\t%s\t%s\n' % (color_col_to_write, shape_col_to_write, node_name_to_write, '\t'.join(mean_95_hpd_list)))
174
+ dm_out_handle.close()
175
+
176
+ plot_cmd = 'Rscript %s -i %s -x %s -y %s -o %s' % (VisHPD95_R, dm_out, plot_width, plot_height, plot_out)
177
+ os.system(plot_cmd)
178
+ print('Plot exported to: %s' % plot_out)
179
+
180
+
181
+ if __name__ == '__main__':
182
+
183
+ VisHPD95_parser = argparse.ArgumentParser()
184
+ VisHPD95_parser.add_argument('-i', required=True, help='mcmc.txt file or folder')
185
+ VisHPD95_parser.add_argument('-n', required=True, help='Nodes to plot')
186
+ VisHPD95_parser.add_argument('-label', required=False, default=None, help='labels on y axis')
187
+ VisHPD95_parser.add_argument('-x', required=False, default=8,type=int, help='plot width, default: 8')
188
+ VisHPD95_parser.add_argument('-y', required=False, default=5,type=int, help='plot height, default: 5')
189
+ VisHPD95_parser.add_argument('-o', required=True, help='Output plot')
190
+ args = vars(VisHPD95_parser.parse_args())
191
+ VisHPD95(args)
192
+
193
+ '''
194
+
195
+ cd /Users/songweizhi/Desktop/777
196
+ python3 ~/PycharmProjects/TreeSAK/TreeSAK/VisHPD95.py -i M1_mcmc_txt -o M1_HPD95.pdf -n nodes_five.txt -label y_label_out.txt
197
+
198
+ '''
@@ -0,0 +1,141 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ parse_reltime_usage = '''
6
+ ==================== parse_reltime example commands ====================
7
+
8
+ TreeSAK parse_reltime -i RelTime.txt -n dbscc_lca.txt -o dbscc_age.txt
9
+
10
+ ========================================================================
11
+ '''
12
+
13
+ def sep_path_basename_ext(file_in):
14
+
15
+ f_path, f_name = os.path.split(file_in)
16
+ if f_path == '':
17
+ f_path = '.'
18
+ f_base, f_ext = os.path.splitext(f_name)
19
+
20
+ return f_name, f_path, f_base, f_ext[1:]
21
+
22
+
23
+ def get_lca(reltime_txt, leaf_1_name, leaf_2_name):
24
+
25
+ leaf_set = set()
26
+ child_to_parent_dict = dict()
27
+ id_to_name_dict = dict()
28
+ name_to_id_dict = dict()
29
+ for each_line in open(reltime_txt):
30
+ if not each_line.startswith('NodeLabel'):
31
+ each_line_split = each_line.strip().split('\t')
32
+ each_line_split = [i.strip() for i in each_line_split]
33
+ if len(each_line_split) > 1:
34
+ node_name = each_line_split[0].replace(' ', '_')
35
+ node_id = each_line_split[1]
36
+ des1 = each_line_split[2]
37
+ des2 = each_line_split[3]
38
+ id_to_name_dict[node_id] = node_name
39
+ name_to_id_dict[node_name] = node_id
40
+ child_to_parent_dict[des1] = node_id
41
+ child_to_parent_dict[des2] = node_id
42
+ if (des1 == '-') and (des2 == '-'):
43
+ leaf_set.add(node_id)
44
+
45
+ leaf_to_lineage_dict = dict()
46
+ for leaf in sorted([i for i in leaf_set]):
47
+ original_leaf = leaf
48
+ lineage_list = [leaf]
49
+ while leaf in child_to_parent_dict:
50
+ leaf_p = child_to_parent_dict[leaf]
51
+ lineage_list.append(leaf_p)
52
+ leaf = leaf_p
53
+ leaf_to_lineage_dict[original_leaf] = lineage_list
54
+
55
+ leaf_1_id = name_to_id_dict[leaf_1_name]
56
+ leaf_2_id = name_to_id_dict[leaf_2_name]
57
+ leaf_1_linage = leaf_to_lineage_dict[leaf_1_id]
58
+ leaf_2_linage = leaf_to_lineage_dict[leaf_2_id]
59
+
60
+ lca = ''
61
+ for each_p in leaf_1_linage[::-1]:
62
+ if each_p in leaf_2_linage:
63
+ lca = each_p
64
+ return lca
65
+
66
+
67
+ def parse_reltime(args):
68
+
69
+ reltime_txt = args['i']
70
+ interested_nodes_txt = args['n']
71
+ op_txt = args['o']
72
+
73
+ f_name, f_path, f_base, f_ext = sep_path_basename_ext(op_txt)
74
+ op_txt_all_info = '%s/%s_all_info.%s' % (f_path,f_base, f_ext)
75
+
76
+ lca_to_leaves_dict = dict()
77
+ interested_node_desc_dict = dict()
78
+ for interested_node in open(interested_nodes_txt):
79
+ interested_node_split = interested_node.strip().split('\t')
80
+ paired_leaves = interested_node_split[0]
81
+ interested_node_desc = paired_leaves
82
+ if len(interested_node_split) > 1:
83
+ interested_node_desc = interested_node_split[1]
84
+ interested_node_desc_dict[paired_leaves] = interested_node_desc
85
+ leaf_1 = paired_leaves.split(',')[0]
86
+ leaf_2 = paired_leaves.split(',')[1]
87
+ lca_id = get_lca(reltime_txt, leaf_1, leaf_2)
88
+ lca_to_leaves_dict[lca_id] = paired_leaves.strip()
89
+
90
+ op_txt_all_info_handle = open(op_txt_all_info, 'w')
91
+ line_num_index = 0
92
+ for each_line in open(reltime_txt):
93
+ each_line_split = each_line.strip().split('\t')
94
+ each_line_split = [i.strip() for i in each_line_split]
95
+ if line_num_index == 0:
96
+ op_txt_all_info_handle.write('Leaves\tDescription\t%s\n' % ('\t'.join(each_line_split)))
97
+ else:
98
+ if len(each_line_split) > 1:
99
+ node_id = each_line_split[1]
100
+ if node_id in lca_to_leaves_dict:
101
+ node_id = each_line_split[1]
102
+ corresponding_leaves = lca_to_leaves_dict[node_id]
103
+ interested_node_desc = interested_node_desc_dict[corresponding_leaves]
104
+ op_txt_all_info_handle.write('%s\t%s\t%s\n' % (corresponding_leaves, interested_node_desc, '\t'.join(each_line_split)))
105
+ line_num_index += 1
106
+ op_txt_all_info_handle.close()
107
+
108
+ op_txt_handle = open(op_txt, 'w')
109
+ op_txt_handle.write('Node\tDivTime\tCI_Lower\tCI_Upper\n')
110
+ line_num_index = 0
111
+ for each_line in open(op_txt_all_info):
112
+ if line_num_index > 0:
113
+ each_line_split = each_line.strip().split('\t')
114
+ desc = each_line_split[1]
115
+ div_time = each_line_split[9]
116
+ ci_lower = each_line_split[10]
117
+ ci_upper = each_line_split[11]
118
+ op_txt_handle.write('%s\t%s\t%s\t%s\n' % (desc, div_time, ci_lower, ci_upper))
119
+ line_num_index += 1
120
+ op_txt_handle.close()
121
+
122
+
123
+ if __name__ == '__main__':
124
+
125
+ parse_reltime_parser = argparse.ArgumentParser()
126
+ parse_reltime_parser.add_argument('-i', required=True, help='reltime output file')
127
+ parse_reltime_parser.add_argument('-n', required=True, help='interested node txt')
128
+ parse_reltime_parser.add_argument('-o', required=True, help='output txt file')
129
+ args = vars(parse_reltime_parser.parse_args())
130
+ parse_reltime(args)
131
+
132
+
133
+ '''
134
+
135
+ cd /Users/songweizhi/Desktop
136
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/parse_reltime.py -i /Users/songweizhi/Desktop/Sponge_r220/6_dating/RelTime/topo2_p30_RelTime_JTT_Gamma4/topo2_p30_RelTime_Gamma4.txt -n yang_7.txt -o dbscc_age.txt
137
+
138
+ cd /Users/songweizhi/Desktop
139
+ TreeSAK parse_reltime -i /Users/songweizhi/Desktop/Sponge_r220/6_dating/RelTime/topo2_p30_RelTime_JTT_Gamma4/topo2_p30_RelTime_Gamma4.txt -n yang_7.txt -o dbscc_age.txt
140
+
141
+ '''
TreeSAK/phy2fa.py ADDED
@@ -0,0 +1,37 @@
1
+ import argparse
2
+ from Bio import SeqIO
3
+ from Bio import AlignIO
4
+
5
+
6
+ phy2fa_usage = '''
7
+ ======= phy2fa example commands =======
8
+
9
+ TreeSAK phy2fa -i msa.phy -o msa.fa
10
+
11
+ =======================================
12
+ '''
13
+
14
+
15
+ def phy2fa(args):
16
+
17
+ phylip_in = args['i']
18
+ fasta_out = args['o']
19
+
20
+ for aln in AlignIO.parse(phylip_in, "phylip"):
21
+ print(aln)
22
+
23
+ # alignments = list(AlignIO.parse(phylip_in, "phylip"))
24
+ # print(alignments)
25
+ # records = SeqIO.parse(phylip_in, "phylip")
26
+ # count = SeqIO.write(records, fasta_out, "fasta")
27
+ # print("Converted %i records" % count)
28
+
29
+
30
+ if __name__ == '__main__':
31
+
32
+ # initialize the options parser
33
+ phy2fa_parser = argparse.ArgumentParser()
34
+ phy2fa_parser.add_argument('-i', required=True, help='input MSA in phylip format')
35
+ phy2fa_parser.add_argument('-o', required=True, help='output MSA in fasta format')
36
+ args = vars(phy2fa_parser.parse_args())
37
+ phy2fa(args)
@@ -0,0 +1,165 @@
1
+ import io
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ from ete3 import Tree
5
+ from glob import glob
6
+ from os.path import *
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ import plotly.figure_factory as ff
10
+
11
+
12
+ def read_mcmc(mcmc, all_col=False):
13
+ if type(mcmc) != str:
14
+ return mcmc
15
+ if all_col:
16
+ mcmc_df = pd.read_csv(mcmc, sep='\t', index_col=0)
17
+ else:
18
+ f1 = open(mcmc)
19
+ header = [_ for _ in next(f1).strip().split('\t')]
20
+ r_header = [_ for _ in header if not _.startswith('r_g')]
21
+ # normally it need to iterate rows and ignore the columns representing rates
22
+ text = '\t'.join(r_header) + '\n'
23
+ r_header = set(r_header)
24
+ for row in f1:
25
+ text += '\t'.join([r for r, h in zip(row.strip().split('\t'), header) if h in r_header]) + '\n'
26
+ mcmc_df = pd.read_csv(io.StringIO(text), sep='\t', index_col=0)
27
+ return mcmc_df
28
+
29
+
30
+ def get_node_name_from_log(f):
31
+ # f should be the *.log file
32
+ rows = open(f).read().split('\n')
33
+ idx = [_ for _, r in enumerate(rows) if r == 'Species tree']
34
+ if not idx:
35
+ print("prior not complete")
36
+ return
37
+ idx = idx[0]
38
+ start_idx = idx + 3
39
+ end_idx = 0
40
+ for _ in range(idx, 100000):
41
+ if rows[_] == '':
42
+ end_idx = _
43
+ break
44
+ tree_idx1 = end_idx + 1
45
+ tree_idx2 = end_idx + 2
46
+ # find the index
47
+ n2father = {}
48
+ for i in range(start_idx, end_idx):
49
+ row = [_ for _ in rows[i].split(' ') if _]
50
+ father, n, name = row[0], row[1], row[2]
51
+ n2father[name if len(row) == 4 else n] = father
52
+
53
+ t = Tree(rows[tree_idx2], format=8)
54
+ for l in t.traverse('postorder'):
55
+ if l.up is None:
56
+ break
57
+ if not l.up.name:
58
+ l.up.name = n2father[l.name]
59
+ return t
60
+
61
+
62
+ indir = '/Users/songweizhi/Desktop/DateArTree/plot_distruibution/stepwise'
63
+ tree_dir = '/Users/songweizhi/Desktop/DateArTree/plot_distruibution/treefile_dir'
64
+ plot_dir = '/Users/songweizhi/Desktop/DateArTree/plot_distruibution'
65
+
66
+ gene_names = ['M24', 'COG25']
67
+ M24_gene_list = ['MitoCOG0043', 'MitoCOG0040', 'MitoCOG0055', 'MitoCOG0052', 'MitoCOG0053', 'MitoCOG0133', 'MitoCOG0008', 'MitoCOG0009', 'MitoCOG0027', 'MitoCOG0031', 'MitoCOG0030', 'MitoCOG0001', 'MitoCOG0003', 'MitoCOG0012', 'MitoCOG0010', 'MitoCOG0004', 'MitoCOG0005', 'MitoCOG0011', 'MitoCOG0039', 'MitoCOG0060', 'MitoCOG0071', 'MitoCOG0059', 'MitoCOG0067', 'MitoCOG0066']
68
+ COG25_gene_list = ['223163', '223176', '223175', '223607', '223159', '223165', '223170', '223164', '223158', '223172', '223128', '223665', '223275', '223328', '223280', '223127', '223279', '273102', '223130', '223181', '223180', '223168', '223178', '223596', '223556']
69
+
70
+
71
+ setname2genes = dict()
72
+ setname2genes['M24'] = M24_gene_list
73
+ setname2genes['COG25'] = COG25_gene_list
74
+
75
+
76
+ gene2num = {}
77
+ gene2dl = {}
78
+ for gene_id in (M24_gene_list + COG25_gene_list):
79
+ pwd_tree_file = '%s/%s.treefile' % (tree_dir, gene_id)
80
+ pwd_iqtree_log = '%s/%s.iqtree' % (tree_dir, gene_id)
81
+ rows = open(pwd_iqtree_log).read().strip().split("\n")
82
+ idx = [idx for idx, v in enumerate(rows) if "deltaL bp-RELL" in v][0]
83
+ r1, r2 = rows[idx + 2], rows[idx + 3]
84
+ r1 = [_ for _ in r1.strip().split(" ") if _]
85
+ r2 = [_ for _ in r2.strip().split(" ") if _]
86
+ if r2[2] == "0":
87
+ gene2dl[gene_id] = float(r1[2])
88
+ else:
89
+ gene2dl[gene_id] = float(r2[2])
90
+ gene2num[gene_id] = len(Tree(pwd_tree_file).get_leaf_names())
91
+
92
+ # plot 1
93
+ for setname, genes in setname2genes.items():
94
+ dl_list = [gene2dl[_] for _ in genes]
95
+ dl_list = sorted(dl_list, reverse=True)
96
+ fig = go.Figure()
97
+ fig.add_bar(y=dl_list)
98
+ fig.update_layout(title_text=setname,title_x=0.5,title_y=1,width=700,height=100,template='simple_white',
99
+ margin_b=10,margin_l=10,margin_r=10,margin_t=10)
100
+ fig.write_image('%s/Plot_1_%s.pdf' % (plot_dir, setname))
101
+
102
+ for gene_set in gene_names:
103
+ for _model in ['LG']: # C60
104
+ t = []
105
+ for f in glob(f'{indir}/{gene_set}/r*/1pf_{_model}/mcmctree/mcmc.txt'):
106
+ if exists(f.replace('mcmc.txt', 'FigTree.tre')):
107
+ t.append((f.split('/')[-4] + ' MCMC', f))
108
+ t = sorted(t, key=lambda x: int(x[0].split(' ')[0][1:]))
109
+
110
+ dfs = []
111
+ targets = []
112
+ for cal, mcmc in tqdm(t):
113
+ tre = get_node_name_from_log(mcmc.replace('mcmc.txt','03_mcmctree.log'))
114
+ df = read_mcmc(mcmc)
115
+ try:
116
+ df = df.sample(5000)
117
+ except:
118
+ print(mcmc)
119
+ for lca, name in [('GCA_001828545.1,GCA_005524015.1', 'Anammox'), ('GCA_013697045.1,GCA_002356115.1', 'Gamma-AOB'),
120
+ ('GCA_001772005.1,GCA_013521015.1', 'Beta-AOB'), ('GCA_017879665.1,GCA_013140535.1', 'Comammox'),
121
+ ('Acanthamoeba_castellanii,Andalucia_godoyi', 'Euk'), ('Andalucia_godoyi,Ostreococcus_tauri', 'Euk'),
122
+ ('Cyanophora_paradoxa,NC_002186.1', 'Euk')]:
123
+ try:
124
+ n = tre.get_common_ancestor(lca.split(',')).name
125
+ targets.append(str(n))
126
+ n = 't_n' + str(n)
127
+ times = df[[n]]
128
+ except:
129
+ continue
130
+
131
+ times.columns = ['time']
132
+ times.loc[:, 'group name'] = name
133
+ times.loc[:, 'cal'] = cal
134
+ dfs.append(times)
135
+
136
+ # plot 2
137
+ _df = pd.concat(dfs, axis=0)
138
+ g2color = {"Gamma-AOB": "#78fce0", "Beta-AOB": "#956bb4", "Comammox": "#edc21a", "Anammox": "#ff8000"}
139
+ _df = _df.loc[_df["group name"].isin(list(g2color)), :]
140
+ _fig = px.violin( _df, y="cal", x="time", color="group name", color_discrete_map=g2color, points=False, orientation="h")
141
+ _fig.update_traces(side="positive", fillcolor='rgba(0,0,0,0)', width=1.8)
142
+ _fig.update_traces(showlegend=False)
143
+ num_y = len(_df["cal"].unique())
144
+ _fig.layout.template = "simple_white"
145
+ _fig.layout.width = 700
146
+ _fig.layout.height = 750
147
+ _fig.update_xaxes(range=[40, 0])
148
+ _fig.update_layout(margin_t=10, title_text=f'{gene_set} {_model}', title_x=0.5)
149
+ _fig.write_image(f'{plot_dir}/Plot_2_{gene_set}_gradient_{_model}.pdf')
150
+
151
+ # plot 3
152
+ xs = []
153
+ ys = []
154
+ for ng, subdf in sorted(_df.groupby('cal'),key=lambda x: int(x[0].split(' ')[0].replace('r', ''))):
155
+ t1 = subdf.loc[subdf['group name'] == 'Gamma-AOB', 'time'].median()
156
+ t2 = subdf.loc[subdf['group name'] == 'Anammox', 'time'].median()
157
+ deltaT = t2-t1
158
+ ys.append(deltaT)
159
+ xs.append(int(ng.split(' ')[0].replace('r', '')))
160
+ fig = go.Figure()
161
+ fig.add_scatter(x=xs, y=ys, mode='markers+lines', showlegend=False)
162
+ fig.update_layout(width=300, height=300, margin_t=30, margin_l=10, margin_b=10, margin_r=10,
163
+ template='simple_white', title_text=f'{gene_set} {_model}', title_x=0.5)
164
+ fig.write_image('%s/Plot_3_%s_%s.pdf' % (plot_dir, gene_set, _model))
165
+
@@ -0,0 +1,92 @@
1
+ import itertools
2
+
3
+
4
+ def prep_mcmctree_ctl(ctl_para_dict, mcmctree_ctl_file):
5
+
6
+ with open(mcmctree_ctl_file, 'w') as ctl_file_handle:
7
+ ctl_file_handle.write(' finetune = %s\n' % ctl_para_dict.get('seed', '-1'))
8
+ ctl_file_handle.write(' seqfile = %s\n' % ctl_para_dict['seqfile'])
9
+ ctl_file_handle.write(' treefile = %s\n' % ctl_para_dict['treefile'])
10
+ ctl_file_handle.write(' mcmcfile = %s\n' % ctl_para_dict['mcmcfile'])
11
+ ctl_file_handle.write(' outfile = %s\n' % ctl_para_dict['outfile'])
12
+ ctl_file_handle.write(' ndata = %s\n' % ctl_para_dict.get('ndata', 1))
13
+ ctl_file_handle.write(' seqtype = %s\n' % ctl_para_dict['seqtype'])
14
+ ctl_file_handle.write(' usedata = %s\n' % ctl_para_dict['usedata'])
15
+ ctl_file_handle.write(' clock = %s\n' % ctl_para_dict['clock'])
16
+ ctl_file_handle.write(' RootAge = %s\n' % ctl_para_dict.get('RootAge', '<1.0'))
17
+ ctl_file_handle.write(' model = %s\n' % ctl_para_dict.get('model', 0))
18
+ ctl_file_handle.write(' alpha = %s\n' % ctl_para_dict.get('alpha', 0.5))
19
+ ctl_file_handle.write(' ncatG = %s\n' % ctl_para_dict.get('ncatG', 4))
20
+ ctl_file_handle.write(' cleandata = %s\n' % ctl_para_dict.get('cleandata', 0))
21
+ ctl_file_handle.write(' BDparas = %s\n' % ctl_para_dict.get('BDparas', '1 1 0.1'))
22
+ ctl_file_handle.write(' kappa_gamma = %s\n' % ctl_para_dict.get('kappa_gamma', '6 2'))
23
+ ctl_file_handle.write(' alpha_gamma = %s\n' % ctl_para_dict.get('alpha_gamma', '1 1'))
24
+ ctl_file_handle.write(' rgene_gamma = %s\n' % ctl_para_dict.get('rgene_gamma', '1 50 1'))
25
+ ctl_file_handle.write(' sigma2_gamma = %s\n' % ctl_para_dict.get('sigma2_gamma', '1 10 1'))
26
+ ctl_file_handle.write(' finetune = %s\n' % ctl_para_dict.get('finetune', '1: .1 .1 .1 .1 .1 .1'))
27
+ ctl_file_handle.write(' print = %s\n' % ctl_para_dict.get('print', 1))
28
+ ctl_file_handle.write(' burnin = %s\n' % ctl_para_dict.get('burnin', 50000))
29
+ ctl_file_handle.write(' sampfreq = %s\n' % ctl_para_dict.get('sampfreq', 5))
30
+ ctl_file_handle.write(' nsample = %s\n' % ctl_para_dict.get('nsample', 50000))
31
+
32
+
33
+ mcmctree_ctl_dict = {'seqfile' : 'concatenated.phy',
34
+ 'treefile': 'deltall75_pa75_rooted_with_calibrations.nwk',
35
+ 'mcmcfile': 'mcmc.txt',
36
+ 'outfile' : 'DateArTree_out.txt',
37
+ 'seqtype' : 2,
38
+ 'usedata' : 3,
39
+ 'clock' : 3}
40
+
41
+
42
+ prep_mcmctree_ctl(mcmctree_ctl_dict, '/Users/songweizhi/Desktop/aaa.txt')
43
+
44
+
45
+ def get_parameter_combinations(para_to_test_dict):
46
+
47
+ para_lol_name = []
48
+ para_lol_value = []
49
+ para_lol_name_with_value = []
50
+ for each_para in sorted(list(para_to_test_dict.keys())):
51
+ para_setting_list_name = []
52
+ para_setting_list_value = []
53
+ para_setting_list_name_with_value = []
54
+ for each_setting in sorted(para_to_test_dict[each_para]):
55
+ name_str = ('%s%s' % (each_para, each_setting)).replace(' ', '_')
56
+ para_setting_list_name.append(each_para)
57
+ para_setting_list_value.append(each_setting)
58
+ para_setting_list_name_with_value.append(name_str)
59
+ para_lol_name.append(para_setting_list_name)
60
+ para_lol_value.append(para_setting_list_value)
61
+ para_lol_name_with_value.append(para_setting_list_name_with_value)
62
+
63
+ all_combination_list_name = [p for p in itertools.product(*para_lol_name)]
64
+ all_combination_list_value = [p for p in itertools.product(*para_lol_value)]
65
+ all_combination_list_name_with_value = [p for p in itertools.product(*para_lol_name_with_value)]
66
+ all_combination_list_name_with_value_str = ['_'.join(i) for i in all_combination_list_name_with_value]
67
+
68
+ para_dod = dict()
69
+ element_index = 0
70
+ for each_combination in all_combination_list_name_with_value_str:
71
+ current_name_list = all_combination_list_name[element_index]
72
+ current_value_list = all_combination_list_value[element_index]
73
+ current_para_dict = dict()
74
+ for key, value in zip(current_name_list, current_value_list):
75
+ current_para_dict[key] = value
76
+ para_dod[each_combination] = current_para_dict
77
+ element_index += 1
78
+
79
+ return para_dod
80
+
81
+
82
+ para_to_test_dict = {'clock': [2, 3], 'nsample': [20000, 50000], 'model': [0, 4], 'kappa_gamma': ['6 2', '5 1']}
83
+ para_dod = get_parameter_combinations(para_to_test_dict)
84
+ print(para_dod)
85
+
86
+ # all_combination_list_in_str = ['_'.join(i) for i in all_combination_list]
87
+ # print(all_combination_list_in_str)
88
+ # print(len(all_combination_list_in_str))
89
+
90
+
91
+
92
+
@@ -0,0 +1,32 @@
1
+ import argparse
2
+ from ete3 import Tree
3
+
4
+
5
+ print_leaves_usage = '''
6
+ ======= print_leaves example commands =======
7
+
8
+ TreeSAK print_leaves -i in.tree
9
+
10
+ =============================================
11
+ '''
12
+
13
+
14
+ def print_leaves(args):
15
+
16
+ tree_file_in = args['i']
17
+
18
+ leaf_list = []
19
+ for leaf in Tree(tree_file_in, format=1):
20
+ leaf_name = leaf.name
21
+ leaf_list.append(leaf_name)
22
+
23
+ print('\n'.join(sorted(leaf_list)))
24
+
25
+
26
+ if __name__ == '__main__':
27
+
28
+ # initialize the options parser
29
+ parser = argparse.ArgumentParser()
30
+ parser.add_argument('-i', required=True, help='input tree file')
31
+ args = vars(parser.parse_args())
32
+ print_leaves(args)