treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/dating.py ADDED
@@ -0,0 +1,264 @@
1
+ import os
2
+ import argparse
3
+ import itertools
4
+ from distutils.spawn import find_executable
5
+
6
+
7
+ dating_usage = '''
8
+ ============================ dating example commands ============================
9
+
10
+ # Requirement: PAML
11
+
12
+ TreeSAK dating -i gnm.tree -m msa.phy -p topo1 -o dating_wd -f -s parameter.txt
13
+
14
+ # parameter.txt file format (tab separated)
15
+ clock 2,3
16
+ nsample 50000
17
+
18
+ # assess dating results
19
+ ESS of at least 200 is commonly recommended, although ESS higher than 100 is
20
+ also often seen in literature.
21
+
22
+ =================================================================================
23
+ '''
24
+
25
+
26
+ def check_dependencies(program_list):
27
+
28
+ not_detected_programs = []
29
+ for needed_program in program_list:
30
+ if find_executable(needed_program) is None:
31
+ not_detected_programs.append(needed_program)
32
+
33
+ if not_detected_programs != []:
34
+ print('%s not found, program exited!' % ','.join(not_detected_programs))
35
+ exit()
36
+
37
+
38
+ def sep_path_basename_ext(file_in):
39
+
40
+ f_path, f_name = os.path.split(file_in)
41
+ if f_path == '':
42
+ f_path = '.'
43
+ f_base, f_ext = os.path.splitext(f_name)
44
+
45
+ return f_name, f_path, f_base, f_ext[1:]
46
+
47
+
48
+ def prep_mcmctree_ctl(ctl_para_dict, mcmctree_ctl_file):
49
+
50
+ ctl_file_handle = open(mcmctree_ctl_file, 'w')
51
+ ctl_file_handle.write(' seed = %s\n' % ctl_para_dict.get('seed', '-1'))
52
+ ctl_file_handle.write(' seqfile = %s\n' % ctl_para_dict['seqfile'])
53
+ ctl_file_handle.write(' treefile = %s\n' % ctl_para_dict['treefile'])
54
+ ctl_file_handle.write(' mcmcfile = %s\n' % ctl_para_dict['mcmcfile'])
55
+ ctl_file_handle.write(' outfile = %s\n' % ctl_para_dict['outfile'])
56
+ ctl_file_handle.write(' ndata = %s\n' % ctl_para_dict.get('ndata', 1))
57
+ ctl_file_handle.write(' seqtype = %s * 0: nucleotides; 1:codons; 2:AAs\n' % ctl_para_dict['seqtype'])
58
+ ctl_file_handle.write(' usedata = %s * 0: no data; 1:seq like; 2:normal approximation; 3:out.BV (in.BV)\n' % ctl_para_dict['usedata'])
59
+ ctl_file_handle.write(' clock = %s * 1: global clock; 2: independent rates; 3: correlated rates\n' % ctl_para_dict.get('clock', 2))
60
+ ctl_file_handle.write(' RootAge = %s * safe constraint on root age, used if no fossil for root.\n' % ctl_para_dict.get('RootAge', '<1.0'))
61
+ ctl_file_handle.write(' model = %s * 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85\n' % ctl_para_dict.get('model', 0))
62
+ ctl_file_handle.write(' alpha = %s * alpha for gamma rates at sites\n' % ctl_para_dict.get('alpha', 0.5))
63
+ ctl_file_handle.write(' ncatG = %s * No. categories in discrete gamma\n' % ctl_para_dict.get('ncatG', 4))
64
+ ctl_file_handle.write(' cleandata = %s * remove sites with ambiguity data (1:yes, 0:no)?\n' % ctl_para_dict.get('cleandata', 0))
65
+ ctl_file_handle.write(' BDparas = %s * birth, death, sampling\n' % ctl_para_dict.get('BDparas', '1 1 0.1'))
66
+ ctl_file_handle.write(' kappa_gamma = %s * gamma prior for kappa\n' % ctl_para_dict.get('kappa_gamma', '6 2'))
67
+ ctl_file_handle.write(' alpha_gamma = %s * gamma prior for alpha\n' % ctl_para_dict.get('alpha_gamma', '1 1'))
68
+ ctl_file_handle.write(' rgene_gamma = %s * gammaDir prior for rate for genes\n' % ctl_para_dict.get('rgene_gamma', '1 50 1'))
69
+ ctl_file_handle.write(' sigma2_gamma = %s * gammaDir prior for sigma^2 (for clock=2 or 3)\n' % ctl_para_dict.get('sigma2_gamma', '1 10 1'))
70
+ ctl_file_handle.write(' finetune = %s * auto (0 or 1): times, musigma2, rates, mixing, paras, FossilErr\n' % ctl_para_dict.get('finetune', '1: .1 .1 .1 .1 .1 .1'))
71
+ ctl_file_handle.write(' print = %s * 0: no mcmc sample; 1: everything except branch rates 2: everything\n' % ctl_para_dict.get('print', 1))
72
+ ctl_file_handle.write(' burnin = %s\n' % ctl_para_dict.get('burnin', 50000))
73
+ ctl_file_handle.write(' sampfreq = %s\n' % ctl_para_dict.get('sampfreq', 50))
74
+ ctl_file_handle.write(' nsample = %s\n' % ctl_para_dict.get('nsample', 10000))
75
+ ctl_file_handle.close()
76
+
77
+
78
+ def get_parameter_combinations(para_to_test_dict):
79
+
80
+ para_lol_name = []
81
+ para_lol_value = []
82
+ para_lol_name_with_value = []
83
+ for each_para in sorted(list(para_to_test_dict.keys())):
84
+ para_setting_list_name = []
85
+ para_setting_list_value = []
86
+ para_setting_list_name_with_value = []
87
+ for each_setting in sorted(para_to_test_dict[each_para]):
88
+ name_str = ('%s%s' % (each_para, each_setting)).replace(' ', '_')
89
+ para_setting_list_name.append(each_para)
90
+ para_setting_list_value.append(each_setting)
91
+ para_setting_list_name_with_value.append(name_str)
92
+ para_lol_name.append(para_setting_list_name)
93
+ para_lol_value.append(para_setting_list_value)
94
+ para_lol_name_with_value.append(para_setting_list_name_with_value)
95
+
96
+ all_combination_list_name = [p for p in itertools.product(*para_lol_name)]
97
+ all_combination_list_value = [p for p in itertools.product(*para_lol_value)]
98
+ all_combination_list_name_with_value = [p for p in itertools.product(*para_lol_name_with_value)]
99
+ all_combination_list_name_with_value_str = ['_'.join(i) for i in all_combination_list_name_with_value]
100
+
101
+ para_dod = dict()
102
+ element_index = 0
103
+ for each_combination in all_combination_list_name_with_value_str:
104
+ current_name_list = all_combination_list_name[element_index]
105
+ current_value_list = all_combination_list_value[element_index]
106
+ current_para_dict = dict()
107
+ for key, value in zip(current_name_list, current_value_list):
108
+ current_para_dict[key] = value
109
+ para_dod[each_combination] = current_para_dict
110
+ element_index += 1
111
+
112
+ return para_dod
113
+
114
+
115
+ def dating(args):
116
+
117
+ tree_file = args['i']
118
+ msa_file = args['m']
119
+ op_dir = args['o']
120
+ op_prefix = args['p']
121
+ seq_type = args['st']
122
+ settings_to_compare = args['s']
123
+ wrap_with_srun = args['srun']
124
+ force_overwrite = args['f']
125
+
126
+ check_dependencies(['mcmctree'])
127
+
128
+ para_to_test_dict = dict()
129
+ for each_para in open(settings_to_compare):
130
+ each_para_split = each_para.strip().split()
131
+ para_list = each_para_split[1].split(',')
132
+ para_to_test_dict[each_para_split[0]] = para_list
133
+
134
+ ####################################################################################################################
135
+
136
+ current_pwd = os.getcwd()
137
+
138
+ tree_f_name, tree_f_path, tree_f_base, tree_f_ext = sep_path_basename_ext(tree_file)
139
+ msa_f_name, msa_f_path, msa_f_base, msa_f_ext = sep_path_basename_ext(msa_file)
140
+
141
+ get_bv_wd = '%s/get_bv_wd' % op_dir
142
+ mcmctree_ctl_bv = '%s/mcmctree.ctl' % get_bv_wd
143
+ get_BV_cmd_txt = '%s/get_BV_cmd.txt' % get_bv_wd
144
+ dating_cmds_txt = '%s/dating_cmds.txt' % op_dir
145
+
146
+ # create output folder
147
+ if os.path.isdir(op_dir) is True:
148
+ if force_overwrite is True:
149
+ os.system('rm -r %s' % op_dir)
150
+ else:
151
+ print('Output folder exist, program exited!')
152
+ exit()
153
+
154
+ os.system('mkdir %s' % op_dir)
155
+
156
+ ############################################# write out step 1 command #############################################
157
+
158
+ # prepare files for getting bv file
159
+ os.system('mkdir %s' % get_bv_wd)
160
+ os.system('cp %s %s/' % (tree_file, get_bv_wd))
161
+ os.system('cp %s %s/' % (msa_file, get_bv_wd))
162
+
163
+ get_bv_para_dict = dict()
164
+ get_bv_para_dict['seqfile'] = msa_f_name
165
+ get_bv_para_dict['treefile'] = tree_f_name
166
+ get_bv_para_dict['mcmcfile'] = 'mcmc.txt'
167
+ get_bv_para_dict['outfile'] = 'out.txt'
168
+ get_bv_para_dict['seqtype'] = seq_type
169
+ get_bv_para_dict['usedata'] = '3'
170
+
171
+ prep_mcmctree_ctl(get_bv_para_dict, mcmctree_ctl_bv)
172
+
173
+ # write out get bv command
174
+ get_BV_cmd_txt_handle = open(get_BV_cmd_txt, 'w')
175
+ get_BV_cmd_txt_handle.write('mcmctree\n')
176
+ get_BV_cmd_txt_handle.close()
177
+
178
+ # run command to get bv file
179
+ print('Running step one command to get the BV file.')
180
+ os.chdir(get_bv_wd)
181
+ os.system('mcmctree > log.txt')
182
+ #os.system('touch out.BV')
183
+ print('Step one finished.')
184
+ os.chdir(current_pwd)
185
+
186
+ ############################################# write out step 2 command #############################################
187
+
188
+ print('Preparing files for dating estimation')
189
+
190
+ para_comb_dict = get_parameter_combinations(para_to_test_dict)
191
+ print('para_comb_dict')
192
+ print(para_comb_dict)
193
+
194
+ dating_cmds_txt_handle = open(dating_cmds_txt, 'w')
195
+ for para_comb in sorted(list(para_comb_dict.keys())):
196
+
197
+ # create dir
198
+ current_dating_wd_1 = '%s/%s_run1' % (op_dir, para_comb)
199
+ current_dating_wd_2 = '%s/%s_run2' % (op_dir, para_comb)
200
+ os.system('mkdir %s' % current_dating_wd_1)
201
+ os.system('mkdir %s' % current_dating_wd_2)
202
+
203
+ # copy tree and msa file
204
+ os.system('cp %s %s/' % (tree_file, current_dating_wd_1))
205
+ os.system('cp %s %s/' % (tree_file, current_dating_wd_2))
206
+ os.system('cp %s %s/' % (msa_file, current_dating_wd_1))
207
+ os.system('cp %s %s/' % (msa_file, current_dating_wd_2))
208
+
209
+ # prepare mcmctree.ctl file
210
+ mcmctree_ctl_1 = '%s/mcmctree.ctl' % current_dating_wd_1
211
+ mcmctree_ctl_2 = '%s/mcmctree.ctl' % current_dating_wd_2
212
+
213
+ # run 1
214
+ current_para_dict_run1 = para_comb_dict[para_comb].copy()
215
+ current_para_dict_run1['seqfile'] = msa_f_name
216
+ current_para_dict_run1['treefile'] = tree_f_name
217
+ current_para_dict_run1['mcmcfile'] = '%s_%s_run1_mcmc.txt' % (op_prefix, para_comb)
218
+ current_para_dict_run1['outfile'] = '%s_%s_run1_out.txt' % (op_prefix, para_comb)
219
+ current_para_dict_run1['seqtype'] = seq_type
220
+ current_para_dict_run1['usedata'] = '2'
221
+
222
+ # run 2
223
+ current_para_dict_run2 = para_comb_dict[para_comb].copy()
224
+ current_para_dict_run2['seqfile'] = msa_f_name
225
+ current_para_dict_run2['treefile'] = tree_f_name
226
+ current_para_dict_run2['mcmcfile'] = '%s_%s_run2_mcmc.txt' % (op_prefix, para_comb)
227
+ current_para_dict_run2['outfile'] = '%s_%s_run2_out.txt' % (op_prefix, para_comb)
228
+ current_para_dict_run2['seqtype'] = seq_type
229
+ current_para_dict_run2['usedata'] = '2'
230
+
231
+ prep_mcmctree_ctl(current_para_dict_run1, mcmctree_ctl_1)
232
+ prep_mcmctree_ctl(current_para_dict_run2, mcmctree_ctl_2)
233
+
234
+ # copy BV files generated in step one
235
+ os.system('cp %s/out.BV %s/in.BV' % (get_bv_wd, current_dating_wd_1))
236
+ os.system('cp %s/out.BV %s/in.BV' % (get_bv_wd, current_dating_wd_2))
237
+
238
+ # write out commands
239
+ cmd_run_1 = 'cd %s/%s/%s; mcmctree' % (current_pwd, op_dir, current_dating_wd_1.split('/')[-1])
240
+ cmd_run_2 = 'cd %s/%s/%s; mcmctree' % (current_pwd, op_dir, current_dating_wd_2.split('/')[-1])
241
+ if wrap_with_srun is True:
242
+ cmd_run_1 = 'BioSAK srun -c "%s"' % cmd_run_1
243
+ cmd_run_2 = 'BioSAK srun -c "%s"' % cmd_run_2
244
+ dating_cmds_txt_handle.write(cmd_run_1 + '\n')
245
+ dating_cmds_txt_handle.write(cmd_run_2 + '\n')
246
+ dating_cmds_txt_handle.close()
247
+
248
+ print('Job script for performing dating exported to: %s' % dating_cmds_txt)
249
+
250
+
251
+ if __name__ == '__main__':
252
+
253
+ dating_parser = argparse.ArgumentParser()
254
+ dating_parser.add_argument('-i', required=True, help='input tree file')
255
+ dating_parser.add_argument('-m', required=True, help='sequence alignments')
256
+ dating_parser.add_argument('-o', required=True, help='output directory')
257
+ dating_parser.add_argument('-p', required=True, help='output prefix')
258
+ dating_parser.add_argument('-s', required=True, help='settings to compare')
259
+ dating_parser.add_argument('-st', required=False, default='2', help='sequence type, 0 for nucleotides, 1 for codons, 2 for AAs, default: 2')
260
+ dating_parser.add_argument('-srun', required=False, action="store_true", help='wrap commands with BioSAK srun')
261
+ dating_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
262
+ args = vars(dating_parser.parse_args())
263
+ dating(args)
264
+
TreeSAK/dating_ss.py ADDED
@@ -0,0 +1,361 @@
1
+ import os
2
+ import argparse
3
+ import itertools
4
+ from ete3 import Tree
5
+ from Bio import AlignIO
6
+
7
+
8
+ Dating_usage = '''
9
+ ============================= Dating example commands =============================
10
+
11
+ # example commands
12
+ TreeSAK Dating_ss -deltall DeltaLL_stdout.txt -aod s11_marker_sets_by_DeltaLL -o s12_dating_wd -c 25-50-75-100 -mmn 20 -f
13
+
14
+ ===================================================================================
15
+ '''
16
+
17
+
18
+ def sep_path_basename_ext(file_in):
19
+ file_path, file_name = os.path.split(file_in)
20
+ if file_path == '':
21
+ file_path = '.'
22
+ file_basename, file_extension = os.path.splitext(file_name)
23
+ return file_path, file_basename, file_extension
24
+
25
+
26
+ def submit_js(js):
27
+ current_wd = os.getcwd()
28
+ js_path, js_basename, js_ext = sep_path_basename_ext(js)
29
+ os.chdir(js_path)
30
+ os.system('qsub %s%s' % (js_basename, js_ext))
31
+ os.chdir(current_wd)
32
+
33
+
34
+ def root_with_out_group(tree_file, out_group_txt, tree_file_rooted):
35
+
36
+ out_group_set = set()
37
+ for each_og in open(out_group_txt):
38
+ out_group_set.add(each_og.strip())
39
+
40
+ tre = Tree(tree_file, format=1)
41
+ out_group_lca = tre.get_common_ancestor(out_group_set)
42
+ tre.set_outgroup(out_group_lca)
43
+ tre.write(outfile=tree_file_rooted)
44
+
45
+
46
+ def replace_clades(main_tree, sub_tree, tree_out, quote_node_name):
47
+
48
+ tre_sub = Tree(sub_tree, format=1, quoted_node_names=quote_node_name)
49
+ subtree_leaf_name_list = tre_sub.get_leaf_names()
50
+ tre_main = Tree(main_tree)
51
+ lca = tre_main.get_common_ancestor(subtree_leaf_name_list)
52
+
53
+ if len(lca.get_leaf_names()) != len(subtree_leaf_name_list):
54
+ print('LCA of subtree leaves in main tree contain extra leaves, program exited!')
55
+ exit()
56
+
57
+ lca_p = lca.up
58
+ lca_p.remove_child(lca)
59
+ lca_p.add_child(tre_sub)
60
+ tre_main.write(outfile=tree_out, format=8, quoted_node_names=quote_node_name)
61
+
62
+
63
+ def prep_mcmctree_ctl(ctl_para_dict, mcmctree_ctl_file):
64
+
65
+ with open(mcmctree_ctl_file, 'w') as ctl_file_handle:
66
+ ctl_file_handle.write(' finetune = %s\n' % ctl_para_dict.get('seed', '-1'))
67
+ ctl_file_handle.write(' seqfile = %s\n' % ctl_para_dict['seqfile'])
68
+ ctl_file_handle.write(' treefile = %s\n' % ctl_para_dict['treefile'])
69
+ ctl_file_handle.write(' mcmcfile = %s\n' % ctl_para_dict['mcmcfile'])
70
+ ctl_file_handle.write(' outfile = %s\n' % ctl_para_dict['outfile'])
71
+ ctl_file_handle.write(' ndata = %s\n' % ctl_para_dict.get('ndata', 1))
72
+ ctl_file_handle.write(' seqtype = %s * 0: nucleotides; 1:codons; 2:AAs\n' % ctl_para_dict['seqtype'])
73
+ ctl_file_handle.write(' usedata = %s * 0: no data; 1:seq like; 2:normal approximation; 3:out.BV (in.BV)\n' % ctl_para_dict['usedata'])
74
+ ctl_file_handle.write(' clock = %s * 1: global clock; 2: independent rates; 3: correlated rates\n' % ctl_para_dict['clock'])
75
+ ctl_file_handle.write(' RootAge = %s * safe constraint on root age, used if no fossil for root.\n' % ctl_para_dict.get('RootAge', '<1.0'))
76
+ ctl_file_handle.write(' model = %s * 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85\n' % ctl_para_dict.get('model', 0))
77
+ ctl_file_handle.write(' alpha = %s * alpha for gamma rates at sites\n' % ctl_para_dict.get('alpha', 0.5))
78
+ ctl_file_handle.write(' ncatG = %s * No. categories in discrete gamma\n' % ctl_para_dict.get('ncatG', 4))
79
+ ctl_file_handle.write(' cleandata = %s * remove sites with ambiguity data (1:yes, 0:no)?\n' % ctl_para_dict.get('cleandata', 0))
80
+ ctl_file_handle.write(' BDparas = %s * birth, death, sampling\n' % ctl_para_dict.get('BDparas', '1 1 0.1'))
81
+ ctl_file_handle.write(' kappa_gamma = %s * gamma prior for kappa\n' % ctl_para_dict.get('kappa_gamma', '6 2'))
82
+ ctl_file_handle.write(' alpha_gamma = %s * gamma prior for alpha\n' % ctl_para_dict.get('alpha_gamma', '1 1'))
83
+ ctl_file_handle.write(' rgene_gamma = %s * gammaDir prior for rate for genes\n' % ctl_para_dict.get('rgene_gamma', '1 50 1'))
84
+ ctl_file_handle.write(' sigma2_gamma = %s * gammaDir prior for sigma^2 (for clock=2 or 3)\n' % ctl_para_dict.get('sigma2_gamma', '1 10 1'))
85
+ ctl_file_handle.write(' finetune = %s * auto (0 or 1): times, musigma2, rates, mixing, paras, FossilErr\n' % ctl_para_dict.get('finetune', '1: .1 .1 .1 .1 .1 .1'))
86
+ ctl_file_handle.write(' print = %s * 0: no mcmc sample; 1: everything except branch rates 2: everything\n' % ctl_para_dict.get('print', 1))
87
+ ctl_file_handle.write(' burnin = %s\n' % ctl_para_dict.get('burnin', 50000))
88
+ ctl_file_handle.write(' sampfreq = %s\n' % ctl_para_dict.get('sampfreq', 5))
89
+ ctl_file_handle.write(' nsample = %s\n' % ctl_para_dict.get('nsample', 150000))
90
+
91
+
92
+ def get_parameter_combinations(para_to_test_dict):
93
+
94
+ para_lol_name = []
95
+ para_lol_value = []
96
+ para_lol_name_with_value = []
97
+ for each_para in sorted(list(para_to_test_dict.keys())):
98
+ para_setting_list_name = []
99
+ para_setting_list_value = []
100
+ para_setting_list_name_with_value = []
101
+ for each_setting in sorted(para_to_test_dict[each_para]):
102
+ name_str = ('%s%s' % (each_para, each_setting)).replace(' ', '_')
103
+ para_setting_list_name.append(each_para)
104
+ para_setting_list_value.append(each_setting)
105
+ para_setting_list_name_with_value.append(name_str)
106
+ para_lol_name.append(para_setting_list_name)
107
+ para_lol_value.append(para_setting_list_value)
108
+ para_lol_name_with_value.append(para_setting_list_name_with_value)
109
+
110
+ all_combination_list_name = [p for p in itertools.product(*para_lol_name)]
111
+ all_combination_list_value = [p for p in itertools.product(*para_lol_value)]
112
+ all_combination_list_name_with_value = [p for p in itertools.product(*para_lol_name_with_value)]
113
+ all_combination_list_name_with_value_str = ['_'.join(i) for i in all_combination_list_name_with_value]
114
+
115
+ para_dod = dict()
116
+ element_index = 0
117
+ for each_combination in all_combination_list_name_with_value_str:
118
+ current_name_list = all_combination_list_name[element_index]
119
+ current_value_list = all_combination_list_value[element_index]
120
+ current_para_dict = dict()
121
+ for key, value in zip(current_name_list, current_value_list):
122
+ current_para_dict[key] = value
123
+ para_dod[each_combination] = current_para_dict
124
+ element_index += 1
125
+
126
+ return para_dod
127
+
128
+
129
+ def fa2phy(fasta_in, phy_out):
130
+
131
+ alignment = AlignIO.read(fasta_in, 'fasta')
132
+
133
+ max_seq_id_len = 0
134
+ for each_seq in alignment:
135
+ seq_id_len = len(each_seq.id)
136
+ if seq_id_len > max_seq_id_len:
137
+ max_seq_id_len = seq_id_len
138
+
139
+ with open(phy_out, 'w') as msa_out_handle:
140
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
141
+ for each_seq in alignment:
142
+ seq_id = each_seq.id
143
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
144
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
145
+
146
+
147
+ def dating_ss(args):
148
+
149
+ deltall_stdout_txt = args['deltall']
150
+ aod = args['aod']
151
+ out_group_txt = args['og']
152
+ eu_tree = args['eu']
153
+ op_dir = args['o']
154
+ deltall_keep_pct_str = args['c']
155
+ min_marker_num = args['mmn']
156
+ force_overwrite = args['f']
157
+ root_age = args['ra']
158
+ submit_job = args['qsub']
159
+ para_to_test = args['to_test']
160
+ js_cpu_num = 1
161
+ quote_node_name = False
162
+
163
+ para_to_test_dict = dict()
164
+ for each_para in open(para_to_test):
165
+ each_para_split = each_para.strip().split()
166
+ para_list = each_para_split[1].split(',')
167
+ para_to_test_dict[each_para_split[0]] = para_list
168
+ print('Parameters to test: %s' % para_to_test_dict)
169
+
170
+ if os.path.isfile(eu_tree) is False:
171
+ print('%s not found, program exited!' % eu_tree)
172
+ exit()
173
+
174
+ deltall_keep_pct_list = [int(i) for i in deltall_keep_pct_str.split('-')]
175
+ deltall_stdout_path, deltall_stdout_basename, deltall_stdout_ext = sep_path_basename_ext(deltall_stdout_txt)
176
+
177
+ # create dir
178
+ if os.path.isdir(op_dir) is True:
179
+ if force_overwrite is True:
180
+ os.system('rm -r %s' % op_dir)
181
+ else:
182
+ print('output folder detected, program exited!')
183
+ exit()
184
+ os.system('mkdir %s' % op_dir)
185
+
186
+ # read in deltall_stdout_txt
187
+ deltall_op_dict = dict()
188
+ for each_line in open(deltall_stdout_txt):
189
+ if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
190
+ each_line_split = each_line.strip().split('\t')
191
+ marker_id = each_line_split[0]
192
+ value = float(each_line_split[1])
193
+ if marker_id not in deltall_op_dict:
194
+ deltall_op_dict[marker_id] = [value]
195
+ else:
196
+ deltall_op_dict[marker_id].append(value)
197
+
198
+ # assigned score to marker
199
+ metric_1_dict = dict()
200
+ metric_2_dict = dict()
201
+ for each_marker in deltall_op_dict:
202
+ metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
203
+ metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
204
+ metric_1_dict[each_marker] = metric_1_value
205
+ metric_2_dict[each_marker] = metric_2_value
206
+
207
+ metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
208
+ metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
209
+
210
+ metric_1_score_dict = dict()
211
+ metric_1_score = 1
212
+ for each_marker_1 in metric_1_dict_sorted:
213
+ metric_1_score_dict[each_marker_1] = metric_1_score
214
+ metric_1_score += 1
215
+
216
+ metric_2_score_dict = dict()
217
+ metric_2_score = 1
218
+ for each_marker_2 in metric_2_dict_sorted:
219
+ metric_2_score_dict[each_marker_2] = metric_2_score
220
+ metric_2_score += 1
221
+
222
+ overall_score_dict = dict()
223
+ for each_marker in deltall_op_dict:
224
+ metric_score_1 = metric_1_score_dict[each_marker]
225
+ metric_score_2 = metric_2_score_dict[each_marker]
226
+ metric_score_overall = metric_score_1 + metric_score_2
227
+ overall_score_dict[each_marker] = metric_score_overall
228
+ marker_list_sorted_by_deltall = [k for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])]
229
+
230
+ # get qualified marker list
231
+ for each_keep_pct in deltall_keep_pct_list:
232
+ marker_num_to_keep = round(len(marker_list_sorted_by_deltall)*each_keep_pct/100)
233
+
234
+ if marker_num_to_keep < min_marker_num:
235
+ print('Ignored DeltaLL cutoff at %s , the number of qualified markers (%s) less than %s' % (each_keep_pct, marker_num_to_keep, min_marker_num))
236
+ else:
237
+ prefix_base = '%s_DeltaLL_%s' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
238
+ aln_concatenated = '%s_DeltaLL_%s_concatenated.phy' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
239
+ aln_concatenated_in_aod_wd_fasta = '%s_DeltaLL_%s_concatenated.phy.fasta' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
240
+ c60_tree_file_rooted_with_time_final = '%s_DeltaLL_%s_rooted_with_time_final.treefile' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
241
+ pwd_c60_tree_file = '%s/%s_DeltaLL_%s_iqtree_C60_PMSF/concatenated.treefile' % (aod, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
242
+ pwd_c60_tree_file_renamed = '%s/%s_DeltaLL_%s_raw.treefile' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
243
+ pwd_c60_tree_file_rooted = '%s/%s_DeltaLL_%s_rooted.treefile' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
244
+ pwd_c60_tree_file_rooted_with_time = '%s/%s_DeltaLL_%s_rooted_with_time.treefile' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
245
+ pwd_aln_concatenated_in_aod_wd_fasta = '%s/%s' % (aod, aln_concatenated_in_aod_wd_fasta)
246
+ pwd_aln_concatenated_in_op_wd_phylip = '%s/%s' % (op_dir, aln_concatenated)
247
+ pwd_c60_tree_file_rooted_with_time_final = '%s/%s' % (op_dir, c60_tree_file_rooted_with_time_final)
248
+ get_BV_wd = '%s/%s_get_BV_wd' % (op_dir, prefix_base)
249
+ pwd_aln_concatenated_in_bv_wd_phylip = '%s/%s' % (get_BV_wd, aln_concatenated)
250
+
251
+ fa2phy(pwd_aln_concatenated_in_aod_wd_fasta, pwd_aln_concatenated_in_op_wd_phylip)
252
+ os.system('cp %s %s' % (pwd_c60_tree_file, pwd_c60_tree_file_renamed))
253
+
254
+ # root genome tree with outgroup
255
+ root_with_out_group(pwd_c60_tree_file_renamed, out_group_txt, pwd_c60_tree_file_rooted)
256
+
257
+ # add time constraints
258
+ replace_clades(pwd_c60_tree_file_rooted, eu_tree, pwd_c60_tree_file_rooted_with_time, quote_node_name)
259
+
260
+ # remove "NoName" from the rooted tree with time constraints
261
+ tree_str = open(pwd_c60_tree_file_rooted_with_time).readline().strip().replace('NoName', '')
262
+
263
+ # add root age
264
+ tree_str = tree_str.replace(';', '<%s;' % root_age)
265
+ tre_object = Tree(tree_str, format=8, quoted_node_names=quote_node_name)
266
+ with open(pwd_c60_tree_file_rooted_with_time_final, 'w') as pwd_c60_tree_file_rooted_with_time_final_hanlde:
267
+ pwd_c60_tree_file_rooted_with_time_final_hanlde.write('%s\t1\n' % len(tre_object.get_leaf_names()))
268
+ pwd_c60_tree_file_rooted_with_time_final_hanlde.write(tree_str.replace('""', '') + '\n')
269
+ #pwd_c60_tree_file_rooted_with_time_final_hanlde.write(tree_str + '\n')
270
+
271
+ # rm tmp tree files
272
+ os.system('rm %s' % pwd_c60_tree_file_renamed)
273
+ os.system('rm %s' % pwd_c60_tree_file_rooted)
274
+ os.system('rm %s' % pwd_c60_tree_file_rooted_with_time)
275
+
276
+ # get BV file
277
+ os.mkdir(get_BV_wd)
278
+ fa2phy(pwd_aln_concatenated_in_aod_wd_fasta, pwd_aln_concatenated_in_bv_wd_phylip) # sequence in phylip format need to be in one line
279
+ os.system('cp %s %s/' % (pwd_c60_tree_file_rooted_with_time_final, get_BV_wd))
280
+
281
+ get_BV_js = '%s/%s_get_BV.sh' % (op_dir, prefix_base)
282
+ get_BV_mcmctree_ctl = '%s_get_BV_mcmctree.ctl' % (prefix_base)
283
+ pwd_get_BV_mcmctree_ctl = '%s/%s' % (get_BV_wd, get_BV_mcmctree_ctl)
284
+
285
+ get_BV_para_dict = dict()
286
+ get_BV_para_dict['seqfile'] = aln_concatenated
287
+ get_BV_para_dict['treefile'] = c60_tree_file_rooted_with_time_final
288
+ get_BV_para_dict['mcmcfile'] = '%s_mcmc.txt' % prefix_base
289
+ get_BV_para_dict['outfile'] = '%s_out.txt' % prefix_base
290
+ get_BV_para_dict['seqtype'] = '2'
291
+ get_BV_para_dict['usedata'] = '3'
292
+ get_BV_para_dict['clock'] = '3'
293
+ prep_mcmctree_ctl(get_BV_para_dict, pwd_get_BV_mcmctree_ctl)
294
+
295
+ with open(get_BV_js, 'w') as get_BV_js_handle:
296
+ get_BV_js_handle.write('#!/bin/bash\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task %s\n\n' % js_cpu_num)
297
+ get_BV_js_handle.write('cd %s/%s\n' % (os.getcwd(), get_BV_wd))
298
+ get_BV_js_handle.write('mcmctree %s\n' % get_BV_mcmctree_ctl)
299
+
300
+ # prepare files for dating
301
+ para_dod = get_parameter_combinations(para_to_test_dict)
302
+ for para_combination in para_dod:
303
+ mcmctree_ctl = '%s_%s_mcmctree.ctl' % (prefix_base, para_combination)
304
+ current_dating_wd = '%s/%s_DeltaLL_%s_%s_dating_wd' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct, para_combination)
305
+ pwd_mcmctree_ctl = '%s/%s_%s_mcmctree.ctl' % (current_dating_wd, prefix_base, para_combination)
306
+ js_mcmctree = '%s/js_%s_DeltaLL_%s_%s.sh' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct, para_combination)
307
+ pwd_aln_in_dating_wd = '%s/%s' % (current_dating_wd, aln_concatenated)
308
+
309
+ # create dating wd and copy tree and alignment files into it
310
+ os.mkdir(current_dating_wd)
311
+ fa2phy(pwd_aln_concatenated_in_aod_wd_fasta, pwd_aln_in_dating_wd) # sequence in phylip format need to be in one line
312
+ os.system('cp %s %s/' % (pwd_c60_tree_file_rooted_with_time_final, current_dating_wd))
313
+
314
+ current_para_dict = para_dod[para_combination]
315
+ current_para_dict['seqfile'] = aln_concatenated
316
+ current_para_dict['treefile'] = c60_tree_file_rooted_with_time_final
317
+ current_para_dict['mcmcfile'] = '%s_%s_mcmc.txt' % (prefix_base, para_combination)
318
+ current_para_dict['outfile'] = '%s_%s_out.txt' % (prefix_base, para_combination)
319
+ current_para_dict['seqtype'] = '2'
320
+ current_para_dict['usedata'] = '2'
321
+
322
+ prep_mcmctree_ctl(current_para_dict, pwd_mcmctree_ctl)
323
+
324
+ with open(js_mcmctree, 'w') as js_mcmctree_handle:
325
+ js_mcmctree_handle.write('#!/bin/bash\n\n')
326
+ js_mcmctree_handle.write('cd %s/%s\n' % (os.getcwd(), current_dating_wd))
327
+ js_mcmctree_handle.write('cp ../%s_get_BV_wd/out.BV in.BV\n' % prefix_base)
328
+ js_mcmctree_handle.write('mcmctree %s\n' % mcmctree_ctl)
329
+ print('Job script for performing dating exported to %s' % js_mcmctree)
330
+
331
+ if submit_job is True:
332
+ submit_js(get_BV_js)
333
+
334
+
335
+ if __name__ == '__main__':
336
+
337
+ parser = argparse.ArgumentParser()
338
+ parser.add_argument('-deltall', required=True, help='DeltaLL stdout')
339
+ parser.add_argument('-aod', required=True, help='AssessMarkerDeltaLL output dir')
340
+ parser.add_argument('-og', required=True, help='outgroup leaves, one id per line')
341
+ parser.add_argument('-eu', required=True, help='EU tree with time constraints')
342
+ parser.add_argument('-o', required=True, help='dating wd')
343
+ parser.add_argument('-c', required=False, default='25-50-75-100', help='cutoffs, default: 25-50-75-100')
344
+ parser.add_argument('-mmn', required=False, default=20, type=int, help='minimal marker number, default: 20')
345
+ parser.add_argument('-ra', required=False, default=45, type=int, help='root age, default: 45')
346
+ parser.add_argument('-qsub', required=False, action="store_true", help='submit job scripts for getting in.BV')
347
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
348
+ parser.add_argument('-to_test', required=True, help='Settings to test')
349
+ args = vars(parser.parse_args())
350
+ dating_ss(args)
351
+
352
+
353
+ '''
354
+
355
+ cd /Users/songweizhi/Desktop/dating_test
356
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/dating_ss.py -deltall Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s10_assess_marker_deltaLL/PA_75_DeltaLL_stdout.txt -aod Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s11_marker_sets_by_DeltaLL -og out_group.txt -eu 27.nwk -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s12_dating_wd -c 25-50-75-100 -mmn 20 -f
357
+
358
+ cd /home-user/wzsong/DateArTree
359
+ python3 dating_ss.py -deltall Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s10_assess_marker_deltaLL/PA_75_DeltaLL_stdout.txt -aod Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s11_marker_sets_by_DeltaLL -og out_group.txt -eu 27.nwk -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s12_dating_wd -c 25-50-75-100 -mmn 20 -f
360
+
361
+ '''