treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,263 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ import multiprocessing as mp
6
+ from distutils.spawn import find_executable
7
+
8
+
9
+ ExtractMarkerSeq_usage = '''
10
+ ============================ ExtractMarkerSeq example commands ============================
11
+
12
+ Dependencies: blastp
13
+
14
+ BioSAK ExtractMarkerSeq -m marker_ref_seq -mx fa -aa faa_files -aax faa -o op_dir -e "1e-30" -t 6
15
+
16
+ ===========================================================================================
17
+ '''
18
+
19
+
20
+ def check_dependencies(program_list):
21
+ not_detected_programs = []
22
+ for needed_program in program_list:
23
+ if find_executable(needed_program) is None:
24
+ not_detected_programs.append(needed_program)
25
+
26
+ if not_detected_programs != []:
27
+ print('%s not found, program exited!' % ','.join(not_detected_programs))
28
+ exit()
29
+
30
+
31
+ def exe_cmds(cmd_list, num_threads):
32
+ print('Running %s commands with %s cores' % (len(cmd_list), num_threads))
33
+ pool = mp.Pool(processes=num_threads)
34
+ pool.map(os.system, cmd_list)
35
+ pool.close()
36
+ pool.join()
37
+
38
+
39
+ def sep_path_basename_ext(file_in):
40
+ file_path, file_name = os.path.split(file_in)
41
+ if file_path == '':
42
+ file_path = '.'
43
+ file_basename, file_extension = os.path.splitext(file_name)
44
+ return file_path, file_basename, file_extension
45
+
46
+
47
+ def select_seq(seq_file, id_file,select_option, output_file, one_line, in_fastq):
48
+
49
+ # get provided id list
50
+ seq_id_list = set()
51
+ for seq_id in open(id_file):
52
+ seq_id_list.add(seq_id.strip())
53
+
54
+ seq_in_format = 'fasta'
55
+ if in_fastq is True:
56
+ seq_in_format = 'fastq'
57
+
58
+ # extract sequences
59
+ output_file_handle = open(output_file, 'w')
60
+ for seq_record in SeqIO.parse(seq_file, seq_in_format):
61
+ seq_id = seq_record.id
62
+ if select_option == 1:
63
+ if seq_id in seq_id_list:
64
+
65
+ if in_fastq is False:
66
+ if one_line is False:
67
+ SeqIO.write(seq_record, output_file_handle, 'fasta')
68
+ else:
69
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
70
+ else:
71
+ SeqIO.write(seq_record, output_file_handle, 'fastq')
72
+
73
+ if select_option == 0:
74
+ if seq_id not in seq_id_list:
75
+
76
+ if in_fastq is False:
77
+ if one_line is False:
78
+ SeqIO.write(seq_record, output_file_handle, 'fasta')
79
+ else:
80
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
81
+ else:
82
+ SeqIO.write(seq_record, output_file_handle, 'fastq')
83
+ output_file_handle.close()
84
+
85
+
86
+ def ExtractMarkerSeq(args):
87
+
88
+ marker_seq_dir = args['m']
89
+ marker_seq_ext = args['mx']
90
+ faa_file_dir = args['aa']
91
+ faa_file_ext = args['aax']
92
+ op_dir = args['o']
93
+ e_value = args['e']
94
+ num_of_threads = args['t']
95
+ force_overwrite = args['f']
96
+
97
+ # check dependencies
98
+ check_dependencies(['blastp'])
99
+
100
+ # get marker id set
101
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
102
+ marker_seq_list = [os.path.basename(file_name) for file_name in glob.glob(marker_seq_re)]
103
+ marker_id_set = set()
104
+ for each_marker_seq_file in marker_seq_list:
105
+ marker_seq_path, marker_seq_basename, marker_seq_ext = sep_path_basename_ext(each_marker_seq_file)
106
+ marker_id_set.add(marker_seq_basename)
107
+
108
+ # get gnm id list
109
+ faa_file_re = '%s/*.%s' % (faa_file_dir, faa_file_ext)
110
+ faa_file_list = [os.path.basename(file_name) for file_name in glob.glob(faa_file_re)]
111
+ gnm_set = set()
112
+ for each_faa_file in faa_file_list:
113
+ faa_path, faa_basename, faa_ext = sep_path_basename_ext(each_faa_file)
114
+ gnm_set.add(faa_basename)
115
+ gnm_id_list_sorted = sorted([i for i in gnm_set])
116
+
117
+ # define output dir
118
+ blastp_cmd_txt = '%s/blastp_cmds_%s.txt' % (op_dir, (len(gnm_id_list_sorted)*len(marker_id_set)))
119
+ pwd_combined_protein = '%s/combined.faa' % op_dir
120
+ blast_op_dir = '%s/s01_blast_op' % op_dir
121
+ best_hit_id_by_marker_dir = '%s/s02_identified_marker_id' % op_dir
122
+ best_hit_seq_by_marker_dir = '%s/s03_identified_marker_seq' % op_dir
123
+ best_hit_seq_by_marker_dir_renamed = '%s/s04_identified_marker_seq_renamed' % op_dir
124
+
125
+ # create folder
126
+ if force_overwrite is True:
127
+ if os.path.isdir(op_dir) is True:
128
+ os.system('rm -r %s' % op_dir)
129
+ os.system('mkdir %s' % op_dir)
130
+ os.system('mkdir %s' % blast_op_dir)
131
+ else:
132
+ if os.path.isdir(op_dir) is False:
133
+ os.system('mkdir %s' % op_dir)
134
+ if os.path.isdir(blast_op_dir) is False:
135
+ os.system('mkdir %s' % blast_op_dir)
136
+
137
+ os.system('cat %s/*.%s > %s' % (faa_file_dir, faa_file_ext, pwd_combined_protein))
138
+
139
+ # get blastp command
140
+ blast_cmd_list = []
141
+ blast_op_to_cmd_dict = dict()
142
+ blastp_cmd_txt_handle = open(blastp_cmd_txt, 'w')
143
+ for gnm_id in gnm_id_list_sorted:
144
+ for each_cog in marker_id_set:
145
+ pwd_blast_op = '%s/%s_vs_%s_blastp.txt' % (blast_op_dir, gnm_id, each_cog)
146
+ blastp_cmd = 'blastp -subject %s/%s.fa -evalue %s -outfmt 6 -query %s/%s.faa -out %s' % (marker_seq_dir, each_cog, e_value, faa_file_dir, gnm_id, pwd_blast_op)
147
+ blast_op_to_cmd_dict[pwd_blast_op] = blastp_cmd
148
+ blastp_cmd_txt_handle.write(blastp_cmd + '\n')
149
+ blast_cmd_list.append(blastp_cmd)
150
+ blastp_cmd_txt_handle.close()
151
+
152
+ # run blastp
153
+ if force_overwrite is True:
154
+ exe_cmds(blast_cmd_list, num_of_threads)
155
+ else:
156
+ cmds_to_rerun = []
157
+ num_of_good_ones = 0
158
+ for each_blast_op in blast_op_to_cmd_dict:
159
+
160
+ look_good = False
161
+ if os.path.isfile(each_blast_op) is True:
162
+ look_good = True
163
+ num_of_good_ones += 1
164
+
165
+ if look_good is False:
166
+ cmds_to_rerun.append(blast_op_to_cmd_dict[each_blast_op])
167
+
168
+ print('Detected blastp outputs: %s' % num_of_good_ones)
169
+ exe_cmds(cmds_to_rerun, num_of_threads)
170
+
171
+ # get best_hit_dict_by_marker
172
+ best_hit_to_gnm_dict = dict()
173
+ best_hit_dict_by_marker = dict()
174
+ for gnm_id in gnm_id_list_sorted:
175
+ for each_cog in marker_id_set:
176
+ current_blastp_op = '%s/%s_vs_%s_blastp.txt' % (blast_op_dir, gnm_id, each_cog)
177
+ # get best hit
178
+ if os.path.isfile(current_blastp_op) is True:
179
+ best_hit_gene = ''
180
+ best_hit_score = 0
181
+ for each_line in open(current_blastp_op):
182
+ each_line_split = each_line.strip().split('\t')
183
+ query_id = each_line_split[0]
184
+ bit_score = float(each_line_split[11])
185
+ if bit_score > best_hit_score:
186
+ best_hit_score = bit_score
187
+ best_hit_gene = query_id
188
+
189
+ if best_hit_gene != '':
190
+ best_hit_to_gnm_dict[best_hit_gene] = gnm_id
191
+
192
+ if each_cog not in best_hit_dict_by_marker:
193
+ best_hit_dict_by_marker[each_cog] = [best_hit_gene]
194
+ else:
195
+ best_hit_dict_by_marker[each_cog].append(best_hit_gene)
196
+
197
+ # create output dir
198
+ if os.path.isdir(best_hit_id_by_marker_dir) is False:
199
+ os.system('mkdir %s' % best_hit_id_by_marker_dir)
200
+ if os.path.isdir(best_hit_seq_by_marker_dir) is False:
201
+ os.system('mkdir %s' % best_hit_seq_by_marker_dir)
202
+ if os.path.isdir(best_hit_seq_by_marker_dir_renamed) is False:
203
+ os.system('mkdir %s' % best_hit_seq_by_marker_dir_renamed)
204
+
205
+ # write out best hits and extract sequences
206
+ processing_index = 1
207
+ for each_marker in best_hit_dict_by_marker:
208
+ print('Extracting marker sequence %s/%s: %s' % (processing_index, len(best_hit_dict_by_marker), each_marker))
209
+ processing_index += 1
210
+
211
+ current_m_hit_list = best_hit_dict_by_marker[each_marker]
212
+ marker_hits_txt = ('%s/%s.txt' % (best_hit_id_by_marker_dir, each_marker)).replace(':', '')
213
+ marker_hits_seq = ('%s/%s.fa' % (best_hit_seq_by_marker_dir, each_marker)).replace(':', '')
214
+ marker_hits_seq_renamed = ('%s/%s.fa' % (best_hit_seq_by_marker_dir_renamed, each_marker)).replace(':', '')
215
+
216
+ with open(marker_hits_txt, 'w') as marker_hits_txt_handle:
217
+ marker_hits_txt_handle.write('\n'.join(current_m_hit_list))
218
+
219
+ # extract sequences
220
+ select_seq(pwd_combined_protein, marker_hits_txt, 1, marker_hits_seq, True, False)
221
+
222
+ # rename sequences
223
+ marker_hits_seq_renamed_handle = open(marker_hits_seq_renamed, 'w')
224
+ for each_seq in SeqIO.parse(marker_hits_seq, 'fasta'):
225
+ seq_id = each_seq.id
226
+ seq_gnm = best_hit_to_gnm_dict[seq_id]
227
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
228
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
229
+ marker_hits_seq_renamed_handle.close()
230
+
231
+ print('Done!')
232
+
233
+
234
+ if __name__ == '__main__':
235
+
236
+ # initialize the options parser
237
+ parser = argparse.ArgumentParser()
238
+ parser.add_argument('-m', required=True, help='marker seq dir')
239
+ parser.add_argument('-mx', required=True, help='marker seq ext')
240
+ parser.add_argument('-aa', required=True, help='faa file dir')
241
+ parser.add_argument('-aax', required=True, help='faa file ext')
242
+ parser.add_argument('-o', required=True, help='output dir')
243
+ parser.add_argument('-e', required=True, default=1e-30, help='e-value cutoff, default: 1e-30')
244
+ parser.add_argument('-t', required=True, type=int, help='num of threads')
245
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
246
+ args = vars(parser.parse_args())
247
+ ExtractMarkerSeq(args)
248
+
249
+
250
+ '''
251
+
252
+ conda activate mypy3env
253
+ cd /home-user/wzsong/DateArTree
254
+ python3 MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 24 -pl /home-user/wzsong/Scripts/catfasta2phyml.pl
255
+ submitHPC.sh --cmd "python3 MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -g /home-user/wzsong/DateArTree/gnm_group.txt -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 24 -pl /home-user/wzsong/Scripts/catfasta2phyml.pl" -n 24 -c Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo
256
+
257
+ cd /home-user/wzsong/DateArTree
258
+ python3 MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -g /home-user/wzsong/DateArTree/gnm_group.txt -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 12 -pl /home-user/wzsong/Scripts/catfasta2phyml.pl -g gnm_group.txt -skip_align_trim -jst 6 -qsub
259
+
260
+ cd /Users/songweizhi/Desktop/demo
261
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -g gnm_group.txt -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 10 -pl /Users/songweizhi/Scripts/catfasta2phyml.pl -g gnm_group.txt -skip_align_trim -jst 6 -qsub
262
+
263
+ '''