treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/assessOG.py ADDED
@@ -0,0 +1,45 @@
1
+ import os
2
+ import glob
3
+ from Bio import SeqIO
4
+
5
+
6
+ def sep_path_basename_ext(file_in):
7
+
8
+ f_path, f_name = os.path.split(file_in)
9
+ if f_path == '':
10
+ f_path = '.'
11
+ f_base, f_ext = os.path.splitext(f_name)
12
+
13
+ return f_name, f_path, f_base, f_ext[1:]
14
+
15
+
16
+ def get_gnm_og_cov(og_dir, og_ext, og_cov_txt):
17
+
18
+ og_file_re = '%s/*.%s' % (og_dir, og_ext)
19
+ og_file_list = glob.glob(og_file_re)
20
+
21
+ gnm_to_og_dict = dict()
22
+ for og_file in og_file_list:
23
+ _, _, og_id, _ = sep_path_basename_ext(og_file)
24
+ for each_seq in SeqIO.parse(og_file, 'fasta'):
25
+ seq_id = each_seq.id
26
+ gnm_id = '_'.join(seq_id.split('_')[:-1])
27
+ if gnm_id not in gnm_to_og_dict:
28
+ gnm_to_og_dict[gnm_id] = set()
29
+ gnm_to_og_dict[gnm_id].add(og_id)
30
+
31
+ og_cov_txt_handle = open(og_cov_txt, 'w')
32
+ for each_gnm in sorted(list(gnm_to_og_dict.keys())):
33
+ gnm_og_set = gnm_to_og_dict[each_gnm]
34
+ og_cov = len(gnm_og_set)*100/len(og_file_list)
35
+ og_cov = float("{0:.2f}".format(og_cov))
36
+ og_cov_txt_handle.write('%s\t%s\n' % (each_gnm, og_cov))
37
+ og_cov_txt_handle.close()
38
+
39
+
40
+ og_dir = '/Users/songweizhi/Desktop/OrthologousGroupsFasta_cov95'
41
+ og_ext = 'fa'
42
+ og_cov_txt = '/Users/songweizhi/Desktop/gnm_og_cov.txt'
43
+
44
+ get_gnm_og_cov(og_dir, og_ext, og_cov_txt)
45
+
TreeSAK/batch_itol.py ADDED
@@ -0,0 +1,171 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from pathlib import Path
5
+ from itolapi import Itol
6
+
7
+
8
+ batch_itol_usage = '''
9
+ ======================================= batch_itol example commands =======================================
10
+
11
+ TreeSAK batch_itol -f -api API_key -ip batch_access_tmp -a annotation_files.txt -i input.tree -o out.pdf
12
+ TreeSAK batch_itol -f -api API_key -ip batch_access_tmp -a annotation_files.txt -i tree_dir -x tree -o out_pdf
13
+
14
+ Manual
15
+ https://github.com/albertyw/itolapi
16
+ http://itol.embl.de/help.cgi#batch
17
+
18
+ # An example of the parameter file is available here
19
+ # to be added
20
+
21
+ ===========================================================================================================
22
+ '''
23
+
24
+ def sep_path_basename_ext(file_in):
25
+
26
+ f_path, f_name = os.path.split(file_in)
27
+ if f_path == '':
28
+ f_path = '.'
29
+ f_base, f_ext = os.path.splitext(f_name)
30
+ f_ext = f_ext[1:]
31
+
32
+ return f_name, f_path, f_base, f_ext
33
+
34
+
35
+ def itol_single_tree(tree_file, annotation_files_txt, project_name, APIkey, parameter_dict, op_plot):
36
+
37
+ annotation_file_list = []
38
+ for each_file in open(annotation_files_txt):
39
+ annotation_file_list.append(each_file.strip())
40
+
41
+ op_plot_ext = op_plot.split('.')[-1]
42
+
43
+ # upload tree to iTOL
44
+ itol_uploader = Itol()
45
+ itol_uploader.params['projectName'] = project_name # better to create a project with a unique name.
46
+ itol_uploader.params['APIkey'] = APIkey # sine we are the same account, we can use the same APIkey
47
+ itol_uploader.params['treeName'] = tree_file
48
+ itol_uploader.add_file(Path(tree_file))
49
+
50
+ # upload annotation files to iTOL
51
+ for annotation_file in annotation_file_list:
52
+ itol_uploader.add_file(Path(annotation_file))
53
+
54
+ status = itol_uploader.upload()
55
+ # import pdb;pdb.set_trace()
56
+ assert status != False
57
+
58
+ # the following parameters are optional, refer to https://itol.embl.de/help.cgi#batchExp
59
+ if len(annotation_file_list) == 1:
60
+ datasets_visible_str = '0'
61
+ elif len(annotation_file_list) == 2:
62
+ datasets_visible_str = '0,1'
63
+ elif len(annotation_file_list) == 3:
64
+ datasets_visible_str = '0,1,2'
65
+ else:
66
+ datasets_visible_str = ','.join([str(i) for i in list(range(0, len(annotation_file_list)))])
67
+
68
+ parameter_dict.get('', 'to be added')
69
+ parameter_dict.get('', '')
70
+
71
+
72
+ # for a full list of options, go to https://itol.embl.de/help.cgi#batchExp
73
+ itol_exporter = itol_uploader.get_itol_export()
74
+ itol_exporter.set_export_param_value('internal_scale', parameter_dict.get('internal_scale', '0'))
75
+ itol_exporter.set_export_param_value('datasets_visible', datasets_visible_str)
76
+ itol_exporter.set_export_param_value('display_mode', parameter_dict.get('display_mode', '1'))
77
+ itol_exporter.set_export_param_value('vertical_shift_factor', parameter_dict.get('vertical_shift_factor', '1'))
78
+ itol_exporter.set_export_param_value('horizontal_scale_factor', parameter_dict.get('horizontal_scale_factor', '0.9'))
79
+
80
+ # range
81
+ itol_exporter.set_export_param_value('range_mode', parameter_dict.get('range_mode', '2')) # Possible values: 0,1 or 2 (0=off, 1=cover labels only, 2=cover full clades)
82
+ itol_exporter.set_export_param_value('include_ranges_legend', parameter_dict.get('include_ranges_legend', '0'))
83
+
84
+ # label
85
+ # itol_exporter.set_export_param_value('current_font_size', '12') # the default looks good
86
+ itol_exporter.set_export_param_value('current_font_name', parameter_dict.get('current_font_name', 'Courier'))
87
+ itol_exporter.set_export_param_value('default_label_color', parameter_dict.get('default_label_color', '#000000'))
88
+
89
+ # branch
90
+ itol_exporter.set_export_param_value('line_width', parameter_dict.get('line_width', '2'))
91
+ itol_exporter.set_export_param_value('dashed_lines', parameter_dict.get('dashed_lines', '1'))
92
+ itol_exporter.set_export_param_value('default_branch_color', parameter_dict.get('default_branch_color', '#000000'))
93
+
94
+ # bootstrap
95
+ itol_exporter.set_export_param_value('metadata_source', parameter_dict.get('metadata_source', 'bootstrap')) # Which metadata source to use for bootstrap display options
96
+ itol_exporter.set_export_param_value('bootstrap_display', parameter_dict.get('bootstrap_display', '1')) # possible values: 0 or 1
97
+ itol_exporter.set_export_param_value('bootstrap_type', parameter_dict.get('bootstrap_type', '2')) # Possible values: 1, 2, 3 or 4 (1=Symbol, 2=Text label, 3=Branch color and 4=Branch width)
98
+ itol_exporter.set_export_param_value('bootstrap_label_size', parameter_dict.get('bootstrap_label_size', '15')) # in pixels, integer >= 9
99
+ itol_exporter.set_export_param_value('bootstrap_label_percent_factor', parameter_dict.get('bootstrap_label_percent_factor', '10')) # in pixels, integer >= 9
100
+
101
+ # write out
102
+ itol_exporter.set_export_param_value('format', op_plot_ext)
103
+ itol_exporter.export(op_plot)
104
+
105
+
106
+ def batch_itol(args):
107
+
108
+ tree_file_dir = args['i']
109
+ tree_file_ext = args['x']
110
+ annotation_files_txt = args['a']
111
+ op_file_dir = args['o']
112
+ force_overwrite = args['f']
113
+ API_key = args['api']
114
+ project_name = args['ip']
115
+ para_txt = args['para']
116
+
117
+ para_dict = dict()
118
+ if para_txt is not None:
119
+ if os.path.isfile(tree_file_dir) is False:
120
+ print('The specified parameter file does not exist, program exited!')
121
+ exit()
122
+ else:
123
+ for each_line in open(para_txt):
124
+ if not each_line.startswith('#'):
125
+ if len(each_line.strip()) > 0:
126
+ para_without_comment = each_line.strip().split('#')[0].strip()
127
+ para_without_comment_split = para_without_comment.split('\t')
128
+ para_dict[para_without_comment_split[0]] = para_without_comment_split[1]
129
+
130
+ if os.path.isfile(tree_file_dir) is True:
131
+ itol_single_tree(tree_file_dir, annotation_files_txt, project_name, API_key, para_dict, op_file_dir)
132
+ elif os.path.isdir(tree_file_dir) is True:
133
+ file_re = '%s/*.%s' % (tree_file_dir, tree_file_ext)
134
+ file_list = glob.glob(file_re)
135
+
136
+ if len(file_list) == 0:
137
+ print('no file found in %s, please check file extension, program exited!' % tree_file_dir)
138
+ exit()
139
+
140
+ # create output folder
141
+ if os.path.isdir(op_file_dir) is True:
142
+ if force_overwrite is True:
143
+ os.system('rm -r %s' % op_file_dir)
144
+ else:
145
+ print('Output folder detected, program exited!')
146
+ exit()
147
+ os.system('mkdir %s' % op_file_dir)
148
+
149
+ for each_file in file_list:
150
+ f_name, f_path, f_base, f_ext = sep_path_basename_ext(each_file)
151
+ op_pdf = '%s/%s.pdf' % (op_file_dir, f_base)
152
+
153
+ itol_single_tree(each_file, annotation_files_txt, project_name, API_key, para_dict, op_pdf)
154
+ else:
155
+ print('please provide input file with -i, program exited!')
156
+ exit()
157
+
158
+
159
+ if __name__ == '__main__':
160
+
161
+ batch_itol_parser = argparse.ArgumentParser(usage=batch_itol_usage)
162
+ batch_itol_parser.add_argument('-i', required=True, help='input tree file or folder')
163
+ batch_itol_parser.add_argument('-x', required=False, default=None, help='file extension')
164
+ batch_itol_parser.add_argument('-o', required=True, help='output file or folder')
165
+ batch_itol_parser.add_argument('-a', required=False, default=None, help='a txt file contain absolute to all annotation files')
166
+ batch_itol_parser.add_argument('-para', required=False, default=None, help='parameter file')
167
+ batch_itol_parser.add_argument('-api', required=True, help='iTOL API key')
168
+ batch_itol_parser.add_argument('-ip', required=False, default='batch_access_tmp', help='iTOL project name, default: batch_access_tmp')
169
+ batch_itol_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
170
+ args = vars(batch_itol_parser.parse_args())
171
+ batch_itol(args)
@@ -0,0 +1,140 @@
1
+ import os
2
+ import glob
3
+ from Bio import SeqIO
4
+ from Bio import AlignIO
5
+
6
+
7
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
8
+
9
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
10
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
11
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
12
+ msa_file_list_sorted = sorted(msa_file_list)
13
+
14
+ complete_gnm_set = set()
15
+ for each_msa_file in msa_file_list:
16
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
17
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
18
+ complete_gnm_set.add(each_seq.id)
19
+
20
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
21
+
22
+ # initialize concatenated msa dict
23
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
24
+ msa_len_dict = dict()
25
+ for each_msa_file in msa_file_list_sorted:
26
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
27
+
28
+ # read in msa
29
+ current_msa_len = 0
30
+ current_msa_len_set = set()
31
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
32
+ current_msa_seq_dict = dict()
33
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
34
+ complete_gnm_set.add(each_seq.id)
35
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
36
+ current_msa_len_set.add(len(each_seq.seq))
37
+ current_msa_len = len(each_seq.seq)
38
+
39
+ if len(current_msa_len_set) != 1:
40
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
41
+ exit()
42
+
43
+ msa_len_dict[gene_id] = current_msa_len
44
+
45
+ # add sequence to concatenated msa dict
46
+ for each_gnm in complete_gnm_list_sorted:
47
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
48
+ gnm_to_seq_dict[each_gnm] += msa_seq
49
+
50
+ # write out concatenated msa
51
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
52
+ for each_gnm in complete_gnm_list_sorted:
53
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
54
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
55
+ concatenated_msa_handle.close()
56
+
57
+ # write out partition file
58
+ end_pos = 0
59
+ partition_file_handle = open(partition_file, 'w')
60
+ for each_m in msa_file_list_sorted:
61
+ gene_id = each_m.split('.' + msa_ext)[0]
62
+ current_m_len = msa_len_dict[gene_id]
63
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
64
+ end_pos += current_m_len
65
+ partition_file_handle.close()
66
+
67
+ # convert msa in fasta to phy
68
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
69
+
70
+
71
+ msa_dir = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed'
72
+ msa_ext = 'aln'
73
+ concatenated_msa_phy = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed_concatenated.phy'
74
+ partition_file = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed_concatenated_partition.txt'
75
+ # catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file)
76
+
77
+
78
+
79
+ msa_file = '/Users/songweizhi/Desktop/PA_75_DeltaLL_75_concatenated.phy'
80
+ msa_file_subset = '/Users/songweizhi/Desktop/PA_75_DeltaLL_75_concatenated_subset.phy'
81
+
82
+ from Bio import AlignIO
83
+
84
+ def slice_msa_by_col(msa_in, range_str, msa_out):
85
+ alignment = AlignIO.read(msa_in, 'phylip-relaxed')
86
+
87
+ range_l = int(range_str.split('-')[0]) - 1
88
+ range_r = int(range_str.split('-')[1])
89
+
90
+ aln_subset = alignment[:, range_l:range_r]
91
+ AlignIO.write(aln_subset, msa_out, 'phylip-relaxed')
92
+
93
+
94
+ def slice_msa_by_col_manual(msa_in, range_str, msa_out):
95
+ alignment = AlignIO.read(msa_in, 'phylip-relaxed')
96
+
97
+ range_l = int(range_str.split('-')[0]) - 1
98
+ range_r = int(range_str.split('-')[1])
99
+ aln_subset = alignment[:, range_l:range_r]
100
+
101
+ max_seq_id_len = 0
102
+ for each_seq in aln_subset:
103
+ seq_id_len = len(each_seq.id)
104
+ if seq_id_len > max_seq_id_len:
105
+ max_seq_id_len = seq_id_len
106
+ print(max_seq_id_len)
107
+
108
+ with open(msa_out, 'w') as msa_out_handle:
109
+ msa_out_handle.write('%s %s\n' % (len(aln_subset), aln_subset.get_alignment_length()))
110
+ for each_seq in aln_subset:
111
+ seq_id = each_seq.id
112
+ seq_id_with_space = '%s%s' % (seq_id, ' '*(max_seq_id_len + 2 - len(seq_id)))
113
+ print(seq_id_with_space)
114
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
115
+
116
+
117
+ # AlignIO.write(aln_subset, msa_out, 'phylip-relaxed')
118
+
119
+
120
+ slice_range = ['1-500', '501-1000', '1001-1500', '1501-2000', '2001-2500', '2501-3000', '3001-3500', '3501-4000', '4001-4500', '4501-4879']
121
+
122
+ for each_range in slice_range:
123
+ pwd_msa_op = '/Users/songweizhi/Desktop/%s.phy' % each_range
124
+ slice_msa_by_col_manual(msa_file, each_range, pwd_msa_op)
125
+
126
+
127
+ def fa2phy(fasta_in, phy_out):
128
+ alignment = AlignIO.read(fasta_in, 'fasta')
129
+ max_seq_id_len = 0
130
+ for each_seq in alignment:
131
+ seq_id_len = len(each_seq.id)
132
+ if seq_id_len > max_seq_id_len:
133
+ max_seq_id_len = seq_id_len
134
+
135
+ with open(phy_out, 'w') as msa_out_handle:
136
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
137
+ for each_seq in alignment:
138
+ seq_id = each_seq.id
139
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
140
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
TreeSAK/cogTree.py ADDED
@@ -0,0 +1,185 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ import multiprocessing as mp
6
+
7
+
8
+ cogTree_usage = '''
9
+ ================================ cogTree example commands ================================
10
+
11
+ TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG_id.txt
12
+ TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG00724
13
+ TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG00724,arCOG02271
14
+
15
+ ==========================================================================================
16
+ '''
17
+
18
+
19
+ def select_seq(seq_file, seq_id_set, output_file):
20
+ output_file_handle = open(output_file, 'w')
21
+ for seq_record in SeqIO.parse(seq_file, 'fasta'):
22
+ seq_id = seq_record.id
23
+ if seq_id in seq_id_set:
24
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
25
+ output_file_handle.close()
26
+
27
+
28
+ def cogTree(args):
29
+
30
+ combined_faa = args['i']
31
+ cog_annotation_wd = args['cog']
32
+ interested_fun_txt = args['fun']
33
+ op_dir = args['o']
34
+ trim_with_bmge = args['bmge']
35
+ trim_model = args['bmge_m']
36
+ entropy_score_cutoff = args['bmge_esc']
37
+ iqtree_model = args['iqtree_m']
38
+ force_overwrite = args['f']
39
+ num_of_threads = args['t']
40
+
41
+ # specify path to BMGE.jar
42
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
43
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
44
+
45
+ interested_fun_set = set()
46
+ if os.path.isfile(interested_fun_txt) is False:
47
+ if ',' in interested_fun_txt:
48
+ interested_fun_set = interested_fun_txt.split(',')
49
+ else:
50
+ interested_fun_set.add(interested_fun_txt)
51
+ else:
52
+ for each_fun in open(interested_fun_txt):
53
+ interested_fun_set.add(each_fun.strip().split()[0])
54
+
55
+ ################################################################################
56
+
57
+ faa_dir = '%s/dir_1_faa' % op_dir
58
+ aln_dir = '%s/dir_2_msa' % op_dir
59
+ trimmed_aln_dir = '%s/dir_3_trimmed_msa' % op_dir
60
+ tree_dir = '%s/dir_4_tree' % op_dir
61
+ cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
62
+ cmd_2_trim_txt = '%s/cmd_2_trim.txt' % op_dir
63
+ cmd_3_tree_txt = '%s/cmd_3_tree.txt' % op_dir
64
+
65
+ ################################################################################
66
+
67
+ # create output folder
68
+ if os.path.isdir(op_dir) is True:
69
+ if force_overwrite is True:
70
+ os.system('rm -r %s' % op_dir)
71
+ else:
72
+ print('%s exist, program exited!' % op_dir)
73
+ exit()
74
+
75
+ os.mkdir(op_dir)
76
+ os.mkdir(faa_dir)
77
+ os.mkdir(aln_dir)
78
+ os.mkdir(trimmed_aln_dir)
79
+ os.mkdir(tree_dir)
80
+
81
+ ################################################################################
82
+
83
+ fun_to_gene_dict = dict()
84
+ if cog_annotation_wd is not None:
85
+
86
+ print('Reading in COG annotation results')
87
+ file_re = '%s/*COG_wd/*_query_to_cog.txt' % (cog_annotation_wd)
88
+ file_list = glob.glob(file_re)
89
+ if len(file_list) == 0:
90
+ print('COG annotation file not detected, program exited!')
91
+ exit()
92
+
93
+ for each_file in file_list:
94
+ line_index = 0
95
+ for each_line in open(each_file):
96
+ if line_index > 0:
97
+ each_line_split = each_line.strip().split('\t')
98
+ if len(each_line_split) == 4:
99
+ gene_id = each_line_split[0]
100
+ cog_id = each_line_split[1]
101
+ if cog_id in interested_fun_set:
102
+ if cog_id not in fun_to_gene_dict:
103
+ fun_to_gene_dict[cog_id] = set()
104
+ fun_to_gene_dict[cog_id].add(gene_id)
105
+ line_index += 1
106
+
107
+ cmd_list_mafft = []
108
+ cmd_list_trim = []
109
+ cmd_list_tree = []
110
+ cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
111
+ cmd_2_trim_txt_handle = open(cmd_2_trim_txt, 'w')
112
+ cmd_3_tree_txt_handle = open(cmd_3_tree_txt, 'w')
113
+ for each_fun in sorted(fun_to_gene_dict):
114
+
115
+ # define file name
116
+ fun_faa = '%s/%s.faa' % (faa_dir, each_fun)
117
+ current_gene_tree_dir = '%s/%s' % (tree_dir, each_fun)
118
+ fun_aln = '%s/%s.aln' % (aln_dir, each_fun)
119
+ fun_aln_trimmed = '%s/%s_trimal.aln' % (trimmed_aln_dir, each_fun)
120
+ if trim_with_bmge is True:
121
+ fun_aln_trimmed = '%s/%s_bmge.aln' % (trimmed_aln_dir, each_fun)
122
+
123
+ # extract sequences
124
+ current_fun_gene_set = fun_to_gene_dict[each_fun]
125
+ select_seq(combined_faa, current_fun_gene_set, fun_faa)
126
+
127
+ os.system('mkdir %s' % current_gene_tree_dir)
128
+
129
+ # prepare commands
130
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (1, fun_faa, fun_aln)
131
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (fun_aln, fun_aln_trimmed)
132
+ if trim_with_bmge is True:
133
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, fun_aln, trim_model, entropy_score_cutoff, fun_aln_trimmed)
134
+ infer_tree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s/%s -T %s --quiet' % (fun_aln_trimmed, iqtree_model, current_gene_tree_dir, each_fun, num_of_threads)
135
+
136
+ # add commands to list
137
+ cmd_list_mafft.append(mafft_cmd)
138
+ cmd_list_trim.append(trim_cmd)
139
+ cmd_list_tree.append(infer_tree_cmd)
140
+
141
+ # write out commands
142
+ cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
143
+ cmd_2_trim_txt_handle.write(trim_cmd + '\n')
144
+ cmd_3_tree_txt_handle.write(infer_tree_cmd + '\n')
145
+
146
+ cmd_1_mafft_txt_handle.close()
147
+ cmd_2_trim_txt_handle.close()
148
+ cmd_3_tree_txt_handle.close()
149
+
150
+ # run mafft commands
151
+ print('Running mafft with %s cores for %s commands' % (num_of_threads, len(cmd_list_mafft)))
152
+ pool = mp.Pool(processes=num_of_threads)
153
+ pool.map(os.system, cmd_list_mafft)
154
+ pool.close()
155
+ pool.join()
156
+
157
+ # run trim commands
158
+ print('Trimming with %s cores for %s commands' % (num_of_threads, len(cmd_list_trim)))
159
+ pool = mp.Pool(processes=num_of_threads)
160
+ pool.map(os.system, cmd_list_trim)
161
+ pool.close()
162
+ pool.join()
163
+
164
+ # run iqtree commands
165
+ print('Running iqtree with %s cores' % num_of_threads)
166
+ for each_iqtree_cmd in sorted(cmd_list_tree):
167
+ print(each_iqtree_cmd)
168
+ os.system(each_iqtree_cmd)
169
+
170
+
171
+ if __name__ == '__main__':
172
+
173
+ cogTree_parser = argparse.ArgumentParser()
174
+ cogTree_parser.add_argument('-i', required=True, help='orthologous gene sequence')
175
+ cogTree_parser.add_argument('-fun', required=True, help='interested functions')
176
+ cogTree_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
177
+ cogTree_parser.add_argument('-o', required=True, help='output directory')
178
+ cogTree_parser.add_argument('-bmge', required=False, action="store_true", help='trim with BMGE, default is trimal')
179
+ cogTree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
180
+ cogTree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
181
+ cogTree_parser.add_argument('-iqtree_m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
182
+ cogTree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
183
+ cogTree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
184
+ args = vars(cogTree_parser.parse_args())
185
+ cogTree(args)
@@ -0,0 +1,30 @@
1
+
2
+ # check.packages function: install and load multiple R packages.
3
+ # Check to see if packages are installed. Install them if they are not, then load them into the R session.
4
+ check.packages <- function(pkg){
5
+ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
6
+ if (length(new.pkg))
7
+ install.packages(new.pkg, dependencies = TRUE)
8
+ sapply(pkg, require, character.only = 1)}
9
+
10
+ # install packages if not installed
11
+ packages<-c("optparse", "ape", "vegan")
12
+ invisible(suppressMessages(check.packages(packages)))
13
+
14
+ option_list = list(
15
+ make_option(c("-a", "--treeo"), type="character", default=NULL, help="the first tree"),
16
+ make_option(c("-b", "--treet"), type="character", default=NULL, help="the second tree"));
17
+
18
+ opt_parser = OptionParser(option_list=option_list);
19
+ opt = parse_args(opt_parser);
20
+
21
+ TREE1 = read.tree(opt$treeo)
22
+ TREE2 = read.tree(opt$treet)
23
+
24
+ D1 = cophenetic(TREE1)
25
+ D1 = D1[order(row.names(D1)),order(row.names(D1))]
26
+ D2 = cophenetic(TREE2)
27
+ D2 = D2[order(row.names(D2)),order(row.names(D2))]
28
+
29
+ mantel(xdis = D1, ydis = D2, permutations = 999)
30
+