treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,80 @@
1
+ import os
2
+ import argparse
3
+ from ete3 import Tree
4
+
5
+
6
+ iTOL_gene_tree_usage = '''
7
+ ====================== iTOL_gene_tree example commands ======================
8
+
9
+ TreeSAK iTOL_gene_tree -tree genes.tree -i gnm_taxon.txt -o gene_taxon.txt
10
+ TreeSAK iTOL_gene_tree -txt gene_id.txt -i gnm_taxon.txt -o gene_taxon.txt
11
+
12
+ =============================================================================
13
+ '''
14
+
15
+
16
+ def iTOL_gene_tree(args):
17
+
18
+ input_tree_file = args['tree']
19
+ input_txt_file = args['txt']
20
+ meta_txt = args['i']
21
+ op_txt = args['o']
22
+ include_na = args['na']
23
+
24
+ if (input_tree_file is None) and (input_txt_file is None):
25
+ print('Please provide gene id with at least one approach, program exited!')
26
+ exit()
27
+
28
+ if os.path.isfile(meta_txt) is False:
29
+ print('Metadata file not found, program exited!')
30
+ exit()
31
+
32
+ metadata_dict = dict()
33
+ for each_gnm in open(meta_txt):
34
+ each_gnm_split = each_gnm.strip().split('\t')
35
+ if len(each_gnm_split) == 2:
36
+ gnm_id = each_gnm_split[0]
37
+ meta_value = each_gnm_split[1]
38
+ metadata_dict[gnm_id] = meta_value
39
+
40
+ gene_id_set = set()
41
+ if input_tree_file is not None:
42
+ if os.path.isfile(input_tree_file) is False:
43
+ print('Tree file not found, program exited!')
44
+ exit()
45
+ else:
46
+ for leaf in Tree(input_tree_file, format=1):
47
+ gene_id_set.add(leaf.name)
48
+
49
+ if input_txt_file is not None:
50
+ if os.path.isfile(input_txt_file) is False:
51
+ print('Txt file not found, program exited!')
52
+ exit()
53
+ else:
54
+ for each_id in open(input_txt_file):
55
+ gene_id_set.add(each_id.strip())
56
+
57
+ op_txt_handle = open(op_txt, 'w')
58
+ for gene_id in gene_id_set:
59
+ gnm_id = '_'.join(gene_id.split('_')[:-1])
60
+ gnm_meta = metadata_dict.get(gnm_id, 'na')
61
+ if include_na is True:
62
+ op_txt_handle.write('%s\t%s\n' % (gene_id, gnm_meta))
63
+ else:
64
+ if gnm_meta != 'na':
65
+ op_txt_handle.write('%s\t%s\n' % (gene_id, gnm_meta))
66
+ op_txt_handle.close()
67
+
68
+ print('Done!')
69
+
70
+
71
+ if __name__ == '__main__':
72
+
73
+ iTOL_gene_tree_parser = argparse.ArgumentParser(usage=iTOL_gene_tree_usage)
74
+ iTOL_gene_tree_parser.add_argument('-i', required=True, help='input metadata')
75
+ iTOL_gene_tree_parser.add_argument('-tree', required=False, default=None, help='gene id, in tree file')
76
+ iTOL_gene_tree_parser.add_argument('-txt', required=False, default=None, help='gene id, in txt file')
77
+ iTOL_gene_tree_parser.add_argument('-o', required=True, help='output metadata')
78
+ iTOL_gene_tree_parser.add_argument('-na', required=False, action='store_true', help='include leaves with na values')
79
+ args = vars(iTOL_gene_tree_parser.parse_args())
80
+ iTOL_gene_tree(args)
@@ -0,0 +1,56 @@
1
+ import os
2
+ import math
3
+ import argparse
4
+ from Bio import SeqIO
5
+
6
+
7
+ iTOL_msa_stats_usage = '''
8
+ ========= iTOL_msa_stats example command =========
9
+
10
+ TreeSAK iTOL_msa_stats -i concatenated.phy.fasta
11
+
12
+ ==================================================
13
+ '''
14
+
15
+
16
+ def sep_path_basename_ext(file_in):
17
+ f_path, f_name = os.path.split(file_in)
18
+ if f_path == '':
19
+ f_path = '.'
20
+ f_base, f_ext = os.path.splitext(f_name)
21
+ f_ext = f_ext[1:]
22
+ return f_name, f_path, f_base, f_ext
23
+
24
+
25
+ def iTOL_msa_stats(args):
26
+
27
+ aln_file = args['i']
28
+
29
+ _, aln_path, aln_base, _ = sep_path_basename_ext(aln_file)
30
+
31
+ stats_txt = '%s/%s_gap_pct.txt' % (aln_path, aln_base)
32
+ stats_txt_itol = '%s/%s_gap_pct_iTOL.txt' % (aln_path, aln_base)
33
+
34
+ max_gap_pct = 0
35
+ stats_txt_handle = open(stats_txt, 'w')
36
+ for each_seq in SeqIO.parse(aln_file, 'fasta'):
37
+ seq_id = each_seq.id
38
+ seq_seq = str(each_seq.seq)
39
+ gap_pct = seq_seq.count('-')*100/len(seq_seq)
40
+ gap_pct = float("{0:.2f}".format(gap_pct))
41
+ if gap_pct > max_gap_pct:
42
+ max_gap_pct = gap_pct
43
+ stats_txt_handle.write('%s\t%s\n' % (seq_id, gap_pct))
44
+ stats_txt_handle.close()
45
+
46
+ max_scale_value = math.ceil(max_gap_pct/5) * 5
47
+ gap_pct_itol_cmd = 'TreeSAK iTOL -SimpleBar -lv %s -scale 0-25-50-75-100 -lt Gap_Pecentage -o %s' % (stats_txt, stats_txt_itol)
48
+ os.system(gap_pct_itol_cmd)
49
+
50
+
51
+ if __name__ == '__main__':
52
+
53
+ iTOL_msa_stats_parser = argparse.ArgumentParser(usage=iTOL_msa_stats_usage)
54
+ iTOL_msa_stats_parser.add_argument('-i', required=True, help='MSA file')
55
+ args = vars(iTOL_msa_stats_parser.parse_args())
56
+ iTOL_msa_stats(args)
@@ -0,0 +1,37 @@
1
+ import operator
2
+
3
+
4
+ def keep_highest_rrtc(rrtc_in, rrtc_out):
5
+
6
+ rrtc_highest_prob_dict = dict()
7
+ for each_rrtc in open(rrtc_in):
8
+ each_rrtc_split = each_rrtc.strip().split(':')[0].split('\t')
9
+ rrtc_r = each_rrtc.strip().split(':')[0].split('\t')[0]
10
+ rrtc_d = each_rrtc.strip().split(':')[0].split('\t')[1]
11
+ rrtc_v = float(each_rrtc.strip().split(':')[1])
12
+ rrtc_key = '%s___%s' % (rrtc_r, rrtc_d)
13
+ if rrtc_key not in rrtc_highest_prob_dict:
14
+ rrtc_highest_prob_dict[rrtc_key] = rrtc_v
15
+ else:
16
+ if rrtc_v > rrtc_highest_prob_dict[rrtc_key]:
17
+ rrtc_highest_prob_dict[rrtc_key] = rrtc_v
18
+
19
+ with open(rrtc_out, 'w') as rrtc_out_handle:
20
+ for each_rrtc in sorted(rrtc_highest_prob_dict.items(), key=operator.itemgetter(1))[::-1]:
21
+ print(each_rrtc)
22
+ rrtc_r = each_rrtc[0].split('___')[0]
23
+ rrtc_d = each_rrtc[0].split('___')[1]
24
+ rrtc_v = each_rrtc[1]
25
+ rrtc_out_handle.write('%s\t%s:%s\n' % (rrtc_r, rrtc_d, rrtc_v))
26
+
27
+
28
+ rrtc_in = '/Users/songweizhi/Desktop/rrtc.txt'
29
+ rrtc_out = '/Users/songweizhi/Desktop/rrtc_out.txt'
30
+ keep_highest_rrtc(rrtc_in, rrtc_out)
31
+
32
+
33
+ demo_dict = { 'a': 6, 'b': 2, 'c': 2 }
34
+ for each in sorted(demo_dict.items(), key=operator.itemgetter(1))[::-1]:
35
+ print(each[0])
36
+ print(each[1])
37
+
TreeSAK/koTree.py ADDED
@@ -0,0 +1,194 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ import multiprocessing as mp
6
+
7
+
8
+ koTree_usage = '''
9
+ ================================ koTree example commands ================================
10
+
11
+ TreeSAK koTree -i combined.faa -kegg KEGG_wd -o op_dir -bmge -t 12 -f -fun ko_id.txt
12
+ TreeSAK koTree -i combined.faa -kegg KEGG_wd -o op_dir -bmge -t 12 -f -fun K01995
13
+ TreeSAK koTree -i combined.faa -kegg KEGG_wd -o op_dir -bmge -t 12 -f -fun K01995,K01996
14
+
15
+ =========================================================================================
16
+ '''
17
+
18
+
19
+ def select_seq(seq_file, seq_id_set, output_file):
20
+ output_file_handle = open(output_file, 'w')
21
+ for seq_record in SeqIO.parse(seq_file, 'fasta'):
22
+ seq_id = seq_record.id
23
+ if seq_id in seq_id_set:
24
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
25
+ output_file_handle.close()
26
+
27
+
28
+ def koTree(args):
29
+
30
+ combined_faa = args['i']
31
+ kegg_annotation_wd = args['kegg']
32
+ interested_fun_txt = args['fun']
33
+ op_dir = args['o']
34
+ trim_with_bmge = args['bmge']
35
+ trim_model = args['bmge_m']
36
+ entropy_score_cutoff = args['bmge_esc']
37
+ iqtree_model = args['iqtree_m']
38
+ force_overwrite = args['f']
39
+ num_of_threads = args['t']
40
+
41
+ # specify path to BMGE.jar
42
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
43
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
44
+
45
+ interested_fun_set = set()
46
+ if os.path.isfile(interested_fun_txt) is False:
47
+ if ',' in interested_fun_txt:
48
+ interested_fun_set = interested_fun_txt.split(',')
49
+ else:
50
+ interested_fun_set.add(interested_fun_txt)
51
+ else:
52
+ for each_fun in open(interested_fun_txt):
53
+ interested_fun_set.add(each_fun.strip().split()[0])
54
+
55
+ ################################################################################
56
+
57
+ faa_dir = '%s/dir_1_faa' % op_dir
58
+ aln_dir = '%s/dir_2_msa' % op_dir
59
+ trimmed_aln_dir = '%s/dir_3_trimmed_msa' % op_dir
60
+ tree_dir = '%s/dir_4_tree' % op_dir
61
+ cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
62
+ cmd_2_trim_txt = '%s/cmd_2_trim.txt' % op_dir
63
+ cmd_3_tree_txt = '%s/cmd_3_tree.txt' % op_dir
64
+
65
+ ################################################################################
66
+
67
+ # create output folder
68
+ if os.path.isdir(op_dir) is True:
69
+ if force_overwrite is True:
70
+ os.system('rm -r %s' % op_dir)
71
+ else:
72
+ print('%s exist, program exited!' % op_dir)
73
+ exit()
74
+
75
+ os.mkdir(op_dir)
76
+ os.mkdir(faa_dir)
77
+ os.mkdir(aln_dir)
78
+ os.mkdir(trimmed_aln_dir)
79
+ os.mkdir(tree_dir)
80
+
81
+ ################################################################################
82
+
83
+ fun_to_gene_dict = dict()
84
+ if kegg_annotation_wd is not None:
85
+
86
+ print('Reading in KEGG annotation results')
87
+ file_re = '%s/*KEGG_wd/*_ko_assignment_ABCD.txt' % (kegg_annotation_wd)
88
+ file_list = glob.glob(file_re)
89
+
90
+ if len(file_list) == 0:
91
+ print('KEGG annotation file not detected, program exited!')
92
+ exit()
93
+
94
+ for each_file in file_list:
95
+ line_index = 0
96
+ for each_line in open(each_file):
97
+ if line_index > 0:
98
+ each_line_split = each_line.strip().split('\t')
99
+ if len(each_line_split) == 9:
100
+ gene_id = each_line_split[0]
101
+ ko_d_id = each_line_split[4][2:]
102
+ if ko_d_id in interested_fun_set:
103
+ if ko_d_id not in fun_to_gene_dict:
104
+ fun_to_gene_dict[ko_d_id] = set()
105
+ fun_to_gene_dict[ko_d_id].add(gene_id)
106
+ line_index += 1
107
+
108
+ cmd_list_mafft = []
109
+ cmd_list_trim = []
110
+ cmd_list_tree = []
111
+ cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
112
+ cmd_2_trim_txt_handle = open(cmd_2_trim_txt, 'w')
113
+ cmd_3_tree_txt_handle = open(cmd_3_tree_txt, 'w')
114
+ for each_fun in sorted(fun_to_gene_dict):
115
+
116
+ # define file name
117
+ fun_faa = '%s/%s.faa' % (faa_dir, each_fun)
118
+ current_gene_tree_dir = '%s/%s' % (tree_dir, each_fun)
119
+ fun_aln = '%s/%s.aln' % (aln_dir, each_fun)
120
+ fun_aln_trimmed = '%s/%s_trimal.aln' % (trimmed_aln_dir, each_fun)
121
+ if trim_with_bmge is True:
122
+ fun_aln_trimmed = '%s/%s_bmge.aln' % (trimmed_aln_dir, each_fun)
123
+
124
+ # extract sequences
125
+ current_fun_gene_set = fun_to_gene_dict[each_fun]
126
+ select_seq(combined_faa, current_fun_gene_set, fun_faa)
127
+
128
+ os.system('mkdir %s' % current_gene_tree_dir)
129
+
130
+ # prepare commands
131
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (1, fun_faa, fun_aln)
132
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (fun_aln, fun_aln_trimmed)
133
+ if trim_with_bmge is True:
134
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, fun_aln, trim_model, entropy_score_cutoff, fun_aln_trimmed)
135
+ infer_tree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s/%s -T %s --quiet' % (fun_aln_trimmed, iqtree_model, current_gene_tree_dir, each_fun, num_of_threads)
136
+
137
+ # add commands to list
138
+ cmd_list_mafft.append(mafft_cmd)
139
+ cmd_list_trim.append(trim_cmd)
140
+ cmd_list_tree.append(infer_tree_cmd)
141
+
142
+ # write out commands
143
+ cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
144
+ cmd_2_trim_txt_handle.write(trim_cmd + '\n')
145
+ cmd_3_tree_txt_handle.write(infer_tree_cmd + '\n')
146
+
147
+ cmd_1_mafft_txt_handle.close()
148
+ cmd_2_trim_txt_handle.close()
149
+ cmd_3_tree_txt_handle.close()
150
+
151
+ # run mafft commands
152
+ print('Running mafft with %s cores for %s commands' % (num_of_threads, len(cmd_list_mafft)))
153
+ pool = mp.Pool(processes=num_of_threads)
154
+ pool.map(os.system, cmd_list_mafft)
155
+ pool.close()
156
+ pool.join()
157
+
158
+ # run trim commands
159
+ print('Trimming with %s cores for %s commands' % (num_of_threads, len(cmd_list_trim)))
160
+ pool = mp.Pool(processes=num_of_threads)
161
+ pool.map(os.system, cmd_list_trim)
162
+ pool.close()
163
+ pool.join()
164
+
165
+ # run iqtree commands
166
+ print('Running iqtree with %s cores' % num_of_threads)
167
+ for each_iqtree_cmd in sorted(cmd_list_tree):
168
+ print(each_iqtree_cmd)
169
+ os.system(each_iqtree_cmd)
170
+
171
+
172
+ if __name__ == '__main__':
173
+
174
+ koTree_parser = argparse.ArgumentParser()
175
+ koTree_parser.add_argument('-i', required=True, help='orthologous gene sequence')
176
+ koTree_parser.add_argument('-fun', required=True, help='interested functions')
177
+ koTree_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
178
+ koTree_parser.add_argument('-o', required=True, help='output directory')
179
+ koTree_parser.add_argument('-bmge', required=False, action="store_true", help='trim with BMGE, default is trimal')
180
+ koTree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
181
+ koTree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
182
+ koTree_parser.add_argument('-iqtree_m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
183
+ koTree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
184
+ koTree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
185
+ args = vars(koTree_parser.parse_args())
186
+ koTree(args)
187
+
188
+
189
+ '''
190
+
191
+ cd /scratch/PI/ocessongwz/Sponge_r220/4_OMA_wd/OMA_wd/Output
192
+ TreeSAK FunTree -i /scratch/PI/ocessongwz/Sponge_r220/3_combined_genomes_50_5_dRep97_291.faa -fun K01995,K01996,K01997,K01998,K01999 -kegg /scratch/PI/ocessongwz/Sponge_r220/3_combined_genomes_50_5_dRep97_291_KEGG_wd -o interested_fun_tree_branched_chain_aa_transport_system -bmge -t 12 -f
193
+
194
+ '''
TreeSAK/label_tree.R ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env Rscript
2
+
3
+ ######################################## Usage ########################################
4
+
5
+ # usgae
6
+ # Rscript add_group_to_tree.R -t input_tree.newick -g grouping_file.txt
7
+
8
+ #######################################################################################
9
+
10
+
11
+ # check.packages function: install and load multiple R packages.
12
+ # Check to see if packages are installed. Install them if they are not, then load them into the R session.
13
+ check.packages <- function(pkg){
14
+ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
15
+ if (length(new.pkg))
16
+ install.packages(new.pkg, dependencies = TRUE)
17
+ sapply(pkg, require, character.only = TRUE)
18
+ }
19
+
20
+ # Usage example
21
+ packages<-c("ape", "tools", "optparse")
22
+ invisible(suppressMessages(check.packages(packages)))
23
+
24
+
25
+ option_list = list(
26
+
27
+ make_option(c("-t", "--tree"), type="character", help="tree file", metavar="character"),
28
+ make_option(c("-g", "--grouping"), type="character", help="grouping file (group_id,bin_id)", metavar="character"));
29
+
30
+ opt_parser = OptionParser(option_list=option_list);
31
+ opt = parse_args(opt_parser);
32
+ grouping_file = opt$grouping
33
+ tree_file_in = opt$tree
34
+ tree_file_path = dirname(tree_file_in)
35
+
36
+ tree_file_in_name_no_extension = file_path_sans_ext(basename(grouping_file))
37
+ tree_txt_file_with_group = paste(tree_file_in_name_no_extension, 'with_group.tree', sep = '_')
38
+ tree_txt_file_only_group = paste(tree_file_in_name_no_extension, 'only_group.tree', sep = '_')
39
+
40
+ pwd_tree_txt_file_with_group = paste(tree_file_path, tree_txt_file_with_group, sep = '/')
41
+ pwd_tree_txt_file_only_group = paste(tree_file_path, tree_txt_file_only_group, sep = '/')
42
+
43
+ # read in grouping file
44
+ grouping_df = read.csv(grouping_file, header = FALSE)
45
+
46
+
47
+ #################### get tree with group ####################
48
+
49
+ SCG_tree_with_group = read.tree(tree_file_in)
50
+ i = 1
51
+ for (i in 1:length(SCG_tree_with_group$tip.label)) {
52
+ label_name = SCG_tree_with_group$tip.label[i]
53
+ label_name_row_num = which(grouping_df$V2 == label_name)
54
+ group_id = grouping_df$V1[label_name_row_num]
55
+ SCG_tree_with_group$tip.label[i] = paste(group_id, SCG_tree_with_group$tip.label[i], sep = '_')
56
+ i = i + 1}
57
+
58
+ # write out tree
59
+ write.tree(SCG_tree_with_group, file=pwd_tree_txt_file_with_group)
60
+
61
+
62
+ #################### get tree with group only ####################
63
+
64
+ SCG_tree_only_group = read.tree(tree_file_in)
65
+ i = 1
66
+ for (i in 1:length(SCG_tree_only_group$tip.label)) {
67
+ label_name = SCG_tree_only_group$tip.label[i]
68
+ label_name_row_num = which(grouping_df$V2 == label_name)
69
+ group_id = grouping_df$V1[label_name_row_num]
70
+ SCG_tree_only_group$tip.label[i] = paste(group_id)
71
+ i = i + 1}
72
+
73
+ # write out tree
74
+ write.tree(SCG_tree_only_group, file=pwd_tree_txt_file_only_group)
75
+
TreeSAK/label_tree.py ADDED
@@ -0,0 +1,121 @@
1
+ import os
2
+ import argparse
3
+ from BioSAK.BioSAK_config import config_dict
4
+
5
+
6
+ label_tree_usage = '''
7
+ ======================== label_tree example commands ========================
8
+
9
+ module load R
10
+
11
+ # label tree with customized grouping file
12
+ BioSAK label_tree -tree NorthSea.tree -label labels.txt
13
+
14
+ # label tree by taxonomic classification at phylum and class levels
15
+ BioSAK label_tree -tree NorthSea.tree -taxon GTDB_output.tsv -rank p
16
+ BioSAK label_tree -tree NorthSea.tree -taxon GTDB_output.tsv -rank c
17
+
18
+ # label file format:
19
+ label_A,tree_leaf_1
20
+ label_B,tree_leaf_2
21
+ label_B,tree_leaf_3
22
+ label_C,tree_leaf_4
23
+
24
+ =============================================================================
25
+ '''
26
+
27
+
28
+ def sep_path_basename_ext(file_in):
29
+
30
+ # separate path and file name
31
+ file_path, file_name = os.path.split(file_in)
32
+ if file_path == '':
33
+ file_path = '.'
34
+
35
+ # separate file basename and extension
36
+ file_basename, file_extension = os.path.splitext(file_name)
37
+
38
+ return file_path, file_basename, file_extension
39
+
40
+
41
+ def label_tree(args, config_dict):
42
+
43
+ tree_in = args['tree']
44
+ label_file = args['label']
45
+ leaf_taxon = args['taxon']
46
+ taxon_rank = args['rank']
47
+ label_tree_R = config_dict['label_tree_R']
48
+
49
+ if (label_file is not None) and (leaf_taxon is None) and (taxon_rank is None):
50
+ label_tree_cmd = 'Rscript %s -t %s -g %s' % (label_tree_R, tree_in, label_file)
51
+ os.system(label_tree_cmd)
52
+
53
+ elif (label_file is None) and (leaf_taxon is not None) and (taxon_rank is not None):
54
+
55
+ # define tmp file name
56
+ tree_file_path, tree_file_basename, tree_file_extension = sep_path_basename_ext(tree_in)
57
+ taxon_grouping = '%s/%s_%s.txt' % (tree_file_path, tree_file_basename, taxon_rank)
58
+
59
+ # read GTDB output into dict
60
+ taxon_assignment_dict = {}
61
+ for each_genome in open(leaf_taxon):
62
+ if not each_genome.startswith('user_genome'):
63
+ each_split = each_genome.strip().split('\t')
64
+ bin_name = each_split[0]
65
+
66
+ assignment_full = []
67
+ if len(each_split) == 1:
68
+ assignment_full = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
69
+ elif (len(each_split) > 1) and (';' in each_split[1]):
70
+ assignment = each_split[1].split(';')
71
+ if len(assignment) == 7:
72
+ assignment_full = assignment
73
+ if len(assignment) == 6:
74
+ assignment_full = assignment + ['s__']
75
+ if len(assignment) == 5:
76
+ assignment_full = assignment + ['g__', 's__']
77
+ if len(assignment) == 4:
78
+ assignment_full = assignment + ['f__', 'g__', 's__']
79
+ if len(assignment) == 3:
80
+ assignment_full = assignment + ['o__', 'f__', 'g__', 's__']
81
+ if len(assignment) == 2:
82
+ assignment_full = assignment + ['c__', 'o__', 'f__', 'g__', 's__']
83
+
84
+ elif (len(each_split) > 1) and (';' not in each_split[1]):
85
+ assignment_full = [each_split[1]] + ['p__', 'c__', 'o__', 'f__', 'g__', 's__']
86
+
87
+ # store in dict
88
+ taxon_assignment_dict[bin_name] = assignment_full
89
+
90
+ # get all identified taxon at defined ranks
91
+ rank_to_position_dict = {'d': 0, 'p': 1, 'c': 2, 'o': 3, 'f': 4, 'g': 5, 's': 6}
92
+ specified_rank_pos = rank_to_position_dict[taxon_rank]
93
+
94
+ taxon_grouping_handle = open(taxon_grouping, 'w')
95
+ for each_TaxonAssign in taxon_assignment_dict:
96
+ specified_rank_id = taxon_assignment_dict[each_TaxonAssign][specified_rank_pos]
97
+ taxon_grouping_handle.write('%s,%s\n' % (specified_rank_id, each_TaxonAssign))
98
+ taxon_grouping_handle.close()
99
+
100
+ # run R script
101
+ label_tree_cmd = 'Rscript %s -t %s -g %s' % (label_tree_R, tree_in, taxon_grouping)
102
+ os.system(label_tree_cmd)
103
+
104
+ else:
105
+ print('Please provide either a customized label file or the taxonomy info of tree leaves together with a taxonomic rank')
106
+ print('Program exited!')
107
+ exit()
108
+
109
+
110
+ if __name__ == '__main__':
111
+
112
+ # initialize the options parser
113
+ parser = argparse.ArgumentParser(usage=label_tree_usage)
114
+
115
+ parser.add_argument('-tree', required=True, help='tree file in newick format')
116
+ parser.add_argument('-label', required=False, default=None, help='label file (label,leaf)')
117
+ parser.add_argument('-taxon', required=False, default=None, help='taxonomic classification')
118
+ parser.add_argument('-rank', required=False, default=None, help='taxonomic rank to label')
119
+
120
+ args = vars(parser.parse_args())
121
+ label_tree(args, config_dict)