treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/KEGG_Luo17.py
ADDED
|
@@ -0,0 +1,807 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import glob
|
|
5
|
+
import argparse
|
|
6
|
+
from Bio import SeqIO
|
|
7
|
+
from time import sleep
|
|
8
|
+
import multiprocessing as mp
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from BioSAK.global_functions import time_format
|
|
11
|
+
from BioSAK.global_functions import force_create_folder
|
|
12
|
+
from BioSAK.global_functions import sep_path_basename_ext
|
|
13
|
+
from BioSAK.global_functions import get_gene_list_TotalDepth
|
|
14
|
+
from BioSAK.global_functions import AnnotateNorm
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
KEGG_parser_usage = '''
|
|
18
|
+
======================================== KEGG example commands =======================================
|
|
19
|
+
|
|
20
|
+
# Dependencies
|
|
21
|
+
module load blast+
|
|
22
|
+
module load diamond
|
|
23
|
+
|
|
24
|
+
# annotation with NCBI blastp (default, for small dataset)
|
|
25
|
+
BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 6 -seq_in input.faa -depth input.depth
|
|
26
|
+
|
|
27
|
+
# annotation with Diamond blastp (for big dataset)
|
|
28
|
+
BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 12 -seq_in faa_folder -x faa -depth depth_files -diamond
|
|
29
|
+
|
|
30
|
+
# get summary for BlastKOALA/GhostKOALA produced results
|
|
31
|
+
BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 9 -ko_in user_ko.txt
|
|
32
|
+
BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 9 -ko_in user_ko_folder -x txt
|
|
33
|
+
|
|
34
|
+
# Prepare DB files, you need to have the following three files in your KEGG_db_dir:
|
|
35
|
+
1. Sequence file, only needed for "-seq_in" mode, DECOMPRESS and RENAME it to kegg_db_seq.fasta
|
|
36
|
+
e.g. prokaryotes.pep.gz (https://www.kegg.jp/kegg/download/Readme/README.fasta)
|
|
37
|
+
2. seq2ko file, only needed for "-seq_in" mode, DECOMPRESS and RENAME it to kegg_db_seq2ko.txt
|
|
38
|
+
e.g. prokaryotes.dat.gz (https://www.kegg.jp/kegg/download/Readme/README.fasta)
|
|
39
|
+
3. ko00001.keg
|
|
40
|
+
https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=htext&filedir=
|
|
41
|
+
|
|
42
|
+
# How it works:
|
|
43
|
+
1. KEGG module uses Blast+/Diamond to get the best hits of query genes in the database with user defined e-value cutoff (default 0.001).
|
|
44
|
+
2. The TotalDepth of a KO is calculated by summing up the depth of all genes assigned to it.
|
|
45
|
+
3. The percentage of GeneNumber/TotalDepth of genes assigned to a KO is calculated by dividing them
|
|
46
|
+
by the total number/depth of genes with KO assignment (default) or by all genes in a genome ("-pct_by_all").
|
|
47
|
+
|
|
48
|
+
# Note!!!
|
|
49
|
+
1. If you run KEGG annotation for multiple files in a batch manner and want to have their depth info incorporated into the results,
|
|
50
|
+
you need to provide a folder containing individual depth files for each of your input sequence file.
|
|
51
|
+
Name of the depth file needs to be exactly the same as its corresponding sequence file, except the extension which is ".depth".
|
|
52
|
+
2. Diamond requires quite a lot of memory for sequence comparison, especially for huge db file (e.g. KEGG db).
|
|
53
|
+
Remember to request sufficient memory (e.g. 90 or 120gb) in your job script and specify a small number (e.g. -t 6)
|
|
54
|
+
of jobs executing in parallel. Otherwise, you may see some of your query genomes with no gene been annotated.
|
|
55
|
+
|
|
56
|
+
# Depth file format (one gene per line, tab separated)
|
|
57
|
+
gene_1 30
|
|
58
|
+
gene_2 10.58
|
|
59
|
+
|
|
60
|
+
# To do:
|
|
61
|
+
1. level C stats: separate stats for Pathway, Brite and the rests
|
|
62
|
+
|
|
63
|
+
======================================================================================================
|
|
64
|
+
'''
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def keep_blast_hit_with_highest_bit_score(file_in, file_out):
|
|
68
|
+
file_out_handle = open(file_out, 'w')
|
|
69
|
+
best_hit_line = ''
|
|
70
|
+
best_hit_query_id = ''
|
|
71
|
+
best_hit_score = 0
|
|
72
|
+
for blast_hit in open(file_in):
|
|
73
|
+
blast_hit_split = blast_hit.strip().split('\t')
|
|
74
|
+
query_id = blast_hit_split[0]
|
|
75
|
+
bit_score = float(blast_hit_split[11])
|
|
76
|
+
|
|
77
|
+
if best_hit_query_id == '':
|
|
78
|
+
best_hit_query_id = query_id
|
|
79
|
+
best_hit_line = blast_hit
|
|
80
|
+
best_hit_score = bit_score
|
|
81
|
+
|
|
82
|
+
elif (query_id == best_hit_query_id) and (bit_score > best_hit_score):
|
|
83
|
+
best_hit_score = bit_score
|
|
84
|
+
best_hit_line = blast_hit
|
|
85
|
+
|
|
86
|
+
elif query_id != best_hit_query_id:
|
|
87
|
+
file_out_handle.write(best_hit_line)
|
|
88
|
+
best_hit_query_id = query_id
|
|
89
|
+
best_hit_line = blast_hit
|
|
90
|
+
best_hit_score = bit_score
|
|
91
|
+
|
|
92
|
+
file_out_handle.write(best_hit_line)
|
|
93
|
+
file_out_handle.close()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def run_blast_worker(argument_list):
|
|
97
|
+
|
|
98
|
+
pwd_input_file = argument_list[0]
|
|
99
|
+
run_blast = argument_list[1]
|
|
100
|
+
run_diamond = argument_list[2]
|
|
101
|
+
KEGG_DB_seq = argument_list[3]
|
|
102
|
+
KEGG_DB_seq_diamond = argument_list[4]
|
|
103
|
+
op_dir = argument_list[5]
|
|
104
|
+
evalue_cutoff = argument_list[6]
|
|
105
|
+
threads_num = argument_list[7]
|
|
106
|
+
|
|
107
|
+
################################################### define file name ###################################################
|
|
108
|
+
|
|
109
|
+
input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(pwd_input_file)
|
|
110
|
+
|
|
111
|
+
blast_results = '%s/%s_KEGG_wd/%s_blast.tab' % (op_dir, in_file_basename, in_file_basename)
|
|
112
|
+
blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (op_dir, in_file_basename, in_file_basename)
|
|
113
|
+
|
|
114
|
+
# create output folder
|
|
115
|
+
force_create_folder('%s/%s_KEGG_wd' % (op_dir, in_file_basename))
|
|
116
|
+
|
|
117
|
+
########################################## blast against KEGG database (Shan) ##########################################
|
|
118
|
+
|
|
119
|
+
if run_blast is True:
|
|
120
|
+
|
|
121
|
+
if run_diamond is False:
|
|
122
|
+
blastp_cmd = 'blastp -query %s -db %s -out %s -outfmt 6 -evalue %s -num_alignments 10 -num_threads %s' % (pwd_input_file, KEGG_DB_seq, blast_results, evalue_cutoff, threads_num)
|
|
123
|
+
os.system(blastp_cmd)
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
diamond_cmd = 'diamond blastp -q %s --db %s --out %s --outfmt 6 --evalue %s --block-size 1 --threads %s --quiet' % (pwd_input_file, KEGG_DB_seq_diamond, blast_results, evalue_cutoff, threads_num)
|
|
127
|
+
os.system(diamond_cmd)
|
|
128
|
+
|
|
129
|
+
# only keep the best hit
|
|
130
|
+
keep_blast_hit_with_highest_bit_score(blast_results, blast_results_best_hit)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def write_out_stats_GeneNumber(identified_ko_list, ko_to_gene_member_dict, ko_description_dict, stats_file_GeneNumber):
|
|
134
|
+
|
|
135
|
+
stats_file_GeneNumber_handle = open(stats_file_GeneNumber, 'w')
|
|
136
|
+
stats_file_GeneNumber_handle.write('KO\tGeneNumber\tDescription\n')
|
|
137
|
+
for ko in identified_ko_list:
|
|
138
|
+
ko_GeneNumber = len(ko_to_gene_member_dict[ko])
|
|
139
|
+
stats_file_GeneNumber_handle.write('%s\t%s\t%s\n' % (ko[2:], ko_GeneNumber, ko_description_dict[ko[2:]]))
|
|
140
|
+
stats_file_GeneNumber_handle.close()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def write_out_stats_TotalDepth(identified_ko_list, ko_to_gene_member_dict, gene_depth_dict, ko_description_dict, stats_file_TotalDepth):
|
|
144
|
+
|
|
145
|
+
stats_file_TotalDepth_handle = open(stats_file_TotalDepth, 'w')
|
|
146
|
+
stats_file_TotalDepth_handle.write('KO\tTotalDepth\tDescription\n')
|
|
147
|
+
for ko in identified_ko_list:
|
|
148
|
+
ko_gene_total_depth = 0
|
|
149
|
+
for each_gene in ko_to_gene_member_dict[ko]:
|
|
150
|
+
each_gene_depth = gene_depth_dict[each_gene]
|
|
151
|
+
ko_gene_total_depth += each_gene_depth
|
|
152
|
+
ko_TotalDepth = float("{0:.2f}".format(ko_gene_total_depth))
|
|
153
|
+
stats_file_TotalDepth_handle.write('%s\t%s\t%s\n' % (ko[2:], ko_TotalDepth, ko_description_dict[ko[2:]]))
|
|
154
|
+
stats_file_TotalDepth_handle.close()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def parse_blast_op_worker(argument_list):
|
|
158
|
+
|
|
159
|
+
pwd_input_file = argument_list[0]
|
|
160
|
+
run_blast = argument_list[1]
|
|
161
|
+
As_description_dict = argument_list[2]
|
|
162
|
+
Bs_description_dict = argument_list[3]
|
|
163
|
+
Cs_description_dict = argument_list[4]
|
|
164
|
+
Ds_description_dict = argument_list[5]
|
|
165
|
+
D2ABCD_dict = argument_list[6]
|
|
166
|
+
db_seq_to_KO_dict = argument_list[7]
|
|
167
|
+
op_dir = argument_list[8]
|
|
168
|
+
depth_file = argument_list[9]
|
|
169
|
+
pct_by_all = argument_list[10]
|
|
170
|
+
|
|
171
|
+
################################################### define file name ###################################################
|
|
172
|
+
|
|
173
|
+
input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(pwd_input_file)
|
|
174
|
+
|
|
175
|
+
blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (op_dir, in_file_basename, in_file_basename)
|
|
176
|
+
KO_assignment_file_D = '%s/%s_KEGG_wd/%s_KO_assignment_D.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
177
|
+
KO_assignment_file_DCBA = '%s/%s_KEGG_wd/%s_ko_assignment_ABCD.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
178
|
+
|
|
179
|
+
stats_file_A_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
180
|
+
stats_file_B_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
181
|
+
stats_file_C_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
182
|
+
stats_file_D_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
183
|
+
|
|
184
|
+
stats_file_A_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
185
|
+
stats_file_B_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
186
|
+
stats_file_C_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
187
|
+
stats_file_D_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
188
|
+
|
|
189
|
+
stats_file_A_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
190
|
+
stats_file_B_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
191
|
+
stats_file_C_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
192
|
+
stats_file_D_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
193
|
+
|
|
194
|
+
stats_file_A_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
195
|
+
stats_file_B_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
196
|
+
stats_file_C_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
197
|
+
stats_file_D_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
198
|
+
|
|
199
|
+
stats_file_A_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
200
|
+
stats_file_B_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
201
|
+
stats_file_C_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
202
|
+
stats_file_D_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
203
|
+
|
|
204
|
+
stats_file_A_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
205
|
+
stats_file_B_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
206
|
+
stats_file_C_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
207
|
+
stats_file_D_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
|
|
208
|
+
|
|
209
|
+
################################################# parse blast results ##################################################
|
|
210
|
+
|
|
211
|
+
if run_blast is True:
|
|
212
|
+
|
|
213
|
+
# store blast results in dict
|
|
214
|
+
query_to_db_seq_dict = {}
|
|
215
|
+
for each_query in open(blast_results_best_hit):
|
|
216
|
+
each_query_split = each_query.strip().split('\t')
|
|
217
|
+
query_id = each_query_split[0]
|
|
218
|
+
db_seq = each_query_split[1]
|
|
219
|
+
query_to_db_seq_dict[query_id] = db_seq
|
|
220
|
+
|
|
221
|
+
# get all query sequence id
|
|
222
|
+
query_seq_id_list = []
|
|
223
|
+
for each_seq in SeqIO.parse(pwd_input_file, 'fasta'):
|
|
224
|
+
query_seq_id_list.append(str(each_seq.id))
|
|
225
|
+
|
|
226
|
+
# get ko id at level D for all query genes
|
|
227
|
+
KO_assignment_file_handle = open(KO_assignment_file_D, 'w')
|
|
228
|
+
for each_query_seq in sorted(query_seq_id_list):
|
|
229
|
+
|
|
230
|
+
if each_query_seq in query_to_db_seq_dict:
|
|
231
|
+
db_hit_id = query_to_db_seq_dict[each_query_seq]
|
|
232
|
+
|
|
233
|
+
if db_hit_id in db_seq_to_KO_dict:
|
|
234
|
+
db_hit_id_ko = db_seq_to_KO_dict[db_hit_id]
|
|
235
|
+
if ',' in db_hit_id_ko:
|
|
236
|
+
db_hit_id_ko_split = db_hit_id_ko.split(',')
|
|
237
|
+
for each_db_hit_id_ko in db_hit_id_ko_split:
|
|
238
|
+
KO_assignment_file_handle.write('%s\t%s\n' % (each_query_seq, each_db_hit_id_ko))
|
|
239
|
+
else:
|
|
240
|
+
KO_assignment_file_handle.write('%s\t%s\n' % (each_query_seq, db_hit_id_ko))
|
|
241
|
+
else:
|
|
242
|
+
KO_assignment_file_handle.write('%s\n' % (each_query_seq))
|
|
243
|
+
else:
|
|
244
|
+
KO_assignment_file_handle.write('%s\n' % (each_query_seq))
|
|
245
|
+
KO_assignment_file_handle.close()
|
|
246
|
+
|
|
247
|
+
else:
|
|
248
|
+
KO_assignment_file_D = pwd_input_file
|
|
249
|
+
|
|
250
|
+
# get ko id at all levels for all query genes
|
|
251
|
+
ko_assign_ABCD_handle = open(KO_assignment_file_DCBA, 'w')
|
|
252
|
+
ko_assign_ABCD_handle.write('Gene_id\tko_A\tko_B\tko_C\tko_D\tDesc_A\tDesc_B\tDesc_C\tDesc_D\n')
|
|
253
|
+
query_seq_id_all = set()
|
|
254
|
+
genes_with_ko = set()
|
|
255
|
+
for query_gene in open(KO_assignment_file_D):
|
|
256
|
+
query_gene_split = query_gene.strip().split('\t')
|
|
257
|
+
gene_ID = query_gene_split[0]
|
|
258
|
+
|
|
259
|
+
if len(query_gene_split) == 1:
|
|
260
|
+
query_seq_id_all.add(query_gene_split[0])
|
|
261
|
+
ko_assign_ABCD_handle.write('%s\n' % gene_ID)
|
|
262
|
+
|
|
263
|
+
if len(query_gene_split) == 2:
|
|
264
|
+
query_seq_id_all.add(query_gene_split[0])
|
|
265
|
+
genes_with_ko.add(query_gene_split[0])
|
|
266
|
+
KO_ID = query_gene_split[1]
|
|
267
|
+
if KO_ID in D2ABCD_dict:
|
|
268
|
+
KO_ID_ABCD = D2ABCD_dict[KO_ID]
|
|
269
|
+
|
|
270
|
+
if len(KO_ID_ABCD) == 1:
|
|
271
|
+
KO_DCBA_list = KO_ID_ABCD[0].split('|')[::-1]
|
|
272
|
+
KO_DCBA_list_only_id = [i.split('_')[1] for i in KO_DCBA_list]
|
|
273
|
+
desc_A = As_description_dict[KO_DCBA_list_only_id[3]]
|
|
274
|
+
desc_B = Bs_description_dict[KO_DCBA_list_only_id[2]]
|
|
275
|
+
desc_C = Cs_description_dict[KO_DCBA_list_only_id[1]]
|
|
276
|
+
desc_D = Ds_description_dict[KO_DCBA_list_only_id[0]]
|
|
277
|
+
ko_assign_ABCD_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (gene_ID,
|
|
278
|
+
'\t'.join(KO_DCBA_list[::-1]),
|
|
279
|
+
desc_A, desc_B, desc_C, desc_D))
|
|
280
|
+
|
|
281
|
+
if len(KO_ID_ABCD) > 1:
|
|
282
|
+
for each_ABCD in KO_ID_ABCD:
|
|
283
|
+
each_KO_DCBA_list = each_ABCD.split('|')[::-1]
|
|
284
|
+
each_KO_DCBA_list_only_id = [i.split('_')[1] for i in each_KO_DCBA_list]
|
|
285
|
+
each_desc_A = As_description_dict[each_KO_DCBA_list_only_id[3]]
|
|
286
|
+
each_desc_B = Bs_description_dict[each_KO_DCBA_list_only_id[2]]
|
|
287
|
+
each_desc_C = Cs_description_dict[each_KO_DCBA_list_only_id[1]]
|
|
288
|
+
each_desc_D = Ds_description_dict[each_KO_DCBA_list_only_id[0]]
|
|
289
|
+
ko_assign_ABCD_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (gene_ID,
|
|
290
|
+
'\t'.join(each_KO_DCBA_list[::-1]),
|
|
291
|
+
each_desc_A, each_desc_B,
|
|
292
|
+
each_desc_C, each_desc_D))
|
|
293
|
+
|
|
294
|
+
ko_assign_ABCD_handle.close()
|
|
295
|
+
|
|
296
|
+
##################################################### Get summary ######################################################
|
|
297
|
+
|
|
298
|
+
# read in depth info
|
|
299
|
+
gene_depth_dict = {}
|
|
300
|
+
if depth_file is not None:
|
|
301
|
+
for each_depth in open(depth_file):
|
|
302
|
+
each_depth_split = each_depth.strip().split('\t')
|
|
303
|
+
gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])
|
|
304
|
+
|
|
305
|
+
# get total number and depth of all genes in one file
|
|
306
|
+
total_depth_for_all_query_genes = 0
|
|
307
|
+
genes_with_ko_TotalDepth = 0
|
|
308
|
+
if depth_file is not None:
|
|
309
|
+
for gene in query_seq_id_all:
|
|
310
|
+
gene_depth = gene_depth_dict[gene]
|
|
311
|
+
total_depth_for_all_query_genes += gene_depth
|
|
312
|
+
|
|
313
|
+
genes_with_ko_TotalDepth = get_gene_list_TotalDepth(genes_with_ko, gene_depth_dict)
|
|
314
|
+
|
|
315
|
+
identified_ko_A_list = []
|
|
316
|
+
identified_ko_B_list = []
|
|
317
|
+
identified_ko_C_list = []
|
|
318
|
+
identified_ko_D_list = []
|
|
319
|
+
ko_A_to_gene_member_dict = {}
|
|
320
|
+
ko_B_to_gene_member_dict = {}
|
|
321
|
+
ko_C_to_gene_member_dict = {}
|
|
322
|
+
ko_D_to_gene_member_dict = {}
|
|
323
|
+
ko_NA_to_gene_member_list = []
|
|
324
|
+
for each_query in open(KO_assignment_file_DCBA):
|
|
325
|
+
if not each_query.startswith('Gene_id'):
|
|
326
|
+
each_query_split = each_query.strip().split('\t')
|
|
327
|
+
query_id = each_query_split[0]
|
|
328
|
+
|
|
329
|
+
if len(each_query_split) == 1:
|
|
330
|
+
ko_NA_to_gene_member_list.append(query_id)
|
|
331
|
+
|
|
332
|
+
if len(each_query_split) > 1:
|
|
333
|
+
query_ko_A = each_query_split[1]
|
|
334
|
+
query_ko_B = each_query_split[2]
|
|
335
|
+
query_ko_C = each_query_split[3]
|
|
336
|
+
query_ko_D = each_query_split[4]
|
|
337
|
+
|
|
338
|
+
if query_ko_A not in identified_ko_A_list:
|
|
339
|
+
identified_ko_A_list.append(query_ko_A)
|
|
340
|
+
if query_ko_B not in identified_ko_B_list:
|
|
341
|
+
identified_ko_B_list.append(query_ko_B)
|
|
342
|
+
if query_ko_C not in identified_ko_C_list:
|
|
343
|
+
identified_ko_C_list.append(query_ko_C)
|
|
344
|
+
if query_ko_D not in identified_ko_D_list:
|
|
345
|
+
identified_ko_D_list.append(query_ko_D)
|
|
346
|
+
|
|
347
|
+
if query_ko_A not in ko_A_to_gene_member_dict:
|
|
348
|
+
ko_A_to_gene_member_dict[query_ko_A] = [query_id]
|
|
349
|
+
else:
|
|
350
|
+
if query_id not in ko_A_to_gene_member_dict[query_ko_A]:
|
|
351
|
+
ko_A_to_gene_member_dict[query_ko_A].append(query_id)
|
|
352
|
+
|
|
353
|
+
if query_ko_B not in ko_B_to_gene_member_dict:
|
|
354
|
+
ko_B_to_gene_member_dict[query_ko_B] = [query_id]
|
|
355
|
+
else:
|
|
356
|
+
if query_id not in ko_B_to_gene_member_dict[query_ko_B]:
|
|
357
|
+
ko_B_to_gene_member_dict[query_ko_B].append(query_id)
|
|
358
|
+
|
|
359
|
+
if query_ko_C not in ko_C_to_gene_member_dict:
|
|
360
|
+
ko_C_to_gene_member_dict[query_ko_C] = [query_id]
|
|
361
|
+
else:
|
|
362
|
+
if query_id not in ko_C_to_gene_member_dict[query_ko_C]:
|
|
363
|
+
ko_C_to_gene_member_dict[query_ko_C].append(query_id)
|
|
364
|
+
|
|
365
|
+
if query_ko_D not in ko_D_to_gene_member_dict:
|
|
366
|
+
ko_D_to_gene_member_dict[query_ko_D] = [query_id]
|
|
367
|
+
else:
|
|
368
|
+
if query_id not in ko_D_to_gene_member_dict[query_ko_D]:
|
|
369
|
+
ko_D_to_gene_member_dict[query_ko_D].append(query_id)
|
|
370
|
+
|
|
371
|
+
#################### write out GeneNumber and TotalDepth stats ####################
|
|
372
|
+
|
|
373
|
+
write_out_stats_GeneNumber(identified_ko_A_list, ko_A_to_gene_member_dict, As_description_dict, stats_file_A_GeneNumber)
|
|
374
|
+
write_out_stats_GeneNumber(identified_ko_B_list, ko_B_to_gene_member_dict, Bs_description_dict, stats_file_B_GeneNumber)
|
|
375
|
+
write_out_stats_GeneNumber(identified_ko_C_list, ko_C_to_gene_member_dict, Cs_description_dict, stats_file_C_GeneNumber)
|
|
376
|
+
write_out_stats_GeneNumber(identified_ko_D_list, ko_D_to_gene_member_dict, Ds_description_dict, stats_file_D_GeneNumber)
|
|
377
|
+
if depth_file is not None:
|
|
378
|
+
write_out_stats_TotalDepth(identified_ko_A_list, ko_A_to_gene_member_dict, gene_depth_dict, As_description_dict, stats_file_A_TotalDepth)
|
|
379
|
+
write_out_stats_TotalDepth(identified_ko_B_list, ko_B_to_gene_member_dict, gene_depth_dict, Bs_description_dict, stats_file_B_TotalDepth)
|
|
380
|
+
write_out_stats_TotalDepth(identified_ko_C_list, ko_C_to_gene_member_dict, gene_depth_dict, Cs_description_dict, stats_file_C_TotalDepth)
|
|
381
|
+
write_out_stats_TotalDepth(identified_ko_D_list, ko_D_to_gene_member_dict, gene_depth_dict, Ds_description_dict, stats_file_D_TotalDepth)
|
|
382
|
+
|
|
383
|
+
#################### write out GeneNumber and TotalDepth stats (pct) ####################
|
|
384
|
+
|
|
385
|
+
AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(genes_with_ko), stats_file_A_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
|
|
386
|
+
AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(genes_with_ko), stats_file_B_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
|
|
387
|
+
AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(genes_with_ko), stats_file_C_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
|
|
388
|
+
AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(genes_with_ko), stats_file_D_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
|
|
389
|
+
if depth_file is not None:
|
|
390
|
+
AnnotateNorm(stats_file_A_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_A_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
|
|
391
|
+
AnnotateNorm(stats_file_B_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_B_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
|
|
392
|
+
AnnotateNorm(stats_file_C_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_C_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
|
|
393
|
+
AnnotateNorm(stats_file_D_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_D_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
|
|
394
|
+
|
|
395
|
+
#################### write out GeneNumber and TotalDepth stats (pct_by_all) ####################
|
|
396
|
+
|
|
397
|
+
if pct_by_all is True:
|
|
398
|
+
AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(query_seq_id_all), stats_file_A_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
|
|
399
|
+
AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(query_seq_id_all), stats_file_B_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
|
|
400
|
+
AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(query_seq_id_all), stats_file_C_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
|
|
401
|
+
AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(query_seq_id_all), stats_file_D_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
|
|
402
|
+
if depth_file is not None:
|
|
403
|
+
AnnotateNorm(stats_file_A_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_A_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
|
|
404
|
+
AnnotateNorm(stats_file_B_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_B_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
|
|
405
|
+
AnnotateNorm(stats_file_C_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_C_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
|
|
406
|
+
AnnotateNorm(stats_file_D_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_D_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def get_KEGG_annot_df(annotation_dir, stats_level, annotation_df_absolute_num, annotation_df_pct, annotation_df_pct_by_all, with_depth, pct_by_all):
|
|
410
|
+
|
|
411
|
+
annotation_dir_re = '%s/*_KEGG_wd' % annotation_dir
|
|
412
|
+
annotation_folder_list = [os.path.basename(file_name) for file_name in glob.glob(annotation_dir_re)]
|
|
413
|
+
|
|
414
|
+
ko_num_dict = {}
|
|
415
|
+
ko_num_pct_dict = {}
|
|
416
|
+
ko_num_pct_by_all_dict = {}
|
|
417
|
+
all_identified_ko = set()
|
|
418
|
+
for annotation_folder in annotation_folder_list:
|
|
419
|
+
|
|
420
|
+
annotation_folder_basename = annotation_folder.split('_KEGG_wd')[0]
|
|
421
|
+
|
|
422
|
+
if with_depth is False:
|
|
423
|
+
pwd_annotation_stats_file = '%s/%s/%s_ko_stats_%s_GeneNumber.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
|
|
424
|
+
pwd_annotation_stats_file_pct = '%s/%s/%s_ko_stats_%s_GeneNumber_pct.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
|
|
425
|
+
pwd_annotation_stats_file_pct_by_all = '%s/%s/%s_ko_stats_%s_GeneNumber_pct_by_all.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
|
|
426
|
+
|
|
427
|
+
else:
|
|
428
|
+
pwd_annotation_stats_file = '%s/%s/%s_ko_stats_%s_TotalDepth.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
|
|
429
|
+
pwd_annotation_stats_file_pct = '%s/%s/%s_ko_stats_%s_TotalDepth_pct.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
|
|
430
|
+
pwd_annotation_stats_file_pct_by_all = '%s/%s/%s_ko_stats_%s_TotalDepth_pct_by_all.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
|
|
431
|
+
|
|
432
|
+
current_ko_to_num_dict = {}
|
|
433
|
+
for ko in open(pwd_annotation_stats_file):
|
|
434
|
+
if not ko.startswith('KO\t'):
|
|
435
|
+
ko_split = ko.strip().split('\t')
|
|
436
|
+
if with_depth is False:
|
|
437
|
+
current_ko_to_num_dict[ko_split[0]] = int(ko_split[1])
|
|
438
|
+
else:
|
|
439
|
+
current_ko_to_num_dict[ko_split[0]] = float(ko_split[1])
|
|
440
|
+
all_identified_ko.add(ko_split[0])
|
|
441
|
+
|
|
442
|
+
current_ko_to_num_pct_dict = {}
|
|
443
|
+
for ko in open(pwd_annotation_stats_file_pct):
|
|
444
|
+
if not ko.startswith('KO\t'):
|
|
445
|
+
ko_split = ko.strip().split('\t')
|
|
446
|
+
current_ko_to_num_pct_dict[ko_split[0]] = float(ko_split[1])
|
|
447
|
+
all_identified_ko.add(ko_split[0])
|
|
448
|
+
|
|
449
|
+
if pct_by_all is True:
|
|
450
|
+
current_ko_to_num_pct_by_all_dict = {}
|
|
451
|
+
for ko in open(pwd_annotation_stats_file_pct_by_all):
|
|
452
|
+
if not ko.startswith('KO\t'):
|
|
453
|
+
ko_split = ko.strip().split('\t')
|
|
454
|
+
current_ko_to_num_pct_by_all_dict[ko_split[0]] = float(ko_split[1])
|
|
455
|
+
all_identified_ko.add(ko_split[0])
|
|
456
|
+
|
|
457
|
+
ko_num_dict[annotation_folder_basename] = current_ko_to_num_dict
|
|
458
|
+
ko_num_pct_dict[annotation_folder_basename] = current_ko_to_num_pct_dict
|
|
459
|
+
if pct_by_all is True:
|
|
460
|
+
ko_num_pct_by_all_dict[annotation_folder_basename] = current_ko_to_num_pct_by_all_dict
|
|
461
|
+
|
|
462
|
+
all_identified_ko_list = sorted([i for i in all_identified_ko])
|
|
463
|
+
|
|
464
|
+
annotation_df_absolute_num_handle = open(annotation_df_absolute_num, 'w')
|
|
465
|
+
annotation_df_absolute_num_handle.write('\t%s\n' % '\t'.join(all_identified_ko_list))
|
|
466
|
+
annotation_df_percentage_handle = open(annotation_df_pct, 'w')
|
|
467
|
+
annotation_df_percentage_handle.write('\t%s\n' % '\t'.join(all_identified_ko_list))
|
|
468
|
+
if pct_by_all is True:
|
|
469
|
+
annotation_df_percentage_by_all_handle = open(annotation_df_pct_by_all, 'w')
|
|
470
|
+
annotation_df_percentage_by_all_handle.write('\t%s\n' % '\t'.join(all_identified_ko_list))
|
|
471
|
+
for annotation_folder in sorted(annotation_folder_list):
|
|
472
|
+
|
|
473
|
+
annotation_folder_basename = annotation_folder.split('_KEGG_wd')[0]
|
|
474
|
+
current_ko_num_dict = ko_num_dict[annotation_folder_basename]
|
|
475
|
+
current_ko_num_dict_pct = ko_num_pct_dict[annotation_folder_basename]
|
|
476
|
+
if pct_by_all is True:
|
|
477
|
+
current_ko_num_dict_pct_by_all = ko_num_pct_by_all_dict[annotation_folder_basename]
|
|
478
|
+
|
|
479
|
+
current_ko_num_list = []
|
|
480
|
+
current_ko_num_list_pct = []
|
|
481
|
+
current_ko_num_list_pct_by_all = []
|
|
482
|
+
for identified_ko in all_identified_ko_list:
|
|
483
|
+
|
|
484
|
+
# get num list
|
|
485
|
+
identified_ko_num = 0
|
|
486
|
+
identified_ko_num_pct = 0
|
|
487
|
+
identified_ko_num_pct_by_all = 0
|
|
488
|
+
if identified_ko in current_ko_num_dict:
|
|
489
|
+
identified_ko_num = current_ko_num_dict[identified_ko]
|
|
490
|
+
identified_ko_num_pct = current_ko_num_dict_pct[identified_ko]
|
|
491
|
+
if pct_by_all is True:
|
|
492
|
+
identified_ko_num_pct_by_all = current_ko_num_dict_pct_by_all[identified_ko]
|
|
493
|
+
|
|
494
|
+
current_ko_num_list.append(identified_ko_num)
|
|
495
|
+
current_ko_num_list_pct.append(identified_ko_num_pct)
|
|
496
|
+
if pct_by_all is True:
|
|
497
|
+
current_ko_num_list_pct_by_all.append(identified_ko_num_pct_by_all)
|
|
498
|
+
|
|
499
|
+
# write out
|
|
500
|
+
annotation_df_absolute_num_handle.write('%s\t%s\n' % (annotation_folder_basename, '\t'.join([str(i) for i in current_ko_num_list])))
|
|
501
|
+
annotation_df_percentage_handle.write('%s\t%s\n' % (annotation_folder_basename, '\t'.join([str(i) for i in current_ko_num_list_pct])))
|
|
502
|
+
if pct_by_all is True:
|
|
503
|
+
annotation_df_percentage_by_all_handle.write('%s\t%s\n' % (annotation_folder_basename, '\t'.join([str(i) for i in current_ko_num_list_pct_by_all])))
|
|
504
|
+
|
|
505
|
+
annotation_df_absolute_num_handle.close()
|
|
506
|
+
annotation_df_percentage_handle.close()
|
|
507
|
+
if pct_by_all is True:
|
|
508
|
+
annotation_df_percentage_by_all_handle.close()
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def Annotation_KEGG(args):
|
|
512
|
+
|
|
513
|
+
input_file_faa = args['seq_in']
|
|
514
|
+
input_file_user_ko = args['ko_in']
|
|
515
|
+
file_extension = args['x']
|
|
516
|
+
depth_file = args['depth']
|
|
517
|
+
pct_by_all = args['pct_by_all']
|
|
518
|
+
KEGG_DB_folder = args['db_dir']
|
|
519
|
+
run_diamond = args['diamond']
|
|
520
|
+
num_threads = args['t']
|
|
521
|
+
evalue_cutoff = args['evalue']
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
run_blast = None
|
|
525
|
+
if (input_file_faa is not None) and (input_file_user_ko is None):
|
|
526
|
+
run_blast = True
|
|
527
|
+
elif (input_file_faa is None) and (input_file_user_ko is not None):
|
|
528
|
+
run_blast = False
|
|
529
|
+
else:
|
|
530
|
+
print(datetime.now().strftime(time_format) + 'Please provide input file with either "-seq_in" or "-ko_in", do not provide both')
|
|
531
|
+
exit()
|
|
532
|
+
|
|
533
|
+
if run_blast is True:
|
|
534
|
+
input_file_folder = input_file_faa
|
|
535
|
+
else:
|
|
536
|
+
input_file_folder = input_file_user_ko
|
|
537
|
+
|
|
538
|
+
# check whether input file/folder exist
|
|
539
|
+
if (os.path.isfile(input_file_folder) is False) and (os.path.isdir(input_file_folder) is False):
|
|
540
|
+
print(datetime.now().strftime(time_format) + 'input file/folder not found, program exited')
|
|
541
|
+
exit()
|
|
542
|
+
|
|
543
|
+
if run_blast is True:
|
|
544
|
+
print(datetime.now().strftime(time_format) + 'Input sequence file detected, will run blastp/diamond first')
|
|
545
|
+
sleep(0.5)
|
|
546
|
+
else:
|
|
547
|
+
print(datetime.now().strftime(time_format) + 'Annotation results provided, blastp/diamond skipped')
|
|
548
|
+
sleep(0.5)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
################################################# define file name #################################################
|
|
552
|
+
|
|
553
|
+
KEGG_DB_seq = '%s/kegg_db_seq.fasta' % KEGG_DB_folder
|
|
554
|
+
KEGG_DB_seq_diamond = '%s/kegg_db_seq.fasta.dmnd' % KEGG_DB_folder
|
|
555
|
+
KEGG_DB_seq2ko = '%s/kegg_db_seq2ko.txt' % KEGG_DB_folder
|
|
556
|
+
KEGG_DB_ko = '%s/ko00001.keg' % KEGG_DB_folder
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
########################################## check whether diamond db exist ##########################################
|
|
560
|
+
|
|
561
|
+
if (run_blast is True) and (run_diamond is True):
|
|
562
|
+
if os.path.isfile(KEGG_DB_seq_diamond) is False:
|
|
563
|
+
print(datetime.now().strftime(time_format) + 'DB file not found, making diamond db with %s' % KEGG_DB_seq)
|
|
564
|
+
|
|
565
|
+
if os.path.isfile(KEGG_DB_seq) is True:
|
|
566
|
+
diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % (KEGG_DB_seq, KEGG_DB_seq_diamond)
|
|
567
|
+
os.system(diamond_makedb_cmd)
|
|
568
|
+
else:
|
|
569
|
+
print(datetime.now().strftime(time_format) + '%s not found, program exited' % KEGG_DB_seq)
|
|
570
|
+
exit()
|
|
571
|
+
|
|
572
|
+
########################################### check whether blast+ db exist ##########################################
|
|
573
|
+
|
|
574
|
+
if (run_blast is True) and (run_diamond is False):
|
|
575
|
+
|
|
576
|
+
unfound_db_index_file = []
|
|
577
|
+
for db_index in ['phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq']:
|
|
578
|
+
pwd_db_index = '%s/kegg_db_seq.fasta.%s' % (KEGG_DB_folder, db_index)
|
|
579
|
+
if not os.path.isfile(pwd_db_index):
|
|
580
|
+
unfound_db_index_file.append(db_index)
|
|
581
|
+
if len(unfound_db_index_file) > 0:
|
|
582
|
+
print(datetime.now().strftime(time_format) + 'blast db index not found, runing makeblastdb first')
|
|
583
|
+
makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % (KEGG_DB_seq, KEGG_DB_seq)
|
|
584
|
+
os.system(makeblastdb_cmd)
|
|
585
|
+
print(datetime.now().strftime(time_format) + 'makeblastdb finished')
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
######################################### Run blastp with multiprocessing ##########################################
|
|
589
|
+
|
|
590
|
+
# check whether the input file is a file or folder
|
|
591
|
+
if os.path.isfile(input_file_folder) is True:
|
|
592
|
+
input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(input_file_folder)
|
|
593
|
+
run_blast_worker([input_file_folder, run_blast, run_diamond, KEGG_DB_seq, KEGG_DB_seq_diamond, input_file_path, evalue_cutoff, num_threads])
|
|
594
|
+
|
|
595
|
+
if os.path.isdir(input_file_folder) is True:
|
|
596
|
+
|
|
597
|
+
# create output folder
|
|
598
|
+
output_folder = '%s_KEGG_wd' % input_file_folder
|
|
599
|
+
force_create_folder(output_folder)
|
|
600
|
+
|
|
601
|
+
# check whether input genome exist
|
|
602
|
+
input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
|
|
603
|
+
input_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_file_re)]
|
|
604
|
+
|
|
605
|
+
if len(input_file_name_list) == 0:
|
|
606
|
+
print(datetime.now().strftime(time_format) + 'input file not found, program exited')
|
|
607
|
+
exit()
|
|
608
|
+
|
|
609
|
+
# run blastp with multiprocessing
|
|
610
|
+
if run_blast is True:
|
|
611
|
+
print(datetime.now().strftime(time_format) + 'Running Blast/Diamond for %s input files with %s cores' % (len(input_file_name_list), num_threads))
|
|
612
|
+
|
|
613
|
+
list_for_multiple_arguments_blast = []
|
|
614
|
+
for input_file in input_file_name_list:
|
|
615
|
+
pwd_input_file = '%s/%s' % (input_file_folder, input_file)
|
|
616
|
+
list_for_multiple_arguments_blast.append([pwd_input_file, run_blast, run_diamond, KEGG_DB_seq, KEGG_DB_seq_diamond, output_folder, evalue_cutoff, 1])
|
|
617
|
+
|
|
618
|
+
# run blastp with multiprocessing
|
|
619
|
+
pool = mp.Pool(processes=num_threads)
|
|
620
|
+
pool.map(run_blast_worker, list_for_multiple_arguments_blast)
|
|
621
|
+
pool.close()
|
|
622
|
+
pool.join()
|
|
623
|
+
|
|
624
|
+
############################################## Read in KEGG DB files ###############################################
|
|
625
|
+
|
|
626
|
+
print(datetime.now().strftime(time_format) + 'Read in KEGG DB files')
|
|
627
|
+
|
|
628
|
+
As_description_dict = {}
|
|
629
|
+
Bs_description_dict = {}
|
|
630
|
+
Cs_description_dict = {}
|
|
631
|
+
Ds_description_dict = {}
|
|
632
|
+
D2ABCD_dict = {}
|
|
633
|
+
current_A = ''
|
|
634
|
+
current_B = ''
|
|
635
|
+
current_C = ''
|
|
636
|
+
for each_line in open(KEGG_DB_ko):
|
|
637
|
+
if each_line[0] in ['A', 'B', 'C', 'D']:
|
|
638
|
+
each_line_split = each_line.strip().split(' ')
|
|
639
|
+
|
|
640
|
+
if each_line[0] == 'A':
|
|
641
|
+
current_A_id = each_line_split[0]
|
|
642
|
+
current_A_description = ' '.join(each_line_split[1:])
|
|
643
|
+
current_A = current_A_id
|
|
644
|
+
As_description_dict[current_A_id] = current_A_description
|
|
645
|
+
|
|
646
|
+
elif each_line[0] == 'B':
|
|
647
|
+
if len(each_line_split) > 1:
|
|
648
|
+
current_B_id = each_line_split[2]
|
|
649
|
+
current_B_description = ' '.join(each_line_split[3:])
|
|
650
|
+
current_B = current_B_id
|
|
651
|
+
Bs_description_dict[current_B_id] = current_B_description
|
|
652
|
+
|
|
653
|
+
elif each_line[0] == 'C':
|
|
654
|
+
current_C_id = each_line_split[4]
|
|
655
|
+
current_C_description = ' '.join(each_line_split[5:])
|
|
656
|
+
current_C = current_C_id
|
|
657
|
+
Cs_description_dict[current_C_id] = current_C_description
|
|
658
|
+
|
|
659
|
+
elif each_line[0] == 'D':
|
|
660
|
+
current_D_id = each_line_split[6]
|
|
661
|
+
current_D_description = ' '.join(each_line_split[7:])
|
|
662
|
+
Ds_description_dict[current_D_id] = current_D_description
|
|
663
|
+
ABCD_value = 'A_%s|B_%s|C_%s|D_%s' % (current_A, current_B, current_C, current_D_id)
|
|
664
|
+
if current_D_id not in D2ABCD_dict:
|
|
665
|
+
D2ABCD_dict[current_D_id] = [ABCD_value]
|
|
666
|
+
elif (current_D_id in D2ABCD_dict) and (ABCD_value not in D2ABCD_dict[current_D_id]):
|
|
667
|
+
D2ABCD_dict[current_D_id].append(ABCD_value)
|
|
668
|
+
|
|
669
|
+
# get db_seq_to_KO_dict
|
|
670
|
+
db_seq_to_KO_dict = {}
|
|
671
|
+
if run_blast is True:
|
|
672
|
+
for each_hit in open(KEGG_DB_seq2ko):
|
|
673
|
+
each_hit_split = each_hit.strip().split('\t')
|
|
674
|
+
db_seq = each_hit_split[0]
|
|
675
|
+
hit_id_KO = each_hit_split[1]
|
|
676
|
+
if hit_id_KO != '':
|
|
677
|
+
db_seq_to_KO_dict[db_seq] = hit_id_KO
|
|
678
|
+
|
|
679
|
+
########################################################################################################################
|
|
680
|
+
|
|
681
|
+
# check whether the input file is a file or folder
|
|
682
|
+
if os.path.isfile(input_file_folder) is True:
|
|
683
|
+
|
|
684
|
+
# check whether depth file exist
|
|
685
|
+
if depth_file is not None:
|
|
686
|
+
if os.path.isfile(depth_file) is False:
|
|
687
|
+
print(datetime.now().strftime(time_format) + 'specified depth file not found, program exited!')
|
|
688
|
+
exit()
|
|
689
|
+
|
|
690
|
+
print(datetime.now().strftime(time_format) + 'Running KEGG annotation for 1 file with %s cores' % (num_threads))
|
|
691
|
+
input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(input_file_folder)
|
|
692
|
+
parse_blast_op_worker([input_file_folder, run_blast, As_description_dict, Bs_description_dict, Cs_description_dict, Ds_description_dict, D2ABCD_dict, db_seq_to_KO_dict, input_file_path, depth_file, pct_by_all])
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
if os.path.isdir(input_file_folder) is True:
|
|
696
|
+
|
|
697
|
+
input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
|
|
698
|
+
input_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_file_re)]
|
|
699
|
+
|
|
700
|
+
# check whether depth file exist
|
|
701
|
+
if depth_file is not None:
|
|
702
|
+
|
|
703
|
+
if os.path.isfile(depth_file) is True:
|
|
704
|
+
print(datetime.now().strftime(
|
|
705
|
+
time_format) + 'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.')
|
|
706
|
+
print(datetime.now().strftime(time_format) + 'single depth file (not folder) detected, program exited!')
|
|
707
|
+
exit()
|
|
708
|
+
|
|
709
|
+
if os.path.isdir(depth_file) is False:
|
|
710
|
+
print(datetime.now().strftime(time_format) + 'specified depth folder not found, program exited!')
|
|
711
|
+
exit()
|
|
712
|
+
|
|
713
|
+
if os.path.isdir(depth_file) is True:
|
|
714
|
+
|
|
715
|
+
undetected_depth_file = []
|
|
716
|
+
for input_seq_file in input_file_name_list:
|
|
717
|
+
input_seq_file_basename = '.'.join(input_seq_file.split('.')[:-1])
|
|
718
|
+
input_seq_file_depth = '%s/%s.depth' % (depth_file, input_seq_file_basename)
|
|
719
|
+
if os.path.isfile(input_seq_file_depth) is False:
|
|
720
|
+
undetected_depth_file.append(input_seq_file_depth)
|
|
721
|
+
|
|
722
|
+
if len(undetected_depth_file) > 0:
|
|
723
|
+
print(datetime.now().strftime(time_format) + 'the following depth files not found, program exited!')
|
|
724
|
+
print(','.join(undetected_depth_file))
|
|
725
|
+
exit()
|
|
726
|
+
|
|
727
|
+
# create output folder
|
|
728
|
+
output_folder = '%s_KEGG_wd' % input_file_folder
|
|
729
|
+
input_folder_name = input_file_folder
|
|
730
|
+
if '/' in input_file_folder:
|
|
731
|
+
input_folder_name = input_file_folder.split('/')[-1]
|
|
732
|
+
|
|
733
|
+
# parse blast results with multiprocessing
|
|
734
|
+
if run_blast is True:
|
|
735
|
+
print(datetime.now().strftime(time_format) + 'Parsing Blast/Diamond results for %s input files with %s cores' % (len(input_file_name_list), num_threads))
|
|
736
|
+
|
|
737
|
+
list_for_multiple_arguments_parse_blast_op = []
|
|
738
|
+
for input_file in input_file_name_list:
|
|
739
|
+
|
|
740
|
+
input_file_basename = '.'.join(input_file.split('.')[:-1])
|
|
741
|
+
pwd_input_file = '%s/%s' % (input_file_folder, input_file)
|
|
742
|
+
|
|
743
|
+
# get path to current depth file
|
|
744
|
+
if depth_file is None:
|
|
745
|
+
input_file_depth = None
|
|
746
|
+
else:
|
|
747
|
+
input_file_depth = '%s/%s.depth' % (depth_file, input_file_basename)
|
|
748
|
+
|
|
749
|
+
list_for_multiple_arguments_parse_blast_op.append([pwd_input_file, run_blast, As_description_dict, Bs_description_dict, Cs_description_dict, Ds_description_dict, D2ABCD_dict, db_seq_to_KO_dict, output_folder, input_file_depth, pct_by_all])
|
|
750
|
+
|
|
751
|
+
# parse blast results with multiprocessing
|
|
752
|
+
pool = mp.Pool(processes=num_threads)
|
|
753
|
+
pool.map(parse_blast_op_worker, list_for_multiple_arguments_parse_blast_op)
|
|
754
|
+
pool.close()
|
|
755
|
+
pool.join()
|
|
756
|
+
|
|
757
|
+
######################################################### get dataframe #########################################################
|
|
758
|
+
|
|
759
|
+
print(datetime.now().strftime(time_format) + 'Data matrix exported to:')
|
|
760
|
+
|
|
761
|
+
for ko_level in ['A', 'B', 'C', 'D']:
|
|
762
|
+
annotation_df_GeneNumber = '%s/%s_%s_GeneNumber.txt' % (output_folder, input_folder_name, ko_level)
|
|
763
|
+
annotation_df_GeneNumber_pct = '%s/%s_%s_GeneNumber_pct.txt' % (output_folder, input_folder_name, ko_level)
|
|
764
|
+
annotation_df_GeneNumber_pct_by_all = '%s/%s_%s_GeneNumber_pct_by_all.txt' % (output_folder, input_folder_name, ko_level)
|
|
765
|
+
annotation_df_TotalDepth = '%s/%s_%s_TotalDepth.txt' % (output_folder, input_folder_name, ko_level)
|
|
766
|
+
annotation_df_TotalDepth_pct = '%s/%s_%s_TotalDepth_pct.txt' % (output_folder, input_folder_name, ko_level)
|
|
767
|
+
annotation_df_TotalDepth_pct_by_all = '%s/%s_%s_TotalDepth_pct_by_all.txt' % (output_folder, input_folder_name, ko_level)
|
|
768
|
+
|
|
769
|
+
#################### get GeneNumber df and report ####################
|
|
770
|
+
|
|
771
|
+
get_KEGG_annot_df(output_folder, ko_level, annotation_df_GeneNumber, annotation_df_GeneNumber_pct, annotation_df_GeneNumber_pct_by_all, with_depth=False, pct_by_all=pct_by_all)
|
|
772
|
+
|
|
773
|
+
print(annotation_df_GeneNumber.split('/')[-1])
|
|
774
|
+
print(annotation_df_GeneNumber_pct.split('/')[-1])
|
|
775
|
+
if pct_by_all is True:
|
|
776
|
+
print(annotation_df_GeneNumber_pct_by_all.split('/')[-1])
|
|
777
|
+
|
|
778
|
+
#################### get TotalDepth df and report ####################
|
|
779
|
+
|
|
780
|
+
if depth_file is not None:
|
|
781
|
+
get_KEGG_annot_df(output_folder, ko_level, annotation_df_TotalDepth, annotation_df_TotalDepth_pct, annotation_df_TotalDepth_pct_by_all, with_depth=True, pct_by_all=pct_by_all)
|
|
782
|
+
|
|
783
|
+
print(annotation_df_TotalDepth.split('/')[-1])
|
|
784
|
+
print(annotation_df_TotalDepth_pct.split('/')[-1])
|
|
785
|
+
if pct_by_all is True:
|
|
786
|
+
print(annotation_df_TotalDepth_pct_by_all.split('/')[-1])
|
|
787
|
+
|
|
788
|
+
################################################## Final report ####################################################
|
|
789
|
+
|
|
790
|
+
print(datetime.now().strftime(time_format) + 'Done!')
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
if __name__ == "__main__":
|
|
794
|
+
|
|
795
|
+
parser = argparse.ArgumentParser()
|
|
796
|
+
parser.add_argument('-seq_in', required=False, help='faa file')
|
|
797
|
+
parser.add_argument('-ko_in', required=False, help='annotation results from BlastKOALA/GhostKOALA, normally with name user_ko.txt')
|
|
798
|
+
parser.add_argument('-x', required=False, help='file extension')
|
|
799
|
+
parser.add_argument('-depth', required=False, default=None, help='gene depth file/folder')
|
|
800
|
+
parser.add_argument('-pct_by_all', required=False, action='store_true', help='normalize by all query genes, rather than those with ko assignment')
|
|
801
|
+
parser.add_argument('-db_dir', required=True, help='folder holds sequence, seq2ko and ko00001.keg files')
|
|
802
|
+
parser.add_argument('-diamond', required=False, action='store_true', help='run diamond (for big dataset), default is NCBI blastp')
|
|
803
|
+
parser.add_argument('-t', required=False, default=1, type=int, help='number of threads, default: 1')
|
|
804
|
+
parser.add_argument('-evalue', required=False, default=0.001, type=float, help='evalue cutoff, default: 0.001')
|
|
805
|
+
|
|
806
|
+
args = vars(parser.parse_args())
|
|
807
|
+
Annotation_KEGG(args)
|