treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/KEGG_Luo17.py ADDED
@@ -0,0 +1,807 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import glob
5
+ import argparse
6
+ from Bio import SeqIO
7
+ from time import sleep
8
+ import multiprocessing as mp
9
+ from datetime import datetime
10
+ from BioSAK.global_functions import time_format
11
+ from BioSAK.global_functions import force_create_folder
12
+ from BioSAK.global_functions import sep_path_basename_ext
13
+ from BioSAK.global_functions import get_gene_list_TotalDepth
14
+ from BioSAK.global_functions import AnnotateNorm
15
+
16
+
17
+ KEGG_parser_usage = '''
18
+ ======================================== KEGG example commands =======================================
19
+
20
+ # Dependencies
21
+ module load blast+
22
+ module load diamond
23
+
24
+ # annotation with NCBI blastp (default, for small dataset)
25
+ BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 6 -seq_in input.faa -depth input.depth
26
+
27
+ # annotation with Diamond blastp (for big dataset)
28
+ BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 12 -seq_in faa_folder -x faa -depth depth_files -diamond
29
+
30
+ # get summary for BlastKOALA/GhostKOALA produced results
31
+ BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 9 -ko_in user_ko.txt
32
+ BioSAK KEGG -db_dir path/to/your/KEGG_db_dir -t 9 -ko_in user_ko_folder -x txt
33
+
34
+ # Prepare DB files, you need to have the following three files in your KEGG_db_dir:
35
+ 1. Sequence file, only needed for "-seq_in" mode, DECOMPRESS and RENAME it to kegg_db_seq.fasta
36
+ e.g. prokaryotes.pep.gz (https://www.kegg.jp/kegg/download/Readme/README.fasta)
37
+ 2. seq2ko file, only needed for "-seq_in" mode, DECOMPRESS and RENAME it to kegg_db_seq2ko.txt
38
+ e.g. prokaryotes.dat.gz (https://www.kegg.jp/kegg/download/Readme/README.fasta)
39
+ 3. ko00001.keg
40
+ https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=htext&filedir=
41
+
42
+ # How it works:
43
+ 1. KEGG module uses Blast+/Diamond to get the best hits of query genes in the database with user defined e-value cutoff (default 0.001).
44
+ 2. The TotalDepth of a KO is calculated by summing up the depth of all genes assigned to it.
45
+ 3. The percentage of GeneNumber/TotalDepth of genes assigned to a KO is calculated by dividing them
46
+ by the total number/depth of genes with KO assignment (default) or by all genes in a genome ("-pct_by_all").
47
+
48
+ # Note!!!
49
+ 1. If you run KEGG annotation for multiple files in a batch manner and want to have their depth info incorporated into the results,
50
+ you need to provide a folder containing individual depth files for each of your input sequence file.
51
+ Name of the depth file needs to be exactly the same as its corresponding sequence file, except the extension which is ".depth".
52
+ 2. Diamond requires quite a lot of memory for sequence comparison, especially for huge db file (e.g. KEGG db).
53
+ Remember to request sufficient memory (e.g. 90 or 120gb) in your job script and specify a small number (e.g. -t 6)
54
+ of jobs executing in parallel. Otherwise, you may see some of your query genomes with no gene been annotated.
55
+
56
+ # Depth file format (one gene per line, tab separated)
57
+ gene_1 30
58
+ gene_2 10.58
59
+
60
+ # To do:
61
+ 1. level C stats: separate stats for Pathway, Brite and the rests
62
+
63
+ ======================================================================================================
64
+ '''
65
+
66
+
67
+ def keep_blast_hit_with_highest_bit_score(file_in, file_out):
68
+ file_out_handle = open(file_out, 'w')
69
+ best_hit_line = ''
70
+ best_hit_query_id = ''
71
+ best_hit_score = 0
72
+ for blast_hit in open(file_in):
73
+ blast_hit_split = blast_hit.strip().split('\t')
74
+ query_id = blast_hit_split[0]
75
+ bit_score = float(blast_hit_split[11])
76
+
77
+ if best_hit_query_id == '':
78
+ best_hit_query_id = query_id
79
+ best_hit_line = blast_hit
80
+ best_hit_score = bit_score
81
+
82
+ elif (query_id == best_hit_query_id) and (bit_score > best_hit_score):
83
+ best_hit_score = bit_score
84
+ best_hit_line = blast_hit
85
+
86
+ elif query_id != best_hit_query_id:
87
+ file_out_handle.write(best_hit_line)
88
+ best_hit_query_id = query_id
89
+ best_hit_line = blast_hit
90
+ best_hit_score = bit_score
91
+
92
+ file_out_handle.write(best_hit_line)
93
+ file_out_handle.close()
94
+
95
+
96
+ def run_blast_worker(argument_list):
97
+
98
+ pwd_input_file = argument_list[0]
99
+ run_blast = argument_list[1]
100
+ run_diamond = argument_list[2]
101
+ KEGG_DB_seq = argument_list[3]
102
+ KEGG_DB_seq_diamond = argument_list[4]
103
+ op_dir = argument_list[5]
104
+ evalue_cutoff = argument_list[6]
105
+ threads_num = argument_list[7]
106
+
107
+ ################################################### define file name ###################################################
108
+
109
+ input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(pwd_input_file)
110
+
111
+ blast_results = '%s/%s_KEGG_wd/%s_blast.tab' % (op_dir, in_file_basename, in_file_basename)
112
+ blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (op_dir, in_file_basename, in_file_basename)
113
+
114
+ # create output folder
115
+ force_create_folder('%s/%s_KEGG_wd' % (op_dir, in_file_basename))
116
+
117
+ ########################################## blast against KEGG database (Shan) ##########################################
118
+
119
+ if run_blast is True:
120
+
121
+ if run_diamond is False:
122
+ blastp_cmd = 'blastp -query %s -db %s -out %s -outfmt 6 -evalue %s -num_alignments 10 -num_threads %s' % (pwd_input_file, KEGG_DB_seq, blast_results, evalue_cutoff, threads_num)
123
+ os.system(blastp_cmd)
124
+
125
+ else:
126
+ diamond_cmd = 'diamond blastp -q %s --db %s --out %s --outfmt 6 --evalue %s --block-size 1 --threads %s --quiet' % (pwd_input_file, KEGG_DB_seq_diamond, blast_results, evalue_cutoff, threads_num)
127
+ os.system(diamond_cmd)
128
+
129
+ # only keep the best hit
130
+ keep_blast_hit_with_highest_bit_score(blast_results, blast_results_best_hit)
131
+
132
+
133
+ def write_out_stats_GeneNumber(identified_ko_list, ko_to_gene_member_dict, ko_description_dict, stats_file_GeneNumber):
134
+
135
+ stats_file_GeneNumber_handle = open(stats_file_GeneNumber, 'w')
136
+ stats_file_GeneNumber_handle.write('KO\tGeneNumber\tDescription\n')
137
+ for ko in identified_ko_list:
138
+ ko_GeneNumber = len(ko_to_gene_member_dict[ko])
139
+ stats_file_GeneNumber_handle.write('%s\t%s\t%s\n' % (ko[2:], ko_GeneNumber, ko_description_dict[ko[2:]]))
140
+ stats_file_GeneNumber_handle.close()
141
+
142
+
143
+ def write_out_stats_TotalDepth(identified_ko_list, ko_to_gene_member_dict, gene_depth_dict, ko_description_dict, stats_file_TotalDepth):
144
+
145
+ stats_file_TotalDepth_handle = open(stats_file_TotalDepth, 'w')
146
+ stats_file_TotalDepth_handle.write('KO\tTotalDepth\tDescription\n')
147
+ for ko in identified_ko_list:
148
+ ko_gene_total_depth = 0
149
+ for each_gene in ko_to_gene_member_dict[ko]:
150
+ each_gene_depth = gene_depth_dict[each_gene]
151
+ ko_gene_total_depth += each_gene_depth
152
+ ko_TotalDepth = float("{0:.2f}".format(ko_gene_total_depth))
153
+ stats_file_TotalDepth_handle.write('%s\t%s\t%s\n' % (ko[2:], ko_TotalDepth, ko_description_dict[ko[2:]]))
154
+ stats_file_TotalDepth_handle.close()
155
+
156
+
157
+ def parse_blast_op_worker(argument_list):
158
+
159
+ pwd_input_file = argument_list[0]
160
+ run_blast = argument_list[1]
161
+ As_description_dict = argument_list[2]
162
+ Bs_description_dict = argument_list[3]
163
+ Cs_description_dict = argument_list[4]
164
+ Ds_description_dict = argument_list[5]
165
+ D2ABCD_dict = argument_list[6]
166
+ db_seq_to_KO_dict = argument_list[7]
167
+ op_dir = argument_list[8]
168
+ depth_file = argument_list[9]
169
+ pct_by_all = argument_list[10]
170
+
171
+ ################################################### define file name ###################################################
172
+
173
+ input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(pwd_input_file)
174
+
175
+ blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (op_dir, in_file_basename, in_file_basename)
176
+ KO_assignment_file_D = '%s/%s_KEGG_wd/%s_KO_assignment_D.txt' % (op_dir, in_file_basename, in_file_basename)
177
+ KO_assignment_file_DCBA = '%s/%s_KEGG_wd/%s_ko_assignment_ABCD.txt' % (op_dir, in_file_basename, in_file_basename)
178
+
179
+ stats_file_A_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
180
+ stats_file_B_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
181
+ stats_file_C_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
182
+ stats_file_D_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber.txt' % (op_dir, in_file_basename, in_file_basename)
183
+
184
+ stats_file_A_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
185
+ stats_file_B_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
186
+ stats_file_C_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
187
+ stats_file_D_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth.txt' % (op_dir, in_file_basename, in_file_basename)
188
+
189
+ stats_file_A_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
190
+ stats_file_B_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
191
+ stats_file_C_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
192
+ stats_file_D_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct.txt' % (op_dir, in_file_basename, in_file_basename)
193
+
194
+ stats_file_A_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
195
+ stats_file_B_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
196
+ stats_file_C_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
197
+ stats_file_D_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct.txt' % (op_dir, in_file_basename, in_file_basename)
198
+
199
+ stats_file_A_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
200
+ stats_file_B_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
201
+ stats_file_C_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
202
+ stats_file_D_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
203
+
204
+ stats_file_A_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
205
+ stats_file_B_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
206
+ stats_file_C_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
207
+ stats_file_D_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct_by_all.txt' % (op_dir, in_file_basename, in_file_basename)
208
+
209
+ ################################################# parse blast results ##################################################
210
+
211
+ if run_blast is True:
212
+
213
+ # store blast results in dict
214
+ query_to_db_seq_dict = {}
215
+ for each_query in open(blast_results_best_hit):
216
+ each_query_split = each_query.strip().split('\t')
217
+ query_id = each_query_split[0]
218
+ db_seq = each_query_split[1]
219
+ query_to_db_seq_dict[query_id] = db_seq
220
+
221
+ # get all query sequence id
222
+ query_seq_id_list = []
223
+ for each_seq in SeqIO.parse(pwd_input_file, 'fasta'):
224
+ query_seq_id_list.append(str(each_seq.id))
225
+
226
+ # get ko id at level D for all query genes
227
+ KO_assignment_file_handle = open(KO_assignment_file_D, 'w')
228
+ for each_query_seq in sorted(query_seq_id_list):
229
+
230
+ if each_query_seq in query_to_db_seq_dict:
231
+ db_hit_id = query_to_db_seq_dict[each_query_seq]
232
+
233
+ if db_hit_id in db_seq_to_KO_dict:
234
+ db_hit_id_ko = db_seq_to_KO_dict[db_hit_id]
235
+ if ',' in db_hit_id_ko:
236
+ db_hit_id_ko_split = db_hit_id_ko.split(',')
237
+ for each_db_hit_id_ko in db_hit_id_ko_split:
238
+ KO_assignment_file_handle.write('%s\t%s\n' % (each_query_seq, each_db_hit_id_ko))
239
+ else:
240
+ KO_assignment_file_handle.write('%s\t%s\n' % (each_query_seq, db_hit_id_ko))
241
+ else:
242
+ KO_assignment_file_handle.write('%s\n' % (each_query_seq))
243
+ else:
244
+ KO_assignment_file_handle.write('%s\n' % (each_query_seq))
245
+ KO_assignment_file_handle.close()
246
+
247
+ else:
248
+ KO_assignment_file_D = pwd_input_file
249
+
250
+ # get ko id at all levels for all query genes
251
+ ko_assign_ABCD_handle = open(KO_assignment_file_DCBA, 'w')
252
+ ko_assign_ABCD_handle.write('Gene_id\tko_A\tko_B\tko_C\tko_D\tDesc_A\tDesc_B\tDesc_C\tDesc_D\n')
253
+ query_seq_id_all = set()
254
+ genes_with_ko = set()
255
+ for query_gene in open(KO_assignment_file_D):
256
+ query_gene_split = query_gene.strip().split('\t')
257
+ gene_ID = query_gene_split[0]
258
+
259
+ if len(query_gene_split) == 1:
260
+ query_seq_id_all.add(query_gene_split[0])
261
+ ko_assign_ABCD_handle.write('%s\n' % gene_ID)
262
+
263
+ if len(query_gene_split) == 2:
264
+ query_seq_id_all.add(query_gene_split[0])
265
+ genes_with_ko.add(query_gene_split[0])
266
+ KO_ID = query_gene_split[1]
267
+ if KO_ID in D2ABCD_dict:
268
+ KO_ID_ABCD = D2ABCD_dict[KO_ID]
269
+
270
+ if len(KO_ID_ABCD) == 1:
271
+ KO_DCBA_list = KO_ID_ABCD[0].split('|')[::-1]
272
+ KO_DCBA_list_only_id = [i.split('_')[1] for i in KO_DCBA_list]
273
+ desc_A = As_description_dict[KO_DCBA_list_only_id[3]]
274
+ desc_B = Bs_description_dict[KO_DCBA_list_only_id[2]]
275
+ desc_C = Cs_description_dict[KO_DCBA_list_only_id[1]]
276
+ desc_D = Ds_description_dict[KO_DCBA_list_only_id[0]]
277
+ ko_assign_ABCD_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (gene_ID,
278
+ '\t'.join(KO_DCBA_list[::-1]),
279
+ desc_A, desc_B, desc_C, desc_D))
280
+
281
+ if len(KO_ID_ABCD) > 1:
282
+ for each_ABCD in KO_ID_ABCD:
283
+ each_KO_DCBA_list = each_ABCD.split('|')[::-1]
284
+ each_KO_DCBA_list_only_id = [i.split('_')[1] for i in each_KO_DCBA_list]
285
+ each_desc_A = As_description_dict[each_KO_DCBA_list_only_id[3]]
286
+ each_desc_B = Bs_description_dict[each_KO_DCBA_list_only_id[2]]
287
+ each_desc_C = Cs_description_dict[each_KO_DCBA_list_only_id[1]]
288
+ each_desc_D = Ds_description_dict[each_KO_DCBA_list_only_id[0]]
289
+ ko_assign_ABCD_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (gene_ID,
290
+ '\t'.join(each_KO_DCBA_list[::-1]),
291
+ each_desc_A, each_desc_B,
292
+ each_desc_C, each_desc_D))
293
+
294
+ ko_assign_ABCD_handle.close()
295
+
296
+ ##################################################### Get summary ######################################################
297
+
298
+ # read in depth info
299
+ gene_depth_dict = {}
300
+ if depth_file is not None:
301
+ for each_depth in open(depth_file):
302
+ each_depth_split = each_depth.strip().split('\t')
303
+ gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])
304
+
305
+ # get total number and depth of all genes in one file
306
+ total_depth_for_all_query_genes = 0
307
+ genes_with_ko_TotalDepth = 0
308
+ if depth_file is not None:
309
+ for gene in query_seq_id_all:
310
+ gene_depth = gene_depth_dict[gene]
311
+ total_depth_for_all_query_genes += gene_depth
312
+
313
+ genes_with_ko_TotalDepth = get_gene_list_TotalDepth(genes_with_ko, gene_depth_dict)
314
+
315
+ identified_ko_A_list = []
316
+ identified_ko_B_list = []
317
+ identified_ko_C_list = []
318
+ identified_ko_D_list = []
319
+ ko_A_to_gene_member_dict = {}
320
+ ko_B_to_gene_member_dict = {}
321
+ ko_C_to_gene_member_dict = {}
322
+ ko_D_to_gene_member_dict = {}
323
+ ko_NA_to_gene_member_list = []
324
+ for each_query in open(KO_assignment_file_DCBA):
325
+ if not each_query.startswith('Gene_id'):
326
+ each_query_split = each_query.strip().split('\t')
327
+ query_id = each_query_split[0]
328
+
329
+ if len(each_query_split) == 1:
330
+ ko_NA_to_gene_member_list.append(query_id)
331
+
332
+ if len(each_query_split) > 1:
333
+ query_ko_A = each_query_split[1]
334
+ query_ko_B = each_query_split[2]
335
+ query_ko_C = each_query_split[3]
336
+ query_ko_D = each_query_split[4]
337
+
338
+ if query_ko_A not in identified_ko_A_list:
339
+ identified_ko_A_list.append(query_ko_A)
340
+ if query_ko_B not in identified_ko_B_list:
341
+ identified_ko_B_list.append(query_ko_B)
342
+ if query_ko_C not in identified_ko_C_list:
343
+ identified_ko_C_list.append(query_ko_C)
344
+ if query_ko_D not in identified_ko_D_list:
345
+ identified_ko_D_list.append(query_ko_D)
346
+
347
+ if query_ko_A not in ko_A_to_gene_member_dict:
348
+ ko_A_to_gene_member_dict[query_ko_A] = [query_id]
349
+ else:
350
+ if query_id not in ko_A_to_gene_member_dict[query_ko_A]:
351
+ ko_A_to_gene_member_dict[query_ko_A].append(query_id)
352
+
353
+ if query_ko_B not in ko_B_to_gene_member_dict:
354
+ ko_B_to_gene_member_dict[query_ko_B] = [query_id]
355
+ else:
356
+ if query_id not in ko_B_to_gene_member_dict[query_ko_B]:
357
+ ko_B_to_gene_member_dict[query_ko_B].append(query_id)
358
+
359
+ if query_ko_C not in ko_C_to_gene_member_dict:
360
+ ko_C_to_gene_member_dict[query_ko_C] = [query_id]
361
+ else:
362
+ if query_id not in ko_C_to_gene_member_dict[query_ko_C]:
363
+ ko_C_to_gene_member_dict[query_ko_C].append(query_id)
364
+
365
+ if query_ko_D not in ko_D_to_gene_member_dict:
366
+ ko_D_to_gene_member_dict[query_ko_D] = [query_id]
367
+ else:
368
+ if query_id not in ko_D_to_gene_member_dict[query_ko_D]:
369
+ ko_D_to_gene_member_dict[query_ko_D].append(query_id)
370
+
371
+ #################### write out GeneNumber and TotalDepth stats ####################
372
+
373
+ write_out_stats_GeneNumber(identified_ko_A_list, ko_A_to_gene_member_dict, As_description_dict, stats_file_A_GeneNumber)
374
+ write_out_stats_GeneNumber(identified_ko_B_list, ko_B_to_gene_member_dict, Bs_description_dict, stats_file_B_GeneNumber)
375
+ write_out_stats_GeneNumber(identified_ko_C_list, ko_C_to_gene_member_dict, Cs_description_dict, stats_file_C_GeneNumber)
376
+ write_out_stats_GeneNumber(identified_ko_D_list, ko_D_to_gene_member_dict, Ds_description_dict, stats_file_D_GeneNumber)
377
+ if depth_file is not None:
378
+ write_out_stats_TotalDepth(identified_ko_A_list, ko_A_to_gene_member_dict, gene_depth_dict, As_description_dict, stats_file_A_TotalDepth)
379
+ write_out_stats_TotalDepth(identified_ko_B_list, ko_B_to_gene_member_dict, gene_depth_dict, Bs_description_dict, stats_file_B_TotalDepth)
380
+ write_out_stats_TotalDepth(identified_ko_C_list, ko_C_to_gene_member_dict, gene_depth_dict, Cs_description_dict, stats_file_C_TotalDepth)
381
+ write_out_stats_TotalDepth(identified_ko_D_list, ko_D_to_gene_member_dict, gene_depth_dict, Ds_description_dict, stats_file_D_TotalDepth)
382
+
383
+ #################### write out GeneNumber and TotalDepth stats (pct) ####################
384
+
385
+ AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(genes_with_ko), stats_file_A_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
386
+ AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(genes_with_ko), stats_file_B_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
387
+ AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(genes_with_ko), stats_file_C_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
388
+ AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(genes_with_ko), stats_file_D_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n')
389
+ if depth_file is not None:
390
+ AnnotateNorm(stats_file_A_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_A_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
391
+ AnnotateNorm(stats_file_B_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_B_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
392
+ AnnotateNorm(stats_file_C_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_C_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
393
+ AnnotateNorm(stats_file_D_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_D_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n')
394
+
395
+ #################### write out GeneNumber and TotalDepth stats (pct_by_all) ####################
396
+
397
+ if pct_by_all is True:
398
+ AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(query_seq_id_all), stats_file_A_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
399
+ AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(query_seq_id_all), stats_file_B_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
400
+ AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(query_seq_id_all), stats_file_C_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
401
+ AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(query_seq_id_all), stats_file_D_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n')
402
+ if depth_file is not None:
403
+ AnnotateNorm(stats_file_A_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_A_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
404
+ AnnotateNorm(stats_file_B_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_B_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
405
+ AnnotateNorm(stats_file_C_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_C_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
406
+ AnnotateNorm(stats_file_D_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_D_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
407
+
408
+
409
+ def get_KEGG_annot_df(annotation_dir, stats_level, annotation_df_absolute_num, annotation_df_pct, annotation_df_pct_by_all, with_depth, pct_by_all):
410
+
411
+ annotation_dir_re = '%s/*_KEGG_wd' % annotation_dir
412
+ annotation_folder_list = [os.path.basename(file_name) for file_name in glob.glob(annotation_dir_re)]
413
+
414
+ ko_num_dict = {}
415
+ ko_num_pct_dict = {}
416
+ ko_num_pct_by_all_dict = {}
417
+ all_identified_ko = set()
418
+ for annotation_folder in annotation_folder_list:
419
+
420
+ annotation_folder_basename = annotation_folder.split('_KEGG_wd')[0]
421
+
422
+ if with_depth is False:
423
+ pwd_annotation_stats_file = '%s/%s/%s_ko_stats_%s_GeneNumber.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
424
+ pwd_annotation_stats_file_pct = '%s/%s/%s_ko_stats_%s_GeneNumber_pct.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
425
+ pwd_annotation_stats_file_pct_by_all = '%s/%s/%s_ko_stats_%s_GeneNumber_pct_by_all.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
426
+
427
+ else:
428
+ pwd_annotation_stats_file = '%s/%s/%s_ko_stats_%s_TotalDepth.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
429
+ pwd_annotation_stats_file_pct = '%s/%s/%s_ko_stats_%s_TotalDepth_pct.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
430
+ pwd_annotation_stats_file_pct_by_all = '%s/%s/%s_ko_stats_%s_TotalDepth_pct_by_all.txt' % (annotation_dir, annotation_folder, annotation_folder_basename, stats_level)
431
+
432
+ current_ko_to_num_dict = {}
433
+ for ko in open(pwd_annotation_stats_file):
434
+ if not ko.startswith('KO\t'):
435
+ ko_split = ko.strip().split('\t')
436
+ if with_depth is False:
437
+ current_ko_to_num_dict[ko_split[0]] = int(ko_split[1])
438
+ else:
439
+ current_ko_to_num_dict[ko_split[0]] = float(ko_split[1])
440
+ all_identified_ko.add(ko_split[0])
441
+
442
+ current_ko_to_num_pct_dict = {}
443
+ for ko in open(pwd_annotation_stats_file_pct):
444
+ if not ko.startswith('KO\t'):
445
+ ko_split = ko.strip().split('\t')
446
+ current_ko_to_num_pct_dict[ko_split[0]] = float(ko_split[1])
447
+ all_identified_ko.add(ko_split[0])
448
+
449
+ if pct_by_all is True:
450
+ current_ko_to_num_pct_by_all_dict = {}
451
+ for ko in open(pwd_annotation_stats_file_pct_by_all):
452
+ if not ko.startswith('KO\t'):
453
+ ko_split = ko.strip().split('\t')
454
+ current_ko_to_num_pct_by_all_dict[ko_split[0]] = float(ko_split[1])
455
+ all_identified_ko.add(ko_split[0])
456
+
457
+ ko_num_dict[annotation_folder_basename] = current_ko_to_num_dict
458
+ ko_num_pct_dict[annotation_folder_basename] = current_ko_to_num_pct_dict
459
+ if pct_by_all is True:
460
+ ko_num_pct_by_all_dict[annotation_folder_basename] = current_ko_to_num_pct_by_all_dict
461
+
462
+ all_identified_ko_list = sorted([i for i in all_identified_ko])
463
+
464
+ annotation_df_absolute_num_handle = open(annotation_df_absolute_num, 'w')
465
+ annotation_df_absolute_num_handle.write('\t%s\n' % '\t'.join(all_identified_ko_list))
466
+ annotation_df_percentage_handle = open(annotation_df_pct, 'w')
467
+ annotation_df_percentage_handle.write('\t%s\n' % '\t'.join(all_identified_ko_list))
468
+ if pct_by_all is True:
469
+ annotation_df_percentage_by_all_handle = open(annotation_df_pct_by_all, 'w')
470
+ annotation_df_percentage_by_all_handle.write('\t%s\n' % '\t'.join(all_identified_ko_list))
471
+ for annotation_folder in sorted(annotation_folder_list):
472
+
473
+ annotation_folder_basename = annotation_folder.split('_KEGG_wd')[0]
474
+ current_ko_num_dict = ko_num_dict[annotation_folder_basename]
475
+ current_ko_num_dict_pct = ko_num_pct_dict[annotation_folder_basename]
476
+ if pct_by_all is True:
477
+ current_ko_num_dict_pct_by_all = ko_num_pct_by_all_dict[annotation_folder_basename]
478
+
479
+ current_ko_num_list = []
480
+ current_ko_num_list_pct = []
481
+ current_ko_num_list_pct_by_all = []
482
+ for identified_ko in all_identified_ko_list:
483
+
484
+ # get num list
485
+ identified_ko_num = 0
486
+ identified_ko_num_pct = 0
487
+ identified_ko_num_pct_by_all = 0
488
+ if identified_ko in current_ko_num_dict:
489
+ identified_ko_num = current_ko_num_dict[identified_ko]
490
+ identified_ko_num_pct = current_ko_num_dict_pct[identified_ko]
491
+ if pct_by_all is True:
492
+ identified_ko_num_pct_by_all = current_ko_num_dict_pct_by_all[identified_ko]
493
+
494
+ current_ko_num_list.append(identified_ko_num)
495
+ current_ko_num_list_pct.append(identified_ko_num_pct)
496
+ if pct_by_all is True:
497
+ current_ko_num_list_pct_by_all.append(identified_ko_num_pct_by_all)
498
+
499
+ # write out
500
+ annotation_df_absolute_num_handle.write('%s\t%s\n' % (annotation_folder_basename, '\t'.join([str(i) for i in current_ko_num_list])))
501
+ annotation_df_percentage_handle.write('%s\t%s\n' % (annotation_folder_basename, '\t'.join([str(i) for i in current_ko_num_list_pct])))
502
+ if pct_by_all is True:
503
+ annotation_df_percentage_by_all_handle.write('%s\t%s\n' % (annotation_folder_basename, '\t'.join([str(i) for i in current_ko_num_list_pct_by_all])))
504
+
505
+ annotation_df_absolute_num_handle.close()
506
+ annotation_df_percentage_handle.close()
507
+ if pct_by_all is True:
508
+ annotation_df_percentage_by_all_handle.close()
509
+
510
+
511
+ def Annotation_KEGG(args):
512
+
513
+ input_file_faa = args['seq_in']
514
+ input_file_user_ko = args['ko_in']
515
+ file_extension = args['x']
516
+ depth_file = args['depth']
517
+ pct_by_all = args['pct_by_all']
518
+ KEGG_DB_folder = args['db_dir']
519
+ run_diamond = args['diamond']
520
+ num_threads = args['t']
521
+ evalue_cutoff = args['evalue']
522
+
523
+
524
+ run_blast = None
525
+ if (input_file_faa is not None) and (input_file_user_ko is None):
526
+ run_blast = True
527
+ elif (input_file_faa is None) and (input_file_user_ko is not None):
528
+ run_blast = False
529
+ else:
530
+ print(datetime.now().strftime(time_format) + 'Please provide input file with either "-seq_in" or "-ko_in", do not provide both')
531
+ exit()
532
+
533
+ if run_blast is True:
534
+ input_file_folder = input_file_faa
535
+ else:
536
+ input_file_folder = input_file_user_ko
537
+
538
+ # check whether input file/folder exist
539
+ if (os.path.isfile(input_file_folder) is False) and (os.path.isdir(input_file_folder) is False):
540
+ print(datetime.now().strftime(time_format) + 'input file/folder not found, program exited')
541
+ exit()
542
+
543
+ if run_blast is True:
544
+ print(datetime.now().strftime(time_format) + 'Input sequence file detected, will run blastp/diamond first')
545
+ sleep(0.5)
546
+ else:
547
+ print(datetime.now().strftime(time_format) + 'Annotation results provided, blastp/diamond skipped')
548
+ sleep(0.5)
549
+
550
+
551
+ ################################################# define file name #################################################
552
+
553
+ KEGG_DB_seq = '%s/kegg_db_seq.fasta' % KEGG_DB_folder
554
+ KEGG_DB_seq_diamond = '%s/kegg_db_seq.fasta.dmnd' % KEGG_DB_folder
555
+ KEGG_DB_seq2ko = '%s/kegg_db_seq2ko.txt' % KEGG_DB_folder
556
+ KEGG_DB_ko = '%s/ko00001.keg' % KEGG_DB_folder
557
+
558
+
559
+ ########################################## check whether diamond db exist ##########################################
560
+
561
+ if (run_blast is True) and (run_diamond is True):
562
+ if os.path.isfile(KEGG_DB_seq_diamond) is False:
563
+ print(datetime.now().strftime(time_format) + 'DB file not found, making diamond db with %s' % KEGG_DB_seq)
564
+
565
+ if os.path.isfile(KEGG_DB_seq) is True:
566
+ diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % (KEGG_DB_seq, KEGG_DB_seq_diamond)
567
+ os.system(diamond_makedb_cmd)
568
+ else:
569
+ print(datetime.now().strftime(time_format) + '%s not found, program exited' % KEGG_DB_seq)
570
+ exit()
571
+
572
+ ########################################### check whether blast+ db exist ##########################################
573
+
574
+ if (run_blast is True) and (run_diamond is False):
575
+
576
+ unfound_db_index_file = []
577
+ for db_index in ['phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq']:
578
+ pwd_db_index = '%s/kegg_db_seq.fasta.%s' % (KEGG_DB_folder, db_index)
579
+ if not os.path.isfile(pwd_db_index):
580
+ unfound_db_index_file.append(db_index)
581
+ if len(unfound_db_index_file) > 0:
582
+ print(datetime.now().strftime(time_format) + 'blast db index not found, runing makeblastdb first')
583
+ makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % (KEGG_DB_seq, KEGG_DB_seq)
584
+ os.system(makeblastdb_cmd)
585
+ print(datetime.now().strftime(time_format) + 'makeblastdb finished')
586
+
587
+
588
+ ######################################### Run blastp with multiprocessing ##########################################
589
+
590
+ # check whether the input file is a file or folder
591
+ if os.path.isfile(input_file_folder) is True:
592
+ input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(input_file_folder)
593
+ run_blast_worker([input_file_folder, run_blast, run_diamond, KEGG_DB_seq, KEGG_DB_seq_diamond, input_file_path, evalue_cutoff, num_threads])
594
+
595
+ if os.path.isdir(input_file_folder) is True:
596
+
597
+ # create output folder
598
+ output_folder = '%s_KEGG_wd' % input_file_folder
599
+ force_create_folder(output_folder)
600
+
601
+ # check whether input genome exist
602
+ input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
603
+ input_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_file_re)]
604
+
605
+ if len(input_file_name_list) == 0:
606
+ print(datetime.now().strftime(time_format) + 'input file not found, program exited')
607
+ exit()
608
+
609
+ # run blastp with multiprocessing
610
+ if run_blast is True:
611
+ print(datetime.now().strftime(time_format) + 'Running Blast/Diamond for %s input files with %s cores' % (len(input_file_name_list), num_threads))
612
+
613
+ list_for_multiple_arguments_blast = []
614
+ for input_file in input_file_name_list:
615
+ pwd_input_file = '%s/%s' % (input_file_folder, input_file)
616
+ list_for_multiple_arguments_blast.append([pwd_input_file, run_blast, run_diamond, KEGG_DB_seq, KEGG_DB_seq_diamond, output_folder, evalue_cutoff, 1])
617
+
618
+ # run blastp with multiprocessing
619
+ pool = mp.Pool(processes=num_threads)
620
+ pool.map(run_blast_worker, list_for_multiple_arguments_blast)
621
+ pool.close()
622
+ pool.join()
623
+
624
+ ############################################## Read in KEGG DB files ###############################################
625
+
626
+ print(datetime.now().strftime(time_format) + 'Read in KEGG DB files')
627
+
628
+ As_description_dict = {}
629
+ Bs_description_dict = {}
630
+ Cs_description_dict = {}
631
+ Ds_description_dict = {}
632
+ D2ABCD_dict = {}
633
+ current_A = ''
634
+ current_B = ''
635
+ current_C = ''
636
+ for each_line in open(KEGG_DB_ko):
637
+ if each_line[0] in ['A', 'B', 'C', 'D']:
638
+ each_line_split = each_line.strip().split(' ')
639
+
640
+ if each_line[0] == 'A':
641
+ current_A_id = each_line_split[0]
642
+ current_A_description = ' '.join(each_line_split[1:])
643
+ current_A = current_A_id
644
+ As_description_dict[current_A_id] = current_A_description
645
+
646
+ elif each_line[0] == 'B':
647
+ if len(each_line_split) > 1:
648
+ current_B_id = each_line_split[2]
649
+ current_B_description = ' '.join(each_line_split[3:])
650
+ current_B = current_B_id
651
+ Bs_description_dict[current_B_id] = current_B_description
652
+
653
+ elif each_line[0] == 'C':
654
+ current_C_id = each_line_split[4]
655
+ current_C_description = ' '.join(each_line_split[5:])
656
+ current_C = current_C_id
657
+ Cs_description_dict[current_C_id] = current_C_description
658
+
659
+ elif each_line[0] == 'D':
660
+ current_D_id = each_line_split[6]
661
+ current_D_description = ' '.join(each_line_split[7:])
662
+ Ds_description_dict[current_D_id] = current_D_description
663
+ ABCD_value = 'A_%s|B_%s|C_%s|D_%s' % (current_A, current_B, current_C, current_D_id)
664
+ if current_D_id not in D2ABCD_dict:
665
+ D2ABCD_dict[current_D_id] = [ABCD_value]
666
+ elif (current_D_id in D2ABCD_dict) and (ABCD_value not in D2ABCD_dict[current_D_id]):
667
+ D2ABCD_dict[current_D_id].append(ABCD_value)
668
+
669
+ # get db_seq_to_KO_dict
670
+ db_seq_to_KO_dict = {}
671
+ if run_blast is True:
672
+ for each_hit in open(KEGG_DB_seq2ko):
673
+ each_hit_split = each_hit.strip().split('\t')
674
+ db_seq = each_hit_split[0]
675
+ hit_id_KO = each_hit_split[1]
676
+ if hit_id_KO != '':
677
+ db_seq_to_KO_dict[db_seq] = hit_id_KO
678
+
679
+ ########################################################################################################################
680
+
681
+ # check whether the input file is a file or folder
682
+ if os.path.isfile(input_file_folder) is True:
683
+
684
+ # check whether depth file exist
685
+ if depth_file is not None:
686
+ if os.path.isfile(depth_file) is False:
687
+ print(datetime.now().strftime(time_format) + 'specified depth file not found, program exited!')
688
+ exit()
689
+
690
+ print(datetime.now().strftime(time_format) + 'Running KEGG annotation for 1 file with %s cores' % (num_threads))
691
+ input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(input_file_folder)
692
+ parse_blast_op_worker([input_file_folder, run_blast, As_description_dict, Bs_description_dict, Cs_description_dict, Ds_description_dict, D2ABCD_dict, db_seq_to_KO_dict, input_file_path, depth_file, pct_by_all])
693
+
694
+
695
+ if os.path.isdir(input_file_folder) is True:
696
+
697
+ input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
698
+ input_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_file_re)]
699
+
700
+ # check whether depth file exist
701
+ if depth_file is not None:
702
+
703
+ if os.path.isfile(depth_file) is True:
704
+ print(datetime.now().strftime(
705
+ time_format) + 'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.')
706
+ print(datetime.now().strftime(time_format) + 'single depth file (not folder) detected, program exited!')
707
+ exit()
708
+
709
+ if os.path.isdir(depth_file) is False:
710
+ print(datetime.now().strftime(time_format) + 'specified depth folder not found, program exited!')
711
+ exit()
712
+
713
+ if os.path.isdir(depth_file) is True:
714
+
715
+ undetected_depth_file = []
716
+ for input_seq_file in input_file_name_list:
717
+ input_seq_file_basename = '.'.join(input_seq_file.split('.')[:-1])
718
+ input_seq_file_depth = '%s/%s.depth' % (depth_file, input_seq_file_basename)
719
+ if os.path.isfile(input_seq_file_depth) is False:
720
+ undetected_depth_file.append(input_seq_file_depth)
721
+
722
+ if len(undetected_depth_file) > 0:
723
+ print(datetime.now().strftime(time_format) + 'the following depth files not found, program exited!')
724
+ print(','.join(undetected_depth_file))
725
+ exit()
726
+
727
+ # create output folder
728
+ output_folder = '%s_KEGG_wd' % input_file_folder
729
+ input_folder_name = input_file_folder
730
+ if '/' in input_file_folder:
731
+ input_folder_name = input_file_folder.split('/')[-1]
732
+
733
+ # parse blast results with multiprocessing
734
+ if run_blast is True:
735
+ print(datetime.now().strftime(time_format) + 'Parsing Blast/Diamond results for %s input files with %s cores' % (len(input_file_name_list), num_threads))
736
+
737
+ list_for_multiple_arguments_parse_blast_op = []
738
+ for input_file in input_file_name_list:
739
+
740
+ input_file_basename = '.'.join(input_file.split('.')[:-1])
741
+ pwd_input_file = '%s/%s' % (input_file_folder, input_file)
742
+
743
+ # get path to current depth file
744
+ if depth_file is None:
745
+ input_file_depth = None
746
+ else:
747
+ input_file_depth = '%s/%s.depth' % (depth_file, input_file_basename)
748
+
749
+ list_for_multiple_arguments_parse_blast_op.append([pwd_input_file, run_blast, As_description_dict, Bs_description_dict, Cs_description_dict, Ds_description_dict, D2ABCD_dict, db_seq_to_KO_dict, output_folder, input_file_depth, pct_by_all])
750
+
751
+ # parse blast results with multiprocessing
752
+ pool = mp.Pool(processes=num_threads)
753
+ pool.map(parse_blast_op_worker, list_for_multiple_arguments_parse_blast_op)
754
+ pool.close()
755
+ pool.join()
756
+
757
+ ######################################################### get dataframe #########################################################
758
+
759
+ print(datetime.now().strftime(time_format) + 'Data matrix exported to:')
760
+
761
+ for ko_level in ['A', 'B', 'C', 'D']:
762
+ annotation_df_GeneNumber = '%s/%s_%s_GeneNumber.txt' % (output_folder, input_folder_name, ko_level)
763
+ annotation_df_GeneNumber_pct = '%s/%s_%s_GeneNumber_pct.txt' % (output_folder, input_folder_name, ko_level)
764
+ annotation_df_GeneNumber_pct_by_all = '%s/%s_%s_GeneNumber_pct_by_all.txt' % (output_folder, input_folder_name, ko_level)
765
+ annotation_df_TotalDepth = '%s/%s_%s_TotalDepth.txt' % (output_folder, input_folder_name, ko_level)
766
+ annotation_df_TotalDepth_pct = '%s/%s_%s_TotalDepth_pct.txt' % (output_folder, input_folder_name, ko_level)
767
+ annotation_df_TotalDepth_pct_by_all = '%s/%s_%s_TotalDepth_pct_by_all.txt' % (output_folder, input_folder_name, ko_level)
768
+
769
+ #################### get GeneNumber df and report ####################
770
+
771
+ get_KEGG_annot_df(output_folder, ko_level, annotation_df_GeneNumber, annotation_df_GeneNumber_pct, annotation_df_GeneNumber_pct_by_all, with_depth=False, pct_by_all=pct_by_all)
772
+
773
+ print(annotation_df_GeneNumber.split('/')[-1])
774
+ print(annotation_df_GeneNumber_pct.split('/')[-1])
775
+ if pct_by_all is True:
776
+ print(annotation_df_GeneNumber_pct_by_all.split('/')[-1])
777
+
778
+ #################### get TotalDepth df and report ####################
779
+
780
+ if depth_file is not None:
781
+ get_KEGG_annot_df(output_folder, ko_level, annotation_df_TotalDepth, annotation_df_TotalDepth_pct, annotation_df_TotalDepth_pct_by_all, with_depth=True, pct_by_all=pct_by_all)
782
+
783
+ print(annotation_df_TotalDepth.split('/')[-1])
784
+ print(annotation_df_TotalDepth_pct.split('/')[-1])
785
+ if pct_by_all is True:
786
+ print(annotation_df_TotalDepth_pct_by_all.split('/')[-1])
787
+
788
+ ################################################## Final report ####################################################
789
+
790
+ print(datetime.now().strftime(time_format) + 'Done!')
791
+
792
+
793
+ if __name__ == "__main__":
794
+
795
+ parser = argparse.ArgumentParser()
796
+ parser.add_argument('-seq_in', required=False, help='faa file')
797
+ parser.add_argument('-ko_in', required=False, help='annotation results from BlastKOALA/GhostKOALA, normally with name user_ko.txt')
798
+ parser.add_argument('-x', required=False, help='file extension')
799
+ parser.add_argument('-depth', required=False, default=None, help='gene depth file/folder')
800
+ parser.add_argument('-pct_by_all', required=False, action='store_true', help='normalize by all query genes, rather than those with ko assignment')
801
+ parser.add_argument('-db_dir', required=True, help='folder holds sequence, seq2ko and ko00001.keg files')
802
+ parser.add_argument('-diamond', required=False, action='store_true', help='run diamond (for big dataset), default is NCBI blastp')
803
+ parser.add_argument('-t', required=False, default=1, type=int, help='number of threads, default: 1')
804
+ parser.add_argument('-evalue', required=False, default=0.001, type=float, help='evalue cutoff, default: 0.001')
805
+
806
+ args = vars(parser.parse_args())
807
+ Annotation_KEGG(args)