rdrpcatch 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdrpcatch/__init__.py +0 -0
- rdrpcatch/cli/__init__.py +0 -0
- rdrpcatch/cli/args.py +358 -0
- rdrpcatch/rdrpcatch_scripts/__init__.py +0 -0
- rdrpcatch/rdrpcatch_scripts/fetch_dbs.py +302 -0
- rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py +589 -0
- rdrpcatch/rdrpcatch_scripts/gui.py +256 -0
- rdrpcatch/rdrpcatch_scripts/mmseqs_tax.py +100 -0
- rdrpcatch/rdrpcatch_scripts/paths.py +162 -0
- rdrpcatch/rdrpcatch_scripts/plot.py +165 -0
- rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py +155 -0
- rdrpcatch/rdrpcatch_scripts/run_seqkit.py +112 -0
- rdrpcatch/rdrpcatch_scripts/utils.py +414 -0
- rdrpcatch/rdrpcatch_wrapper.py +666 -0
- rdrpcatch-0.0.1.dist-info/METADATA +223 -0
- rdrpcatch-0.0.1.dist-info/RECORD +19 -0
- rdrpcatch-0.0.1.dist-info/WHEEL +4 -0
- rdrpcatch-0.0.1.dist-info/entry_points.txt +2 -0
- rdrpcatch-0.0.1.dist-info/licenses/LICENCE +9 -0
|
@@ -0,0 +1,666 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wrapper for the RdRpCATCH package.
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
import warnings
|
|
9
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="numpy") # see https://moyix.blogspot.com/2022/09/someones-been-messing-with-my-subnormals.html
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# def run_gui():
|
|
16
|
+
#
|
|
17
|
+
# gui_runner = gui.colabscanner_gui()
|
|
18
|
+
# gui_runner.run()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def bundle_results(output_dir, prefix):
|
|
22
|
+
"""
|
|
23
|
+
Bundle the results into a tar.gz file.
|
|
24
|
+
|
|
25
|
+
:param output_dir: Path to the output directory.
|
|
26
|
+
:type output_dir: str
|
|
27
|
+
:param prefix: Prefix for the output files.
|
|
28
|
+
:type prefix: str
|
|
29
|
+
:return: Path to the bundled file
|
|
30
|
+
:rtype: str
|
|
31
|
+
"""
|
|
32
|
+
import tarfile
|
|
33
|
+
import datetime
|
|
34
|
+
|
|
35
|
+
# Create timestamp for the archive name
|
|
36
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
37
|
+
archive_name = f"{prefix}_rdrpcatch_results_{timestamp}.tar.gz"
|
|
38
|
+
archive_path = os.path.join(output_dir, archive_name)
|
|
39
|
+
|
|
40
|
+
# Create tar.gz archive
|
|
41
|
+
with tarfile.open(archive_path, "w:gz") as tar:
|
|
42
|
+
# Add all relevant directories
|
|
43
|
+
for dir_name in [f"{prefix}_rdrpcatch_fasta", f"{prefix}_rdrpcatch_plots",
|
|
44
|
+
f"{prefix}_gff_files", "tmp"]:
|
|
45
|
+
dir_path = os.path.join(output_dir, dir_name)
|
|
46
|
+
if os.path.exists(dir_path):
|
|
47
|
+
tar.add(dir_path, arcname=dir_name)
|
|
48
|
+
|
|
49
|
+
# Add the main output file
|
|
50
|
+
output_file = os.path.join(output_dir, f"{prefix}_rdrpcatch_output_annotated.tsv")
|
|
51
|
+
if os.path.exists(output_file):
|
|
52
|
+
tar.add(output_file, arcname=os.path.basename(output_file))
|
|
53
|
+
|
|
54
|
+
return archive_path
|
|
55
|
+
|
|
56
|
+
def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp):
|
|
57
|
+
"""
|
|
58
|
+
Run RdRpCATCH scan.
|
|
59
|
+
|
|
60
|
+
:param input_file: Path to the input FASTA file.
|
|
61
|
+
:type input_file: str
|
|
62
|
+
:param output_dir: Path to the output directory.
|
|
63
|
+
:type output_dir: str
|
|
64
|
+
:param db_options: List of databases to search against.
|
|
65
|
+
:type db_options: list
|
|
66
|
+
:param db_dir: Path to the directory containing RdRpCATCH databases.
|
|
67
|
+
:type db_dir: str
|
|
68
|
+
:param seq_type: Type of sequence (prot or nuc).
|
|
69
|
+
:type seq_type: str
|
|
70
|
+
:param verbose: Whether to print verbose output.
|
|
71
|
+
:type verbose: bool
|
|
72
|
+
:param e: E-value threshold for HMMsearch.
|
|
73
|
+
:type e: float
|
|
74
|
+
:param incdomE: Inclusion domain E-value threshold for HMMsearch.
|
|
75
|
+
:type incdomE: float
|
|
76
|
+
:param domE: Domain E-value threshold for HMMsearch.
|
|
77
|
+
:type domE: float
|
|
78
|
+
:param incE: Inclusion E-value threshold for HMMsearch.
|
|
79
|
+
:type incE: float
|
|
80
|
+
:param z: Number of sequences to search against.
|
|
81
|
+
:type z: int
|
|
82
|
+
:param cpus: Number of CPUs to use for HMMsearch.
|
|
83
|
+
:type cpus: int
|
|
84
|
+
:param length_thr: Minimum length threshold for seqkit seq.
|
|
85
|
+
:type length_thr: int
|
|
86
|
+
:param gen_code: Genetic code to use for translation.
|
|
87
|
+
:type gen_code: int
|
|
88
|
+
:return: None
|
|
89
|
+
"""
|
|
90
|
+
from .rdrpcatch_scripts import utils
|
|
91
|
+
from .rdrpcatch_scripts import paths
|
|
92
|
+
from .rdrpcatch_scripts import run_pyhmmer
|
|
93
|
+
from .rdrpcatch_scripts import fetch_dbs
|
|
94
|
+
from .rdrpcatch_scripts import format_pyhmmer_out
|
|
95
|
+
from .rdrpcatch_scripts import run_seqkit
|
|
96
|
+
from .rdrpcatch_scripts import plot
|
|
97
|
+
import polars as pl
|
|
98
|
+
from .rdrpcatch_scripts import mmseqs_tax
|
|
99
|
+
import datetime
|
|
100
|
+
|
|
101
|
+
## Ignore warnings
|
|
102
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
103
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
104
|
+
|
|
105
|
+
## Set output directories
|
|
106
|
+
prefix = Path(input_file).stem
|
|
107
|
+
outputs = paths.rdrpcatch_output(prefix, Path(output_dir))
|
|
108
|
+
|
|
109
|
+
## Set up logger
|
|
110
|
+
log_file = outputs.log_file
|
|
111
|
+
if not os.path.exists(outputs.output_dir):
|
|
112
|
+
os.makedirs(outputs.output_dir)
|
|
113
|
+
else:
|
|
114
|
+
raise FileExistsError(f"Output directory already exists: {outputs.output_dir}, please choose a different directory.")
|
|
115
|
+
if not os.path.exists(outputs.log_dir):
|
|
116
|
+
os.makedirs(outputs.log_dir)
|
|
117
|
+
|
|
118
|
+
logger = utils.Logger(log_file)
|
|
119
|
+
|
|
120
|
+
logger.silent_log(f"Input File: {input_file}")
|
|
121
|
+
logger.silent_log(f"Output Directory: {output_dir}")
|
|
122
|
+
logger.silent_log(f"Databases: {db_options}")
|
|
123
|
+
logger.silent_log(f"Database Directory: {db_dir}")
|
|
124
|
+
logger.silent_log(f"Sequence Type: {seq_type}")
|
|
125
|
+
logger.silent_log(f"Verbose Mode: {'ON' if verbose else 'OFF'}")
|
|
126
|
+
logger.silent_log(f"E-value: {e}")
|
|
127
|
+
logger.silent_log(f"Inclusion E-value: {incE}")
|
|
128
|
+
logger.silent_log(f"Domain E-value: {domE}")
|
|
129
|
+
logger.silent_log(f"Inclusion Domain E-value: {incdomE}")
|
|
130
|
+
logger.silent_log(f"Z-value: {z}")
|
|
131
|
+
logger.silent_log(f"CPUs: {cpus}")
|
|
132
|
+
logger.silent_log(f"Length Threshold: {length_thr}")
|
|
133
|
+
logger.silent_log(f"Genetic Code: {gen_code}")
|
|
134
|
+
logger.silent_log(f"Bundle Results: {'ON' if bundle else 'OFF'}")
|
|
135
|
+
logger.silent_log(f"Save Temporary Files: {'ON' if keep_tmp else 'OFF'}")
|
|
136
|
+
|
|
137
|
+
## Start time
|
|
138
|
+
start_time = logger.start_timer()
|
|
139
|
+
|
|
140
|
+
## Check fasta validity
|
|
141
|
+
if not utils.fasta_checker(input_file, logger).check_fasta_validity():
|
|
142
|
+
raise Exception("Invalid fasta file.")
|
|
143
|
+
else:
|
|
144
|
+
if verbose:
|
|
145
|
+
logger.loud_log(f"Valid fasta file: {input_file}")
|
|
146
|
+
else:
|
|
147
|
+
logger.silent_log(f"Valid fasta file: {input_file}")
|
|
148
|
+
|
|
149
|
+
## Check sequence type
|
|
150
|
+
if not seq_type:
|
|
151
|
+
seq_type = utils.fasta_checker(input_file, logger).check_seq_type()
|
|
152
|
+
if verbose:
|
|
153
|
+
logger.loud_log(f"Sequence type: {seq_type}")
|
|
154
|
+
else:
|
|
155
|
+
logger.silent_log(f"Sequence type: {seq_type}")
|
|
156
|
+
|
|
157
|
+
## Check sequence length in .fasta files, if >100000, pyHMMER breaks
|
|
158
|
+
if seq_type == 'nuc':
|
|
159
|
+
utils.fasta_checker(input_file, logger).check_seq_length(300000)
|
|
160
|
+
if seq_type == 'prot':
|
|
161
|
+
utils.fasta_checker(input_file, logger).check_seq_length(100000)
|
|
162
|
+
|
|
163
|
+
## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
|
|
164
|
+
rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
|
|
165
|
+
if verbose:
|
|
166
|
+
logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
|
|
167
|
+
else:
|
|
168
|
+
logger.silent_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
|
|
169
|
+
neordrp_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("NeoRdRp")
|
|
170
|
+
if verbose:
|
|
171
|
+
logger.loud_log(f"NeoRdRp HMM database fetched from: {neordrp_hmm_db}")
|
|
172
|
+
else:
|
|
173
|
+
logger.silent_log(f"NeoRdRp HMM database fetched from: {neordrp_hmm_db}")
|
|
174
|
+
neordrp_2_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("NeoRdRp.2.1")
|
|
175
|
+
if verbose:
|
|
176
|
+
logger.loud_log(f"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}")
|
|
177
|
+
else:
|
|
178
|
+
logger.silent_log(f"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}")
|
|
179
|
+
tsa_olen_fam_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("TSA_Olendraite_fam")
|
|
180
|
+
if verbose:
|
|
181
|
+
logger.loud_log(f"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}")
|
|
182
|
+
else:
|
|
183
|
+
logger.silent_log(f"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}")
|
|
184
|
+
|
|
185
|
+
tsa_olen_gen_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("TSA_Olendraite_gen")
|
|
186
|
+
if verbose:
|
|
187
|
+
logger.loud_log(f"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}")
|
|
188
|
+
else:
|
|
189
|
+
logger.silent_log(f"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}")
|
|
190
|
+
rdrpscan_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RDRP-scan")
|
|
191
|
+
if verbose:
|
|
192
|
+
logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
|
|
193
|
+
else:
|
|
194
|
+
logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
|
|
195
|
+
lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
|
|
196
|
+
if verbose:
|
|
197
|
+
logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
|
|
198
|
+
else:
|
|
199
|
+
logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
|
|
200
|
+
|
|
201
|
+
db_name_list = []
|
|
202
|
+
db_path_list = []
|
|
203
|
+
|
|
204
|
+
## Set up HMM databases
|
|
205
|
+
if db_options == ['all']:
|
|
206
|
+
db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
|
|
207
|
+
db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
|
|
208
|
+
|
|
209
|
+
else:
|
|
210
|
+
for db in db_options:
|
|
211
|
+
if db == "RVMT".lower():
|
|
212
|
+
db_name_list.append("RVMT")
|
|
213
|
+
db_path_list.append(rvmt_hmm_db)
|
|
214
|
+
elif db == "NeoRdRp".lower():
|
|
215
|
+
db_name_list.append("NeoRdRp")
|
|
216
|
+
db_path_list.append(neordrp_hmm_db)
|
|
217
|
+
elif db == "NeoRdRp.2.1":
|
|
218
|
+
db_name_list.append("NeoRdRp.2.1".lower())
|
|
219
|
+
db_path_list.append(neordrp_2_hmm_db)
|
|
220
|
+
elif db == "TSA_Olendraite_fam".lower():
|
|
221
|
+
db_name_list.append("TSA_Olendraite_fam")
|
|
222
|
+
db_path_list.append(tsa_olen_fam_hmm_db)
|
|
223
|
+
elif db == "TSA_Olendraite_gen".lower():
|
|
224
|
+
db_name_list.append("TSA_Olendraite_gen")
|
|
225
|
+
db_path_list.append(tsa_olen_gen_hmm_db)
|
|
226
|
+
elif db == "RDRP-scan".lower():
|
|
227
|
+
db_name_list.append("RDRP-scan")
|
|
228
|
+
db_path_list.append(rdrpscan_hmm_db)
|
|
229
|
+
elif db == "Lucaprot".lower():
|
|
230
|
+
db_name_list.append("Lucaprot")
|
|
231
|
+
db_path_list.append(lucaprot_hmm_db)
|
|
232
|
+
else:
|
|
233
|
+
raise Exception(f"Invalid database option: {db}")
|
|
234
|
+
|
|
235
|
+
# Fetch mmseqs database
|
|
236
|
+
|
|
237
|
+
if verbose:
|
|
238
|
+
logger.loud_log("Fetching mmseqs databases.")
|
|
239
|
+
else:
|
|
240
|
+
logger.silent_log("Fetching mmseqs databases.")
|
|
241
|
+
mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path("mmseqs_refseq_riboviria_20250211")
|
|
242
|
+
|
|
243
|
+
if verbose:
|
|
244
|
+
logger.loud_log(f"mmseqs database fetched from: {mmseqs_db_path}")
|
|
245
|
+
else:
|
|
246
|
+
logger.silent_log(f"mmseqs database fetched from: {mmseqs_db_path}")
|
|
247
|
+
|
|
248
|
+
if not os.path.exists(outputs.hmm_output_dir):
|
|
249
|
+
outputs.hmm_output_dir.mkdir(parents=True)
|
|
250
|
+
|
|
251
|
+
if not os.path.exists(outputs.formatted_hmm_output_dir):
|
|
252
|
+
outputs.formatted_hmm_output_dir.mkdir(parents=True)
|
|
253
|
+
|
|
254
|
+
if not os.path.exists(outputs.tsv_outdir):
|
|
255
|
+
outputs.tsv_outdir.mkdir(parents=True)
|
|
256
|
+
|
|
257
|
+
if not os.path.exists(outputs.plot_outdir):
|
|
258
|
+
outputs.plot_outdir.mkdir(parents=True)
|
|
259
|
+
|
|
260
|
+
if not os.path.exists(outputs.tmp_dir):
|
|
261
|
+
outputs.tmp_dir.mkdir(parents=True)
|
|
262
|
+
|
|
263
|
+
if seq_type == 'nuc':
|
|
264
|
+
if verbose:
|
|
265
|
+
logger.loud_log("Nucleotide sequence detected.")
|
|
266
|
+
else:
|
|
267
|
+
logger.silent_log("Nucleotide sequence detected.")
|
|
268
|
+
|
|
269
|
+
set_dict = {}
|
|
270
|
+
translated_set_dict = {}
|
|
271
|
+
df_list = []
|
|
272
|
+
|
|
273
|
+
## Filter out sequences with length less than 400 bp with seqkit
|
|
274
|
+
if verbose:
|
|
275
|
+
logger.loud_log("Filtering out sequences with length less than 400 bp.")
|
|
276
|
+
else:
|
|
277
|
+
logger.silent_log("Filtering out sequences with length less than 400 bp.")
|
|
278
|
+
|
|
279
|
+
if not os.path.exists(outputs.seqkit_seq_output_dir):
|
|
280
|
+
outputs.seqkit_seq_output_dir.mkdir(parents=True)
|
|
281
|
+
|
|
282
|
+
run_seqkit.seqkit(input_file, outputs.seqkit_seq_output_path, log_file, threads=cpus, logger=logger).run_seqkit_seq(length_thr)
|
|
283
|
+
if verbose:
|
|
284
|
+
logger.loud_log(f"Filtered sequence written to: { outputs.seqkit_seq_output_path}")
|
|
285
|
+
else:
|
|
286
|
+
logger.silent_log(f"Filtered sequence written to: { outputs.seqkit_seq_output_path}")
|
|
287
|
+
|
|
288
|
+
## Translate nucleotide sequences to protein sequences with seqkit
|
|
289
|
+
if verbose:
|
|
290
|
+
logger.loud_log("Translating nucleotide sequences to protein sequences.")
|
|
291
|
+
else:
|
|
292
|
+
logger.silent_log("Translating nucleotide sequences to protein sequences.")
|
|
293
|
+
|
|
294
|
+
if not os.path.exists(outputs.seqkit_translate_output_dir):
|
|
295
|
+
outputs.seqkit_translate_output_dir.mkdir(parents=True)
|
|
296
|
+
|
|
297
|
+
run_seqkit.seqkit(outputs.seqkit_seq_output_path, outputs.seqkit_translate_output_path, log_file, threads=cpus, logger=logger).run_seqkit_translate(gen_code, 6)
|
|
298
|
+
|
|
299
|
+
if verbose:
|
|
300
|
+
logger.loud_log(f"Translated sequence written to: {outputs.seqkit_translate_output_path}")
|
|
301
|
+
else:
|
|
302
|
+
logger.silent_log(f"Translated sequence written to: {outputs.seqkit_translate_output_path}")
|
|
303
|
+
|
|
304
|
+
for db_name,db_path in zip(db_name_list, db_path_list):
|
|
305
|
+
|
|
306
|
+
if verbose:
|
|
307
|
+
logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
|
|
308
|
+
else:
|
|
309
|
+
logger.silent_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
|
|
310
|
+
|
|
311
|
+
start_hmmsearch_time = logger.start_timer()
|
|
312
|
+
run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), outputs.seqkit_translate_output_path, db_path, cpus, e, incdomE, domE, incE,
|
|
313
|
+
z).run_pyhmmsearch()
|
|
314
|
+
end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time, verbose)
|
|
315
|
+
if verbose:
|
|
316
|
+
logger.loud_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
|
|
317
|
+
else:
|
|
318
|
+
logger.silent_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
|
|
319
|
+
|
|
320
|
+
if verbose:
|
|
321
|
+
logger.loud_log(f"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}")
|
|
322
|
+
else:
|
|
323
|
+
logger.silent_log(f"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}")
|
|
324
|
+
|
|
325
|
+
if not os.path.exists(outputs.formatted_hmm_output_dir):
|
|
326
|
+
outputs.formatted_hmm_output_dir.mkdir(parents=True)
|
|
327
|
+
|
|
328
|
+
format_pyhmmer_out.hmmsearch_formatter(outputs.hmm_output_path(db_name), outputs.formatted_hmm_output_path(db_name), seq_type)
|
|
329
|
+
|
|
330
|
+
if verbose:
|
|
331
|
+
logger.loud_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
|
|
332
|
+
else:
|
|
333
|
+
logger.silent_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
|
|
334
|
+
if not os.path.exists(outputs.best_hit_dir):
|
|
335
|
+
outputs.best_hit_dir.mkdir(parents=True)
|
|
336
|
+
|
|
337
|
+
format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name), seq_type, logger).highest_bitscore_hits(
|
|
338
|
+
outputs.best_hit_path(db_name))
|
|
339
|
+
if verbose:
|
|
340
|
+
logger.loud_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
|
|
341
|
+
else:
|
|
342
|
+
logger.silent_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
|
|
343
|
+
|
|
344
|
+
set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),
|
|
345
|
+
seq_type, logger).hmm_to_contig_set()
|
|
346
|
+
translated_set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),
|
|
347
|
+
'prot', logger).hmm_to_contig_set()
|
|
348
|
+
|
|
349
|
+
# Convert to dataframe, add db_name column and append to df_list
|
|
350
|
+
df = pl.read_csv(outputs.best_hit_path(db_name), separator='\t')
|
|
351
|
+
df = df.with_columns([
|
|
352
|
+
pl.lit(db_name).alias('db_name')
|
|
353
|
+
])
|
|
354
|
+
df_list.append(df)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
if not os.path.exists(outputs.plot_outdir):
|
|
359
|
+
outputs.plot_outdir.mkdir(parents=True)
|
|
360
|
+
|
|
361
|
+
if not os.path.exists(outputs.tsv_outdir):
|
|
362
|
+
outputs.tsv_outdir.mkdir(parents=True)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# Combine all the dataframes in the list
|
|
366
|
+
combined_df = pl.concat(df_list, how='vertical')
|
|
367
|
+
# Write the combined dataframe to a tsv file
|
|
368
|
+
for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',
|
|
369
|
+
'ID_score', 'profile_coverage', 'contig_coverage']:
|
|
370
|
+
combined_df = combined_df.with_columns([
|
|
371
|
+
pl.col(col).cast(pl.Float64)
|
|
372
|
+
])
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
combined_df.write_csv(outputs.combined_tsv_path, separator="\t")
|
|
376
|
+
|
|
377
|
+
# Check if the combined dataframe is empty
|
|
378
|
+
if combined_df.is_empty():
|
|
379
|
+
logger.loud_log("No hits found by RdRpCATCH. Exiting.")
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
if len(db_name_list) > 1:
|
|
384
|
+
if verbose:
|
|
385
|
+
logger.loud_log("Generating upset plot.")
|
|
386
|
+
else:
|
|
387
|
+
logger.silent_log("Generating upset plot.")
|
|
388
|
+
|
|
389
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).upset_plotter(set_dict)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
if verbose:
|
|
393
|
+
logger.loud_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
|
|
394
|
+
else:
|
|
395
|
+
logger.silent_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
|
|
396
|
+
# Generate e-value plot
|
|
397
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_evalue(combined_df)
|
|
398
|
+
# Generate score plot
|
|
399
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_score(combined_df)
|
|
400
|
+
# Generate normalized bitscore plot
|
|
401
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)
|
|
402
|
+
# Generate normalized bitscore contig plot
|
|
403
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)
|
|
404
|
+
# Generate ID score plot
|
|
405
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_ID_score(combined_df)
|
|
406
|
+
# Generate Profile coverage plot
|
|
407
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)
|
|
408
|
+
# Generate contig coverage plot
|
|
409
|
+
plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)
|
|
410
|
+
# Extract all the contigs
|
|
411
|
+
combined_set = set.union(*[value for value in set_dict.values()])
|
|
412
|
+
translated_combined_set = set.union(*[value for value in translated_set_dict.values()])
|
|
413
|
+
|
|
414
|
+
# Write a fasta file with all the contigs
|
|
415
|
+
if not os.path.exists(outputs.fasta_output_dir):
|
|
416
|
+
outputs.fasta_output_dir.mkdir(parents=True)
|
|
417
|
+
|
|
418
|
+
utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_nuc_out_path)
|
|
419
|
+
|
|
420
|
+
utils.fasta(outputs.seqkit_translate_output_path).write_fasta(utils.fasta(outputs.seqkit_translate_output_path).extract_contigs(translated_combined_set),
|
|
421
|
+
outputs.fasta_prot_out_path)
|
|
422
|
+
|
|
423
|
+
if not os.path.exists(outputs.gff_output_dir):
|
|
424
|
+
outputs.gff_output_dir.mkdir(parents=True)
|
|
425
|
+
hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)
|
|
426
|
+
hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output, outputs.gff_output_path)
|
|
427
|
+
rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output,seq_type)
|
|
428
|
+
utils.fasta(outputs.seqkit_translate_output_path, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)
|
|
429
|
+
|
|
430
|
+
if verbose:
|
|
431
|
+
logger.loud_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
|
|
432
|
+
logger.loud_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
|
|
433
|
+
else:
|
|
434
|
+
logger.silent_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
|
|
435
|
+
logger.silent_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
|
|
436
|
+
|
|
437
|
+
if not os.path.exists(outputs.mmseqs_tax_output_dir):
|
|
438
|
+
outputs.mmseqs_tax_output_dir.mkdir(parents=True)
|
|
439
|
+
|
|
440
|
+
if verbose:
|
|
441
|
+
logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
442
|
+
else:
|
|
443
|
+
logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
444
|
+
|
|
445
|
+
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
|
|
446
|
+
outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
|
|
447
|
+
|
|
448
|
+
if verbose:
|
|
449
|
+
logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
450
|
+
else:
|
|
451
|
+
logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
452
|
+
|
|
453
|
+
if not os.path.exists(outputs.mmseqs_e_search_output_dir):
|
|
454
|
+
outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,
|
|
458
|
+
outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()
|
|
459
|
+
|
|
460
|
+
utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(
|
|
461
|
+
outputs.rdrpcatch_output, outputs.extended_rdrpcatch_output, seq_type)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
elif seq_type == 'prot':
|
|
465
|
+
|
|
466
|
+
if verbose:
|
|
467
|
+
logger.loud_log("Protein sequence detected.")
|
|
468
|
+
else:
|
|
469
|
+
logger.silent_log("Protein sequence detected.")
|
|
470
|
+
|
|
471
|
+
set_dict = {}
|
|
472
|
+
df_list = []
|
|
473
|
+
|
|
474
|
+
for db_name,db_path in zip (db_name_list, db_path_list):
|
|
475
|
+
|
|
476
|
+
if verbose:
|
|
477
|
+
logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
|
|
478
|
+
else:
|
|
479
|
+
logger.silent_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
|
|
480
|
+
start_hmmsearch_time = logger.start_timer()
|
|
481
|
+
hmm_out = run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), input_file, db_path, cpus, e, incdomE, domE, incE, z).run_pyhmmsearch()
|
|
482
|
+
end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time,verbose)
|
|
483
|
+
if verbose:
|
|
484
|
+
logger.loud_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
|
|
485
|
+
else:
|
|
486
|
+
logger.silent_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
|
|
487
|
+
|
|
488
|
+
if verbose:
|
|
489
|
+
logger.loud_log(f"Pyhmmer output written to: {hmm_out}")
|
|
490
|
+
else:
|
|
491
|
+
logger.silent_log(f"Pyhmmer output written to: {hmm_out}")
|
|
492
|
+
if not os.path.exists(outputs.formatted_hmm_output_dir):
|
|
493
|
+
outputs.formatted_hmm_output_dir.mkdir(parents=True)
|
|
494
|
+
|
|
495
|
+
format_pyhmmer_out.hmmsearch_formatter(hmm_out, outputs.formatted_hmm_output_path(db_name), seq_type)
|
|
496
|
+
if verbose:
|
|
497
|
+
logger.loud_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
|
|
498
|
+
else:
|
|
499
|
+
logger.silent_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
|
|
500
|
+
|
|
501
|
+
# Extract Highest Bitscore hits from the formatted hmm output
|
|
502
|
+
|
|
503
|
+
if not os.path.exists(outputs.best_hit_dir):
|
|
504
|
+
outputs.best_hit_dir.mkdir(parents=True)
|
|
505
|
+
|
|
506
|
+
format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),seq_type, logger).highest_bitscore_hits(outputs.best_hit_path(db_name))
|
|
507
|
+
|
|
508
|
+
if verbose:
|
|
509
|
+
logger.loud_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
|
|
510
|
+
else:
|
|
511
|
+
logger.silent_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
|
|
512
|
+
# Here I overwrite prot to nuc, because I need the contig name to extract the contigs
|
|
513
|
+
set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),"nuc", logger).hmm_to_contig_set()
|
|
514
|
+
|
|
515
|
+
# Convert to dataframe, add db_name column and append to df_list
|
|
516
|
+
df = pl.read_csv(outputs.best_hit_path(db_name), separator='\t')
|
|
517
|
+
df = df.with_columns([
|
|
518
|
+
pl.lit(db_name).alias('db_name')
|
|
519
|
+
])
|
|
520
|
+
df_list.append(df)
|
|
521
|
+
|
|
522
|
+
if not os.path.exists(outputs.plot_outdir):
|
|
523
|
+
outputs.plot_outdir.mkdir(parents=True)
|
|
524
|
+
|
|
525
|
+
if not os.path.exists(outputs.tsv_outdir):
|
|
526
|
+
outputs.tsv_outdir.mkdir(parents=True)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# Combine all the dataframes in the list
|
|
531
|
+
combined_df = pl.concat(df_list, how='vertical')
|
|
532
|
+
# Write the combined dataframe to a tsv file
|
|
533
|
+
for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',
|
|
534
|
+
'ID_score', 'profile_coverage', 'contig_coverage']:
|
|
535
|
+
combined_df = combined_df.with_columns([
|
|
536
|
+
pl.col(col).cast(pl.Float64)
|
|
537
|
+
])
|
|
538
|
+
|
|
539
|
+
combined_df.write_csv(outputs.combined_tsv_path, separator="\t")
|
|
540
|
+
|
|
541
|
+
# Check if the combined dataframe is empty
|
|
542
|
+
if combined_df.is_empty():
|
|
543
|
+
logger.loud_log("No hits found by RdRpCATCH. Exiting.")
|
|
544
|
+
return None
|
|
545
|
+
|
|
546
|
+
if len(db_name_list) > 1:
|
|
547
|
+
if verbose:
|
|
548
|
+
logger.loud_log("Generating upset plot.")
|
|
549
|
+
else:
|
|
550
|
+
logger.silent_log("Generating upset plot.")
|
|
551
|
+
|
|
552
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).upset_plotter(set_dict)
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
if verbose:
|
|
556
|
+
logger.loud_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
|
|
557
|
+
else:
|
|
558
|
+
logger.silent_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
|
|
559
|
+
|
|
560
|
+
# Generate e-value plot
|
|
561
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_evalue(combined_df)
|
|
562
|
+
# Generate score plot
|
|
563
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_score(combined_df)
|
|
564
|
+
# Generate normalized bitscore plot
|
|
565
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)
|
|
566
|
+
# Generate normalized bitscore contig plot
|
|
567
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)
|
|
568
|
+
# Generate ID score plot
|
|
569
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_ID_score(combined_df)
|
|
570
|
+
# Generate Profile coverage plot
|
|
571
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)
|
|
572
|
+
# Generate contig coverage plot
|
|
573
|
+
plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)
|
|
574
|
+
|
|
575
|
+
# Extract all the contigs
|
|
576
|
+
combined_set = set.union(*[value for value in set_dict.values()])
|
|
577
|
+
# Write a fasta file with all the contigs
|
|
578
|
+
if not os.path.exists(outputs.fasta_output_dir):
|
|
579
|
+
outputs.fasta_output_dir.mkdir(parents=True)
|
|
580
|
+
|
|
581
|
+
utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_prot_out_path)
|
|
582
|
+
|
|
583
|
+
if verbose:
|
|
584
|
+
logger.loud_log(f"Contigs written to: {outputs.fasta_prot_out_path}")
|
|
585
|
+
else:
|
|
586
|
+
logger.silent_log(f"Contigs written to: {outputs.fasta_prot_out_path}")
|
|
587
|
+
|
|
588
|
+
if not os.path.exists(outputs.gff_output_dir):
|
|
589
|
+
outputs.gff_output_dir.mkdir(parents=True)
|
|
590
|
+
|
|
591
|
+
hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)
|
|
592
|
+
hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output, outputs.gff_output_path)
|
|
593
|
+
rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output,seq_type)
|
|
594
|
+
utils.fasta(input_file, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)
|
|
595
|
+
|
|
596
|
+
if verbose:
|
|
597
|
+
logger.loud_log(f"RdRpCATCH output file written to: {outputs.fasta_prot_out_path}")
|
|
598
|
+
else:
|
|
599
|
+
logger.silent_log(f"RdRpCATCH output file written to: {outputs.fasta_prot_out_path}")
|
|
600
|
+
|
|
601
|
+
if not os.path.exists(outputs.mmseqs_tax_output_dir):
|
|
602
|
+
outputs.mmseqs_tax_output_dir.mkdir(parents=True)
|
|
603
|
+
|
|
604
|
+
if verbose:
|
|
605
|
+
logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
606
|
+
else:
|
|
607
|
+
logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
|
|
611
|
+
outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
|
|
612
|
+
|
|
613
|
+
if not os.path.exists(outputs.mmseqs_e_search_output_dir):
|
|
614
|
+
outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
|
|
615
|
+
|
|
616
|
+
if verbose:
|
|
617
|
+
logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
618
|
+
else:
|
|
619
|
+
logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
620
|
+
|
|
621
|
+
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,
|
|
622
|
+
outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()
|
|
623
|
+
|
|
624
|
+
utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(
|
|
625
|
+
outputs.rdrpcatch_output, outputs.extended_rdrpcatch_output, seq_type)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
end_time = logger.stop_timer(start_time, verbose)
|
|
629
|
+
if verbose:
|
|
630
|
+
logger.loud_log(f"Total Runtime: {end_time}")
|
|
631
|
+
else:
|
|
632
|
+
logger.silent_log(f"Total Runtime: {end_time}")
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
if not keep_tmp:
|
|
637
|
+
if verbose:
|
|
638
|
+
logger.loud_log("Deleting temporary files.")
|
|
639
|
+
else:
|
|
640
|
+
logger.silent_log("Deleting temporary files.")
|
|
641
|
+
|
|
642
|
+
try:
|
|
643
|
+
import shutil
|
|
644
|
+
shutil.rmtree(outputs.tmp_dir)
|
|
645
|
+
logger.silent_log(f"Temporary files deleted.")
|
|
646
|
+
except FileNotFoundError:
|
|
647
|
+
print(f"Directory '{outputs.tmp_dir}' does not exist.")
|
|
648
|
+
except PermissionError:
|
|
649
|
+
print(f"Permission denied while trying to delete '{outputs.tmp_dir}'.")
|
|
650
|
+
except Exception as e:
|
|
651
|
+
print(f"An error occurred: {e}")
|
|
652
|
+
|
|
653
|
+
# Bundle results
|
|
654
|
+
if bundle:
|
|
655
|
+
archive_path = bundle_results(output_dir, prefix)
|
|
656
|
+
if verbose:
|
|
657
|
+
logger.loud_log(f"Results bundled into: {archive_path}")
|
|
658
|
+
else:
|
|
659
|
+
logger.silent_log(f"Results bundled into: {archive_path}")
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
return outputs.extended_rdrpcatch_output
|
|
664
|
+
|
|
665
|
+
if __name__ == "__main__":
|
|
666
|
+
main()
|