rdrpcatch 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,666 @@
1
+ """
2
+ Wrapper for the RdRpCATCH package.
3
+
4
+ """
5
+ import os
6
+ from pathlib import Path
7
+ from rich.console import Console
8
+ import warnings
9
+ warnings.filterwarnings("ignore", category=UserWarning, module="numpy") # see https://moyix.blogspot.com/2022/09/someones-been-messing-with-my-subnormals.html
10
+
11
+ def main():
12
+ pass
13
+
14
+
15
+ # def run_gui():
16
+ #
17
+ # gui_runner = gui.colabscanner_gui()
18
+ # gui_runner.run()
19
+
20
+
21
+ def bundle_results(output_dir, prefix):
22
+ """
23
+ Bundle the results into a tar.gz file.
24
+
25
+ :param output_dir: Path to the output directory.
26
+ :type output_dir: str
27
+ :param prefix: Prefix for the output files.
28
+ :type prefix: str
29
+ :return: Path to the bundled file
30
+ :rtype: str
31
+ """
32
+ import tarfile
33
+ import datetime
34
+
35
+ # Create timestamp for the archive name
36
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
37
+ archive_name = f"{prefix}_rdrpcatch_results_{timestamp}.tar.gz"
38
+ archive_path = os.path.join(output_dir, archive_name)
39
+
40
+ # Create tar.gz archive
41
+ with tarfile.open(archive_path, "w:gz") as tar:
42
+ # Add all relevant directories
43
+ for dir_name in [f"{prefix}_rdrpcatch_fasta", f"{prefix}_rdrpcatch_plots",
44
+ f"{prefix}_gff_files", "tmp"]:
45
+ dir_path = os.path.join(output_dir, dir_name)
46
+ if os.path.exists(dir_path):
47
+ tar.add(dir_path, arcname=dir_name)
48
+
49
+ # Add the main output file
50
+ output_file = os.path.join(output_dir, f"{prefix}_rdrpcatch_output_annotated.tsv")
51
+ if os.path.exists(output_file):
52
+ tar.add(output_file, arcname=os.path.basename(output_file))
53
+
54
+ return archive_path
55
+
56
+ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp):
57
+ """
58
+ Run RdRpCATCH scan.
59
+
60
+ :param input_file: Path to the input FASTA file.
61
+ :type input_file: str
62
+ :param output_dir: Path to the output directory.
63
+ :type output_dir: str
64
+ :param db_options: List of databases to search against.
65
+ :type db_options: list
66
+ :param db_dir: Path to the directory containing RdRpCATCH databases.
67
+ :type db_dir: str
68
+ :param seq_type: Type of sequence (prot or nuc).
69
+ :type seq_type: str
70
+ :param verbose: Whether to print verbose output.
71
+ :type verbose: bool
72
+ :param e: E-value threshold for HMMsearch.
73
+ :type e: float
74
+ :param incdomE: Inclusion domain E-value threshold for HMMsearch.
75
+ :type incdomE: float
76
+ :param domE: Domain E-value threshold for HMMsearch.
77
+ :type domE: float
78
+ :param incE: Inclusion E-value threshold for HMMsearch.
79
+ :type incE: float
80
+ :param z: Number of sequences to search against.
81
+ :type z: int
82
+ :param cpus: Number of CPUs to use for HMMsearch.
83
+ :type cpus: int
84
+ :param length_thr: Minimum length threshold for seqkit seq.
85
+ :type length_thr: int
86
+ :param gen_code: Genetic code to use for translation.
87
+ :type gen_code: int
88
+ :return: None
89
+ """
90
+ from .rdrpcatch_scripts import utils
91
+ from .rdrpcatch_scripts import paths
92
+ from .rdrpcatch_scripts import run_pyhmmer
93
+ from .rdrpcatch_scripts import fetch_dbs
94
+ from .rdrpcatch_scripts import format_pyhmmer_out
95
+ from .rdrpcatch_scripts import run_seqkit
96
+ from .rdrpcatch_scripts import plot
97
+ import polars as pl
98
+ from .rdrpcatch_scripts import mmseqs_tax
99
+ import datetime
100
+
101
+ ## Ignore warnings
102
+ warnings.filterwarnings("ignore", category=FutureWarning)
103
+ warnings.filterwarnings("ignore", category=UserWarning)
104
+
105
+ ## Set output directories
106
+ prefix = Path(input_file).stem
107
+ outputs = paths.rdrpcatch_output(prefix, Path(output_dir))
108
+
109
+ ## Set up logger
110
+ log_file = outputs.log_file
111
+ if not os.path.exists(outputs.output_dir):
112
+ os.makedirs(outputs.output_dir)
113
+ else:
114
+ raise FileExistsError(f"Output directory already exists: {outputs.output_dir}, please choose a different directory.")
115
+ if not os.path.exists(outputs.log_dir):
116
+ os.makedirs(outputs.log_dir)
117
+
118
+ logger = utils.Logger(log_file)
119
+
120
+ logger.silent_log(f"Input File: {input_file}")
121
+ logger.silent_log(f"Output Directory: {output_dir}")
122
+ logger.silent_log(f"Databases: {db_options}")
123
+ logger.silent_log(f"Database Directory: {db_dir}")
124
+ logger.silent_log(f"Sequence Type: {seq_type}")
125
+ logger.silent_log(f"Verbose Mode: {'ON' if verbose else 'OFF'}")
126
+ logger.silent_log(f"E-value: {e}")
127
+ logger.silent_log(f"Inclusion E-value: {incE}")
128
+ logger.silent_log(f"Domain E-value: {domE}")
129
+ logger.silent_log(f"Inclusion Domain E-value: {incdomE}")
130
+ logger.silent_log(f"Z-value: {z}")
131
+ logger.silent_log(f"CPUs: {cpus}")
132
+ logger.silent_log(f"Length Threshold: {length_thr}")
133
+ logger.silent_log(f"Genetic Code: {gen_code}")
134
+ logger.silent_log(f"Bundle Results: {'ON' if bundle else 'OFF'}")
135
+ logger.silent_log(f"Save Temporary Files: {'ON' if keep_tmp else 'OFF'}")
136
+
137
+ ## Start time
138
+ start_time = logger.start_timer()
139
+
140
+ ## Check fasta validity
141
+ if not utils.fasta_checker(input_file, logger).check_fasta_validity():
142
+ raise Exception("Invalid fasta file.")
143
+ else:
144
+ if verbose:
145
+ logger.loud_log(f"Valid fasta file: {input_file}")
146
+ else:
147
+ logger.silent_log(f"Valid fasta file: {input_file}")
148
+
149
+ ## Check sequence type
150
+ if not seq_type:
151
+ seq_type = utils.fasta_checker(input_file, logger).check_seq_type()
152
+ if verbose:
153
+ logger.loud_log(f"Sequence type: {seq_type}")
154
+ else:
155
+ logger.silent_log(f"Sequence type: {seq_type}")
156
+
157
+ ## Check sequence length in .fasta files, if >100000, pyHMMER breaks
158
+ if seq_type == 'nuc':
159
+ utils.fasta_checker(input_file, logger).check_seq_length(300000)
160
+ if seq_type == 'prot':
161
+ utils.fasta_checker(input_file, logger).check_seq_length(100000)
162
+
163
+ ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
164
+ rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
165
+ if verbose:
166
+ logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
167
+ else:
168
+ logger.silent_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
169
+ neordrp_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("NeoRdRp")
170
+ if verbose:
171
+ logger.loud_log(f"NeoRdRp HMM database fetched from: {neordrp_hmm_db}")
172
+ else:
173
+ logger.silent_log(f"NeoRdRp HMM database fetched from: {neordrp_hmm_db}")
174
+ neordrp_2_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("NeoRdRp.2.1")
175
+ if verbose:
176
+ logger.loud_log(f"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}")
177
+ else:
178
+ logger.silent_log(f"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}")
179
+ tsa_olen_fam_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("TSA_Olendraite_fam")
180
+ if verbose:
181
+ logger.loud_log(f"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}")
182
+ else:
183
+ logger.silent_log(f"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}")
184
+
185
+ tsa_olen_gen_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("TSA_Olendraite_gen")
186
+ if verbose:
187
+ logger.loud_log(f"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}")
188
+ else:
189
+ logger.silent_log(f"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}")
190
+ rdrpscan_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RDRP-scan")
191
+ if verbose:
192
+ logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
193
+ else:
194
+ logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
195
+ lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
196
+ if verbose:
197
+ logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
198
+ else:
199
+ logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
200
+
201
+ db_name_list = []
202
+ db_path_list = []
203
+
204
+ ## Set up HMM databases
205
+ if db_options == ['all']:
206
+ db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
207
+ db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
208
+
209
+ else:
210
+ for db in db_options:
211
+ if db == "RVMT".lower():
212
+ db_name_list.append("RVMT")
213
+ db_path_list.append(rvmt_hmm_db)
214
+ elif db == "NeoRdRp".lower():
215
+ db_name_list.append("NeoRdRp")
216
+ db_path_list.append(neordrp_hmm_db)
217
+ elif db == "NeoRdRp.2.1":
218
+ db_name_list.append("NeoRdRp.2.1".lower())
219
+ db_path_list.append(neordrp_2_hmm_db)
220
+ elif db == "TSA_Olendraite_fam".lower():
221
+ db_name_list.append("TSA_Olendraite_fam")
222
+ db_path_list.append(tsa_olen_fam_hmm_db)
223
+ elif db == "TSA_Olendraite_gen".lower():
224
+ db_name_list.append("TSA_Olendraite_gen")
225
+ db_path_list.append(tsa_olen_gen_hmm_db)
226
+ elif db == "RDRP-scan".lower():
227
+ db_name_list.append("RDRP-scan")
228
+ db_path_list.append(rdrpscan_hmm_db)
229
+ elif db == "Lucaprot".lower():
230
+ db_name_list.append("Lucaprot")
231
+ db_path_list.append(lucaprot_hmm_db)
232
+ else:
233
+ raise Exception(f"Invalid database option: {db}")
234
+
235
+ # Fetch mmseqs database
236
+
237
+ if verbose:
238
+ logger.loud_log("Fetching mmseqs databases.")
239
+ else:
240
+ logger.silent_log("Fetching mmseqs databases.")
241
+ mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path("mmseqs_refseq_riboviria_20250211")
242
+
243
+ if verbose:
244
+ logger.loud_log(f"mmseqs database fetched from: {mmseqs_db_path}")
245
+ else:
246
+ logger.silent_log(f"mmseqs database fetched from: {mmseqs_db_path}")
247
+
248
+ if not os.path.exists(outputs.hmm_output_dir):
249
+ outputs.hmm_output_dir.mkdir(parents=True)
250
+
251
+ if not os.path.exists(outputs.formatted_hmm_output_dir):
252
+ outputs.formatted_hmm_output_dir.mkdir(parents=True)
253
+
254
+ if not os.path.exists(outputs.tsv_outdir):
255
+ outputs.tsv_outdir.mkdir(parents=True)
256
+
257
+ if not os.path.exists(outputs.plot_outdir):
258
+ outputs.plot_outdir.mkdir(parents=True)
259
+
260
+ if not os.path.exists(outputs.tmp_dir):
261
+ outputs.tmp_dir.mkdir(parents=True)
262
+
263
+ if seq_type == 'nuc':
264
+ if verbose:
265
+ logger.loud_log("Nucleotide sequence detected.")
266
+ else:
267
+ logger.silent_log("Nucleotide sequence detected.")
268
+
269
+ set_dict = {}
270
+ translated_set_dict = {}
271
+ df_list = []
272
+
273
+ ## Filter out sequences with length less than 400 bp with seqkit
274
+ if verbose:
275
+ logger.loud_log("Filtering out sequences with length less than 400 bp.")
276
+ else:
277
+ logger.silent_log("Filtering out sequences with length less than 400 bp.")
278
+
279
+ if not os.path.exists(outputs.seqkit_seq_output_dir):
280
+ outputs.seqkit_seq_output_dir.mkdir(parents=True)
281
+
282
+ run_seqkit.seqkit(input_file, outputs.seqkit_seq_output_path, log_file, threads=cpus, logger=logger).run_seqkit_seq(length_thr)
283
+ if verbose:
284
+ logger.loud_log(f"Filtered sequence written to: { outputs.seqkit_seq_output_path}")
285
+ else:
286
+ logger.silent_log(f"Filtered sequence written to: { outputs.seqkit_seq_output_path}")
287
+
288
+ ## Translate nucleotide sequences to protein sequences with seqkit
289
+ if verbose:
290
+ logger.loud_log("Translating nucleotide sequences to protein sequences.")
291
+ else:
292
+ logger.silent_log("Translating nucleotide sequences to protein sequences.")
293
+
294
+ if not os.path.exists(outputs.seqkit_translate_output_dir):
295
+ outputs.seqkit_translate_output_dir.mkdir(parents=True)
296
+
297
+ run_seqkit.seqkit(outputs.seqkit_seq_output_path, outputs.seqkit_translate_output_path, log_file, threads=cpus, logger=logger).run_seqkit_translate(gen_code, 6)
298
+
299
+ if verbose:
300
+ logger.loud_log(f"Translated sequence written to: {outputs.seqkit_translate_output_path}")
301
+ else:
302
+ logger.silent_log(f"Translated sequence written to: {outputs.seqkit_translate_output_path}")
303
+
304
+ for db_name,db_path in zip(db_name_list, db_path_list):
305
+
306
+ if verbose:
307
+ logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
308
+ else:
309
+ logger.silent_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
310
+
311
+ start_hmmsearch_time = logger.start_timer()
312
+ run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), outputs.seqkit_translate_output_path, db_path, cpus, e, incdomE, domE, incE,
313
+ z).run_pyhmmsearch()
314
+ end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time, verbose)
315
+ if verbose:
316
+ logger.loud_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
317
+ else:
318
+ logger.silent_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
319
+
320
+ if verbose:
321
+ logger.loud_log(f"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}")
322
+ else:
323
+ logger.silent_log(f"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}")
324
+
325
+ if not os.path.exists(outputs.formatted_hmm_output_dir):
326
+ outputs.formatted_hmm_output_dir.mkdir(parents=True)
327
+
328
+ format_pyhmmer_out.hmmsearch_formatter(outputs.hmm_output_path(db_name), outputs.formatted_hmm_output_path(db_name), seq_type)
329
+
330
+ if verbose:
331
+ logger.loud_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
332
+ else:
333
+ logger.silent_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
334
+ if not os.path.exists(outputs.best_hit_dir):
335
+ outputs.best_hit_dir.mkdir(parents=True)
336
+
337
+ format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name), seq_type, logger).highest_bitscore_hits(
338
+ outputs.best_hit_path(db_name))
339
+ if verbose:
340
+ logger.loud_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
341
+ else:
342
+ logger.silent_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
343
+
344
+ set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),
345
+ seq_type, logger).hmm_to_contig_set()
346
+ translated_set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),
347
+ 'prot', logger).hmm_to_contig_set()
348
+
349
+ # Convert to dataframe, add db_name column and append to df_list
350
+ df = pl.read_csv(outputs.best_hit_path(db_name), separator='\t')
351
+ df = df.with_columns([
352
+ pl.lit(db_name).alias('db_name')
353
+ ])
354
+ df_list.append(df)
355
+
356
+
357
+
358
+ if not os.path.exists(outputs.plot_outdir):
359
+ outputs.plot_outdir.mkdir(parents=True)
360
+
361
+ if not os.path.exists(outputs.tsv_outdir):
362
+ outputs.tsv_outdir.mkdir(parents=True)
363
+
364
+
365
+ # Combine all the dataframes in the list
366
+ combined_df = pl.concat(df_list, how='vertical')
367
+ # Write the combined dataframe to a tsv file
368
+ for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',
369
+ 'ID_score', 'profile_coverage', 'contig_coverage']:
370
+ combined_df = combined_df.with_columns([
371
+ pl.col(col).cast(pl.Float64)
372
+ ])
373
+
374
+
375
+ combined_df.write_csv(outputs.combined_tsv_path, separator="\t")
376
+
377
+ # Check if the combined dataframe is empty
378
+ if combined_df.is_empty():
379
+ logger.loud_log("No hits found by RdRpCATCH. Exiting.")
380
+ return None
381
+
382
+
383
+ if len(db_name_list) > 1:
384
+ if verbose:
385
+ logger.loud_log("Generating upset plot.")
386
+ else:
387
+ logger.silent_log("Generating upset plot.")
388
+
389
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).upset_plotter(set_dict)
390
+
391
+
392
+ if verbose:
393
+ logger.loud_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
394
+ else:
395
+ logger.silent_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
396
+ # Generate e-value plot
397
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_evalue(combined_df)
398
+ # Generate score plot
399
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_score(combined_df)
400
+ # Generate normalized bitscore plot
401
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)
402
+ # Generate normalized bitscore contig plot
403
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)
404
+ # Generate ID score plot
405
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_ID_score(combined_df)
406
+ # Generate Profile coverage plot
407
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)
408
+ # Generate contig coverage plot
409
+ plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)
410
+ # Extract all the contigs
411
+ combined_set = set.union(*[value for value in set_dict.values()])
412
+ translated_combined_set = set.union(*[value for value in translated_set_dict.values()])
413
+
414
+ # Write a fasta file with all the contigs
415
+ if not os.path.exists(outputs.fasta_output_dir):
416
+ outputs.fasta_output_dir.mkdir(parents=True)
417
+
418
+ utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_nuc_out_path)
419
+
420
+ utils.fasta(outputs.seqkit_translate_output_path).write_fasta(utils.fasta(outputs.seqkit_translate_output_path).extract_contigs(translated_combined_set),
421
+ outputs.fasta_prot_out_path)
422
+
423
+ if not os.path.exists(outputs.gff_output_dir):
424
+ outputs.gff_output_dir.mkdir(parents=True)
425
+ hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)
426
+ hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output, outputs.gff_output_path)
427
+ rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output,seq_type)
428
+ utils.fasta(outputs.seqkit_translate_output_path, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)
429
+
430
+ if verbose:
431
+ logger.loud_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
432
+ logger.loud_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
433
+ else:
434
+ logger.silent_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
435
+ logger.silent_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
436
+
437
+ if not os.path.exists(outputs.mmseqs_tax_output_dir):
438
+ outputs.mmseqs_tax_output_dir.mkdir(parents=True)
439
+
440
+ if verbose:
441
+ logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
442
+ else:
443
+ logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
444
+
445
+ mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
446
+ outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
447
+
448
+ if verbose:
449
+ logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
450
+ else:
451
+ logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
452
+
453
+ if not os.path.exists(outputs.mmseqs_e_search_output_dir):
454
+ outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
455
+
456
+
457
+ mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,
458
+ outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()
459
+
460
+ utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(
461
+ outputs.rdrpcatch_output, outputs.extended_rdrpcatch_output, seq_type)
462
+
463
+
464
+ elif seq_type == 'prot':
465
+
466
+ if verbose:
467
+ logger.loud_log("Protein sequence detected.")
468
+ else:
469
+ logger.silent_log("Protein sequence detected.")
470
+
471
+ set_dict = {}
472
+ df_list = []
473
+
474
+ for db_name,db_path in zip (db_name_list, db_path_list):
475
+
476
+ if verbose:
477
+ logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
478
+ else:
479
+ logger.silent_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
480
+ start_hmmsearch_time = logger.start_timer()
481
+ hmm_out = run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), input_file, db_path, cpus, e, incdomE, domE, incE, z).run_pyhmmsearch()
482
+ end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time,verbose)
483
+ if verbose:
484
+ logger.loud_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
485
+ else:
486
+ logger.silent_log(f"{db_name} HMMsearch Runtime: {end_hmmsearch_time}")
487
+
488
+ if verbose:
489
+ logger.loud_log(f"Pyhmmer output written to: {hmm_out}")
490
+ else:
491
+ logger.silent_log(f"Pyhmmer output written to: {hmm_out}")
492
+ if not os.path.exists(outputs.formatted_hmm_output_dir):
493
+ outputs.formatted_hmm_output_dir.mkdir(parents=True)
494
+
495
+ format_pyhmmer_out.hmmsearch_formatter(hmm_out, outputs.formatted_hmm_output_path(db_name), seq_type)
496
+ if verbose:
497
+ logger.loud_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
498
+ else:
499
+ logger.silent_log(f"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}")
500
+
501
+ # Extract Highest Bitscore hits from the formatted hmm output
502
+
503
+ if not os.path.exists(outputs.best_hit_dir):
504
+ outputs.best_hit_dir.mkdir(parents=True)
505
+
506
+ format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),seq_type, logger).highest_bitscore_hits(outputs.best_hit_path(db_name))
507
+
508
+ if verbose:
509
+ logger.loud_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
510
+ else:
511
+ logger.silent_log(f"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}")
512
+ # Here I overwrite prot to nuc, because I need the contig name to extract the contigs
513
+ set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),"nuc", logger).hmm_to_contig_set()
514
+
515
+ # Convert to dataframe, add db_name column and append to df_list
516
+ df = pl.read_csv(outputs.best_hit_path(db_name), separator='\t')
517
+ df = df.with_columns([
518
+ pl.lit(db_name).alias('db_name')
519
+ ])
520
+ df_list.append(df)
521
+
522
+ if not os.path.exists(outputs.plot_outdir):
523
+ outputs.plot_outdir.mkdir(parents=True)
524
+
525
+ if not os.path.exists(outputs.tsv_outdir):
526
+ outputs.tsv_outdir.mkdir(parents=True)
527
+
528
+
529
+
530
+ # Combine all the dataframes in the list
531
+ combined_df = pl.concat(df_list, how='vertical')
532
+ # Write the combined dataframe to a tsv file
533
+ for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',
534
+ 'ID_score', 'profile_coverage', 'contig_coverage']:
535
+ combined_df = combined_df.with_columns([
536
+ pl.col(col).cast(pl.Float64)
537
+ ])
538
+
539
+ combined_df.write_csv(outputs.combined_tsv_path, separator="\t")
540
+
541
+ # Check if the combined dataframe is empty
542
+ if combined_df.is_empty():
543
+ logger.loud_log("No hits found by RdRpCATCH. Exiting.")
544
+ return None
545
+
546
+ if len(db_name_list) > 1:
547
+ if verbose:
548
+ logger.loud_log("Generating upset plot.")
549
+ else:
550
+ logger.silent_log("Generating upset plot.")
551
+
552
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).upset_plotter(set_dict)
553
+
554
+
555
+ if verbose:
556
+ logger.loud_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
557
+ else:
558
+ logger.silent_log(f"Combined dataframe written to: {outputs.combined_tsv_path}")
559
+
560
+ # Generate e-value plot
561
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_evalue(combined_df)
562
+ # Generate score plot
563
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_score(combined_df)
564
+ # Generate normalized bitscore plot
565
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)
566
+ # Generate normalized bitscore contig plot
567
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)
568
+ # Generate ID score plot
569
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_ID_score(combined_df)
570
+ # Generate Profile coverage plot
571
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)
572
+ # Generate contig coverage plot
573
+ plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)
574
+
575
+ # Extract all the contigs
576
+ combined_set = set.union(*[value for value in set_dict.values()])
577
+ # Write a fasta file with all the contigs
578
+ if not os.path.exists(outputs.fasta_output_dir):
579
+ outputs.fasta_output_dir.mkdir(parents=True)
580
+
581
+ utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_prot_out_path)
582
+
583
+ if verbose:
584
+ logger.loud_log(f"Contigs written to: {outputs.fasta_prot_out_path}")
585
+ else:
586
+ logger.silent_log(f"Contigs written to: {outputs.fasta_prot_out_path}")
587
+
588
+ if not os.path.exists(outputs.gff_output_dir):
589
+ outputs.gff_output_dir.mkdir(parents=True)
590
+
591
+ hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)
592
+ hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output, outputs.gff_output_path)
593
+ rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output,seq_type)
594
+ utils.fasta(input_file, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)
595
+
596
+ if verbose:
597
+ logger.loud_log(f"RdRpCATCH output file written to: {outputs.fasta_prot_out_path}")
598
+ else:
599
+ logger.silent_log(f"RdRpCATCH output file written to: {outputs.fasta_prot_out_path}")
600
+
601
+ if not os.path.exists(outputs.mmseqs_tax_output_dir):
602
+ outputs.mmseqs_tax_output_dir.mkdir(parents=True)
603
+
604
+ if verbose:
605
+ logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
606
+ else:
607
+ logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
608
+
609
+
610
+ mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
611
+ outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
612
+
613
+ if not os.path.exists(outputs.mmseqs_e_search_output_dir):
614
+ outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
615
+
616
+ if verbose:
617
+ logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
618
+ else:
619
+ logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
620
+
621
+ mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,
622
+ outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()
623
+
624
+ utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(
625
+ outputs.rdrpcatch_output, outputs.extended_rdrpcatch_output, seq_type)
626
+
627
+
628
+ end_time = logger.stop_timer(start_time, verbose)
629
+ if verbose:
630
+ logger.loud_log(f"Total Runtime: {end_time}")
631
+ else:
632
+ logger.silent_log(f"Total Runtime: {end_time}")
633
+
634
+
635
+
636
+ if not keep_tmp:
637
+ if verbose:
638
+ logger.loud_log("Deleting temporary files.")
639
+ else:
640
+ logger.silent_log("Deleting temporary files.")
641
+
642
+ try:
643
+ import shutil
644
+ shutil.rmtree(outputs.tmp_dir)
645
+ logger.silent_log(f"Temporary files deleted.")
646
+ except FileNotFoundError:
647
+ print(f"Directory '{outputs.tmp_dir}' does not exist.")
648
+ except PermissionError:
649
+ print(f"Permission denied while trying to delete '{outputs.tmp_dir}'.")
650
+ except Exception as e:
651
+ print(f"An error occurred: {e}")
652
+
653
+ # Bundle results
654
+ if bundle:
655
+ archive_path = bundle_results(output_dir, prefix)
656
+ if verbose:
657
+ logger.loud_log(f"Results bundled into: {archive_path}")
658
+ else:
659
+ logger.silent_log(f"Results bundled into: {archive_path}")
660
+
661
+
662
+
663
+ return outputs.extended_rdrpcatch_output
664
+
665
+ if __name__ == "__main__":
666
+ main()