rdrpcatch 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdrpcatch/cli/args.py +35 -28
- rdrpcatch/rdrpcatch_wrapper.py +68 -50
- {rdrpcatch-0.0.5.dist-info → rdrpcatch-0.0.7.dist-info}/METADATA +21 -22
- {rdrpcatch-0.0.5.dist-info → rdrpcatch-0.0.7.dist-info}/RECORD +7 -7
- {rdrpcatch-0.0.5.dist-info → rdrpcatch-0.0.7.dist-info}/WHEEL +0 -0
- {rdrpcatch-0.0.5.dist-info → rdrpcatch-0.0.7.dist-info}/entry_points.txt +0 -0
- {rdrpcatch-0.0.5.dist-info → rdrpcatch-0.0.7.dist-info}/licenses/LICENSE +0 -0
rdrpcatch/cli/args.py
CHANGED
|
@@ -26,7 +26,7 @@ def parse_comma_separated_options(ctx, param, value):
|
|
|
26
26
|
return ['all']
|
|
27
27
|
|
|
28
28
|
allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
|
|
29
|
-
'
|
|
29
|
+
'Lucaprot_HMM, Zayed_HMM', 'all']
|
|
30
30
|
lower_choices = [choice.lower() for choice in allowed_choices]
|
|
31
31
|
options = value.split(',')
|
|
32
32
|
lower_options = [option.lower() for option in options]
|
|
@@ -73,7 +73,7 @@ def cli():
|
|
|
73
73
|
callback=parse_comma_separated_options,
|
|
74
74
|
default="all",
|
|
75
75
|
help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
|
|
76
|
-
" TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,
|
|
76
|
+
" TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all")
|
|
77
77
|
@click.option("--custom-dbs",
|
|
78
78
|
help="Path to directory containing custom MSAs/pHMM files to use as additional databases",
|
|
79
79
|
type=click.Path(exists=True, path_type=Path))
|
|
@@ -115,30 +115,30 @@ def cli():
|
|
|
115
115
|
@click.option('-gen_code', '--gen_code',
|
|
116
116
|
type=click.INT,
|
|
117
117
|
default=1,
|
|
118
|
-
help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) :
|
|
119
|
-
'2: The Vertebrate Mitochondrial Code
|
|
120
|
-
'3: The Yeast Mitochondrial Code
|
|
121
|
-
'4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code
|
|
122
|
-
'5: The Invertebrate Mitochondrial Code
|
|
123
|
-
'6: The Ciliate, Dasycladacean and Hexamita Nuclear Code
|
|
124
|
-
'9: The Echinoderm and Flatworm Mitochondrial Code
|
|
125
|
-
'10: The Euplotid Nuclear Code
|
|
126
|
-
'11: The Bacterial, Archaeal and Plant Plastid Code
|
|
127
|
-
'12: The Alternative Yeast Nuclear Code
|
|
128
|
-
'13: The Ascidian Mitochondrial Code
|
|
129
|
-
'14: The Alternative Flatworm Mitochondrial Code
|
|
130
|
-
'16: Chlorophycean Mitochondrial Code
|
|
131
|
-
'21: Trematode Mitochondrial Code
|
|
132
|
-
'22: Scenedesmus obliquus Mitochondrial Code
|
|
133
|
-
'23: Thraustochytrium Mitochondrial Code
|
|
134
|
-
'24: Pterobranchia Mitochondrial Code
|
|
135
|
-
'25: Candidate Division SR1 and Gracilibacteria Code
|
|
136
|
-
'26: Pachysolen tannophilus Nuclear Code
|
|
137
|
-
'27: Karyorelict Nuclear
|
|
138
|
-
'28: Condylostoma Nuclear
|
|
139
|
-
'29: Mesodinium Nuclear
|
|
140
|
-
'30: Peritrich Nuclear
|
|
141
|
-
'31: Blastocrithidia Nuclear
|
|
118
|
+
help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) : 1: The Standard Code, '
|
|
119
|
+
'2: The Vertebrate Mitochondrial Code, '
|
|
120
|
+
'3: The Yeast Mitochondrial Code, '
|
|
121
|
+
'4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code, '
|
|
122
|
+
'5: The Invertebrate Mitochondrial Code, '
|
|
123
|
+
'6: The Ciliate, Dasycladacean and Hexamita Nuclear Code, '
|
|
124
|
+
'9: The Echinoderm and Flatworm Mitochondrial Code, '
|
|
125
|
+
'10: The Euplotid Nuclear Code, '
|
|
126
|
+
'11: The Bacterial, Archaeal and Plant Plastid Code, '
|
|
127
|
+
'12: The Alternative Yeast Nuclear Code, '
|
|
128
|
+
'13: The Ascidian Mitochondrial Code, '
|
|
129
|
+
'14: The Alternative Flatworm Mitochondrial Code, '
|
|
130
|
+
'16: Chlorophycean Mitochondrial Code, '
|
|
131
|
+
'21: Trematode Mitochondrial Code, '
|
|
132
|
+
'22: Scenedesmus obliquus Mitochondrial Code, '
|
|
133
|
+
'23: Thraustochytrium Mitochondrial Code, '
|
|
134
|
+
'24: Pterobranchia Mitochondrial Code, '
|
|
135
|
+
'25: Candidate Division SR1 and Gracilibacteria Code, '
|
|
136
|
+
'26: Pachysolen tannophilus Nuclear Code, '
|
|
137
|
+
'27: Karyorelict Nuclear, '
|
|
138
|
+
'28: Condylostoma Nuclear, '
|
|
139
|
+
'29: Mesodinium Nuclear, '
|
|
140
|
+
'30: Peritrich Nuclear, '
|
|
141
|
+
'31: Blastocrithidia Nuclear, ')
|
|
142
142
|
@click.option('-bundle', '--bundle',
|
|
143
143
|
is_flag=True,
|
|
144
144
|
default=False,
|
|
@@ -147,9 +147,14 @@ def cli():
|
|
|
147
147
|
is_flag=True,
|
|
148
148
|
default=False,
|
|
149
149
|
help="Keep temporary files (Expert users) (default: False)")
|
|
150
|
+
@click.option('-overwrite', '--overwrite',
|
|
151
|
+
is_flag=True,
|
|
152
|
+
default=False,
|
|
153
|
+
help="Force overwrite of existing output directory. (default: False)")
|
|
154
|
+
|
|
150
155
|
@click.pass_context
|
|
151
156
|
def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose, evalue,
|
|
152
|
-
incevalue, domevalue, incdomevalue, zvalue, cpus, length_thr, gen_code, bundle, keep_tmp):
|
|
157
|
+
incevalue, domevalue, incdomevalue, zvalue, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):
|
|
153
158
|
"""Scan sequences for RdRps."""
|
|
154
159
|
|
|
155
160
|
# Create a rich table for displaying parameters
|
|
@@ -175,6 +180,7 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
|
|
|
175
180
|
table.add_row("Genetic Code", str(gen_code))
|
|
176
181
|
table.add_row("Bundle Output", "ON" if bundle else "OFF")
|
|
177
182
|
table.add_row("Save Temporary Files", "ON" if keep_tmp else "OFF")
|
|
183
|
+
table.add_row("Force Overwrite", "ON" if overwrite else "OFF")
|
|
178
184
|
|
|
179
185
|
console.print(Panel(table, title="Scan Configuration"))
|
|
180
186
|
|
|
@@ -207,7 +213,8 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
|
|
|
207
213
|
length_thr=length_thr,
|
|
208
214
|
gen_code=gen_code,
|
|
209
215
|
bundle=bundle,
|
|
210
|
-
keep_tmp=keep_tmp
|
|
216
|
+
keep_tmp=keep_tmp,
|
|
217
|
+
overwrite=overwrite
|
|
211
218
|
)
|
|
212
219
|
|
|
213
220
|
# @cli.command("download", help="Download RdRpCATCH databases.")
|
rdrpcatch/rdrpcatch_wrapper.py
CHANGED
|
@@ -53,7 +53,7 @@ def bundle_results(output_dir, prefix):
|
|
|
53
53
|
|
|
54
54
|
return archive_path
|
|
55
55
|
|
|
56
|
-
def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp):
|
|
56
|
+
def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):
|
|
57
57
|
"""
|
|
58
58
|
Run RdRpCATCH scan.
|
|
59
59
|
|
|
@@ -110,8 +110,16 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
110
110
|
log_file = outputs.log_file
|
|
111
111
|
if not os.path.exists(outputs.output_dir):
|
|
112
112
|
os.makedirs(outputs.output_dir)
|
|
113
|
+
elif os.path.exists(outputs.output_dir) and overwrite:
|
|
114
|
+
# If the output directory already exists and force_overwrite is True, remove the existing directory
|
|
115
|
+
import shutil
|
|
116
|
+
shutil.rmtree(outputs.output_dir)
|
|
117
|
+
os.makedirs(outputs.output_dir)
|
|
118
|
+
outputs = paths.rdrpcatch_output(prefix, Path(output_dir))
|
|
113
119
|
else:
|
|
114
|
-
raise FileExistsError(f"Output directory already exists: {outputs.output_dir}, Please choose a different directory
|
|
120
|
+
raise FileExistsError(f"Output directory already exists: {outputs.output_dir}, Please choose a different directory"
|
|
121
|
+
f" or activate the -overwrite flag to overwrite the contents of the directory.")
|
|
122
|
+
|
|
115
123
|
if not os.path.exists(outputs.log_dir):
|
|
116
124
|
os.makedirs(outputs.log_dir)
|
|
117
125
|
|
|
@@ -160,7 +168,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
160
168
|
if seq_type == 'prot':
|
|
161
169
|
utils.fasta_checker(input_file, logger).check_seq_length(100000)
|
|
162
170
|
|
|
163
|
-
|
|
171
|
+
logger.loud_log("Fetching HMM databases...")
|
|
172
|
+
|
|
173
|
+
## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
|
|
164
174
|
rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
|
|
165
175
|
if verbose:
|
|
166
176
|
logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
|
|
@@ -192,19 +202,24 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
192
202
|
logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
|
|
193
203
|
else:
|
|
194
204
|
logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
|
|
195
|
-
lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("
|
|
205
|
+
lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
|
|
196
206
|
if verbose:
|
|
197
207
|
logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
|
|
198
208
|
else:
|
|
199
209
|
logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
|
|
210
|
+
zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
|
|
211
|
+
if verbose:
|
|
212
|
+
logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
|
|
213
|
+
else:
|
|
214
|
+
logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
|
|
200
215
|
|
|
201
216
|
db_name_list = []
|
|
202
217
|
db_path_list = []
|
|
203
218
|
|
|
204
219
|
## Set up HMM databases
|
|
205
220
|
if db_options == ['all']:
|
|
206
|
-
db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "
|
|
207
|
-
db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
|
|
221
|
+
db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
|
|
222
|
+
db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
|
|
208
223
|
|
|
209
224
|
else:
|
|
210
225
|
for db in db_options:
|
|
@@ -226,18 +241,20 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
226
241
|
elif db == "RDRP-scan".lower():
|
|
227
242
|
db_name_list.append("RDRP-scan")
|
|
228
243
|
db_path_list.append(rdrpscan_hmm_db)
|
|
229
|
-
elif db == "
|
|
230
|
-
db_name_list.append("
|
|
244
|
+
elif db == "Lucaprot_HMM".lower():
|
|
245
|
+
db_name_list.append("Lucaprot_HMM")
|
|
231
246
|
db_path_list.append(lucaprot_hmm_db)
|
|
247
|
+
elif db == "Zayed_HMM".lower():
|
|
248
|
+
db_name_list.append("Zayed_HMM")
|
|
249
|
+
db_path_list.append(zayed_hmm_db)
|
|
232
250
|
else:
|
|
233
251
|
raise Exception(f"Invalid database option: {db}")
|
|
234
252
|
|
|
235
253
|
# Fetch mmseqs database
|
|
236
254
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
logger.silent_log("Fetching mmseqs databases.")
|
|
255
|
+
|
|
256
|
+
logger.loud_log("Fetching Mmseqs2 databases...")
|
|
257
|
+
|
|
241
258
|
mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path("mmseqs_refseq_riboviria_20250211")
|
|
242
259
|
|
|
243
260
|
if verbose:
|
|
@@ -260,21 +277,17 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
260
277
|
if not os.path.exists(outputs.tmp_dir):
|
|
261
278
|
outputs.tmp_dir.mkdir(parents=True)
|
|
262
279
|
|
|
280
|
+
logger.loud_log("Databases fetched successfully.")
|
|
281
|
+
|
|
263
282
|
if seq_type == 'nuc':
|
|
264
|
-
|
|
265
|
-
logger.loud_log("Nucleotide sequence detected.")
|
|
266
|
-
else:
|
|
267
|
-
logger.silent_log("Nucleotide sequence detected.")
|
|
283
|
+
logger.loud_log("Nucleotide sequence detected.")
|
|
268
284
|
|
|
269
285
|
set_dict = {}
|
|
270
286
|
translated_set_dict = {}
|
|
271
287
|
df_list = []
|
|
272
288
|
|
|
273
289
|
## Filter out sequences with length less than 400 bp with seqkit
|
|
274
|
-
|
|
275
|
-
logger.loud_log("Filtering out sequences with length less than 400 bp.")
|
|
276
|
-
else:
|
|
277
|
-
logger.silent_log("Filtering out sequences with length less than 400 bp.")
|
|
290
|
+
logger.loud_log("Filtering out sequences with length less than 400 bp.")
|
|
278
291
|
|
|
279
292
|
if not os.path.exists(outputs.seqkit_seq_output_dir):
|
|
280
293
|
outputs.seqkit_seq_output_dir.mkdir(parents=True)
|
|
@@ -286,10 +299,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
286
299
|
logger.silent_log(f"Filtered sequence written to: { outputs.seqkit_seq_output_path}")
|
|
287
300
|
|
|
288
301
|
## Translate nucleotide sequences to protein sequences with seqkit
|
|
289
|
-
|
|
290
|
-
logger.loud_log("Translating nucleotide sequences to protein sequences.")
|
|
291
|
-
else:
|
|
292
|
-
logger.silent_log("Translating nucleotide sequences to protein sequences.")
|
|
302
|
+
logger.loud_log("Translating nucleotide sequences to protein sequences.")
|
|
293
303
|
|
|
294
304
|
if not os.path.exists(outputs.seqkit_translate_output_dir):
|
|
295
305
|
outputs.seqkit_translate_output_dir.mkdir(parents=True)
|
|
@@ -302,6 +312,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
302
312
|
logger.silent_log(f"Translated sequence written to: {outputs.seqkit_translate_output_path}")
|
|
303
313
|
|
|
304
314
|
for db_name,db_path in zip(db_name_list, db_path_list):
|
|
315
|
+
logger.loud_log(f"Running HMMsearch for {db_name} database.")
|
|
305
316
|
|
|
306
317
|
if verbose:
|
|
307
318
|
logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
|
|
@@ -353,7 +364,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
353
364
|
])
|
|
354
365
|
df_list.append(df)
|
|
355
366
|
|
|
367
|
+
logger.loud_log(f"HMMsearch for {db_name} completed.")
|
|
356
368
|
|
|
369
|
+
logger.loud_log("HMMsearch completed.")
|
|
357
370
|
|
|
358
371
|
if not os.path.exists(outputs.plot_outdir):
|
|
359
372
|
outputs.plot_outdir.mkdir(parents=True)
|
|
@@ -361,6 +374,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
361
374
|
if not os.path.exists(outputs.tsv_outdir):
|
|
362
375
|
outputs.tsv_outdir.mkdir(parents=True)
|
|
363
376
|
|
|
377
|
+
logger.loud_log("Consolidating results.")
|
|
364
378
|
|
|
365
379
|
# Combine all the dataframes in the list
|
|
366
380
|
combined_df = pl.concat(df_list, how='vertical_relaxed')
|
|
@@ -379,6 +393,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
379
393
|
logger.loud_log("No hits found by RdRpCATCH. Exiting.")
|
|
380
394
|
return None
|
|
381
395
|
|
|
396
|
+
# Generate upset plot
|
|
397
|
+
logger.loud_log("Generating plots.")
|
|
382
398
|
|
|
383
399
|
if len(db_name_list) > 1:
|
|
384
400
|
if verbose:
|
|
@@ -411,6 +427,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
411
427
|
combined_set = set.union(*[value for value in set_dict.values()])
|
|
412
428
|
translated_combined_set = set.union(*[value for value in translated_set_dict.values()])
|
|
413
429
|
|
|
430
|
+
logger.loud_log("Extracting RdRp contigs from the input file.")
|
|
431
|
+
|
|
414
432
|
# Write a fasta file with all the contigs
|
|
415
433
|
if not os.path.exists(outputs.fasta_output_dir):
|
|
416
434
|
outputs.fasta_output_dir.mkdir(parents=True)
|
|
@@ -430,25 +448,21 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
430
448
|
if verbose:
|
|
431
449
|
logger.loud_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
|
|
432
450
|
logger.loud_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
|
|
451
|
+
logger.loud_log(f"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}")
|
|
433
452
|
else:
|
|
434
453
|
logger.silent_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
|
|
435
454
|
logger.silent_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
|
|
455
|
+
logger.silent_log(f"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}")
|
|
436
456
|
|
|
437
457
|
if not os.path.exists(outputs.mmseqs_tax_output_dir):
|
|
438
458
|
outputs.mmseqs_tax_output_dir.mkdir(parents=True)
|
|
439
459
|
|
|
440
|
-
|
|
441
|
-
logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
442
|
-
else:
|
|
443
|
-
logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
460
|
+
logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
444
461
|
|
|
445
462
|
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
|
|
446
463
|
outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
|
|
447
464
|
|
|
448
|
-
|
|
449
|
-
logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
450
|
-
else:
|
|
451
|
-
logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
465
|
+
logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
452
466
|
|
|
453
467
|
if not os.path.exists(outputs.mmseqs_e_search_output_dir):
|
|
454
468
|
outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
|
|
@@ -460,18 +474,17 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
460
474
|
utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(
|
|
461
475
|
outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)
|
|
462
476
|
|
|
477
|
+
logger.loud_log("Taxonomic annotation completed.")
|
|
463
478
|
|
|
464
479
|
elif seq_type == 'prot':
|
|
465
480
|
|
|
466
|
-
|
|
467
|
-
logger.loud_log("Protein sequence detected.")
|
|
468
|
-
else:
|
|
469
|
-
logger.silent_log("Protein sequence detected.")
|
|
481
|
+
logger.loud_log("Protein sequence detected.")
|
|
470
482
|
|
|
471
483
|
set_dict = {}
|
|
472
484
|
df_list = []
|
|
473
485
|
|
|
474
486
|
for db_name,db_path in zip (db_name_list, db_path_list):
|
|
487
|
+
logger.loud_log(f"Running HMMsearch for {db_name} database.")
|
|
475
488
|
|
|
476
489
|
if verbose:
|
|
477
490
|
logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
|
|
@@ -519,12 +532,17 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
519
532
|
])
|
|
520
533
|
df_list.append(df)
|
|
521
534
|
|
|
535
|
+
logger.loud_log(f"HMMsearch for {db_name} completed.")
|
|
536
|
+
|
|
537
|
+
logger.loud_log("HMMsearch completed.")
|
|
538
|
+
|
|
522
539
|
if not os.path.exists(outputs.plot_outdir):
|
|
523
540
|
outputs.plot_outdir.mkdir(parents=True)
|
|
524
541
|
|
|
525
542
|
if not os.path.exists(outputs.tsv_outdir):
|
|
526
543
|
outputs.tsv_outdir.mkdir(parents=True)
|
|
527
544
|
|
|
545
|
+
logger.loud_log("Consolidating results.")
|
|
528
546
|
|
|
529
547
|
# Combine all the dataframes in the list
|
|
530
548
|
combined_df = pl.concat(df_list, how='vertical_relaxed')
|
|
@@ -542,6 +560,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
542
560
|
logger.loud_log("No hits found by RdRpCATCH. Exiting.")
|
|
543
561
|
return None
|
|
544
562
|
|
|
563
|
+
# Generate upset plot
|
|
564
|
+
logger.loud_log("Generating plots.")
|
|
565
|
+
|
|
545
566
|
if len(db_name_list) > 1:
|
|
546
567
|
if verbose:
|
|
547
568
|
logger.loud_log("Generating upset plot.")
|
|
@@ -574,6 +595,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
574
595
|
# Extract all the contigs
|
|
575
596
|
combined_set = set.union(*[value for value in set_dict.values()])
|
|
576
597
|
# Write a fasta file with all the contigs
|
|
598
|
+
|
|
599
|
+
logger.loud_log("Extracting RdRp contigs from the input file.")
|
|
600
|
+
|
|
577
601
|
if not os.path.exists(outputs.fasta_output_dir):
|
|
578
602
|
outputs.fasta_output_dir.mkdir(parents=True)
|
|
579
603
|
|
|
@@ -600,11 +624,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
600
624
|
if not os.path.exists(outputs.mmseqs_tax_output_dir):
|
|
601
625
|
outputs.mmseqs_tax_output_dir.mkdir(parents=True)
|
|
602
626
|
|
|
603
|
-
|
|
604
|
-
logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
605
|
-
else:
|
|
606
|
-
logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
607
|
-
|
|
627
|
+
logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
|
|
608
628
|
|
|
609
629
|
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
|
|
610
630
|
outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
|
|
@@ -612,10 +632,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
612
632
|
if not os.path.exists(outputs.mmseqs_e_search_output_dir):
|
|
613
633
|
outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
|
|
614
634
|
|
|
615
|
-
|
|
616
|
-
logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
617
|
-
else:
|
|
618
|
-
logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
635
|
+
logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
|
|
619
636
|
|
|
620
637
|
mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,
|
|
621
638
|
outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()
|
|
@@ -624,11 +641,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
624
641
|
outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)
|
|
625
642
|
|
|
626
643
|
|
|
627
|
-
|
|
628
|
-
if verbose:
|
|
629
|
-
logger.loud_log(f"Total Runtime: {end_time}")
|
|
630
|
-
else:
|
|
631
|
-
logger.silent_log(f"Total Runtime: {end_time}")
|
|
644
|
+
|
|
632
645
|
|
|
633
646
|
|
|
634
647
|
|
|
@@ -657,6 +670,11 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
657
670
|
else:
|
|
658
671
|
logger.silent_log(f"Results bundled into: {archive_path}")
|
|
659
672
|
|
|
673
|
+
end_time = logger.stop_timer(start_time, verbose)
|
|
674
|
+
|
|
675
|
+
logger.loud_log(f"Total Runtime: {end_time}")
|
|
676
|
+
|
|
677
|
+
logger.loud_log("RdRpCATCH completed successfully.")
|
|
660
678
|
|
|
661
679
|
|
|
662
680
|
return outputs.extended_rdrpcatch_output
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rdrpcatch
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.7
|
|
4
4
|
Dynamic: Summary
|
|
5
5
|
Project-URL: Home, https://github.com/dimitris-karapliafis/RdRpCATCH
|
|
6
6
|
Project-URL: Source, https://github.com/dimitris-karapliafis/RdRpCATCH
|
|
@@ -36,7 +36,7 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
|
|
|
36
36
|
|
|
37
37
|
** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
|
|
38
38
|
|
|
39
|
-

|
|
40
40
|
|
|
41
41
|
### Supported databases
|
|
42
42
|
- NeoRdRp <sup>1</sup> : 1182 pHMMs
|
|
@@ -133,25 +133,25 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
|
|
|
133
133
|
### rdrpcatch scan:
|
|
134
134
|
Search a given input using selected RdRp databases.
|
|
135
135
|
|
|
136
|
-
| Argument | Short Flag | Type | Description
|
|
137
|
-
|
|
138
|
-
| `--input` | `-i` | FILE | Path to the input FASTA file. [required]
|
|
139
|
-
| `--output` | `-o` | DIRECTORY | Path to the output directory. [required]
|
|
140
|
-
| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required]
|
|
141
|
-
| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,
|
|
142
|
-
| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases
|
|
143
|
-
| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown
|
|
144
|
-
| `--verbose` | `-v` | FLAG | Print verbose output.
|
|
145
|
-
| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5)
|
|
146
|
-
| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5)
|
|
147
|
-
| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5)
|
|
148
|
-
| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5)
|
|
149
|
-
| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000)
|
|
150
|
-
| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1)
|
|
151
|
-
| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400)
|
|
152
|
-
| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1)
|
|
153
|
-
| `--bundle` | `-bundle` | |
|
|
154
|
-
| `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False)
|
|
136
|
+
| Argument | Short Flag | Type | Description |
|
|
137
|
+
|----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
138
|
+
| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
|
|
139
|
+
| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
|
|
140
|
+
| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
|
|
141
|
+
| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
|
|
142
|
+
| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
|
|
143
|
+
| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
|
|
144
|
+
| `--verbose` | `-v` | FLAG | Print verbose output. |
|
|
145
|
+
| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
|
|
146
|
+
| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
|
|
147
|
+
| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
|
|
148
|
+
| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
|
|
149
|
+
| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
|
|
150
|
+
| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
|
|
151
|
+
| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
|
|
152
|
+
| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
|
|
153
|
+
| `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
|
|
154
|
+
| `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
|
|
155
155
|
|
|
156
156
|
|
|
157
157
|
|
|
@@ -205,7 +205,6 @@ Dimitris Karapliafis (dimitris.karapliafis@wur.nl), potentially via slack/teams
|
|
|
205
205
|
|
|
206
206
|
##TODO:
|
|
207
207
|
- [ ] loud logging is linking to the utils.py file, not the actual line of code causing the error.
|
|
208
|
-
- [ ] Add `overwrite` flag
|
|
209
208
|
- [ ] drop `db_dir` argument and use global/environment/config variable that is set after running the `download` command
|
|
210
209
|
|
|
211
210
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
rdrpcatch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rdrpcatch/rdrpcatch_wrapper.py,sha256=
|
|
2
|
+
rdrpcatch/rdrpcatch_wrapper.py,sha256=X-U0CKQWHwybLIdWvaFZGEj-v0oTUnBv2PbiLAdu8s4,31573
|
|
3
3
|
rdrpcatch/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
rdrpcatch/cli/args.py,sha256=
|
|
4
|
+
rdrpcatch/cli/args.py,sha256=DX7gfESWi4j1CNpALAEG45JV_b5KkU1LAJj2FDb8J5g,15963
|
|
5
5
|
rdrpcatch/rdrpcatch_scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
rdrpcatch/rdrpcatch_scripts/fetch_dbs.py,sha256=e9ShColfLgBvWSZpGOvY3zKhEgIg3rw1IIV__KX7N-g,11054
|
|
7
7
|
rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py,sha256=2_ERXFQK2lpVReWl0jwQdnKIObv_zq07uFJOzGsTHlo,25025
|
|
@@ -12,8 +12,8 @@ rdrpcatch/rdrpcatch_scripts/plot.py,sha256=Y1mZL7rkKHFKEs2D7T2Qj2kpfiORmFwRLq1LY
|
|
|
12
12
|
rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py,sha256=9zcMzaIwQ4_-NgYzG9kejxOBaDi-gbzaqpvZti8ZXA4,9008
|
|
13
13
|
rdrpcatch/rdrpcatch_scripts/run_seqkit.py,sha256=5y7DtJ6NLa4sRoBQOcjBfczKlqG_LibNrEqNmKLrHu0,4361
|
|
14
14
|
rdrpcatch/rdrpcatch_scripts/utils.py,sha256=jvpyPxchAMn6BeLV7HOFECSY_a3nbkxDBBL8tunmM8A,16938
|
|
15
|
-
rdrpcatch-0.0.
|
|
16
|
-
rdrpcatch-0.0.
|
|
17
|
-
rdrpcatch-0.0.
|
|
18
|
-
rdrpcatch-0.0.
|
|
19
|
-
rdrpcatch-0.0.
|
|
15
|
+
rdrpcatch-0.0.7.dist-info/METADATA,sha256=BU-V7TAZcYQC5L3KuX_N_iH_l7Q77go7ZF9-1jYRrQE,16219
|
|
16
|
+
rdrpcatch-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
17
|
+
rdrpcatch-0.0.7.dist-info/entry_points.txt,sha256=uiyoPO41jNz_KVOt2JdPak9NbVei-D8WQ6saMeMBFpE,53
|
|
18
|
+
rdrpcatch-0.0.7.dist-info/licenses/LICENSE,sha256=3jm5vKRMIaiETEFfNN34-oyWUShxZtmDmL38PNAwlUI,1120
|
|
19
|
+
rdrpcatch-0.0.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|