rdrpcatch 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rdrpcatch/cli/args.py CHANGED
@@ -26,7 +26,7 @@ def parse_comma_separated_options(ctx, param, value):
26
26
  return ['all']
27
27
 
28
28
  allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
29
- 'Lucaprot', 'all']
29
+ 'Lucaprot_HMM, Zayed_HMM', 'all']
30
30
  lower_choices = [choice.lower() for choice in allowed_choices]
31
31
  options = value.split(',')
32
32
  lower_options = [option.lower() for option in options]
@@ -73,7 +73,7 @@ def cli():
73
73
  callback=parse_comma_separated_options,
74
74
  default="all",
75
75
  help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
76
- " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all")
76
+ " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all")
77
77
  @click.option("--custom-dbs",
78
78
  help="Path to directory containing custom MSAs/pHMM files to use as additional databases",
79
79
  type=click.Path(exists=True, path_type=Path))
@@ -115,30 +115,30 @@ def cli():
115
115
  @click.option('-gen_code', '--gen_code',
116
116
  type=click.INT,
117
117
  default=1,
118
- help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) : 1: The Standard Code \n'
119
- '2: The Vertebrate Mitochondrial Code \n'
120
- '3: The Yeast Mitochondrial Code \n'
121
- '4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code \n'
122
- '5: The Invertebrate Mitochondrial Code \n'
123
- '6: The Ciliate, Dasycladacean and Hexamita Nuclear Code \n'
124
- '9: The Echinoderm and Flatworm Mitochondrial Code \n'
125
- '10: The Euplotid Nuclear Code \n'
126
- '11: The Bacterial, Archaeal and Plant Plastid Code \n'
127
- '12: The Alternative Yeast Nuclear Code \n'
128
- '13: The Ascidian Mitochondrial Code \n'
129
- '14: The Alternative Flatworm Mitochondrial Code \n'
130
- '16: Chlorophycean Mitochondrial Code \n'
131
- '21: Trematode Mitochondrial Code \n'
132
- '22: Scenedesmus obliquus Mitochondrial Code \n'
133
- '23: Thraustochytrium Mitochondrial Code \n'
134
- '24: Pterobranchia Mitochondrial Code \n'
135
- '25: Candidate Division SR1 and Gracilibacteria Code \n'
136
- '26: Pachysolen tannophilus Nuclear Code \n'
137
- '27: Karyorelict Nuclear \n'
138
- '28: Condylostoma Nuclear \n'
139
- '29: Mesodinium Nuclear \n'
140
- '30: Peritrich Nuclear \n'
141
- '31: Blastocrithidia Nuclear \n')
118
+ help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) : 1: The Standard Code, '
119
+ '2: The Vertebrate Mitochondrial Code, '
120
+ '3: The Yeast Mitochondrial Code, '
121
+ '4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code, '
122
+ '5: The Invertebrate Mitochondrial Code, '
123
+ '6: The Ciliate, Dasycladacean and Hexamita Nuclear Code, '
124
+ '9: The Echinoderm and Flatworm Mitochondrial Code, '
125
+ '10: The Euplotid Nuclear Code, '
126
+ '11: The Bacterial, Archaeal and Plant Plastid Code, '
127
+ '12: The Alternative Yeast Nuclear Code, '
128
+ '13: The Ascidian Mitochondrial Code, '
129
+ '14: The Alternative Flatworm Mitochondrial Code, '
130
+ '16: Chlorophycean Mitochondrial Code, '
131
+ '21: Trematode Mitochondrial Code, '
132
+ '22: Scenedesmus obliquus Mitochondrial Code, '
133
+ '23: Thraustochytrium Mitochondrial Code, '
134
+ '24: Pterobranchia Mitochondrial Code, '
135
+ '25: Candidate Division SR1 and Gracilibacteria Code, '
136
+ '26: Pachysolen tannophilus Nuclear Code, '
137
+ '27: Karyorelict Nuclear, '
138
+ '28: Condylostoma Nuclear, '
139
+ '29: Mesodinium Nuclear, '
140
+ '30: Peritrich Nuclear, '
141
+ '31: Blastocrithidia Nuclear, ')
142
142
  @click.option('-bundle', '--bundle',
143
143
  is_flag=True,
144
144
  default=False,
@@ -147,9 +147,14 @@ def cli():
147
147
  is_flag=True,
148
148
  default=False,
149
149
  help="Keep temporary files (Expert users) (default: False)")
150
+ @click.option('-overwrite', '--overwrite',
151
+ is_flag=True,
152
+ default=False,
153
+ help="Force overwrite of existing output directory. (default: False)")
154
+
150
155
  @click.pass_context
151
156
  def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose, evalue,
152
- incevalue, domevalue, incdomevalue, zvalue, cpus, length_thr, gen_code, bundle, keep_tmp):
157
+ incevalue, domevalue, incdomevalue, zvalue, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):
153
158
  """Scan sequences for RdRps."""
154
159
 
155
160
  # Create a rich table for displaying parameters
@@ -175,6 +180,7 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
175
180
  table.add_row("Genetic Code", str(gen_code))
176
181
  table.add_row("Bundle Output", "ON" if bundle else "OFF")
177
182
  table.add_row("Save Temporary Files", "ON" if keep_tmp else "OFF")
183
+ table.add_row("Force Overwrite", "ON" if overwrite else "OFF")
178
184
 
179
185
  console.print(Panel(table, title="Scan Configuration"))
180
186
 
@@ -207,7 +213,8 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
207
213
  length_thr=length_thr,
208
214
  gen_code=gen_code,
209
215
  bundle=bundle,
210
- keep_tmp=keep_tmp
216
+ keep_tmp=keep_tmp,
217
+ overwrite=overwrite
211
218
  )
212
219
 
213
220
  # @cli.command("download", help="Download RdRpCATCH databases.")
@@ -53,7 +53,7 @@ def bundle_results(output_dir, prefix):
53
53
 
54
54
  return archive_path
55
55
 
56
- def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp):
56
+ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):
57
57
  """
58
58
  Run RdRpCATCH scan.
59
59
 
@@ -110,8 +110,16 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
110
110
  log_file = outputs.log_file
111
111
  if not os.path.exists(outputs.output_dir):
112
112
  os.makedirs(outputs.output_dir)
113
+ elif os.path.exists(outputs.output_dir) and overwrite:
114
+ # If the output directory already exists and force_overwrite is True, remove the existing directory
115
+ import shutil
116
+ shutil.rmtree(outputs.output_dir)
117
+ os.makedirs(outputs.output_dir)
118
+ outputs = paths.rdrpcatch_output(prefix, Path(output_dir))
113
119
  else:
114
- raise FileExistsError(f"Output directory already exists: {outputs.output_dir}, Please choose a different directory.")
120
+ raise FileExistsError(f"Output directory already exists: {outputs.output_dir}, Please choose a different directory"
121
+ f" or activate the -overwrite flag to overwrite the contents of the directory.")
122
+
115
123
  if not os.path.exists(outputs.log_dir):
116
124
  os.makedirs(outputs.log_dir)
117
125
 
@@ -160,7 +168,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
160
168
  if seq_type == 'prot':
161
169
  utils.fasta_checker(input_file, logger).check_seq_length(100000)
162
170
 
163
- ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
171
+ logger.loud_log("Fetching HMM databases...")
172
+
173
+ ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
164
174
  rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
165
175
  if verbose:
166
176
  logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
@@ -192,19 +202,24 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
192
202
  logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
193
203
  else:
194
204
  logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
195
- lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
205
+ lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
196
206
  if verbose:
197
207
  logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
198
208
  else:
199
209
  logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
210
+ zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
211
+ if verbose:
212
+ logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
213
+ else:
214
+ logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
200
215
 
201
216
  db_name_list = []
202
217
  db_path_list = []
203
218
 
204
219
  ## Set up HMM databases
205
220
  if db_options == ['all']:
206
- db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
207
- db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
221
+ db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
222
+ db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
208
223
 
209
224
  else:
210
225
  for db in db_options:
@@ -226,18 +241,20 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
226
241
  elif db == "RDRP-scan".lower():
227
242
  db_name_list.append("RDRP-scan")
228
243
  db_path_list.append(rdrpscan_hmm_db)
229
- elif db == "Lucaprot".lower():
230
- db_name_list.append("Lucaprot")
244
+ elif db == "Lucaprot_HMM".lower():
245
+ db_name_list.append("Lucaprot_HMM")
231
246
  db_path_list.append(lucaprot_hmm_db)
247
+ elif db == "Zayed_HMM".lower():
248
+ db_name_list.append("Zayed_HMM")
249
+ db_path_list.append(zayed_hmm_db)
232
250
  else:
233
251
  raise Exception(f"Invalid database option: {db}")
234
252
 
235
253
  # Fetch mmseqs database
236
254
 
237
- if verbose:
238
- logger.loud_log("Fetching mmseqs databases.")
239
- else:
240
- logger.silent_log("Fetching mmseqs databases.")
255
+
256
+ logger.loud_log("Fetching Mmseqs2 databases...")
257
+
241
258
  mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path("mmseqs_refseq_riboviria_20250211")
242
259
 
243
260
  if verbose:
@@ -260,21 +277,17 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
260
277
  if not os.path.exists(outputs.tmp_dir):
261
278
  outputs.tmp_dir.mkdir(parents=True)
262
279
 
280
+ logger.loud_log("Databases fetched successfully.")
281
+
263
282
  if seq_type == 'nuc':
264
- if verbose:
265
- logger.loud_log("Nucleotide sequence detected.")
266
- else:
267
- logger.silent_log("Nucleotide sequence detected.")
283
+ logger.loud_log("Nucleotide sequence detected.")
268
284
 
269
285
  set_dict = {}
270
286
  translated_set_dict = {}
271
287
  df_list = []
272
288
 
273
289
  ## Filter out sequences with length less than 400 bp with seqkit
274
- if verbose:
275
- logger.loud_log("Filtering out sequences with length less than 400 bp.")
276
- else:
277
- logger.silent_log("Filtering out sequences with length less than 400 bp.")
290
+ logger.loud_log("Filtering out sequences with length less than 400 bp.")
278
291
 
279
292
  if not os.path.exists(outputs.seqkit_seq_output_dir):
280
293
  outputs.seqkit_seq_output_dir.mkdir(parents=True)
@@ -286,10 +299,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
286
299
  logger.silent_log(f"Filtered sequence written to: { outputs.seqkit_seq_output_path}")
287
300
 
288
301
  ## Translate nucleotide sequences to protein sequences with seqkit
289
- if verbose:
290
- logger.loud_log("Translating nucleotide sequences to protein sequences.")
291
- else:
292
- logger.silent_log("Translating nucleotide sequences to protein sequences.")
302
+ logger.loud_log("Translating nucleotide sequences to protein sequences.")
293
303
 
294
304
  if not os.path.exists(outputs.seqkit_translate_output_dir):
295
305
  outputs.seqkit_translate_output_dir.mkdir(parents=True)
@@ -302,6 +312,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
302
312
  logger.silent_log(f"Translated sequence written to: {outputs.seqkit_translate_output_path}")
303
313
 
304
314
  for db_name,db_path in zip(db_name_list, db_path_list):
315
+ logger.loud_log(f"Running HMMsearch for {db_name} database.")
305
316
 
306
317
  if verbose:
307
318
  logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
@@ -353,7 +364,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
353
364
  ])
354
365
  df_list.append(df)
355
366
 
367
+ logger.loud_log(f"HMMsearch for {db_name} completed.")
356
368
 
369
+ logger.loud_log("HMMsearch completed.")
357
370
 
358
371
  if not os.path.exists(outputs.plot_outdir):
359
372
  outputs.plot_outdir.mkdir(parents=True)
@@ -361,6 +374,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
361
374
  if not os.path.exists(outputs.tsv_outdir):
362
375
  outputs.tsv_outdir.mkdir(parents=True)
363
376
 
377
+ logger.loud_log("Consolidating results.")
364
378
 
365
379
  # Combine all the dataframes in the list
366
380
  combined_df = pl.concat(df_list, how='vertical_relaxed')
@@ -379,6 +393,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
379
393
  logger.loud_log("No hits found by RdRpCATCH. Exiting.")
380
394
  return None
381
395
 
396
+ # Generate upset plot
397
+ logger.loud_log("Generating plots.")
382
398
 
383
399
  if len(db_name_list) > 1:
384
400
  if verbose:
@@ -411,6 +427,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
411
427
  combined_set = set.union(*[value for value in set_dict.values()])
412
428
  translated_combined_set = set.union(*[value for value in translated_set_dict.values()])
413
429
 
430
+ logger.loud_log("Extracting RdRp contigs from the input file.")
431
+
414
432
  # Write a fasta file with all the contigs
415
433
  if not os.path.exists(outputs.fasta_output_dir):
416
434
  outputs.fasta_output_dir.mkdir(parents=True)
@@ -430,25 +448,21 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
430
448
  if verbose:
431
449
  logger.loud_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
432
450
  logger.loud_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
451
+ logger.loud_log(f"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}")
433
452
  else:
434
453
  logger.silent_log(f"Contigs written to: {outputs.fasta_nuc_out_path}")
435
454
  logger.silent_log(f"Translated contigs written to: {outputs.fasta_prot_out_path}")
455
+ logger.silent_log(f"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}")
436
456
 
437
457
  if not os.path.exists(outputs.mmseqs_tax_output_dir):
438
458
  outputs.mmseqs_tax_output_dir.mkdir(parents=True)
439
459
 
440
- if verbose:
441
- logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
442
- else:
443
- logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
460
+ logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
444
461
 
445
462
  mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
446
463
  outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
447
464
 
448
- if verbose:
449
- logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
450
- else:
451
- logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
465
+ logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
452
466
 
453
467
  if not os.path.exists(outputs.mmseqs_e_search_output_dir):
454
468
  outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
@@ -460,18 +474,17 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
460
474
  utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(
461
475
  outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)
462
476
 
477
+ logger.loud_log("Taxonomic annotation completed.")
463
478
 
464
479
  elif seq_type == 'prot':
465
480
 
466
- if verbose:
467
- logger.loud_log("Protein sequence detected.")
468
- else:
469
- logger.silent_log("Protein sequence detected.")
481
+ logger.loud_log("Protein sequence detected.")
470
482
 
471
483
  set_dict = {}
472
484
  df_list = []
473
485
 
474
486
  for db_name,db_path in zip (db_name_list, db_path_list):
487
+ logger.loud_log(f"Running HMMsearch for {db_name} database.")
475
488
 
476
489
  if verbose:
477
490
  logger.loud_log(f"HMM output path: {outputs.hmm_output_path(db_name)}")
@@ -519,12 +532,17 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
519
532
  ])
520
533
  df_list.append(df)
521
534
 
535
+ logger.loud_log(f"HMMsearch for {db_name} completed.")
536
+
537
+ logger.loud_log("HMMsearch completed.")
538
+
522
539
  if not os.path.exists(outputs.plot_outdir):
523
540
  outputs.plot_outdir.mkdir(parents=True)
524
541
 
525
542
  if not os.path.exists(outputs.tsv_outdir):
526
543
  outputs.tsv_outdir.mkdir(parents=True)
527
544
 
545
+ logger.loud_log("Consolidating results.")
528
546
 
529
547
  # Combine all the dataframes in the list
530
548
  combined_df = pl.concat(df_list, how='vertical_relaxed')
@@ -542,6 +560,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
542
560
  logger.loud_log("No hits found by RdRpCATCH. Exiting.")
543
561
  return None
544
562
 
563
+ # Generate upset plot
564
+ logger.loud_log("Generating plots.")
565
+
545
566
  if len(db_name_list) > 1:
546
567
  if verbose:
547
568
  logger.loud_log("Generating upset plot.")
@@ -574,6 +595,9 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
574
595
  # Extract all the contigs
575
596
  combined_set = set.union(*[value for value in set_dict.values()])
576
597
  # Write a fasta file with all the contigs
598
+
599
+ logger.loud_log("Extracting RdRp contigs from the input file.")
600
+
577
601
  if not os.path.exists(outputs.fasta_output_dir):
578
602
  outputs.fasta_output_dir.mkdir(parents=True)
579
603
 
@@ -600,11 +624,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
600
624
  if not os.path.exists(outputs.mmseqs_tax_output_dir):
601
625
  outputs.mmseqs_tax_output_dir.mkdir(parents=True)
602
626
 
603
- if verbose:
604
- logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
605
- else:
606
- logger.silent_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
607
-
627
+ logger.loud_log("Running mmseqs easy-taxonomy for taxonomic annotation.")
608
628
 
609
629
  mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,
610
630
  outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()
@@ -612,10 +632,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
612
632
  if not os.path.exists(outputs.mmseqs_e_search_output_dir):
613
633
  outputs.mmseqs_e_search_output_dir.mkdir(parents=True)
614
634
 
615
- if verbose:
616
- logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
617
- else:
618
- logger.silent_log("Running mmseqs easy-search for taxonomic annotation.")
635
+ logger.loud_log("Running mmseqs easy-search for taxonomic annotation.")
619
636
 
620
637
  mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,
621
638
  outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()
@@ -624,11 +641,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
624
641
  outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)
625
642
 
626
643
 
627
- end_time = logger.stop_timer(start_time, verbose)
628
- if verbose:
629
- logger.loud_log(f"Total Runtime: {end_time}")
630
- else:
631
- logger.silent_log(f"Total Runtime: {end_time}")
644
+
632
645
 
633
646
 
634
647
 
@@ -657,6 +670,11 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
657
670
  else:
658
671
  logger.silent_log(f"Results bundled into: {archive_path}")
659
672
 
673
+ end_time = logger.stop_timer(start_time, verbose)
674
+
675
+ logger.loud_log(f"Total Runtime: {end_time}")
676
+
677
+ logger.loud_log("RdRpCATCH completed successfully.")
660
678
 
661
679
 
662
680
  return outputs.extended_rdrpcatch_output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdrpcatch
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Dynamic: Summary
5
5
  Project-URL: Home, https://github.com/dimitris-karapliafis/RdRpCATCH
6
6
  Project-URL: Source, https://github.com/dimitris-karapliafis/RdRpCATCH
@@ -36,7 +36,7 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
36
36
 
37
37
  ** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
38
38
 
39
- ![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)
39
+ ![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_illustration.png)
40
40
 
41
41
  ### Supported databases
42
42
  - NeoRdRp <sup>1</sup> : 1182 pHMMs
@@ -133,25 +133,25 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
133
133
  ### rdrpcatch scan:
134
134
  Search a given input using selected RdRp databases.
135
135
 
136
- | Argument | Short Flag | Type | Description |
137
- |----------|------------|------|-------------|
138
- | `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
139
- | `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
140
- | `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
141
- | `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
142
- | `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
143
- | `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
144
- | `--verbose` | `-v` | FLAG | Print verbose output. |
145
- | `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
146
- | `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
147
- | `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
148
- | `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
149
- | `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
150
- | `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
151
- | `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
152
- | `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
153
- | `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
154
- | `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
136
+ | Argument | Short Flag | Type | Description |
137
+ |----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
138
+ | `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
139
+ | `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
140
+ | `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
141
+ | `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
142
+ | `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
143
+ | `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
144
+ | `--verbose` | `-v` | FLAG | Print verbose output. |
145
+ | `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
146
+ | `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
147
+ | `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
148
+ | `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
149
+ | `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
150
+ | `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
151
+ | `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
152
+ | `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
153
+ | `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
154
+ | `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
155
155
 
156
156
 
157
157
 
@@ -205,7 +205,6 @@ Dimitris Karapliafis (dimitris.karapliafis@wur.nl), potentially via slack/teams
205
205
 
206
206
  ##TODO:
207
207
  - [ ] loud logging is linking to the utils.py file, not the actual line of code causing the error.
208
- - [ ] Add `overwrite` flag
209
208
  - [ ] drop `db_dir` argument and use global/environment/config variable that is set after running the `download` command
210
209
 
211
210
 
@@ -1,7 +1,7 @@
1
1
  rdrpcatch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rdrpcatch/rdrpcatch_wrapper.py,sha256=bZ5w4NuTlCSUsCx9baEtJSk7jGiyp-6XthO80IKaMXI,30564
2
+ rdrpcatch/rdrpcatch_wrapper.py,sha256=X-U0CKQWHwybLIdWvaFZGEj-v0oTUnBv2PbiLAdu8s4,31573
3
3
  rdrpcatch/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- rdrpcatch/cli/args.py,sha256=2E2gXY42hNasUP94HmPxpgVCA1glk_oN7D5ftbu6W2c,15805
4
+ rdrpcatch/cli/args.py,sha256=DX7gfESWi4j1CNpALAEG45JV_b5KkU1LAJj2FDb8J5g,15963
5
5
  rdrpcatch/rdrpcatch_scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  rdrpcatch/rdrpcatch_scripts/fetch_dbs.py,sha256=e9ShColfLgBvWSZpGOvY3zKhEgIg3rw1IIV__KX7N-g,11054
7
7
  rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py,sha256=2_ERXFQK2lpVReWl0jwQdnKIObv_zq07uFJOzGsTHlo,25025
@@ -12,8 +12,8 @@ rdrpcatch/rdrpcatch_scripts/plot.py,sha256=Y1mZL7rkKHFKEs2D7T2Qj2kpfiORmFwRLq1LY
12
12
  rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py,sha256=9zcMzaIwQ4_-NgYzG9kejxOBaDi-gbzaqpvZti8ZXA4,9008
13
13
  rdrpcatch/rdrpcatch_scripts/run_seqkit.py,sha256=5y7DtJ6NLa4sRoBQOcjBfczKlqG_LibNrEqNmKLrHu0,4361
14
14
  rdrpcatch/rdrpcatch_scripts/utils.py,sha256=jvpyPxchAMn6BeLV7HOFECSY_a3nbkxDBBL8tunmM8A,16938
15
- rdrpcatch-0.0.5.dist-info/METADATA,sha256=X3wolDh_nUrk7caPG4jFMvsF7FHZCvYuGzjPLZnC4VA,14004
16
- rdrpcatch-0.0.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- rdrpcatch-0.0.5.dist-info/entry_points.txt,sha256=uiyoPO41jNz_KVOt2JdPak9NbVei-D8WQ6saMeMBFpE,53
18
- rdrpcatch-0.0.5.dist-info/licenses/LICENSE,sha256=3jm5vKRMIaiETEFfNN34-oyWUShxZtmDmL38PNAwlUI,1120
19
- rdrpcatch-0.0.5.dist-info/RECORD,,
15
+ rdrpcatch-0.0.7.dist-info/METADATA,sha256=BU-V7TAZcYQC5L3KuX_N_iH_l7Q77go7ZF9-1jYRrQE,16219
16
+ rdrpcatch-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
+ rdrpcatch-0.0.7.dist-info/entry_points.txt,sha256=uiyoPO41jNz_KVOt2JdPak9NbVei-D8WQ6saMeMBFpE,53
18
+ rdrpcatch-0.0.7.dist-info/licenses/LICENSE,sha256=3jm5vKRMIaiETEFfNN34-oyWUShxZtmDmL38PNAwlUI,1120
19
+ rdrpcatch-0.0.7.dist-info/RECORD,,