rdrpcatch 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdrpcatch/cli/args.py +36 -47
- rdrpcatch/rdrpcatch_scripts/fetch_dbs.py +13 -9
- rdrpcatch/rdrpcatch_wrapper.py +54 -12
- {rdrpcatch-0.0.6.dist-info → rdrpcatch-0.0.8.dist-info}/METADATA +89 -31
- {rdrpcatch-0.0.6.dist-info → rdrpcatch-0.0.8.dist-info}/RECORD +8 -8
- {rdrpcatch-0.0.6.dist-info → rdrpcatch-0.0.8.dist-info}/WHEEL +0 -0
- {rdrpcatch-0.0.6.dist-info → rdrpcatch-0.0.8.dist-info}/entry_points.txt +0 -0
- {rdrpcatch-0.0.6.dist-info → rdrpcatch-0.0.8.dist-info}/licenses/LICENSE +0 -0
rdrpcatch/cli/args.py
CHANGED
|
@@ -26,7 +26,7 @@ def parse_comma_separated_options(ctx, param, value):
|
|
|
26
26
|
return ['all']
|
|
27
27
|
|
|
28
28
|
allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
|
|
29
|
-
'
|
|
29
|
+
'Lucaprot_HMM', 'Zayed_HMM', 'all', 'none']
|
|
30
30
|
lower_choices = [choice.lower() for choice in allowed_choices]
|
|
31
31
|
options = value.split(',')
|
|
32
32
|
lower_options = [option.lower() for option in options]
|
|
@@ -66,17 +66,16 @@ def cli():
|
|
|
66
66
|
@click.option("-o", "--output",
|
|
67
67
|
help="Path to the output directory.",
|
|
68
68
|
type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path), required=True)
|
|
69
|
-
@click.option("-
|
|
69
|
+
@click.option("-db-dir", "--db-dir",
|
|
70
70
|
help="Path to the directory containing RdRpCATCH databases.",
|
|
71
71
|
type=click.Path(exists=True, dir_okay=True, readable=True, path_type=Path),required=True)
|
|
72
|
-
@click.option("-dbs", "--
|
|
72
|
+
@click.option("-dbs", "--db-options",
|
|
73
73
|
callback=parse_comma_separated_options,
|
|
74
74
|
default="all",
|
|
75
75
|
help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
|
|
76
|
-
" TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,
|
|
76
|
+
" TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all, none. ")
|
|
77
77
|
@click.option("--custom-dbs",
|
|
78
|
-
help="Path to directory containing custom MSAs/pHMM files to use as additional databases"
|
|
79
|
-
type=click.Path(exists=True, path_type=Path))
|
|
78
|
+
help="Path to directory containing custom MSAs/pHMM files to use as additional databases")
|
|
80
79
|
@click.option("-seq_type", "--seq_type",
|
|
81
80
|
type=click.STRING,
|
|
82
81
|
default=None,
|
|
@@ -112,7 +111,7 @@ def cli():
|
|
|
112
111
|
type=click.INT,
|
|
113
112
|
default=400,
|
|
114
113
|
help="Minimum length threshold for seqkit seq. (default: 400)")
|
|
115
|
-
@click.option('-
|
|
114
|
+
@click.option('-gen-code', '--gen_code',
|
|
116
115
|
type=click.INT,
|
|
117
116
|
default=1,
|
|
118
117
|
help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) : 1: The Standard Code, '
|
|
@@ -143,7 +142,7 @@ def cli():
|
|
|
143
142
|
is_flag=True,
|
|
144
143
|
default=False,
|
|
145
144
|
help="Bundle the output files into a single archive. (default: False)")
|
|
146
|
-
@click.option('-
|
|
145
|
+
@click.option('-keep-tmp', '--keep_tmp',
|
|
147
146
|
is_flag=True,
|
|
148
147
|
default=False,
|
|
149
148
|
help="Keep temporary files (Expert users) (default: False)")
|
|
@@ -164,7 +163,7 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
|
|
|
164
163
|
|
|
165
164
|
table.add_row("Input File", str(input))
|
|
166
165
|
table.add_row("Output Directory", str(output))
|
|
167
|
-
table.add_row("
|
|
166
|
+
table.add_row("Supported databases", ", ".join(db_options))
|
|
168
167
|
table.add_row("Database Directory", str(db_dir))
|
|
169
168
|
if custom_dbs:
|
|
170
169
|
table.add_row("Custom Databases", str(custom_dbs))
|
|
@@ -184,24 +183,13 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
|
|
|
184
183
|
|
|
185
184
|
console.print(Panel(table, title="Scan Configuration"))
|
|
186
185
|
|
|
187
|
-
# Add custom databases if provided
|
|
188
|
-
if custom_dbs:
|
|
189
|
-
db = db_fetcher(db_dir)
|
|
190
|
-
if os.path.isfile(custom_dbs):
|
|
191
|
-
db.add_custom_db(custom_dbs)
|
|
192
|
-
else:
|
|
193
|
-
for item in os.listdir(custom_dbs):
|
|
194
|
-
item_path = os.path.join(custom_dbs, item)
|
|
195
|
-
if os.path.isfile(item_path) and item_path.endswith(('.hmm', '.h3m', '.msa', '.sto', '.fasta', '.fa')):
|
|
196
|
-
db.add_custom_db(item_path)
|
|
197
|
-
elif os.path.isdir(item_path):
|
|
198
|
-
db.add_custom_db(item_path, item)
|
|
199
186
|
|
|
200
187
|
run_scan(
|
|
201
188
|
input_file=input,
|
|
202
189
|
output_dir=output,
|
|
203
190
|
db_options=db_options,
|
|
204
191
|
db_dir=db_dir,
|
|
192
|
+
custom_dbs=custom_dbs,
|
|
205
193
|
seq_type=seq_type,
|
|
206
194
|
verbose=verbose,
|
|
207
195
|
e=evalue,
|
|
@@ -217,29 +205,6 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
|
|
|
217
205
|
overwrite=overwrite
|
|
218
206
|
)
|
|
219
207
|
|
|
220
|
-
# @cli.command("download", help="Download RdRpCATCH databases.")
|
|
221
|
-
# @click.option("--destination_dir", "-dest",
|
|
222
|
-
# help="Path to the directory to download HMM databases.",
|
|
223
|
-
# type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path), required=True)
|
|
224
|
-
# @click.option("--check-updates", "-u",
|
|
225
|
-
# is_flag=True,
|
|
226
|
-
# help="Check for database updates")
|
|
227
|
-
# @click.pass_context
|
|
228
|
-
# def download(ctx, destination_dir, check_updates):
|
|
229
|
-
# """Download RdRpCATCH databases."""
|
|
230
|
-
#
|
|
231
|
-
# # if check_updates:
|
|
232
|
-
# # db = db_fetcher(destination_dir)
|
|
233
|
-
# # version_info = db.check_db_updates()
|
|
234
|
-
# # if version_info:
|
|
235
|
-
# # console.print("Current database versions:")
|
|
236
|
-
# # for db_name, info in version_info.items():
|
|
237
|
-
# # console.print(f"- {db_name}: {info}")
|
|
238
|
-
# # else:
|
|
239
|
-
# # console.print("No version information available")
|
|
240
|
-
# # return
|
|
241
|
-
#
|
|
242
|
-
# run_download(destination_dir)
|
|
243
208
|
#
|
|
244
209
|
# # @cli.command("gui", help="Launch the GUI.")
|
|
245
210
|
# # @click.pass_context
|
|
@@ -251,17 +216,41 @@ def scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose,
|
|
|
251
216
|
|
|
252
217
|
|
|
253
218
|
|
|
254
|
-
@cli.command("
|
|
219
|
+
@cli.command("databases", help="Download & update RdRpCATCH databases. If databases are already installed in the "
|
|
255
220
|
"specified directory,"
|
|
256
221
|
" it will check for updates and download the latest version if available.")
|
|
257
|
-
@click.option("--
|
|
222
|
+
@click.option("--destination-dir", "-dest",
|
|
258
223
|
help="Path to directory to download databases",
|
|
259
224
|
type=click.Path(path_type=Path, file_okay=False, writable=True),
|
|
260
225
|
required=True)
|
|
261
226
|
@click.option("--concept-doi", default="10.5281/zenodo.14358348",
|
|
262
227
|
help="Zenodo Concept DOI for database repository")
|
|
263
|
-
|
|
228
|
+
@click.option("--add-custom-db", "-cdb",
|
|
229
|
+
help="Path to a custom, pressed pHMM database directory. This only works"
|
|
230
|
+
"if the supported databases have already been downloaded."
|
|
231
|
+
" Please point to the directory the databases are stored ('rdrpcatch_dbs') using"
|
|
232
|
+
"the '--destination-dir' flag."
|
|
233
|
+
|
|
234
|
+
, type = click.Path(exists=True, dir_okay=True, readable=True, path_type=Path))
|
|
235
|
+
|
|
236
|
+
def databases(destination_dir: Path, concept_doi: str, add_custom_db: Path | None = None):
|
|
264
237
|
"""Handle database download/update workflow"""
|
|
238
|
+
|
|
239
|
+
if add_custom_db:
|
|
240
|
+
if not destination_dir.exists():
|
|
241
|
+
console.print("[red]× Destination directory does not exist![/red]")
|
|
242
|
+
raise click.Abort()
|
|
243
|
+
if not destination_dir.is_dir():
|
|
244
|
+
console.print("[red]× Destination path is not a directory![/red]")
|
|
245
|
+
raise click.Abort()
|
|
246
|
+
|
|
247
|
+
db = db_fetcher(destination_dir)
|
|
248
|
+
db.add_custom_db(add_custom_db)
|
|
249
|
+
|
|
250
|
+
console.print(f"[green]✓ Custom database added successfully to {destination_dir}[/green]")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
|
|
265
254
|
downloader = ZenodoDownloader(concept_doi, destination_dir)
|
|
266
255
|
|
|
267
256
|
try:
|
|
@@ -49,20 +49,23 @@ class db_fetcher:
|
|
|
49
49
|
|
|
50
50
|
# Copy the database file
|
|
51
51
|
if os.path.isfile(db_path):
|
|
52
|
-
|
|
52
|
+
raise ValueError("Custom database must be a directory, not a file. Name the directory as the database name,"
|
|
53
|
+
"and include a pressed HMMER HMM database inside. For directions, see the README.md file.")
|
|
54
|
+
|
|
53
55
|
elif os.path.isdir(db_path):
|
|
54
56
|
if os.path.exists(target_path):
|
|
55
57
|
shutil.rmtree(target_path)
|
|
56
58
|
shutil.copytree(db_path, target_path)
|
|
57
59
|
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
version_info.
|
|
61
|
-
version_info
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
60
|
+
# For now, we stop saving the version info, cause it messes up with the download module.
|
|
61
|
+
# # Update version info
|
|
62
|
+
# version_info = self._get_db_version()
|
|
63
|
+
# version_info.setdefault('custom_dbs', {})
|
|
64
|
+
# version_info['custom_dbs'][db_name] = {
|
|
65
|
+
# 'added': datetime.datetime.now().isoformat(),
|
|
66
|
+
# 'path': target_path
|
|
67
|
+
# }
|
|
68
|
+
# self._save_db_version(version_info)
|
|
66
69
|
|
|
67
70
|
def _resolve_rdrpcatch_path(self):
|
|
68
71
|
"""Automatically detect correct database path structure"""
|
|
@@ -93,6 +96,7 @@ class db_fetcher:
|
|
|
93
96
|
|
|
94
97
|
# First check custom databases
|
|
95
98
|
if os.path.exists(self.custom_db_dir):
|
|
99
|
+
|
|
96
100
|
custom_path = os.path.join(self.custom_db_dir, db_name)
|
|
97
101
|
if os.path.exists(custom_path):
|
|
98
102
|
if os.path.isfile(custom_path) and custom_path.endswith(('.h3m', '.hmm')):
|
rdrpcatch/rdrpcatch_wrapper.py
CHANGED
|
@@ -53,7 +53,7 @@ def bundle_results(output_dir, prefix):
|
|
|
53
53
|
|
|
54
54
|
return archive_path
|
|
55
55
|
|
|
56
|
-
def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):
|
|
56
|
+
def run_scan(input_file, output_dir, db_options, db_dir, custom_dbs, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):
|
|
57
57
|
"""
|
|
58
58
|
Run RdRpCATCH scan.
|
|
59
59
|
|
|
@@ -127,7 +127,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
127
127
|
|
|
128
128
|
logger.silent_log(f"Input File: {input_file}")
|
|
129
129
|
logger.silent_log(f"Output Directory: {output_dir}")
|
|
130
|
-
logger.silent_log(f"Databases: {db_options}")
|
|
130
|
+
logger.silent_log(f"Supported Databases: {db_options}")
|
|
131
|
+
logger.silent_log(f"Custom Databases: {custom_dbs}")
|
|
131
132
|
logger.silent_log(f"Database Directory: {db_dir}")
|
|
132
133
|
logger.silent_log(f"Sequence Type: {seq_type}")
|
|
133
134
|
logger.silent_log(f"Verbose Mode: {'ON' if verbose else 'OFF'}")
|
|
@@ -170,7 +171,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
170
171
|
|
|
171
172
|
logger.loud_log("Fetching HMM databases...")
|
|
172
173
|
|
|
173
|
-
## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan,
|
|
174
|
+
## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
|
|
174
175
|
rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
|
|
175
176
|
if verbose:
|
|
176
177
|
logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
|
|
@@ -202,20 +203,32 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
202
203
|
logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
|
|
203
204
|
else:
|
|
204
205
|
logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
|
|
205
|
-
lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("
|
|
206
|
+
lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
|
|
206
207
|
if verbose:
|
|
207
208
|
logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
|
|
208
209
|
else:
|
|
209
210
|
logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
|
|
211
|
+
zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
|
|
212
|
+
if verbose:
|
|
213
|
+
logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
|
|
214
|
+
else:
|
|
215
|
+
logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
|
|
210
216
|
|
|
211
217
|
db_name_list = []
|
|
212
218
|
db_path_list = []
|
|
213
219
|
|
|
214
220
|
## Set up HMM databases
|
|
215
221
|
if db_options == ['all']:
|
|
216
|
-
db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "
|
|
217
|
-
db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
|
|
218
|
-
|
|
222
|
+
db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
|
|
223
|
+
db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
|
|
224
|
+
elif db_options == ['none'] and not custom_dbs:
|
|
225
|
+
raise Exception("No databases selected. Please select at least one database or provide custom databases.")
|
|
226
|
+
elif db_options == ['none'] and custom_dbs:
|
|
227
|
+
logger.loud_log("No supported databases selected, but custom databases provided. Using only custom databases.")
|
|
228
|
+
if not os.path.exists(os.path.join(db_dir, "custom_dbs")):
|
|
229
|
+
raise Exception(f"Custom databases directory not found: {os.path.join(db_dir, 'custom_dbs')}. Please"
|
|
230
|
+
f" use rdrpcatch databases to create a valid custom database as described in the "
|
|
231
|
+
f"documentation.")
|
|
219
232
|
else:
|
|
220
233
|
for db in db_options:
|
|
221
234
|
if db == "RVMT".lower():
|
|
@@ -236,15 +249,42 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
236
249
|
elif db == "RDRP-scan".lower():
|
|
237
250
|
db_name_list.append("RDRP-scan")
|
|
238
251
|
db_path_list.append(rdrpscan_hmm_db)
|
|
239
|
-
elif db == "
|
|
240
|
-
db_name_list.append("
|
|
252
|
+
elif db == "Lucaprot_HMM".lower():
|
|
253
|
+
db_name_list.append("Lucaprot_HMM")
|
|
241
254
|
db_path_list.append(lucaprot_hmm_db)
|
|
255
|
+
elif db == "Zayed_HMM".lower():
|
|
256
|
+
db_name_list.append("Zayed_HMM")
|
|
257
|
+
db_path_list.append(zayed_hmm_db)
|
|
242
258
|
else:
|
|
243
259
|
raise Exception(f"Invalid database option: {db}")
|
|
244
260
|
|
|
245
|
-
|
|
261
|
+
## Check if custom databases are provided
|
|
262
|
+
if custom_dbs:
|
|
246
263
|
|
|
264
|
+
if not os.path.exists(os.path.join(db_dir, "custom_dbs")):
|
|
265
|
+
raise Exception(f"Custom databases directory not found: {os.path.join(db_dir, 'custom_dbs')}. Please"
|
|
266
|
+
f" use rdrpcatch databases to create a valid custom database as described in the "
|
|
267
|
+
f"documentation.")
|
|
268
|
+
|
|
269
|
+
custom_db_names = custom_dbs.split(',')
|
|
270
|
+
for custom_db in custom_db_names:
|
|
271
|
+
if verbose:
|
|
272
|
+
logger.loud_log(f"Fetching custom database: {custom_db}")
|
|
273
|
+
else:
|
|
274
|
+
logger.silent_log(f"Fetching custom database: {custom_db}")
|
|
247
275
|
|
|
276
|
+
custom_db = custom_db.strip()
|
|
277
|
+
custom_db_path = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(custom_db)
|
|
278
|
+
db_name_list.append(custom_db)
|
|
279
|
+
db_path_list.append(custom_db_path)
|
|
280
|
+
|
|
281
|
+
if verbose:
|
|
282
|
+
logger.loud_log(f"Custom database {custom_db} fetched from: {custom_db_path}")
|
|
283
|
+
else:
|
|
284
|
+
logger.silent_log(f"Custom database {custom_db} fetched from: {custom_db_path}")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# Fetch mmseqs database
|
|
248
288
|
logger.loud_log("Fetching Mmseqs2 databases...")
|
|
249
289
|
|
|
250
290
|
mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path("mmseqs_refseq_riboviria_20250211")
|
|
@@ -382,7 +422,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
382
422
|
|
|
383
423
|
# Check if the combined dataframe is empty
|
|
384
424
|
if combined_df.is_empty():
|
|
385
|
-
|
|
425
|
+
db_name_string = ', '.join(db_name_list)
|
|
426
|
+
logger.loud_log(f"No hits found by RdRpCATCH for databases {db_name_string}. Exiting.")
|
|
386
427
|
return None
|
|
387
428
|
|
|
388
429
|
# Generate upset plot
|
|
@@ -549,7 +590,8 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
|
|
|
549
590
|
|
|
550
591
|
# Check if the combined dataframe is empty
|
|
551
592
|
if combined_df.is_empty():
|
|
552
|
-
|
|
593
|
+
db_name_string = ', '.join(db_name_list)
|
|
594
|
+
logger.loud_log(f"No hits found by RdRpCATCH for databases {db_name_string}. Exiting.")
|
|
553
595
|
return None
|
|
554
596
|
|
|
555
597
|
# Generate upset plot
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rdrpcatch
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.8
|
|
4
4
|
Dynamic: Summary
|
|
5
5
|
Project-URL: Home, https://github.com/dimitris-karapliafis/RdRpCATCH
|
|
6
6
|
Project-URL: Source, https://github.com/dimitris-karapliafis/RdRpCATCH
|
|
@@ -34,18 +34,31 @@ library to perform pHMM searches. In addition, the tool scans each sequence (aa
|
|
|
34
34
|
In addition, RdRpCATCH provides information about the number of profiles
|
|
35
35
|
that were positive for each sequence across all pHMM databases, and taxonomic information based on the MMseqs2 easy-taxonomy and search modules against a custom RefSeq Riboviria database.
|
|
36
36
|
|
|
37
|
+
### Version 0.0.7 -> 0.0.8 Changelog
|
|
38
|
+
- Added support for custom pHMM databases. See the [Setting up custom pHMM databases](#setting-up-custom-phmm-databases) section for more information.
|
|
39
|
+
- All specified flags use '-' instead of '_' (e.g. `--db-dir` instead of `--db_dir`).
|
|
40
|
+
- Fixed issue with specifying the Lucaprot_HMM and Zayed_HMM databases in the `--db-options` argument.
|
|
41
|
+
- Command `rdrpcatch download` renamed as `rdrpcatch databases` for clarity, as it now supports adding custom pHMM
|
|
42
|
+
databases to the RdRpCATCH databases. This is facilitated by the `--add-custom-db` argument.
|
|
43
|
+
- Added none option to the `--db-options` argument to search only against custom databases.
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
37
47
|
** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
|
|
38
48
|
|
|
39
|
-
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+

|
|
40
52
|
|
|
41
53
|
### Supported databases
|
|
42
54
|
- NeoRdRp <sup>1</sup> : 1182 pHMMs
|
|
43
55
|
- NeoRdRp2 <sup>2</sup>: 19394 pHMMs
|
|
44
56
|
- RVMT <sup>3</sup>: 710 pHMMs
|
|
45
57
|
- RdRp-Scan <sup>4</sup> : 68 pHMMs
|
|
46
|
-
-
|
|
47
|
-
-
|
|
48
|
-
-
|
|
58
|
+
- TSA_Olendraite_fam <sup>5</sup>: 77 pHMMs
|
|
59
|
+
- TSA_Olendraite_gen <sup>6</sup> : 341 pHMMs
|
|
60
|
+
- LucaProt_HMM<sup>7 </sup> : 754 pHMMs
|
|
61
|
+
- Zayed_HMM<sup>8 </sup> : 2489 pHMMs
|
|
49
62
|
|
|
50
63
|
1. Sakaguchi, S. et al. (2022) 'NeoRdRp: A comprehensive dataset for identifying RNA-dependent RNA polymerases of various RNA viruses from metatranscriptomic data', *Microbes and Environments*, 37(3). [doi:10.1264/jsme2.me22001](https://doi.org/10.1264/jsme2.me22001)
|
|
51
64
|
2. Sakaguchi, S., Nakano, T. and Nakagawa, S. (2024) 'Neordrp2 with improved seed data, annotations, and scoring', *Frontiers in Virology*, 4. [doi:10.3389/fviro.2024.1378695](https://doi.org/10.3389/fviro.2024.1378695)
|
|
@@ -53,7 +66,9 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
|
|
|
53
66
|
4. Charon, J. et al. (2022) 'RDRP-Scan: A bioinformatic resource to identify and annotate divergent RNA viruses in metagenomic sequence data', *Virus Evolution*, 8(2). [doi:10.1093/ve/veac082](https://doi.org/10.1093/ve/veac082)
|
|
54
67
|
5. Olendraite, I., Brown, K. and Firth, A.E. (2023) 'Identification of RNA virus–derived rdrp sequences in publicly available transcriptomic data sets', *Molecular Biology and Evolution*, 40(4). [doi:10.1093/molbev/msad060](https://doi.org/10.1093/molbev/msad060)
|
|
55
68
|
6. Olendraite, I. (2021) 'Mining diverse and novel RNA viruses in transcriptomic datasets', Apollo. Available at: [https://www.repository.cam.ac.uk/items/1fabebd2-429b-45c9-b6eb-41d27d0a90c2](https://www.repository.cam.ac.uk/items/1fabebd2-429b-45c9-b6eb-41d27d0a90c2)
|
|
56
|
-
7. Hou, X. et al. (2024) 'Using artificial intelligence to document the hidden RNA virosphere', *Cell*, 187(24). [doi:10.1016/j.cell.2024.09.027](https://doi.org/10.1016/j.cell.2024.09.027)
|
|
69
|
+
7. Hou, X. and He, Y. et al. (2024) 'Using artificial intelligence to document the hidden RNA virosphere', *Cell*, 187(24). [doi:10.1016/j.cell.2024.09.027](https://doi.org/10.1016/j.cell.2024.09.027)
|
|
70
|
+
8. Zayed, A. A., et al. (2022) 'Cryptic and abundant marine viruses at the evolutionary origins of Earth’s RNA virome.' *Science*, 376(6589), 156–162. [doi:10.1126/science.abm5847](https://doi.org/10.1126/science.abm5847)
|
|
71
|
+
|
|
57
72
|
|
|
58
73
|
|
|
59
74
|
## Installation
|
|
@@ -94,13 +109,16 @@ Activate the environment and download the RdRpCATCH databases:
|
|
|
94
109
|
|
|
95
110
|
```bash
|
|
96
111
|
conda activate rdrpcatch
|
|
97
|
-
rdrpcatch
|
|
112
|
+
rdrpcatch databases --destination_dir path/to/store/databases
|
|
98
113
|
```
|
|
99
114
|
|
|
100
115
|
* Note 1: The databases are large files and may take some time to download (~ 3 GB).
|
|
101
116
|
* Note 2: The databases are stored in the specified directory, and the path is required to run RdRpCATCH.
|
|
102
117
|
* Note 3: If you encounter an SSL error while downloading, please try again. The error seems to appear sporadically during testing, and a simple re-initiation of the downloading process seems to fix it.
|
|
103
118
|
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
104
122
|
## Usage
|
|
105
123
|
RdRpCATCH can be used as a CLI tool as follows:
|
|
106
124
|
|
|
@@ -109,20 +127,58 @@ RdRpCATCH can be used as a CLI tool as follows:
|
|
|
109
127
|
# conda activate rdrpcatch
|
|
110
128
|
|
|
111
129
|
# scan the input fasta file with the selected databases
|
|
112
|
-
rdrpcatch scan -i path/to/input.fasta -o path/to/output_dir -
|
|
130
|
+
rdrpcatch scan -i path/to/input.fasta -o path/to/output_dir -db-dir path/to/database
|
|
113
131
|
```
|
|
114
|
-
|
|
132
|
+
|
|
133
|
+
## Input description
|
|
115
134
|
The input file can be one or more nucleotide or protein sequences in multi-fasta format.
|
|
116
135
|
The output directory is where the results will be stored. We recommend specifying the type of the sequence in the command line,
|
|
117
136
|
An optional argument `--seq_type` (nuc or prot) can be used to specify if the input fasta file sequences are nucleotide or amino acid.
|
|
118
137
|
|
|
138
|
+
|
|
139
|
+
## Setting up custom pHMM databases
|
|
140
|
+
It is possible to use custom pHMM databases with RdRpCATCH. As a prerequisite, you need to install the RdRpCATCH
|
|
141
|
+
databases using the `rdrpcatch databases` command as described above, to a directory of your choice.
|
|
142
|
+
|
|
143
|
+
The custom databases should be formatted as follows:
|
|
144
|
+
|
|
145
|
+
- First create a directory and give it a descriptive name, e.g. `my_custom_rdrp_database`. Important: The name should not contain comma `,` characters.
|
|
146
|
+
- Inside the directory put your custom pHMM HMMER pressed database. You can use the `hmmpress` command of HMMER to create the pressed database from your custom HMM file. This creates a set of files with the same name as the original HMM file, but with different extensions (e.g. `.h3f`, `.h3i`, `.h3m`, `.h3p`). The directory should contain all these files. Please refer to the HMMER manual for more information on how to create a pressed database from an HMM file. (http://eddylab.org/software/hmmer/Userguide.pdf)
|
|
147
|
+
- Next you can add the directory to the custom databases that are readable by RdRpCATCH. This can be done by using the rdrpcatch databases command as follows:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
rdrpcatch databases --add-custom-db path/to/my_custom_rdrp_database --destination-dir path/that/contains/rdrpcatch/databases
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
- This will add the custom database to the list of databases that can be used with RdRpCATCH.
|
|
154
|
+
- The custom database can then be used with the `rdrpcatch scan` command by specifying the `--custom-dbs` argument as follows:
|
|
155
|
+
-
|
|
156
|
+
```bash
|
|
157
|
+
rdrpcatch scan -i path/to/input.fasta -o path/to/output_dir -db_dir path/to/database --custom-dbs custom_database_name
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
- The `custom_database_name` should be the name of the directory that contains the custom pHMM files, without the path.
|
|
161
|
+
- For example, if the custom database is stored in `path/to/my_custom_rdrp_database`, you would use `--custom-dbs my_custom_rdrp_database` in the command line.
|
|
162
|
+
- You can add multiple custom databases by installing them in the same way and specifying them by separating them with commas, e.g. `--custom-dbs my_custom_rdrp_database,another_custom_database`.
|
|
163
|
+
- The custom databases can be used in combination with the pre-compiled databases provided by RdRpCATCH. To do this, you can specify the `--db_options` argument with the names of the pre-compiled databases you want to use, and specify the custom databases with the `--custom-dbs` argument.
|
|
164
|
+
- For example, if you want to use the NeoRdRp and RVMT databases along with your custom database, you would use the following command:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
rdrpcatch scan -i path/to/input.fasta -o path/to/output_dir -db_dir path/to/database --db_options NeoRdRp,RVMT --custom-dbs my_custom_rdrp_database
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
- Note: By default, RdRpCATCH will search against all pre-compiled databases if no `--db_options` argument is specified. If you want to use only the custom databases, you can specify `--db_options none` to avoid searching against the pre-compiled databases.
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
119
175
|
## Commands
|
|
120
176
|
The following two commands are available in RdRpCATCH:
|
|
121
177
|
* [`rdrpcatch scan`](#rdrpcatch-scan)
|
|
122
|
-
* [`rdrpcatch
|
|
178
|
+
* [`rdrpcatch databases`](#rdrpcatch-download)
|
|
123
179
|
|
|
124
|
-
### rdrpcatch
|
|
125
|
-
Command to download pre-compiled databases from Zenodo. If the databases are already downloaded in the specified directory
|
|
180
|
+
### rdrpcatch databases:
|
|
181
|
+
Command to download pre-compiled databases from Zenodo and to set up custom databases. If the databases are already downloaded in the specified directory
|
|
126
182
|
, the command will check for updates and download the latest version if available.
|
|
127
183
|
|
|
128
184
|
| Argument | Short Flag | Type | Description |
|
|
@@ -130,28 +186,30 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
|
|
|
130
186
|
| `--destination_dir` | `-dest` | PATH | Path to the directory to download HMM databases. [required] |
|
|
131
187
|
| `--concept-doi` | `` | TEXT | Zenodo Concept DOI for database repository |
|
|
132
188
|
| `--help` | `` | | Show help message and exit |
|
|
189
|
+
| `--add-custom-db` | `` | PATH | Path to the directory containing custom pHMM files to add to the RdRpCATCH databases. |
|
|
190
|
+
|
|
133
191
|
### rdrpcatch scan:
|
|
134
192
|
Search a given input using selected RdRp databases.
|
|
135
193
|
|
|
136
|
-
| Argument
|
|
137
|
-
|
|
138
|
-
| `--input`
|
|
139
|
-
| `--output`
|
|
140
|
-
| `--
|
|
141
|
-
| `--
|
|
142
|
-
| `--custom-dbs`
|
|
143
|
-
| `--
|
|
144
|
-
| `--verbose`
|
|
145
|
-
| `--evalue`
|
|
146
|
-
| `--incevalue`
|
|
147
|
-
| `--domevalue`
|
|
148
|
-
| `--incdomevalue` | `-incdomE`
|
|
149
|
-
| `--zvalue`
|
|
150
|
-
| `--cpus`
|
|
151
|
-
| `--
|
|
152
|
-
| `--
|
|
153
|
-
| `--bundle`
|
|
154
|
-
| `--
|
|
194
|
+
| Argument | Short Flag | Type | Description |
|
|
195
|
+
|------------------|---------------|------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
196
|
+
| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
|
|
197
|
+
| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
|
|
198
|
+
| `--db-dir` | `-db-dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
|
|
199
|
+
| `--db-options` | `-dbs` | TEXT | Comma-separated list of pre-installed databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
|
|
200
|
+
| `--custom-dbs` | | PATH | Comma-separated list of custom databases to search against. Valid options: names of the directories that the custom databases are stored in. |
|
|
201
|
+
| `--seq-type` | `-seq-type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
|
|
202
|
+
| `--verbose` | `-v` | FLAG | Print verbose output. |
|
|
203
|
+
| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
|
|
204
|
+
| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
|
|
205
|
+
| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
|
|
206
|
+
| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
|
|
207
|
+
| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
|
|
208
|
+
| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
|
|
209
|
+
| `--length-thr` | `-length-thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
|
|
210
|
+
| `--gen-code` | `-gen-code` | INTEGER | Genetic code to use for translation. (default: 1) |
|
|
211
|
+
| `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
|
|
212
|
+
| `--keep-tmp` | `-keep-tmp` | | Keep the temporary files generated during the analysis. (default: False) |
|
|
155
213
|
|
|
156
214
|
|
|
157
215
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
rdrpcatch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rdrpcatch/rdrpcatch_wrapper.py,sha256=
|
|
2
|
+
rdrpcatch/rdrpcatch_wrapper.py,sha256=sV2xvkRcyJvxjk3ybN0FaaSn7uDsbJ2FbQreaEG2hgo,33674
|
|
3
3
|
rdrpcatch/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
rdrpcatch/cli/args.py,sha256=
|
|
4
|
+
rdrpcatch/cli/args.py,sha256=e3gAu84rMbEkQeijyy2m2wWBiymvbPRzkrzsdWdTOfk,15389
|
|
5
5
|
rdrpcatch/rdrpcatch_scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
rdrpcatch/rdrpcatch_scripts/fetch_dbs.py,sha256=
|
|
6
|
+
rdrpcatch/rdrpcatch_scripts/fetch_dbs.py,sha256=BuiTlwe8d489zP0TIsfd-KuynccQRSae_k5dPAXKyk4,11368
|
|
7
7
|
rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py,sha256=2_ERXFQK2lpVReWl0jwQdnKIObv_zq07uFJOzGsTHlo,25025
|
|
8
8
|
rdrpcatch/rdrpcatch_scripts/gui.py,sha256=he8kx_4VJWB7SVv9XSQPk0DmkOjEFIg-uGMAtDp3t-w,10576
|
|
9
9
|
rdrpcatch/rdrpcatch_scripts/mmseqs_tax.py,sha256=bwzuCxu8nHQ5OC0Yr5Lyvhcyk9OWjuamInqe0T0lc38,3809
|
|
@@ -12,8 +12,8 @@ rdrpcatch/rdrpcatch_scripts/plot.py,sha256=Y1mZL7rkKHFKEs2D7T2Qj2kpfiORmFwRLq1LY
|
|
|
12
12
|
rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py,sha256=9zcMzaIwQ4_-NgYzG9kejxOBaDi-gbzaqpvZti8ZXA4,9008
|
|
13
13
|
rdrpcatch/rdrpcatch_scripts/run_seqkit.py,sha256=5y7DtJ6NLa4sRoBQOcjBfczKlqG_LibNrEqNmKLrHu0,4361
|
|
14
14
|
rdrpcatch/rdrpcatch_scripts/utils.py,sha256=jvpyPxchAMn6BeLV7HOFECSY_a3nbkxDBBL8tunmM8A,16938
|
|
15
|
-
rdrpcatch-0.0.
|
|
16
|
-
rdrpcatch-0.0.
|
|
17
|
-
rdrpcatch-0.0.
|
|
18
|
-
rdrpcatch-0.0.
|
|
19
|
-
rdrpcatch-0.0.
|
|
15
|
+
rdrpcatch-0.0.8.dist-info/METADATA,sha256=JpFlynLSxi9M-JeH5Gr2Y4lFEITwtsfrWvcyY-F-TjA,20768
|
|
16
|
+
rdrpcatch-0.0.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
17
|
+
rdrpcatch-0.0.8.dist-info/entry_points.txt,sha256=uiyoPO41jNz_KVOt2JdPak9NbVei-D8WQ6saMeMBFpE,53
|
|
18
|
+
rdrpcatch-0.0.8.dist-info/licenses/LICENSE,sha256=3jm5vKRMIaiETEFfNN34-oyWUShxZtmDmL38PNAwlUI,1120
|
|
19
|
+
rdrpcatch-0.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|