rdrpcatch 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdrpcatch/__init__.py +0 -0
- rdrpcatch/cli/__init__.py +0 -0
- rdrpcatch/cli/args.py +358 -0
- rdrpcatch/rdrpcatch_scripts/__init__.py +0 -0
- rdrpcatch/rdrpcatch_scripts/fetch_dbs.py +302 -0
- rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py +589 -0
- rdrpcatch/rdrpcatch_scripts/gui.py +256 -0
- rdrpcatch/rdrpcatch_scripts/mmseqs_tax.py +100 -0
- rdrpcatch/rdrpcatch_scripts/paths.py +162 -0
- rdrpcatch/rdrpcatch_scripts/plot.py +165 -0
- rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py +155 -0
- rdrpcatch/rdrpcatch_scripts/run_seqkit.py +112 -0
- rdrpcatch/rdrpcatch_scripts/utils.py +414 -0
- rdrpcatch/rdrpcatch_wrapper.py +666 -0
- rdrpcatch-0.0.1.dist-info/METADATA +223 -0
- rdrpcatch-0.0.1.dist-info/RECORD +19 -0
- rdrpcatch-0.0.1.dist-info/WHEEL +4 -0
- rdrpcatch-0.0.1.dist-info/entry_points.txt +2 -0
- rdrpcatch-0.0.1.dist-info/licenses/LICENCE +9 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pyhmmer
|
|
3
|
+
class pyhmmsearch:
|
|
4
|
+
|
|
5
|
+
def __init__(self, hmmsearch_out_path, seq_file, hmm_file, cpus, e, incdomE, domE, incE, z):
|
|
6
|
+
self.hmmsearch_out_path = hmmsearch_out_path
|
|
7
|
+
self.hmmsearch_out_path_custom = str(self.hmmsearch_out_path.with_suffix('.custom.tsv'))
|
|
8
|
+
self.seq_file = seq_file
|
|
9
|
+
self.hmm_file = hmm_file
|
|
10
|
+
self.cpus = cpus
|
|
11
|
+
self.e = e
|
|
12
|
+
self.incdomE = incdomE
|
|
13
|
+
self.domE = domE
|
|
14
|
+
self.incE = incE
|
|
15
|
+
self.z = z
|
|
16
|
+
|
|
17
|
+
def run_pyhmmsearch(self):
|
|
18
|
+
"""
|
|
19
|
+
TODO: 1. Add option to run hmmsearch on long sequences (longer than 100kb) as pyhmmer.Pipeline is not able to handle
|
|
20
|
+
TODO: long sequences. See: https://pyhmmer.readthedocs.io/en/latest/api/plan7.html#pyhmmer.plan7.LongTargetsPipeline
|
|
21
|
+
TODO: 2. Parameters are now hardcoded, add option to change them
|
|
22
|
+
"""
|
|
23
|
+
# import pyhmmer
|
|
24
|
+
|
|
25
|
+
if not os.path.exists(self.hmmsearch_out_path):
|
|
26
|
+
|
|
27
|
+
with pyhmmer.plan7.HMMPressedFile(self.hmm_file) as handle:
|
|
28
|
+
hmms = list(handle)
|
|
29
|
+
|
|
30
|
+
with pyhmmer.easel.SequenceFile(self.seq_file, digital=True) as handle:
|
|
31
|
+
db = list(handle)
|
|
32
|
+
|
|
33
|
+
with open(self.hmmsearch_out_path, 'wb') as raw_out, open(self.hmmsearch_out_path_custom, 'wb') as custom_out:
|
|
34
|
+
title_line = ["t_name", "t_acc", "tlen", "q_name", "q_acc", "qlen", "E-value",
|
|
35
|
+
"score", "bias", "dom_num", "dom_total", "dom_c_value", "dom_i_value", "dom_score",
|
|
36
|
+
"dom_bias", "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc",
|
|
37
|
+
"description of target"]
|
|
38
|
+
raw_out.write("\t".join(title_line).encode("utf-8") + b"\n")
|
|
39
|
+
custom_out.write("\t".join(title_line).encode("utf-8") + b"\n")
|
|
40
|
+
|
|
41
|
+
for result in pyhmmer.hmmer.hmmsearch(hmms,
|
|
42
|
+
db,
|
|
43
|
+
cpus=self.cpus,
|
|
44
|
+
E=self.e,
|
|
45
|
+
incdomE=self.incdomE,
|
|
46
|
+
domE=self.domE,
|
|
47
|
+
incE=self.incE,
|
|
48
|
+
Z=self.z):
|
|
49
|
+
|
|
50
|
+
result.write(raw_out, format="domains", header=False)
|
|
51
|
+
if len(result) >= 1:
|
|
52
|
+
# result.reported.
|
|
53
|
+
# print(hits.query_name.decode())
|
|
54
|
+
for hit in result:
|
|
55
|
+
hit_desc = hit.accession or bytes("", "utf-8")
|
|
56
|
+
t_desc = hit.description or bytes("-", "utf-8")
|
|
57
|
+
|
|
58
|
+
# print(dir(hit.domains.ex))
|
|
59
|
+
# hit_name = hit.name.decode()
|
|
60
|
+
# join the prot name and acc into a single string because God knows why there are spaces in fasta headers
|
|
61
|
+
# full_prot_name = f"{hit_name} {hit_desc.decode()}"
|
|
62
|
+
|
|
63
|
+
total_domains = len(hit.domains.included)
|
|
64
|
+
dom_desc = result.query.description or bytes("", "utf-8")
|
|
65
|
+
|
|
66
|
+
for i, domain in enumerate(hit.domains.included):
|
|
67
|
+
domain_num = i + 1
|
|
68
|
+
|
|
69
|
+
# print(dir(domain.alignment))
|
|
70
|
+
# remove the non-numeric characters from the posterior_probabilities string, then convert to int
|
|
71
|
+
import re
|
|
72
|
+
# print(domain.alignment.posterior_probabilities)
|
|
73
|
+
aligned_probs = (re.sub(r'[^0-9]', '', domain.alignment.posterior_probabilities))
|
|
74
|
+
mean_aligned_prob = sum(int(digit) for digit in aligned_probs) / len(domain.alignment.posterior_probabilities)
|
|
75
|
+
MEA = mean_aligned_prob
|
|
76
|
+
# print(MEA)
|
|
77
|
+
outputline = [
|
|
78
|
+
f"{hit.name.decode()}", # t_name (protein)
|
|
79
|
+
f"{hit_desc.decode()}", # t_acc (empty if none)
|
|
80
|
+
f"{hit.length}", # tlen (protein length)
|
|
81
|
+
f"{result.query.name.decode()}", # q_name (HMM name)
|
|
82
|
+
f"{dom_desc.decode()}", # q_acc (empty if none)
|
|
83
|
+
f"{domain.alignment.hmm_length}", # qlen (HMM length)
|
|
84
|
+
f"{hit.evalue}", # E-value
|
|
85
|
+
f"{hit.score}", # score
|
|
86
|
+
f"{hit.bias}", # bias
|
|
87
|
+
f"{domain_num}", # dom_num (number of this domain)
|
|
88
|
+
f"{total_domains}", # dom_total (total number of domains)
|
|
89
|
+
f"{domain.c_evalue}", # dom_c_value
|
|
90
|
+
f"{domain.i_evalue}", # dom_i_value
|
|
91
|
+
f"{domain.score}", # dom_score
|
|
92
|
+
f"{domain.bias}", # dom_bias
|
|
93
|
+
f"{domain.alignment.hmm_from}", # hmm_from (query from)
|
|
94
|
+
f"{domain.alignment.hmm_to}", # hmm_to (query to)
|
|
95
|
+
f"{domain.alignment.target_from}", # ali_from (target from)
|
|
96
|
+
f"{domain.alignment.target_to}", # ali_to (target to)
|
|
97
|
+
f"{domain.env_from}", # env_from
|
|
98
|
+
f"{domain.env_to}", # env_to
|
|
99
|
+
f"{MEA}", # acc
|
|
100
|
+
f"{t_desc.decode()}" # description of target
|
|
101
|
+
]
|
|
102
|
+
custom_out.write(("\t".join(outputline) + "\n").encode())
|
|
103
|
+
|
|
104
|
+
return self.hmmsearch_out_path
|
|
105
|
+
|
|
106
|
+
def run_pyhmmsearch_long_sequences(self):
|
|
107
|
+
"""
|
|
108
|
+
Run hmmsearch for sequences longer than 100,000 residues.
|
|
109
|
+
"""
|
|
110
|
+
import pyhmmer
|
|
111
|
+
|
|
112
|
+
if not os.path.exists(self.hmmsearch_out_path):
|
|
113
|
+
with pyhmmer.plan7.HMMPressedFile(self.hmm_file) as handle:
|
|
114
|
+
hmms = list(handle)
|
|
115
|
+
|
|
116
|
+
with pyhmmer.easel.SequenceFile(self.seq_file, digital=True) as handle:
|
|
117
|
+
db = list(handle)
|
|
118
|
+
|
|
119
|
+
# Create a LongTargetsPipeline instance
|
|
120
|
+
alphabet = pyhmmer.easel.Alphabet.amino()
|
|
121
|
+
pipeline = pyhmmer.plan7.LongTargetsPipeline(alphabet,
|
|
122
|
+
block_length=262144, # Default block length
|
|
123
|
+
F1=0.02, F2=0.003, F3=3e-05)
|
|
124
|
+
|
|
125
|
+
with open(self.hmmsearch_out_path, 'wb') as handle:
|
|
126
|
+
title_line = ["#t_name", "t_acc", "tlen", "q_name", "q_acc", "qlen", "E-value",
|
|
127
|
+
"score", "bias", "dom_num", "dom_total", "dom_c_value", "dom_i_value", "dom_score",
|
|
128
|
+
"dom_bias", "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc",
|
|
129
|
+
"description of target"]
|
|
130
|
+
handle.write("\t".join(title_line).encode("utf-8") + b"\n")
|
|
131
|
+
|
|
132
|
+
for hmm in hmms:
|
|
133
|
+
iterator = pipeline.iterate_seq(hmm, db)
|
|
134
|
+
max_iterations = 10 # Prevent infinite loop
|
|
135
|
+
for n in range(max_iterations):
|
|
136
|
+
_, hits, _, converged, _ = next(iterator)
|
|
137
|
+
if converged:
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
# Process hits and write to file
|
|
141
|
+
for hit in hits:
|
|
142
|
+
# Assuming hit is a plan7.Hit object
|
|
143
|
+
# Extract relevant information and write to file
|
|
144
|
+
# Note: This part might need adjustment based on actual hit structure
|
|
145
|
+
handle.write(f"{hit.target_name}\t{hit.target_accession}\t{hit.target_length}\t"
|
|
146
|
+
f"{hit.query_name}\t{hit.query_accession}\t{hit.query_length}\t"
|
|
147
|
+
f"{hit.evalue}\t{hit.score}\t{hit.bias}\t"
|
|
148
|
+
f"{hit.domain_num}\t{hit.domain_total}\t{hit.domain_cvalue}\t"
|
|
149
|
+
f"{hit.domain_ivalue}\t{hit.domain_score}\t{hit.domain_bias}\t"
|
|
150
|
+
f"{hit.hmm_from}\t{hit.hmm_to}\t{hit.ali_from}\t{hit.ali_to}\t"
|
|
151
|
+
f"{hit.env_from}\t{hit.env_to}\t{hit.acc}\t"
|
|
152
|
+
f"{hit.description}\n".encode("utf-8"))
|
|
153
|
+
|
|
154
|
+
return self.hmmsearch_out_path
|
|
155
|
+
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
class seqkit:
|
|
2
|
+
|
|
3
|
+
def __init__(self, input_file,
|
|
4
|
+
output_file,
|
|
5
|
+
log_file,
|
|
6
|
+
threads=4,
|
|
7
|
+
logger=None):
|
|
8
|
+
|
|
9
|
+
self.input_file = input_file
|
|
10
|
+
self.output_file = output_file
|
|
11
|
+
self.log_file = log_file
|
|
12
|
+
self.threads = threads
|
|
13
|
+
self.logger = logger
|
|
14
|
+
|
|
15
|
+
def run_seqkit_seq(self, length_thr=400):
|
|
16
|
+
import os
|
|
17
|
+
import subprocess
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
if self.logger:
|
|
22
|
+
self.logger.silent_log(f"Running seqkit seq on {self.input_file}")
|
|
23
|
+
self.logger.silent_log(f"Length threshold: {length_thr}")
|
|
24
|
+
|
|
25
|
+
seqkit_cmd = ["seqkit",
|
|
26
|
+
"seq",
|
|
27
|
+
"--threads",
|
|
28
|
+
str(self.threads),
|
|
29
|
+
"-m",
|
|
30
|
+
str(length_thr),
|
|
31
|
+
str(self.input_file),
|
|
32
|
+
"-o",
|
|
33
|
+
str(self.output_file)]
|
|
34
|
+
|
|
35
|
+
if self.logger:
|
|
36
|
+
self.logger.silent_log(f"Running command: {' '.join(seqkit_cmd)}")
|
|
37
|
+
|
|
38
|
+
with open(self.log_file, 'w') as fout:
|
|
39
|
+
try:
|
|
40
|
+
subprocess.run(seqkit_cmd, stdout=fout, stderr=fout, shell=False, check=True)
|
|
41
|
+
if self.logger:
|
|
42
|
+
self.logger.silent_log(f"Successfully filtered sequences to {self.output_file}")
|
|
43
|
+
|
|
44
|
+
except subprocess.CalledProcessError as e:
|
|
45
|
+
cmd_str = ' '.join(seqkit_cmd)
|
|
46
|
+
error_msg = f"Error running seqkit command: {cmd_str}"
|
|
47
|
+
if self.logger:
|
|
48
|
+
self.logger.silent_log(error_msg)
|
|
49
|
+
raise Exception(error_msg)
|
|
50
|
+
|
|
51
|
+
return str(self.output_file)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def run_seqkit_translate(self, gen_code=1, frame=6):
|
|
55
|
+
import os
|
|
56
|
+
import subprocess
|
|
57
|
+
import sys
|
|
58
|
+
from pathlib import Path
|
|
59
|
+
|
|
60
|
+
if self.logger:
|
|
61
|
+
self.logger.silent_log(f"Running seqkit translate on {self.input_file}")
|
|
62
|
+
self.logger.silent_log(f"Output will be written to {self.output_file}")
|
|
63
|
+
self.logger.silent_log(f"Using genetic code {gen_code} and frame {frame}")
|
|
64
|
+
|
|
65
|
+
seqkit_cmd = ["seqkit",
|
|
66
|
+
"translate",
|
|
67
|
+
"--threads",
|
|
68
|
+
str(self.threads),
|
|
69
|
+
"--clean",
|
|
70
|
+
"--append-frame",
|
|
71
|
+
"-f",
|
|
72
|
+
f"{frame}",
|
|
73
|
+
"-T",
|
|
74
|
+
f"{gen_code}",
|
|
75
|
+
str(self.input_file),
|
|
76
|
+
"-o",
|
|
77
|
+
str(self.output_file)]
|
|
78
|
+
|
|
79
|
+
if self.logger:
|
|
80
|
+
self.logger.silent_log(f"Running command: {' '.join(seqkit_cmd)}")
|
|
81
|
+
|
|
82
|
+
with open(self.log_file, 'w') as fout:
|
|
83
|
+
try:
|
|
84
|
+
subprocess.run(seqkit_cmd, stdout=fout, stderr=fout, shell=False, check=True)
|
|
85
|
+
# Check the output file exists and has content
|
|
86
|
+
if os.path.exists(self.output_file):
|
|
87
|
+
with open(self.output_file, 'r') as f:
|
|
88
|
+
first_few_lines = [next(f) for _ in range(6)]
|
|
89
|
+
if self.logger:
|
|
90
|
+
self.logger.silent_log("First few lines of output:")
|
|
91
|
+
for line in first_few_lines:
|
|
92
|
+
self.logger.silent_log(f"{line.strip()}")
|
|
93
|
+
else:
|
|
94
|
+
error_msg = f"Output file {self.output_file} was not created!"
|
|
95
|
+
if self.logger:
|
|
96
|
+
self.logger.silent_log(error_msg)
|
|
97
|
+
raise Exception(error_msg)
|
|
98
|
+
|
|
99
|
+
except subprocess.CalledProcessError as e:
|
|
100
|
+
cmd_str = ' '.join(seqkit_cmd)
|
|
101
|
+
error_msg = f"Error running seqkit command: {cmd_str}"
|
|
102
|
+
error_details = f"Error details: {str(e)}"
|
|
103
|
+
if self.logger:
|
|
104
|
+
self.logger.silent_log(error_msg)
|
|
105
|
+
self.logger.silent_log(error_details)
|
|
106
|
+
raise Exception(error_msg)
|
|
107
|
+
|
|
108
|
+
return str(self.output_file)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
import os
|
|
5
|
+
import polars as pl
|
|
6
|
+
import needletail
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_combined_results_to_gff(output_file, combined_data,seq_type):
|
|
10
|
+
with open(output_file, 'w') as f:
|
|
11
|
+
f.write("##gff-version 3\n")
|
|
12
|
+
for row in combined_data.iter_rows(named=True):
|
|
13
|
+
record = convert_record_to_gff3_record(row, seq_type)
|
|
14
|
+
f.write(f"{record}\n")
|
|
15
|
+
|
|
16
|
+
def convert_record_to_gff3_record(row,seq_type): # for dict objects expected to be coherced into a gff3
|
|
17
|
+
# taken from rolypoly https://code.jgi.doe.gov/UNeri/rolypoly/-/blob/main/src/rolypoly/commands/annotation/annotate_RNA.py
|
|
18
|
+
|
|
19
|
+
# try to identify a sequence_id columns (query, qseqid, contig_id, contig, id, name)
|
|
20
|
+
if seq_type == 'nuc':
|
|
21
|
+
sequence_id_col = "Translated_contig_name (frame)"
|
|
22
|
+
else:
|
|
23
|
+
sequence_id_columns = ["sequence_id",'query', 'qseqid', 'contig_id', 'contig', 'id', 'name','Contig_name']
|
|
24
|
+
sequence_id_col = next((col for col in sequence_id_columns if col in row.keys()), None)
|
|
25
|
+
if sequence_id_col is None:
|
|
26
|
+
raise ValueError(f"No sequence ID column found in row. Available columns: {list(row.keys())}")
|
|
27
|
+
|
|
28
|
+
# try to identify a score column (score, Score, bitscore, qscore, bit)
|
|
29
|
+
score_columns = ["score", "Score", "bitscore", "qscore", "bit","bits"]
|
|
30
|
+
score_col = next((col for col in score_columns if col in row.keys()), "score")
|
|
31
|
+
|
|
32
|
+
# try to identify a source column (source, Source, db, DB)
|
|
33
|
+
source_columns = ["source", "Source", "db", "DB"]
|
|
34
|
+
source_col = next((col for col in source_columns if col in row.keys()), "source")
|
|
35
|
+
|
|
36
|
+
# try to identify a type column (type, Type, feature, Feature)
|
|
37
|
+
type_columns = ["type", "Type", "feature", "Feature"]
|
|
38
|
+
type_col = next((col for col in type_columns if col in row.keys()), "type")
|
|
39
|
+
|
|
40
|
+
# try to identify a strand column (strand, Strand, sense, Sense)
|
|
41
|
+
strand_columns = ["strand", "Strand", "sense", "Sense"]
|
|
42
|
+
strand_col = next((col for col in strand_columns if col in row.keys()), "strand")
|
|
43
|
+
|
|
44
|
+
# try to identify a phase column (phase, Phase)
|
|
45
|
+
phase_columns = ["phase", "Phase"]
|
|
46
|
+
phase_col = next((col for col in phase_columns if col in row.keys()), "phase")
|
|
47
|
+
|
|
48
|
+
# Build GFF3 attributes string
|
|
49
|
+
attrs = []
|
|
50
|
+
for key, value in row.items():
|
|
51
|
+
if key not in [sequence_id_col, source_col, score_col, type_col, strand_col, phase_col]:
|
|
52
|
+
attrs.append(f"{key}={value}")
|
|
53
|
+
|
|
54
|
+
# Get values, using defaults for missing columns
|
|
55
|
+
sequence_id = row[sequence_id_col]
|
|
56
|
+
source = row.get(source_col, "rdrpcatch")
|
|
57
|
+
score = row.get(score_col, "0")
|
|
58
|
+
feature_type = row.get(type_col, "feature")
|
|
59
|
+
strand = row.get(strand_col, "+")
|
|
60
|
+
phase = row.get(phase_col, ".")
|
|
61
|
+
|
|
62
|
+
# Format GFF3 record
|
|
63
|
+
gff3_fields = [
|
|
64
|
+
sequence_id,
|
|
65
|
+
source,
|
|
66
|
+
feature_type,
|
|
67
|
+
str(row.get("RdRp_from(AA)", "1")),
|
|
68
|
+
str(row.get("RdRp_to(AA)", "1")),
|
|
69
|
+
str(score),
|
|
70
|
+
strand,
|
|
71
|
+
phase,
|
|
72
|
+
";".join(attrs) if attrs else "."
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
return "\t".join(gff3_fields)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Logger:
|
|
82
|
+
def __init__(self, log_file):
|
|
83
|
+
self.console = Console()
|
|
84
|
+
self.log_file = log_file
|
|
85
|
+
self.logger = logging.getLogger('Logger')
|
|
86
|
+
self.logger.setLevel(logging.INFO)
|
|
87
|
+
handler = logging.FileHandler(self.log_file)
|
|
88
|
+
handler.setLevel(logging.INFO)
|
|
89
|
+
formatter = logging.Formatter('%(asctime)s - %(message)s')
|
|
90
|
+
handler.setFormatter(formatter)
|
|
91
|
+
self.logger.addHandler(handler)
|
|
92
|
+
|
|
93
|
+
def loud_log(self, message):
|
|
94
|
+
self.console.log(message)
|
|
95
|
+
self.logger.info(message)
|
|
96
|
+
|
|
97
|
+
def silent_log(self, message):
|
|
98
|
+
self.logger.info(message)
|
|
99
|
+
|
|
100
|
+
def start_timer(self):
|
|
101
|
+
self.start_time = time.time()
|
|
102
|
+
|
|
103
|
+
return self.start_time
|
|
104
|
+
|
|
105
|
+
def stop_timer(self, start_time, verbose=None):
|
|
106
|
+
end_time = time.time()
|
|
107
|
+
|
|
108
|
+
raw_execution_time = end_time - start_time
|
|
109
|
+
|
|
110
|
+
# Calculate hours, minutes, and seconds
|
|
111
|
+
hours = int(raw_execution_time // 3600)
|
|
112
|
+
minutes = int((raw_execution_time % 3600) // 60)
|
|
113
|
+
seconds = int(raw_execution_time % 60)
|
|
114
|
+
milliseconds = int((raw_execution_time % 1) * 1000)
|
|
115
|
+
|
|
116
|
+
# Format the output
|
|
117
|
+
execution_time = f"{hours} Hours {minutes} Minutes {seconds} Seconds {milliseconds} ms"
|
|
118
|
+
|
|
119
|
+
return execution_time
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class fasta_checker:
|
|
123
|
+
|
|
124
|
+
def __init__(self, fasta_file, logger=None):
|
|
125
|
+
self.fasta_file = fasta_file
|
|
126
|
+
self.logger = logger
|
|
127
|
+
|
|
128
|
+
def check_fasta_validity(self):
|
|
129
|
+
reader = needletail.parse_fastx_file(self.fasta_file)
|
|
130
|
+
try:
|
|
131
|
+
first_record = next(reader)
|
|
132
|
+
if self.logger:
|
|
133
|
+
self.logger.silent_log(f"Successfully validated fasta file: {self.fasta_file}")
|
|
134
|
+
return True
|
|
135
|
+
except StopIteration:
|
|
136
|
+
error_msg = f"Invalid or empty fasta file: {self.fasta_file}"
|
|
137
|
+
if self.logger:
|
|
138
|
+
self.logger.silent_log(error_msg)
|
|
139
|
+
raise Exception(error_msg)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
error_msg = f"Invalid fasta file: {self.fasta_file}, error: {str(e)}"
|
|
142
|
+
if self.logger:
|
|
143
|
+
self.logger.silent_log(error_msg)
|
|
144
|
+
raise Exception(error_msg)
|
|
145
|
+
|
|
146
|
+
def read_fasta(self):
|
|
147
|
+
fasta_dict = {}
|
|
148
|
+
reader = needletail.parse_fastx_file(self.fasta_file)
|
|
149
|
+
for record in reader:
|
|
150
|
+
header = f">{record.id}"
|
|
151
|
+
fasta_dict[header] = record.seq
|
|
152
|
+
if self.logger:
|
|
153
|
+
self.logger.silent_log(f"Read {len(fasta_dict)} sequences from {self.fasta_file}")
|
|
154
|
+
return fasta_dict
|
|
155
|
+
|
|
156
|
+
def check_seq_type(self):
|
|
157
|
+
reader = needletail.parse_fastx_file(self.fasta_file)
|
|
158
|
+
dna_set = {'A', 'T', 'G', 'C'}
|
|
159
|
+
dna_set_ambiguous = {'A', 'T', 'G', 'C', 'N'}
|
|
160
|
+
protein_set = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X'}
|
|
161
|
+
|
|
162
|
+
for record in reader:
|
|
163
|
+
seq = record.seq.upper()
|
|
164
|
+
if set(seq).issubset(dna_set):
|
|
165
|
+
if self.logger:
|
|
166
|
+
self.logger.silent_log(f"Detected nucleotide sequence (strict DNA alphabet)")
|
|
167
|
+
return 'nuc'
|
|
168
|
+
elif set(seq).issubset(dna_set_ambiguous):
|
|
169
|
+
if self.logger:
|
|
170
|
+
self.logger.silent_log(f"Detected nucleotide sequence (ambiguous DNA alphabet)")
|
|
171
|
+
return 'nuc'
|
|
172
|
+
elif set(seq).issubset(protein_set):
|
|
173
|
+
if self.logger:
|
|
174
|
+
self.logger.silent_log(f"Detected protein sequence")
|
|
175
|
+
return 'prot'
|
|
176
|
+
else:
|
|
177
|
+
error_msg = f"Invalid sequence type in fasta file: {self.fasta_file} for sequence: {record.id.encode()} with sequence: {set(seq)}"
|
|
178
|
+
if self.logger:
|
|
179
|
+
self.logger.silent_log(error_msg)
|
|
180
|
+
raise Exception(error_msg)
|
|
181
|
+
|
|
182
|
+
def check_seq_length(self, max_len):
|
|
183
|
+
if not os.path.isfile(self.fasta_file):
|
|
184
|
+
error_msg = f"The file '{self.fasta_file}' does not exist."
|
|
185
|
+
if self.logger:
|
|
186
|
+
self.logger.silent_log(error_msg)
|
|
187
|
+
raise FileNotFoundError(error_msg)
|
|
188
|
+
|
|
189
|
+
reader = needletail.parse_fastx_file(self.fasta_file)
|
|
190
|
+
for record in reader:
|
|
191
|
+
if len(record.seq) > max_len:
|
|
192
|
+
error_msg = f"Sequence ID: {record.id}, Length: {len(record.seq)}, " \
|
|
193
|
+
f"Exceeds maximum allowed length: {max_len}. Please check the input file, " \
|
|
194
|
+
f"as this will cause issues with the pyHMMER search."
|
|
195
|
+
if self.logger:
|
|
196
|
+
self.logger.silent_log(error_msg)
|
|
197
|
+
raise ValueError(error_msg)
|
|
198
|
+
if self.logger:
|
|
199
|
+
self.logger.silent_log(f"All sequences are within length limit of {max_len}")
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class fasta:
|
|
205
|
+
|
|
206
|
+
def __init__(self, fasta_file, logger=None):
|
|
207
|
+
self.fasta_file = fasta_file
|
|
208
|
+
self.logger = logger
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def extract_contigs(self, contig_list):
|
|
212
|
+
"""
|
|
213
|
+
Extract contigs from a fasta file based on a list of contig names.
|
|
214
|
+
|
|
215
|
+
:param contig_list: List of contig names to extract.
|
|
216
|
+
:type contig_list: list
|
|
217
|
+
:return: Dictionary with contig names as keys and sequences as values.
|
|
218
|
+
:rtype: dict
|
|
219
|
+
"""
|
|
220
|
+
contig_dict = {}
|
|
221
|
+
reader = needletail.parse_fastx_file(self.fasta_file)
|
|
222
|
+
for record in reader:
|
|
223
|
+
# pyhmmer uses the first word of the header as the ID, so split on whitespace
|
|
224
|
+
if record.id.strip().split(" ")[0] in contig_list:
|
|
225
|
+
contig_dict[record.id] = record.seq
|
|
226
|
+
return contig_dict
|
|
227
|
+
|
|
228
|
+
def write_fasta(self, contig_dict, outfile):
|
|
229
|
+
"""
|
|
230
|
+
Write a dictionary of contigs to a fasta file.
|
|
231
|
+
|
|
232
|
+
:param contig_dict: Dictionary with contig names as keys and sequences as values.
|
|
233
|
+
:type contig_dict: dict
|
|
234
|
+
:param outfile: Path to the output file.
|
|
235
|
+
:type outfile: str
|
|
236
|
+
:return: None
|
|
237
|
+
"""
|
|
238
|
+
with open(outfile, 'w') as out_handle:
|
|
239
|
+
for contig_name, seq in contig_dict.items():
|
|
240
|
+
out_handle.write(f">{contig_name}\n{seq}\n")
|
|
241
|
+
|
|
242
|
+
def write_fasta_coords(self, rdrp_coords_list, outfile, seq_type):
|
|
243
|
+
"""
|
|
244
|
+
Write a list of RdRp coordinates to a fasta file.
|
|
245
|
+
|
|
246
|
+
:param rdrp_coords_list: List of tuples containing contig name and RdRp coordinates.
|
|
247
|
+
:type rdrp_coords_list: list
|
|
248
|
+
:param outfile: Path to the output file.
|
|
249
|
+
:type outfile: str
|
|
250
|
+
:param seq_type: Type of sequence (prot or nuc).
|
|
251
|
+
:type seq_type: str
|
|
252
|
+
:return: None
|
|
253
|
+
"""
|
|
254
|
+
if self.logger:
|
|
255
|
+
self.logger.silent_log(f"Processing {len(rdrp_coords_list)} coordinates")
|
|
256
|
+
self.logger.silent_log(f"First few coordinates: {rdrp_coords_list[:3]}")
|
|
257
|
+
|
|
258
|
+
reader = needletail.parse_fastx_file(self.fasta_file)
|
|
259
|
+
matches_found = 0
|
|
260
|
+
with open(outfile, 'w') as out_handle:
|
|
261
|
+
for record in reader:
|
|
262
|
+
# pyhmmer uses the first word of the header as the ID, so split on whitespace
|
|
263
|
+
record_id = record.id.strip().split(" ")[0]
|
|
264
|
+
if self.logger:
|
|
265
|
+
self.logger.silent_log(f"Processing record with ID: '{record_id}'")
|
|
266
|
+
for contig_name, rdrp_from, rdrp_to in rdrp_coords_list:
|
|
267
|
+
contig_name = str(contig_name).strip()
|
|
268
|
+
if self.logger:
|
|
269
|
+
self.logger.silent_log(f"Comparing record '{record_id}' with contig '{contig_name}'")
|
|
270
|
+
if record_id == contig_name:
|
|
271
|
+
matches_found += 1
|
|
272
|
+
seq = record.seq[rdrp_from-1:rdrp_to]
|
|
273
|
+
fasta_header = f"{record_id}_RdRp_{rdrp_from}-{rdrp_to}"
|
|
274
|
+
out_handle.write(f">{fasta_header}\n{seq}\n")
|
|
275
|
+
if self.logger:
|
|
276
|
+
self.logger.silent_log(f"Match found! Writing sequence of length {len(seq)}")
|
|
277
|
+
else:
|
|
278
|
+
if self.logger:
|
|
279
|
+
self.logger.silent_log(f"No match - lengths: {len(record_id)}|{len(contig_name)}, "
|
|
280
|
+
f"record_id bytes: {record_id.encode()}, contig bytes: {contig_name.encode()}")
|
|
281
|
+
|
|
282
|
+
if self.logger:
|
|
283
|
+
self.logger.silent_log(f"Total matches found: {matches_found}")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class mmseqs_parser:
|
|
287
|
+
|
|
288
|
+
def __init__(self, mmseqs_tax_out_file, mmseqs_s_out_file):
|
|
289
|
+
self.mmseqs_tax_out_file = mmseqs_tax_out_file
|
|
290
|
+
self.mmseqs_s_out_file = mmseqs_s_out_file
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def parse_mmseqs_tax_lca(self):
|
|
294
|
+
"""
|
|
295
|
+
Parse the MMseqs2 taxonomy output file.
|
|
296
|
+
|
|
297
|
+
:return: Dictionary with contig names as keys and taxonomy lineages as values.
|
|
298
|
+
:rtype: dict
|
|
299
|
+
"""
|
|
300
|
+
with open(self.mmseqs_tax_out_file, 'r') as f:
|
|
301
|
+
lca_dict = {}
|
|
302
|
+
for line in f:
|
|
303
|
+
line = line.strip().split('\t')
|
|
304
|
+
contig = line[0]
|
|
305
|
+
if len(line) < 5:
|
|
306
|
+
lca_lineage = line[3]
|
|
307
|
+
else:
|
|
308
|
+
lca_lineage = line[4]
|
|
309
|
+
lca_dict[contig] = lca_lineage
|
|
310
|
+
return lca_dict
|
|
311
|
+
|
|
312
|
+
def parse_mmseqs_e_search_tophit(self):
|
|
313
|
+
"""
|
|
314
|
+
Parse the MMseqs2 easy-search output file.
|
|
315
|
+
|
|
316
|
+
:return: Dictionary with contig names as keys and lists of hit information as values.
|
|
317
|
+
:rtype: dict
|
|
318
|
+
"""
|
|
319
|
+
with open(self.mmseqs_s_out_file, 'r') as f:
|
|
320
|
+
tophit_dict = {}
|
|
321
|
+
for line in f:
|
|
322
|
+
line = line.strip().split('\t')
|
|
323
|
+
contig = line[0]
|
|
324
|
+
|
|
325
|
+
if contig not in tophit_dict:
|
|
326
|
+
target = line[1]
|
|
327
|
+
fident = line[2]
|
|
328
|
+
alnlen = line[3]
|
|
329
|
+
eval = line[10]
|
|
330
|
+
bits = line[11]
|
|
331
|
+
qcov = line[12]
|
|
332
|
+
lineage = line[14]
|
|
333
|
+
tophit_dict[contig] = [target, fident, alnlen, eval, bits, qcov, lineage]
|
|
334
|
+
else:
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
return tophit_dict
|
|
338
|
+
|
|
339
|
+
def tax_to_rdrpcatch(self, rdrpcatch_out, extended_rdrpcatch_out, seq_type):
|
|
340
|
+
"""
|
|
341
|
+
Add taxonomy information to the RdRpCATCH output file.
|
|
342
|
+
|
|
343
|
+
:param rdrpcatch_out: Path to the RdRpCATCH output file.
|
|
344
|
+
:type rdrpcatch_out: str
|
|
345
|
+
:param extended_rdrpcatch_out: Path to the extended RdRpCATCH output file.
|
|
346
|
+
:type extended_rdrpcatch_out: str
|
|
347
|
+
:param seq_type: Type of sequence (prot or nuc).
|
|
348
|
+
:type seq_type: str
|
|
349
|
+
:return: None
|
|
350
|
+
"""
|
|
351
|
+
lca_dict = self.parse_mmseqs_tax_lca()
|
|
352
|
+
tophit_dict = self.parse_mmseqs_e_search_tophit()
|
|
353
|
+
|
|
354
|
+
df = pl.read_csv(rdrpcatch_out, separator='\t')
|
|
355
|
+
|
|
356
|
+
# drop columns that are not needed
|
|
357
|
+
df = df.drop(["Best_hit_norm_bitscore_profile", "Best_hit_norm_bitscore_contig",
|
|
358
|
+
"Best_hit_ID_score"])
|
|
359
|
+
|
|
360
|
+
# Create new columns for taxonomy information
|
|
361
|
+
# For translated sequences, use the frame-specific name
|
|
362
|
+
lookup_col = 'Translated_contig_name (frame)' if seq_type == 'nuc' else 'Contig_name'
|
|
363
|
+
|
|
364
|
+
df = df.with_columns([
|
|
365
|
+
pl.Series(name='MMseqs_Taxonomy_2bLCA', values=[lca_dict.get(row[lookup_col], '') for row in df.iter_rows(named=True)]),
|
|
366
|
+
pl.Series(name='MMseqs_TopHit_accession', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[0] for row in df.iter_rows(named=True)]),
|
|
367
|
+
pl.Series(name='MMseqs_TopHit_fident', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[1] for row in df.iter_rows(named=True)]),
|
|
368
|
+
pl.Series(name='MMseqs_TopHit_alnlen', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[2] for row in df.iter_rows(named=True)]),
|
|
369
|
+
pl.Series(name='MMseqs_TopHit_eval', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[3] for row in df.iter_rows(named=True)]),
|
|
370
|
+
pl.Series(name='MMseqs_TopHit_bitscore', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[4] for row in df.iter_rows(named=True)]),
|
|
371
|
+
pl.Series(name='MMseqs_TopHit_qcov', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[5] for row in df.iter_rows(named=True)]),
|
|
372
|
+
pl.Series(name='MMseqs_TopHit_lineage', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[6] for row in df.iter_rows(named=True)])
|
|
373
|
+
])
|
|
374
|
+
|
|
375
|
+
# Sort by Best_hit_bitscore
|
|
376
|
+
sorted_df = df.sort("Best_hit_bitscore", descending=True)
|
|
377
|
+
|
|
378
|
+
sorted_df.write_csv(extended_rdrpcatch_out, separator='\t')
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class file_handler:
|
|
382
|
+
|
|
383
|
+
def __init__(self, file):
|
|
384
|
+
self.file = file
|
|
385
|
+
|
|
386
|
+
def check_file_exists(self):
|
|
387
|
+
if not os.path.exists(self.file):
|
|
388
|
+
raise Exception(f"File does not exist: {self.file}")
|
|
389
|
+
return True
|
|
390
|
+
|
|
391
|
+
def delete_file(self):
|
|
392
|
+
os.remove(self.file)
|
|
393
|
+
return True
|
|
394
|
+
|
|
395
|
+
def check_file_size(self):
|
|
396
|
+
return os.path.getsize(self.file)
|
|
397
|
+
|
|
398
|
+
def check_file_extension(self):
|
|
399
|
+
return os.path.splitext(self.file)[1]
|
|
400
|
+
|
|
401
|
+
def get_file_name(self):
|
|
402
|
+
return os.path.basename(self.file)
|
|
403
|
+
|
|
404
|
+
def get_file_dir(self):
|
|
405
|
+
return os.path.dirname(self.file)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
|