rdrpcatch 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdrpcatch/__init__.py +0 -0
- rdrpcatch/cli/__init__.py +0 -0
- rdrpcatch/cli/args.py +358 -0
- rdrpcatch/rdrpcatch_scripts/__init__.py +0 -0
- rdrpcatch/rdrpcatch_scripts/fetch_dbs.py +302 -0
- rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py +589 -0
- rdrpcatch/rdrpcatch_scripts/gui.py +256 -0
- rdrpcatch/rdrpcatch_scripts/mmseqs_tax.py +100 -0
- rdrpcatch/rdrpcatch_scripts/paths.py +162 -0
- rdrpcatch/rdrpcatch_scripts/plot.py +165 -0
- rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py +155 -0
- rdrpcatch/rdrpcatch_scripts/run_seqkit.py +112 -0
- rdrpcatch/rdrpcatch_scripts/utils.py +414 -0
- rdrpcatch/rdrpcatch_wrapper.py +666 -0
- rdrpcatch-0.0.1.dist-info/METADATA +223 -0
- rdrpcatch-0.0.1.dist-info/RECORD +19 -0
- rdrpcatch-0.0.1.dist-info/WHEEL +4 -0
- rdrpcatch-0.0.1.dist-info/entry_points.txt +2 -0
- rdrpcatch-0.0.1.dist-info/licenses/LICENCE +9 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import tkinter as tk
|
|
2
|
+
from tkinter import filedialog
|
|
3
|
+
from tkinter import ttk
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
from tkinter import messagebox
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class colabscanner_gui:
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.root = tk.Tk()
|
|
13
|
+
self.root.title("RdRpCATCH")
|
|
14
|
+
self.root.geometry("1000x800")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
style = ttk.Style(self.root)
|
|
18
|
+
# set theme style
|
|
19
|
+
style.theme_use("clam")
|
|
20
|
+
|
|
21
|
+
# Input file selection
|
|
22
|
+
self.input_frame = ttk.Frame(self.root)
|
|
23
|
+
self.input_frame.grid(row=0, column=0, columnspan=3, padx=5, pady=5)
|
|
24
|
+
|
|
25
|
+
self.input_label = ttk.Label(self.input_frame, text="Input File:")
|
|
26
|
+
self.input_label.pack(side=tk.LEFT)
|
|
27
|
+
|
|
28
|
+
self.input_entry = ttk.Entry(self.input_frame, width=50)
|
|
29
|
+
self.input_entry.pack(side=tk.LEFT, padx=5)
|
|
30
|
+
|
|
31
|
+
self.input_button = ttk.Button(self.input_frame, text="Browse", command=self.browse_input)
|
|
32
|
+
self.input_button.pack(side=tk.LEFT)
|
|
33
|
+
|
|
34
|
+
# Parent directory selection
|
|
35
|
+
self.parent_dir_frame = ttk.Frame(self.root)
|
|
36
|
+
self.parent_dir_frame.grid(row=1, column=0, columnspan=3, padx=5, pady=5)
|
|
37
|
+
|
|
38
|
+
self.parent_dir_label = ttk.Label(self.parent_dir_frame, text="Parent Directory:")
|
|
39
|
+
self.parent_dir_label.pack(side=tk.LEFT)
|
|
40
|
+
|
|
41
|
+
self.parent_dir_entry = ttk.Entry(self.parent_dir_frame, width=50)
|
|
42
|
+
self.parent_dir_entry.pack(side=tk.LEFT, padx=5)
|
|
43
|
+
|
|
44
|
+
self.parent_dir_button = ttk.Button(self.parent_dir_frame, text="Browse", command=self.browse_parent_dir)
|
|
45
|
+
self.parent_dir_button.pack(side=tk.LEFT)
|
|
46
|
+
|
|
47
|
+
# Output name entry
|
|
48
|
+
self.output_name_frame = ttk.Frame(self.root)
|
|
49
|
+
self.output_name_frame.grid(row=2, column=0, columnspan=3, padx=5, pady=5)
|
|
50
|
+
|
|
51
|
+
self.output_name_label = ttk.Label(self.output_name_frame, text="Output Name:")
|
|
52
|
+
self.output_name_label.pack(side=tk.LEFT)
|
|
53
|
+
|
|
54
|
+
self.output_name_entry = ttk.Entry(self.output_name_frame, width=50)
|
|
55
|
+
self.output_name_entry.pack(side=tk.LEFT, padx=5)
|
|
56
|
+
|
|
57
|
+
# HMM directory selection
|
|
58
|
+
self.hmm_dir_frame = ttk.Frame(self.root)
|
|
59
|
+
self.hmm_dir_frame.grid(row=3, column=0, columnspan=3, padx=5, pady=5)
|
|
60
|
+
|
|
61
|
+
self.hmm_dir_label = ttk.Label(self.hmm_dir_frame, text="HMM Directory:")
|
|
62
|
+
self.hmm_dir_label.pack(side=tk.LEFT)
|
|
63
|
+
|
|
64
|
+
self.hmm_dir_entry = ttk.Entry(self.hmm_dir_frame, width=50)
|
|
65
|
+
self.hmm_dir_entry.pack(side=tk.LEFT, padx=5)
|
|
66
|
+
|
|
67
|
+
self.hmm_dir_button = ttk.Button(self.hmm_dir_frame, text="Browse", command=self.browse_hmm_dir)
|
|
68
|
+
self.hmm_dir_button.pack(side=tk.LEFT)
|
|
69
|
+
|
|
70
|
+
# Database selection
|
|
71
|
+
self.db_frame = ttk.LabelFrame(self.root, text="Select Databases")
|
|
72
|
+
self.db_frame.grid(row=4, column=0, columnspan=3, padx=5, pady=10)
|
|
73
|
+
|
|
74
|
+
self.databases = {
|
|
75
|
+
'RVMT': tk.BooleanVar(),
|
|
76
|
+
'NeoRdRp': tk.BooleanVar(),
|
|
77
|
+
'NeoRdRp.2.1': tk.BooleanVar(),
|
|
78
|
+
'TSA_Olendraite_fam': tk.BooleanVar(),
|
|
79
|
+
'TSA_Olendraite_gen': tk.BooleanVar(),
|
|
80
|
+
'RDRP-scan': tk.BooleanVar(),
|
|
81
|
+
'Lucaprot': tk.BooleanVar()
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
for i, (db_name, var) in enumerate(self.databases.items()):
|
|
85
|
+
ttk.Checkbutton(self.db_frame, text=db_name, variable=var).grid(row=i//3, column=i%3, padx=5, pady=2)
|
|
86
|
+
|
|
87
|
+
# HMMsearch parameters
|
|
88
|
+
self.hmmsearch_frame = ttk.LabelFrame(self.root, text="HMMsearch Parameters")
|
|
89
|
+
self.hmmsearch_frame.grid(row=6, column=1, padx=5, pady=10)
|
|
90
|
+
|
|
91
|
+
self.hmmsearch_label = ttk.Label(self.hmmsearch_frame, text="HMMsearch Parameters:")
|
|
92
|
+
self.hmmsearch_label.pack(anchor=tk.W)
|
|
93
|
+
|
|
94
|
+
self.evalue_var = tk.StringVar(value='1e-05')
|
|
95
|
+
self.inc_evalue_var = tk.StringVar(value='1e-05')
|
|
96
|
+
self.dom_evalue_var = tk.StringVar(value='1e-05')
|
|
97
|
+
self.incdom_evalue_var = tk.StringVar(value='1e-05')
|
|
98
|
+
self.z_value_var = tk.IntVar(value=1000000)
|
|
99
|
+
self.cpus_var = tk.IntVar(value=1)
|
|
100
|
+
|
|
101
|
+
hmmsearch_params = [
|
|
102
|
+
('E-value threshold', self.evalue_var),
|
|
103
|
+
('Inclusion E-value threshold', self.inc_evalue_var),
|
|
104
|
+
('Domain E-value threshold', self.dom_evalue_var),
|
|
105
|
+
('Inclusion domain E-value threshold', self.incdom_evalue_var),
|
|
106
|
+
('z-value', self.z_value_var),
|
|
107
|
+
('Number of CPUs to use', self.cpus_var)
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
for param_name, param_var in hmmsearch_params:
|
|
111
|
+
frame = ttk.Frame(self.hmmsearch_frame)
|
|
112
|
+
frame.pack(fill=tk.X, padx=5, pady=2)
|
|
113
|
+
ttk.Label(frame, text=param_name).pack(side=tk.LEFT)
|
|
114
|
+
ttk.Entry(frame, textvariable=param_var, width=15).pack(side=tk.RIGHT)
|
|
115
|
+
|
|
116
|
+
# seqkit translate parameters
|
|
117
|
+
self.seqkit_frame = ttk.Frame(self.root)
|
|
118
|
+
self.seqkit_frame.grid(row=6, column=2, padx=5, pady=10)
|
|
119
|
+
self.gen_code_var = tk.IntVar(value=1)
|
|
120
|
+
self.frame_var = tk.IntVar(value=6)
|
|
121
|
+
self.seqkit_translate_label = ttk.Label(self.seqkit_frame, text="Seqkit Translate Parameters:")
|
|
122
|
+
self.seqkit_translate_label.pack(anchor=tk.W)
|
|
123
|
+
seqkit_transl_params = [ ('Genetic code', self.gen_code_var), ('Frame (6: All frames)', self.frame_var)]
|
|
124
|
+
for param_name, param_var in seqkit_transl_params:
|
|
125
|
+
frame = ttk.Frame(self.seqkit_frame)
|
|
126
|
+
frame.pack(fill=tk.X, padx=5, pady=2)
|
|
127
|
+
ttk.Label(frame, text=param_name).pack(side=tk.LEFT)
|
|
128
|
+
ttk.Entry(frame, textvariable=param_var, width=15).pack(side=tk.RIGHT)
|
|
129
|
+
|
|
130
|
+
# Run button
|
|
131
|
+
self.run_button = ttk.Button(self.root, text="Run Script", command=self.run_script)
|
|
132
|
+
self.run_button.grid(row=7, column=1, padx=5, pady=5)
|
|
133
|
+
|
|
134
|
+
# Status label
|
|
135
|
+
self.status_label = tk.Label(self.root, text="", wraplength=400)
|
|
136
|
+
self.status_label.grid(row=9, column=0, columnspan=3, padx=5, pady=5)
|
|
137
|
+
|
|
138
|
+
# Progress text box (wider version)
|
|
139
|
+
self.progress_frame = ttk.Frame(self.root)
|
|
140
|
+
self.progress_frame.grid(row=8, column=0, columnspan=3, padx=5, pady=10)
|
|
141
|
+
|
|
142
|
+
self.progress_label = ttk.Label(self.progress_frame, text="Progress:")
|
|
143
|
+
self.progress_label.pack(anchor=tk.W)
|
|
144
|
+
|
|
145
|
+
# Make the text area wider
|
|
146
|
+
self.progress_text = tk.Text(self.progress_frame, height=15, width=120)
|
|
147
|
+
self.progress_text.pack(fill=tk.BOTH, expand=True)
|
|
148
|
+
self.progress_text.config(state='disabled')
|
|
149
|
+
|
|
150
|
+
# Add horizontal scrollbar
|
|
151
|
+
self.scrollbar_x = ttk.Scrollbar(self.progress_frame, orient=tk.HORIZONTAL)
|
|
152
|
+
self.scrollbar_x.pack(side=tk.BOTTOM, fill=tk.X)
|
|
153
|
+
self.progress_text.config(xscrollcommand=self.scrollbar_x.set)
|
|
154
|
+
self.scrollbar_x.config(command=self.progress_text.xview)
|
|
155
|
+
|
|
156
|
+
# Vertical scrollbar (already added earlier)
|
|
157
|
+
self.scrollbar = ttk.Scrollbar(self.progress_frame)
|
|
158
|
+
self.scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
|
159
|
+
self.progress_text.config(yscrollcommand=self.scrollbar.set)
|
|
160
|
+
self.scrollbar.config(command=self.progress_text.yview)
|
|
161
|
+
|
|
162
|
+
def browse_input(self):
|
|
163
|
+
filename = filedialog.askopenfilename()
|
|
164
|
+
self.input_entry.delete(0, tk.END)
|
|
165
|
+
self.input_entry.insert(0, filename)
|
|
166
|
+
|
|
167
|
+
def browse_parent_dir(self):
|
|
168
|
+
dirname = filedialog.askdirectory()
|
|
169
|
+
self.parent_dir_entry.delete(0, tk.END)
|
|
170
|
+
self.parent_dir_entry.insert(0, dirname)
|
|
171
|
+
|
|
172
|
+
def browse_hmm_dir(self):
|
|
173
|
+
dirname = filedialog.askdirectory()
|
|
174
|
+
self.hmm_dir_entry.delete(0, tk.END)
|
|
175
|
+
self.hmm_dir_entry.insert(0, dirname)
|
|
176
|
+
|
|
177
|
+
def run_script(self):
|
|
178
|
+
input_file = self.input_entry.get()
|
|
179
|
+
parent_dir = self.parent_dir_entry.get()
|
|
180
|
+
output_name = self.output_name_entry.get()
|
|
181
|
+
hmm_dir = self.hmm_dir_entry.get()
|
|
182
|
+
|
|
183
|
+
selected_dbs = [db for db, var in self.databases.items() if var.get()]
|
|
184
|
+
|
|
185
|
+
if not os.path.isfile(input_file):
|
|
186
|
+
self.status_label.config(text="Error: Input file does not exist.", fg="red")
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
if not os.path.isdir(parent_dir):
|
|
190
|
+
self.status_label.config(text="Error: Parent directory does not exist.", fg="red")
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
output_dir = os.path.join(parent_dir, output_name)
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
self.status_label.config(text=f"Error creating output directory: {str(e)}", fg="red")
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
db_options = ",".join(selected_dbs) if selected_dbs else "all"
|
|
202
|
+
|
|
203
|
+
hmmsearch_args = [
|
|
204
|
+
f'-e', self.evalue_var.get(),
|
|
205
|
+
f'-incE', self.inc_evalue_var.get(),
|
|
206
|
+
f'-domE', self.dom_evalue_var.get(),
|
|
207
|
+
f'-incdomE', self.incdom_evalue_var.get(),
|
|
208
|
+
f'-z', str(self.z_value_var.get()),
|
|
209
|
+
f'-cpus', str(self.cpus_var.get())
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
# Clear previous progress
|
|
214
|
+
self.clear_progress()
|
|
215
|
+
|
|
216
|
+
# Run the script
|
|
217
|
+
result = subprocess.Popen(
|
|
218
|
+
["python3", "colabscanner.py", "-i", input_file, "-o", output_dir, "-hmm_dir", hmm_dir, "-dbs",
|
|
219
|
+
db_options] + hmmsearch_args,
|
|
220
|
+
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True)
|
|
221
|
+
|
|
222
|
+
# Read and display output line by line
|
|
223
|
+
for line in iter(result.stdout.readline, ""):
|
|
224
|
+
self.update_progress(line.strip())
|
|
225
|
+
|
|
226
|
+
result.wait()
|
|
227
|
+
if result.returncode == 0:
|
|
228
|
+
self.status_label.config(text="Script executed successfully.", fg="green")
|
|
229
|
+
else:
|
|
230
|
+
self.status_label.config(text="Script execution failed.", fg="red")
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
messagebox.showerror("Error", f"An unexpected error occurred:\n{str(e)}")
|
|
234
|
+
print(f"Unexpected error: {e}")
|
|
235
|
+
|
|
236
|
+
def clear_progress(self):
|
|
237
|
+
self.progress_text.config(state='normal')
|
|
238
|
+
self.progress_text.delete('1.0', tk.END)
|
|
239
|
+
self.progress_text.config(state='disabled')
|
|
240
|
+
|
|
241
|
+
def update_progress(self, message):
|
|
242
|
+
self.progress_text.config(state='normal')
|
|
243
|
+
self.progress_text.insert(tk.END, message + "\n")
|
|
244
|
+
self.progress_text.yview(tk.END)
|
|
245
|
+
self.progress_text.config(state='disabled')
|
|
246
|
+
self.root.update_idletasks()
|
|
247
|
+
|
|
248
|
+
def run(self):
|
|
249
|
+
self.root.mainloop()
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
if __name__ == "__main__":
|
|
253
|
+
gui = colabscanner_gui()
|
|
254
|
+
gui.run()
|
|
255
|
+
|
|
256
|
+
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class mmseqs:
|
|
7
|
+
|
|
8
|
+
def __init__(self,fasta_fn, mmseqs_db, mmseqs_out_prefix,outdir_path, sens, cpus, log_file):
|
|
9
|
+
self.fasta_fn = fasta_fn
|
|
10
|
+
self.mmseqs_db = mmseqs_db
|
|
11
|
+
self.out_prefix = mmseqs_out_prefix
|
|
12
|
+
self.outdir_path = outdir_path
|
|
13
|
+
self.sens = sens
|
|
14
|
+
self.cpus = cpus
|
|
15
|
+
self.log_file = log_file
|
|
16
|
+
|
|
17
|
+
def run_mmseqs_easy_tax_lca(self):
|
|
18
|
+
"""Run mmseqs easy-tax command."""
|
|
19
|
+
|
|
20
|
+
mmseqs_easy_tax_cmd = ["mmseqs",
|
|
21
|
+
"easy-taxonomy",
|
|
22
|
+
str(self.fasta_fn),
|
|
23
|
+
str(self.mmseqs_db),
|
|
24
|
+
str(self.out_prefix),
|
|
25
|
+
f"{str(self.outdir_path)}/tmp",
|
|
26
|
+
"--tax-lineage",
|
|
27
|
+
"1",
|
|
28
|
+
"--alignment-mode",
|
|
29
|
+
"3",
|
|
30
|
+
"-s",
|
|
31
|
+
str(self.sens),
|
|
32
|
+
"--threads",
|
|
33
|
+
str(self.cpus)
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
with open(self.log_file, 'w') as fout:
|
|
38
|
+
subprocess.run(mmseqs_easy_tax_cmd, stdout=fout, stderr=fout, shell=False, check=True)
|
|
39
|
+
|
|
40
|
+
except subprocess.CalledProcessError as e:
|
|
41
|
+
cmd_str = ' '.join(mmseqs_easy_tax_cmd)
|
|
42
|
+
raise Exception(f"Error running mmseqs easy-tax command: {cmd_str}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def run_mmseqs_easy_tax_tophit(self):
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
mmseqs_easy_tax_cmd = ["mmseqs",
|
|
49
|
+
"easy-taxonomy",
|
|
50
|
+
self.fasta_fn,
|
|
51
|
+
self.mmseqs_db,
|
|
52
|
+
self.out_prefix,
|
|
53
|
+
"tmp",
|
|
54
|
+
"--tax-lineage",
|
|
55
|
+
"1",
|
|
56
|
+
self.sens,
|
|
57
|
+
"--threads",
|
|
58
|
+
str(self.cpus),
|
|
59
|
+
"--lca-mode",
|
|
60
|
+
4]
|
|
61
|
+
try:
|
|
62
|
+
with open(self.log_file, 'w') as fout:
|
|
63
|
+
subprocess.run(mmseqs_easy_tax_cmd, stdout=fout, stderr=fout, shell=False, check=True)
|
|
64
|
+
|
|
65
|
+
except subprocess.CalledProcessError as e:
|
|
66
|
+
cmd_str = ' '.join(mmseqs_easy_tax_cmd)
|
|
67
|
+
raise Exception(f"Error running mmseqs easy-tax command: {cmd_str}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run_mmseqs_e_search(self):
|
|
71
|
+
mmseqs_e_search_cmd = ["mmseqs",
|
|
72
|
+
"easy-search",
|
|
73
|
+
str(self.fasta_fn),
|
|
74
|
+
str(self.mmseqs_db),
|
|
75
|
+
str(self.outdir_path),
|
|
76
|
+
f"{str(self.out_prefix)}/tmp",
|
|
77
|
+
"--start-sens",
|
|
78
|
+
str(self.sens),
|
|
79
|
+
"--threads",
|
|
80
|
+
str(self.cpus),
|
|
81
|
+
"--format-output",
|
|
82
|
+
"query,target,fident,alnlen,mismatch,gapopen,"
|
|
83
|
+
"qstart,qend,tstart,tend,evalue,bits,qcov,tcov,taxlineage",
|
|
84
|
+
"--sort-results",
|
|
85
|
+
"1"]
|
|
86
|
+
try:
|
|
87
|
+
with open(self.log_file, 'w') as fout:
|
|
88
|
+
subprocess.run(mmseqs_e_search_cmd, stdout=fout, stderr=fout, shell=False, check=True)
|
|
89
|
+
|
|
90
|
+
except subprocess.CalledProcessError as e:
|
|
91
|
+
cmd_str = ' '.join(mmseqs_e_search_cmd)
|
|
92
|
+
raise Exception(f"Error running mmseqs easy-search command: {cmd_str}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class classproperty(property):
|
|
6
|
+
def __get__(self, cls, owner):
|
|
7
|
+
return classmethod(self.fget).__get__(None, owner)()
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class rdrpcatch_input:
|
|
11
|
+
|
|
12
|
+
#TODO: Change this line for final version
|
|
13
|
+
|
|
14
|
+
source_dir : Path = Path(__file__).parents[0].parents[0].parents[0]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@classproperty
|
|
18
|
+
def db_dir(cls):
|
|
19
|
+
return cls.source_dir / 'DBs'
|
|
20
|
+
|
|
21
|
+
@classproperty
|
|
22
|
+
def hmm_dbs_dir(cls):
|
|
23
|
+
return cls.source_dir / 'DBs'/ 'hmm_dbs'
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@classproperty
|
|
27
|
+
def test_dir(cls):
|
|
28
|
+
return cls.source_dir / 'test'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@classproperty
|
|
32
|
+
def input_fasta(cls):
|
|
33
|
+
return cls.source_dir / "input.fasta"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class rdrpcatch_output:
|
|
38
|
+
|
|
39
|
+
prefix: str
|
|
40
|
+
output_dir: Path
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def tmp_dir(self):
|
|
44
|
+
return self.output_dir / "tmp"
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def hmm_output_dir (self):
|
|
48
|
+
return self.tmp_dir /"hmm_output"
|
|
49
|
+
|
|
50
|
+
def hmm_output_path(self, db_name):
|
|
51
|
+
return self.hmm_output_dir / f"{self.prefix}_{db_name}_hmmsearch_output.txt"
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def formatted_hmm_output_dir(self):
|
|
55
|
+
return self.tmp_dir / "formatted_hmm_output"
|
|
56
|
+
|
|
57
|
+
def formatted_hmm_output_path(self, db_name):
|
|
58
|
+
return self.formatted_hmm_output_dir / f"{self.prefix}_{db_name}_hmm_output_formatted.txt"
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def best_hit_dir(self):
|
|
62
|
+
return self.tmp_dir / "best_hit_hmm_output"
|
|
63
|
+
|
|
64
|
+
def best_hit_path(self, db_name):
|
|
65
|
+
return self.best_hit_dir / f"{self.prefix}_{db_name}_hmm_output_best_hit.txt"
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def seqkit_seq_output_dir(self):
|
|
69
|
+
return self.tmp_dir/ "seqkit_seq_output"
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def seqkit_seq_output_path(self):
|
|
73
|
+
return self.seqkit_seq_output_dir / f"{self.prefix}_seqkit_seq_output.fasta"
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def seqkit_translate_output_dir(self):
|
|
77
|
+
return self.tmp_dir/ "seqkit_translate_output"
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def seqkit_translate_output_path(self):
|
|
81
|
+
return self.seqkit_translate_output_dir / f"{self.prefix}_seqkit_translate_output.fasta"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def tsv_outdir(self):
|
|
86
|
+
return self.tmp_dir/ "tsv_files"
|
|
87
|
+
@property
|
|
88
|
+
def combined_tsv_path(self):
|
|
89
|
+
return self.tsv_outdir / f"{self.prefix}_combined.tsv"
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def mmseqs_tax_output_dir(self ):
|
|
93
|
+
return self.tmp_dir/ "mmseqs_tax_output"
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def mmseqs_tax_output_prefix(self):
|
|
97
|
+
return self.mmseqs_tax_output_dir / f"{self.prefix}_mmseqs_tax"
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def mmseqs_tax_log_path(self):
|
|
101
|
+
return self.log_dir / f"{self.prefix}_mmseqs_tax.log"
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def mmseqs_tax_output_lca_path(self):
|
|
105
|
+
return self.mmseqs_tax_output_dir / f"{self.prefix}_mmseqs_tax_lca.tsv"
|
|
106
|
+
@property
|
|
107
|
+
def mmseqs_e_search_output_dir(self):
|
|
108
|
+
return self.tmp_dir/ "mmseqs_e_search_output"
|
|
109
|
+
@property
|
|
110
|
+
def mmseqs_e_search_log_path(self):
|
|
111
|
+
return self.log_dir/ f"{self.prefix}_mmseqs_e_search.log"
|
|
112
|
+
@property
|
|
113
|
+
def mmseqs_e_search_output_prefix(self):
|
|
114
|
+
return self.mmseqs_e_search_output_dir / f"{self.prefix}_mmseqs_e_search"
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def mmseqs_e_search_output_path(self):
|
|
118
|
+
return self.mmseqs_e_search_output_dir / f"{self.prefix}_mmseqs_e_search.tsv"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def plot_outdir(self):
|
|
123
|
+
return self.output_dir / f"{self.prefix}_rdrpcatch_plots"
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def fasta_output_dir(self):
|
|
127
|
+
return self.output_dir / f"{self.prefix}_rdrpcatch_fasta"
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def fasta_nuc_out_path(self):
|
|
131
|
+
return self.fasta_output_dir / f"{self.prefix}_full_nucleotide_contigs.fasta"
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def fasta_trimmed_out_path(self):
|
|
135
|
+
return self.fasta_output_dir / f"{self.prefix}_trimmed_aminoacid_contigs.fasta"
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def fasta_prot_out_path(self):
|
|
139
|
+
return self.fasta_output_dir / f"{self.prefix}_full_aminoacid_contigs.fasta"
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def rdrpcatch_output(self):
|
|
143
|
+
return self.tsv_outdir / f"{self.prefix}_rdrpcatch_output.tsv"
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def extended_rdrpcatch_output(self):
|
|
147
|
+
return self.output_dir / f"{self.prefix}_rdrpcatch_output_annotated.tsv"
|
|
148
|
+
@property
|
|
149
|
+
def log_dir(self):
|
|
150
|
+
return self.tmp_dir / f"{self.prefix}_logs"
|
|
151
|
+
@property
|
|
152
|
+
def log_file(self):
|
|
153
|
+
return self.log_dir / f"{self.prefix}_rdrpcatch.log"
|
|
154
|
+
@property
|
|
155
|
+
def gff_output_dir(self):
|
|
156
|
+
return self.output_dir / f"{self.prefix}_gff_files"
|
|
157
|
+
@property
|
|
158
|
+
def gff_output_path(self):
|
|
159
|
+
return self.gff_output_dir / f"{self.prefix}_full_aminoacid_rdrpcatch.gff3"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
# Filter numpy warnings before any imports that might trigger them
|
|
3
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="numpy")
|
|
4
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
|
|
5
|
+
warnings.filterwarnings("ignore", message=".*subnormal.*")
|
|
6
|
+
|
|
7
|
+
class Plotter:
|
|
8
|
+
|
|
9
|
+
def __init__(self, upset_outdir, tsv_outdir, prefix):
|
|
10
|
+
self.upset_outdir = upset_outdir
|
|
11
|
+
self.tsv_outdir = tsv_outdir
|
|
12
|
+
self.prefix = prefix
|
|
13
|
+
|
|
14
|
+
def upset_plotter(self, analysis_dict):
|
|
15
|
+
''' Create an upset plot for the analysis results for a given e-value threshold
|
|
16
|
+
|
|
17
|
+
:param analysis_dict:
|
|
18
|
+
:param general_outdir:
|
|
19
|
+
:param eval:
|
|
20
|
+
:return:
|
|
21
|
+
|
|
22
|
+
'''
|
|
23
|
+
from matplotlib import pyplot as plt
|
|
24
|
+
import upsetplot
|
|
25
|
+
import os
|
|
26
|
+
|
|
27
|
+
upset_data = upsetplot.from_contents(analysis_dict)
|
|
28
|
+
# write upset data to a tsv file
|
|
29
|
+
upset_data.to_csv(os.path.join(self.tsv_outdir, f"{self.prefix}_upset_data.tsv"), sep="\t")
|
|
30
|
+
upsetplot.UpSet(upset_data, subset_size="count", show_counts=True, sort_by='cardinality').plot()
|
|
31
|
+
plt.savefig(os.path.join(self.upset_outdir, f"{self.prefix}_upset_plot.png"), bbox_inches='tight', dpi=300)
|
|
32
|
+
plt.close()
|
|
33
|
+
|
|
34
|
+
# def plot_evalue(self, combined_df):
|
|
35
|
+
#
|
|
36
|
+
# sns.set(style="whitegrid")
|
|
37
|
+
# plt.figure(figsize=(10, 6))
|
|
38
|
+
# ax = sns.boxplot(x='db_name', y='E-value', data=combined_df, showfliers=False)
|
|
39
|
+
# plt.title(f"E-value distribution", fontweight='bold')
|
|
40
|
+
# plt.savefig(os.path.join(self.upset_outdir, f"{self.prefix}_evalue_plot.png"), bbox_inches='tight', dpi=300)
|
|
41
|
+
# plt.close()
|
|
42
|
+
|
|
43
|
+
def plot_evalue(self, combined_df):
|
|
44
|
+
import polars as pl
|
|
45
|
+
import altair as alt
|
|
46
|
+
import os
|
|
47
|
+
|
|
48
|
+
# Ensure the E-value column contains only positive numbers and convert to log scale
|
|
49
|
+
df = combined_df.filter(pl.col('E-value') > 0).with_columns([
|
|
50
|
+
pl.col('E-value').log10().alias('log10_evalue')
|
|
51
|
+
])
|
|
52
|
+
|
|
53
|
+
chart = alt.Chart(df).mark_boxplot().encode(
|
|
54
|
+
x=alt.X('db_name:N', title='Database'),
|
|
55
|
+
y=alt.Y('log10_evalue:Q', title='log10(E-value)'),
|
|
56
|
+
color='db_name:N'
|
|
57
|
+
).properties(
|
|
58
|
+
title='E-value Distribution',
|
|
59
|
+
width=600,
|
|
60
|
+
height=400
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_evalue_plot.html"))
|
|
64
|
+
|
|
65
|
+
def plot_score(self, combined_df):
|
|
66
|
+
import altair as alt
|
|
67
|
+
import os
|
|
68
|
+
|
|
69
|
+
chart = alt.Chart(combined_df).mark_boxplot().encode(
|
|
70
|
+
x=alt.X('db_name:N', title='Database'),
|
|
71
|
+
y=alt.Y('score:Q', title='Score'),
|
|
72
|
+
color='db_name:N'
|
|
73
|
+
).properties(
|
|
74
|
+
title='Score Distribution',
|
|
75
|
+
width=600,
|
|
76
|
+
height=400
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_score_plot.html"))
|
|
80
|
+
|
|
81
|
+
def plot_norm_bitscore_profile(self, combined_df):
|
|
82
|
+
import altair as alt
|
|
83
|
+
import os
|
|
84
|
+
|
|
85
|
+
chart = alt.Chart(combined_df).mark_boxplot().encode(
|
|
86
|
+
x=alt.X('db_name:N', title='Database'),
|
|
87
|
+
y=alt.Y('norm_bitscore_profile:Q', title='Normalized Bitscore (Profile)'),
|
|
88
|
+
color='db_name:N'
|
|
89
|
+
).properties(
|
|
90
|
+
title='Normalized Bitscore Distribution (Profile)',
|
|
91
|
+
width=600,
|
|
92
|
+
height=400
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_norm_bitscore_plot_profile.html"))
|
|
96
|
+
|
|
97
|
+
def plot_norm_bitscore_contig(self, combined_df):
|
|
98
|
+
import altair as alt
|
|
99
|
+
import os
|
|
100
|
+
|
|
101
|
+
chart = alt.Chart(combined_df).mark_boxplot().encode(
|
|
102
|
+
x=alt.X('db_name:N', title='Database'),
|
|
103
|
+
y=alt.Y('norm_bitscore_contig:Q', title='Normalized Bitscore (Contig)'),
|
|
104
|
+
color='db_name:N'
|
|
105
|
+
).properties(
|
|
106
|
+
title='Normalized Bitscore Distribution (Contig)',
|
|
107
|
+
width=600,
|
|
108
|
+
height=400
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_norm_bitscore_contig_plot.html"))
|
|
112
|
+
|
|
113
|
+
def plot_ID_score(self, combined_df):
|
|
114
|
+
import altair as alt
|
|
115
|
+
import os
|
|
116
|
+
|
|
117
|
+
chart = alt.Chart(combined_df).mark_boxplot().encode(
|
|
118
|
+
x=alt.X('db_name:N', title='Database'),
|
|
119
|
+
y=alt.Y('ID_score:Q', title='Identity Score'),
|
|
120
|
+
color='db_name:N'
|
|
121
|
+
).properties(
|
|
122
|
+
title='Identity Score Distribution',
|
|
123
|
+
width=600,
|
|
124
|
+
height=400
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_ID_score_plot.html"))
|
|
128
|
+
|
|
129
|
+
def plot_profile_coverage(self, combined_df):
|
|
130
|
+
import altair as alt
|
|
131
|
+
import os
|
|
132
|
+
|
|
133
|
+
chart = alt.Chart(combined_df).mark_boxplot().encode(
|
|
134
|
+
x=alt.X('db_name:N', title='Database'),
|
|
135
|
+
y=alt.Y('profile_coverage:Q', title='Profile Coverage'),
|
|
136
|
+
color='db_name:N'
|
|
137
|
+
).properties(
|
|
138
|
+
title='Profile Coverage Distribution',
|
|
139
|
+
width=600,
|
|
140
|
+
height=400
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_profile_coverage_plot.html"))
|
|
144
|
+
|
|
145
|
+
def plot_contig_coverage(self, combined_df):
|
|
146
|
+
import altair as alt
|
|
147
|
+
import os
|
|
148
|
+
|
|
149
|
+
chart = alt.Chart(combined_df).mark_boxplot().encode(
|
|
150
|
+
x=alt.X('db_name:N', title='Database'),
|
|
151
|
+
y=alt.Y('contig_coverage:Q', title='Contig Coverage'),
|
|
152
|
+
color='db_name:N'
|
|
153
|
+
).properties(
|
|
154
|
+
title='Contig Coverage Distribution',
|
|
155
|
+
width=600,
|
|
156
|
+
height=400
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_contig_coverage_plot.html"))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|