rdrpcatch 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,256 @@
1
+ import tkinter as tk
2
+ from tkinter import filedialog
3
+ from tkinter import ttk
4
+ import os
5
+ import subprocess
6
+ from tkinter import messagebox
7
+ import polars as pl
8
+
9
+
10
+ class colabscanner_gui:
11
+ def __init__(self):
12
+ self.root = tk.Tk()
13
+ self.root.title("RdRpCATCH")
14
+ self.root.geometry("1000x800")
15
+
16
+
17
+ style = ttk.Style(self.root)
18
+ # set theme style
19
+ style.theme_use("clam")
20
+
21
+ # Input file selection
22
+ self.input_frame = ttk.Frame(self.root)
23
+ self.input_frame.grid(row=0, column=0, columnspan=3, padx=5, pady=5)
24
+
25
+ self.input_label = ttk.Label(self.input_frame, text="Input File:")
26
+ self.input_label.pack(side=tk.LEFT)
27
+
28
+ self.input_entry = ttk.Entry(self.input_frame, width=50)
29
+ self.input_entry.pack(side=tk.LEFT, padx=5)
30
+
31
+ self.input_button = ttk.Button(self.input_frame, text="Browse", command=self.browse_input)
32
+ self.input_button.pack(side=tk.LEFT)
33
+
34
+ # Parent directory selection
35
+ self.parent_dir_frame = ttk.Frame(self.root)
36
+ self.parent_dir_frame.grid(row=1, column=0, columnspan=3, padx=5, pady=5)
37
+
38
+ self.parent_dir_label = ttk.Label(self.parent_dir_frame, text="Parent Directory:")
39
+ self.parent_dir_label.pack(side=tk.LEFT)
40
+
41
+ self.parent_dir_entry = ttk.Entry(self.parent_dir_frame, width=50)
42
+ self.parent_dir_entry.pack(side=tk.LEFT, padx=5)
43
+
44
+ self.parent_dir_button = ttk.Button(self.parent_dir_frame, text="Browse", command=self.browse_parent_dir)
45
+ self.parent_dir_button.pack(side=tk.LEFT)
46
+
47
+ # Output name entry
48
+ self.output_name_frame = ttk.Frame(self.root)
49
+ self.output_name_frame.grid(row=2, column=0, columnspan=3, padx=5, pady=5)
50
+
51
+ self.output_name_label = ttk.Label(self.output_name_frame, text="Output Name:")
52
+ self.output_name_label.pack(side=tk.LEFT)
53
+
54
+ self.output_name_entry = ttk.Entry(self.output_name_frame, width=50)
55
+ self.output_name_entry.pack(side=tk.LEFT, padx=5)
56
+
57
+ # HMM directory selection
58
+ self.hmm_dir_frame = ttk.Frame(self.root)
59
+ self.hmm_dir_frame.grid(row=3, column=0, columnspan=3, padx=5, pady=5)
60
+
61
+ self.hmm_dir_label = ttk.Label(self.hmm_dir_frame, text="HMM Directory:")
62
+ self.hmm_dir_label.pack(side=tk.LEFT)
63
+
64
+ self.hmm_dir_entry = ttk.Entry(self.hmm_dir_frame, width=50)
65
+ self.hmm_dir_entry.pack(side=tk.LEFT, padx=5)
66
+
67
+ self.hmm_dir_button = ttk.Button(self.hmm_dir_frame, text="Browse", command=self.browse_hmm_dir)
68
+ self.hmm_dir_button.pack(side=tk.LEFT)
69
+
70
+ # Database selection
71
+ self.db_frame = ttk.LabelFrame(self.root, text="Select Databases")
72
+ self.db_frame.grid(row=4, column=0, columnspan=3, padx=5, pady=10)
73
+
74
+ self.databases = {
75
+ 'RVMT': tk.BooleanVar(),
76
+ 'NeoRdRp': tk.BooleanVar(),
77
+ 'NeoRdRp.2.1': tk.BooleanVar(),
78
+ 'TSA_Olendraite_fam': tk.BooleanVar(),
79
+ 'TSA_Olendraite_gen': tk.BooleanVar(),
80
+ 'RDRP-scan': tk.BooleanVar(),
81
+ 'Lucaprot': tk.BooleanVar()
82
+ }
83
+
84
+ for i, (db_name, var) in enumerate(self.databases.items()):
85
+ ttk.Checkbutton(self.db_frame, text=db_name, variable=var).grid(row=i//3, column=i%3, padx=5, pady=2)
86
+
87
+ # HMMsearch parameters
88
+ self.hmmsearch_frame = ttk.LabelFrame(self.root, text="HMMsearch Parameters")
89
+ self.hmmsearch_frame.grid(row=6, column=1, padx=5, pady=10)
90
+
91
+ self.hmmsearch_label = ttk.Label(self.hmmsearch_frame, text="HMMsearch Parameters:")
92
+ self.hmmsearch_label.pack(anchor=tk.W)
93
+
94
+ self.evalue_var = tk.StringVar(value='1e-05')
95
+ self.inc_evalue_var = tk.StringVar(value='1e-05')
96
+ self.dom_evalue_var = tk.StringVar(value='1e-05')
97
+ self.incdom_evalue_var = tk.StringVar(value='1e-05')
98
+ self.z_value_var = tk.IntVar(value=1000000)
99
+ self.cpus_var = tk.IntVar(value=1)
100
+
101
+ hmmsearch_params = [
102
+ ('E-value threshold', self.evalue_var),
103
+ ('Inclusion E-value threshold', self.inc_evalue_var),
104
+ ('Domain E-value threshold', self.dom_evalue_var),
105
+ ('Inclusion domain E-value threshold', self.incdom_evalue_var),
106
+ ('z-value', self.z_value_var),
107
+ ('Number of CPUs to use', self.cpus_var)
108
+ ]
109
+
110
+ for param_name, param_var in hmmsearch_params:
111
+ frame = ttk.Frame(self.hmmsearch_frame)
112
+ frame.pack(fill=tk.X, padx=5, pady=2)
113
+ ttk.Label(frame, text=param_name).pack(side=tk.LEFT)
114
+ ttk.Entry(frame, textvariable=param_var, width=15).pack(side=tk.RIGHT)
115
+
116
+ # seqkit translate parameters
117
+ self.seqkit_frame = ttk.Frame(self.root)
118
+ self.seqkit_frame.grid(row=6, column=2, padx=5, pady=10)
119
+ self.gen_code_var = tk.IntVar(value=1)
120
+ self.frame_var = tk.IntVar(value=6)
121
+ self.seqkit_translate_label = ttk.Label(self.seqkit_frame, text="Seqkit Translate Parameters:")
122
+ self.seqkit_translate_label.pack(anchor=tk.W)
123
+ seqkit_transl_params = [ ('Genetic code', self.gen_code_var), ('Frame (6: All frames)', self.frame_var)]
124
+ for param_name, param_var in seqkit_transl_params:
125
+ frame = ttk.Frame(self.seqkit_frame)
126
+ frame.pack(fill=tk.X, padx=5, pady=2)
127
+ ttk.Label(frame, text=param_name).pack(side=tk.LEFT)
128
+ ttk.Entry(frame, textvariable=param_var, width=15).pack(side=tk.RIGHT)
129
+
130
+ # Run button
131
+ self.run_button = ttk.Button(self.root, text="Run Script", command=self.run_script)
132
+ self.run_button.grid(row=7, column=1, padx=5, pady=5)
133
+
134
+ # Status label
135
+ self.status_label = tk.Label(self.root, text="", wraplength=400)
136
+ self.status_label.grid(row=9, column=0, columnspan=3, padx=5, pady=5)
137
+
138
+ # Progress text box (wider version)
139
+ self.progress_frame = ttk.Frame(self.root)
140
+ self.progress_frame.grid(row=8, column=0, columnspan=3, padx=5, pady=10)
141
+
142
+ self.progress_label = ttk.Label(self.progress_frame, text="Progress:")
143
+ self.progress_label.pack(anchor=tk.W)
144
+
145
+ # Make the text area wider
146
+ self.progress_text = tk.Text(self.progress_frame, height=15, width=120)
147
+ self.progress_text.pack(fill=tk.BOTH, expand=True)
148
+ self.progress_text.config(state='disabled')
149
+
150
+ # Add horizontal scrollbar
151
+ self.scrollbar_x = ttk.Scrollbar(self.progress_frame, orient=tk.HORIZONTAL)
152
+ self.scrollbar_x.pack(side=tk.BOTTOM, fill=tk.X)
153
+ self.progress_text.config(xscrollcommand=self.scrollbar_x.set)
154
+ self.scrollbar_x.config(command=self.progress_text.xview)
155
+
156
+ # Vertical scrollbar (already added earlier)
157
+ self.scrollbar = ttk.Scrollbar(self.progress_frame)
158
+ self.scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
159
+ self.progress_text.config(yscrollcommand=self.scrollbar.set)
160
+ self.scrollbar.config(command=self.progress_text.yview)
161
+
162
+ def browse_input(self):
163
+ filename = filedialog.askopenfilename()
164
+ self.input_entry.delete(0, tk.END)
165
+ self.input_entry.insert(0, filename)
166
+
167
+ def browse_parent_dir(self):
168
+ dirname = filedialog.askdirectory()
169
+ self.parent_dir_entry.delete(0, tk.END)
170
+ self.parent_dir_entry.insert(0, dirname)
171
+
172
+ def browse_hmm_dir(self):
173
+ dirname = filedialog.askdirectory()
174
+ self.hmm_dir_entry.delete(0, tk.END)
175
+ self.hmm_dir_entry.insert(0, dirname)
176
+
177
+ def run_script(self):
178
+ input_file = self.input_entry.get()
179
+ parent_dir = self.parent_dir_entry.get()
180
+ output_name = self.output_name_entry.get()
181
+ hmm_dir = self.hmm_dir_entry.get()
182
+
183
+ selected_dbs = [db for db, var in self.databases.items() if var.get()]
184
+
185
+ if not os.path.isfile(input_file):
186
+ self.status_label.config(text="Error: Input file does not exist.", fg="red")
187
+ return
188
+
189
+ if not os.path.isdir(parent_dir):
190
+ self.status_label.config(text="Error: Parent directory does not exist.", fg="red")
191
+ return
192
+
193
+ output_dir = os.path.join(parent_dir, output_name)
194
+
195
+ try:
196
+ os.makedirs(output_dir, exist_ok=True)
197
+ except Exception as e:
198
+ self.status_label.config(text=f"Error creating output directory: {str(e)}", fg="red")
199
+ return
200
+
201
+ db_options = ",".join(selected_dbs) if selected_dbs else "all"
202
+
203
+ hmmsearch_args = [
204
+ f'-e', self.evalue_var.get(),
205
+ f'-incE', self.inc_evalue_var.get(),
206
+ f'-domE', self.dom_evalue_var.get(),
207
+ f'-incdomE', self.incdom_evalue_var.get(),
208
+ f'-z', str(self.z_value_var.get()),
209
+ f'-cpus', str(self.cpus_var.get())
210
+ ]
211
+
212
+ try:
213
+ # Clear previous progress
214
+ self.clear_progress()
215
+
216
+ # Run the script
217
+ result = subprocess.Popen(
218
+ ["python3", "colabscanner.py", "-i", input_file, "-o", output_dir, "-hmm_dir", hmm_dir, "-dbs",
219
+ db_options] + hmmsearch_args,
220
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True)
221
+
222
+ # Read and display output line by line
223
+ for line in iter(result.stdout.readline, ""):
224
+ self.update_progress(line.strip())
225
+
226
+ result.wait()
227
+ if result.returncode == 0:
228
+ self.status_label.config(text="Script executed successfully.", fg="green")
229
+ else:
230
+ self.status_label.config(text="Script execution failed.", fg="red")
231
+
232
+ except Exception as e:
233
+ messagebox.showerror("Error", f"An unexpected error occurred:\n{str(e)}")
234
+ print(f"Unexpected error: {e}")
235
+
236
+ def clear_progress(self):
237
+ self.progress_text.config(state='normal')
238
+ self.progress_text.delete('1.0', tk.END)
239
+ self.progress_text.config(state='disabled')
240
+
241
+ def update_progress(self, message):
242
+ self.progress_text.config(state='normal')
243
+ self.progress_text.insert(tk.END, message + "\n")
244
+ self.progress_text.yview(tk.END)
245
+ self.progress_text.config(state='disabled')
246
+ self.root.update_idletasks()
247
+
248
+ def run(self):
249
+ self.root.mainloop()
250
+
251
+
252
+ if __name__ == "__main__":
253
+ gui = colabscanner_gui()
254
+ gui.run()
255
+
256
+
@@ -0,0 +1,100 @@
1
+ import subprocess
2
+ import os
3
+
4
+
5
+
6
+ class mmseqs:
7
+
8
+ def __init__(self,fasta_fn, mmseqs_db, mmseqs_out_prefix,outdir_path, sens, cpus, log_file):
9
+ self.fasta_fn = fasta_fn
10
+ self.mmseqs_db = mmseqs_db
11
+ self.out_prefix = mmseqs_out_prefix
12
+ self.outdir_path = outdir_path
13
+ self.sens = sens
14
+ self.cpus = cpus
15
+ self.log_file = log_file
16
+
17
+ def run_mmseqs_easy_tax_lca(self):
18
+ """Run mmseqs easy-tax command."""
19
+
20
+ mmseqs_easy_tax_cmd = ["mmseqs",
21
+ "easy-taxonomy",
22
+ str(self.fasta_fn),
23
+ str(self.mmseqs_db),
24
+ str(self.out_prefix),
25
+ f"{str(self.outdir_path)}/tmp",
26
+ "--tax-lineage",
27
+ "1",
28
+ "--alignment-mode",
29
+ "3",
30
+ "-s",
31
+ str(self.sens),
32
+ "--threads",
33
+ str(self.cpus)
34
+ ]
35
+
36
+ try:
37
+ with open(self.log_file, 'w') as fout:
38
+ subprocess.run(mmseqs_easy_tax_cmd, stdout=fout, stderr=fout, shell=False, check=True)
39
+
40
+ except subprocess.CalledProcessError as e:
41
+ cmd_str = ' '.join(mmseqs_easy_tax_cmd)
42
+ raise Exception(f"Error running mmseqs easy-tax command: {cmd_str}")
43
+
44
+
45
+ def run_mmseqs_easy_tax_tophit(self):
46
+
47
+
48
+ mmseqs_easy_tax_cmd = ["mmseqs",
49
+ "easy-taxonomy",
50
+ self.fasta_fn,
51
+ self.mmseqs_db,
52
+ self.out_prefix,
53
+ "tmp",
54
+ "--tax-lineage",
55
+ "1",
56
+ self.sens,
57
+ "--threads",
58
+ str(self.cpus),
59
+ "--lca-mode",
60
+ 4]
61
+ try:
62
+ with open(self.log_file, 'w') as fout:
63
+ subprocess.run(mmseqs_easy_tax_cmd, stdout=fout, stderr=fout, shell=False, check=True)
64
+
65
+ except subprocess.CalledProcessError as e:
66
+ cmd_str = ' '.join(mmseqs_easy_tax_cmd)
67
+ raise Exception(f"Error running mmseqs easy-tax command: {cmd_str}")
68
+
69
+
70
+ def run_mmseqs_e_search(self):
71
+ mmseqs_e_search_cmd = ["mmseqs",
72
+ "easy-search",
73
+ str(self.fasta_fn),
74
+ str(self.mmseqs_db),
75
+ str(self.outdir_path),
76
+ f"{str(self.out_prefix)}/tmp",
77
+ "--start-sens",
78
+ str(self.sens),
79
+ "--threads",
80
+ str(self.cpus),
81
+ "--format-output",
82
+ "query,target,fident,alnlen,mismatch,gapopen,"
83
+ "qstart,qend,tstart,tend,evalue,bits,qcov,tcov,taxlineage",
84
+ "--sort-results",
85
+ "1"]
86
+ try:
87
+ with open(self.log_file, 'w') as fout:
88
+ subprocess.run(mmseqs_e_search_cmd, stdout=fout, stderr=fout, shell=False, check=True)
89
+
90
+ except subprocess.CalledProcessError as e:
91
+ cmd_str = ' '.join(mmseqs_e_search_cmd)
92
+ raise Exception(f"Error running mmseqs easy-search command: {cmd_str}")
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
@@ -0,0 +1,162 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+
5
+ class classproperty(property):
6
+ def __get__(self, cls, owner):
7
+ return classmethod(self.fget).__get__(None, owner)()
8
+
9
+ @dataclass
10
+ class rdrpcatch_input:
11
+
12
+ #TODO: Change this line for final version
13
+
14
+ source_dir : Path = Path(__file__).parents[0].parents[0].parents[0]
15
+
16
+
17
+ @classproperty
18
+ def db_dir(cls):
19
+ return cls.source_dir / 'DBs'
20
+
21
+ @classproperty
22
+ def hmm_dbs_dir(cls):
23
+ return cls.source_dir / 'DBs'/ 'hmm_dbs'
24
+
25
+
26
+ @classproperty
27
+ def test_dir(cls):
28
+ return cls.source_dir / 'test'
29
+
30
+
31
+ @classproperty
32
+ def input_fasta(cls):
33
+ return cls.source_dir / "input.fasta"
34
+
35
+
36
+ @dataclass
37
+ class rdrpcatch_output:
38
+
39
+ prefix: str
40
+ output_dir: Path
41
+
42
+ @property
43
+ def tmp_dir(self):
44
+ return self.output_dir / "tmp"
45
+
46
+ @property
47
+ def hmm_output_dir (self):
48
+ return self.tmp_dir /"hmm_output"
49
+
50
+ def hmm_output_path(self, db_name):
51
+ return self.hmm_output_dir / f"{self.prefix}_{db_name}_hmmsearch_output.txt"
52
+
53
+ @property
54
+ def formatted_hmm_output_dir(self):
55
+ return self.tmp_dir / "formatted_hmm_output"
56
+
57
+ def formatted_hmm_output_path(self, db_name):
58
+ return self.formatted_hmm_output_dir / f"{self.prefix}_{db_name}_hmm_output_formatted.txt"
59
+
60
+ @property
61
+ def best_hit_dir(self):
62
+ return self.tmp_dir / "best_hit_hmm_output"
63
+
64
+ def best_hit_path(self, db_name):
65
+ return self.best_hit_dir / f"{self.prefix}_{db_name}_hmm_output_best_hit.txt"
66
+
67
+ @property
68
+ def seqkit_seq_output_dir(self):
69
+ return self.tmp_dir/ "seqkit_seq_output"
70
+
71
+ @property
72
+ def seqkit_seq_output_path(self):
73
+ return self.seqkit_seq_output_dir / f"{self.prefix}_seqkit_seq_output.fasta"
74
+
75
+ @property
76
+ def seqkit_translate_output_dir(self):
77
+ return self.tmp_dir/ "seqkit_translate_output"
78
+
79
+ @property
80
+ def seqkit_translate_output_path(self):
81
+ return self.seqkit_translate_output_dir / f"{self.prefix}_seqkit_translate_output.fasta"
82
+
83
+
84
+ @property
85
+ def tsv_outdir(self):
86
+ return self.tmp_dir/ "tsv_files"
87
+ @property
88
+ def combined_tsv_path(self):
89
+ return self.tsv_outdir / f"{self.prefix}_combined.tsv"
90
+
91
+ @property
92
+ def mmseqs_tax_output_dir(self ):
93
+ return self.tmp_dir/ "mmseqs_tax_output"
94
+
95
+ @property
96
+ def mmseqs_tax_output_prefix(self):
97
+ return self.mmseqs_tax_output_dir / f"{self.prefix}_mmseqs_tax"
98
+
99
+ @property
100
+ def mmseqs_tax_log_path(self):
101
+ return self.log_dir / f"{self.prefix}_mmseqs_tax.log"
102
+
103
+ @property
104
+ def mmseqs_tax_output_lca_path(self):
105
+ return self.mmseqs_tax_output_dir / f"{self.prefix}_mmseqs_tax_lca.tsv"
106
+ @property
107
+ def mmseqs_e_search_output_dir(self):
108
+ return self.tmp_dir/ "mmseqs_e_search_output"
109
+ @property
110
+ def mmseqs_e_search_log_path(self):
111
+ return self.log_dir/ f"{self.prefix}_mmseqs_e_search.log"
112
+ @property
113
+ def mmseqs_e_search_output_prefix(self):
114
+ return self.mmseqs_e_search_output_dir / f"{self.prefix}_mmseqs_e_search"
115
+
116
+ @property
117
+ def mmseqs_e_search_output_path(self):
118
+ return self.mmseqs_e_search_output_dir / f"{self.prefix}_mmseqs_e_search.tsv"
119
+
120
+
121
+ @property
122
+ def plot_outdir(self):
123
+ return self.output_dir / f"{self.prefix}_rdrpcatch_plots"
124
+
125
+ @property
126
+ def fasta_output_dir(self):
127
+ return self.output_dir / f"{self.prefix}_rdrpcatch_fasta"
128
+
129
+ @property
130
+ def fasta_nuc_out_path(self):
131
+ return self.fasta_output_dir / f"{self.prefix}_full_nucleotide_contigs.fasta"
132
+
133
+ @property
134
+ def fasta_trimmed_out_path(self):
135
+ return self.fasta_output_dir / f"{self.prefix}_trimmed_aminoacid_contigs.fasta"
136
+
137
+ @property
138
+ def fasta_prot_out_path(self):
139
+ return self.fasta_output_dir / f"{self.prefix}_full_aminoacid_contigs.fasta"
140
+
141
+ @property
142
+ def rdrpcatch_output(self):
143
+ return self.tsv_outdir / f"{self.prefix}_rdrpcatch_output.tsv"
144
+
145
+ @property
146
+ def extended_rdrpcatch_output(self):
147
+ return self.output_dir / f"{self.prefix}_rdrpcatch_output_annotated.tsv"
148
+ @property
149
+ def log_dir(self):
150
+ return self.tmp_dir / f"{self.prefix}_logs"
151
+ @property
152
+ def log_file(self):
153
+ return self.log_dir / f"{self.prefix}_rdrpcatch.log"
154
+ @property
155
+ def gff_output_dir(self):
156
+ return self.output_dir / f"{self.prefix}_gff_files"
157
+ @property
158
+ def gff_output_path(self):
159
+ return self.gff_output_dir / f"{self.prefix}_full_aminoacid_rdrpcatch.gff3"
160
+
161
+
162
+
@@ -0,0 +1,165 @@
1
+ import warnings
2
+ # Filter numpy warnings before any imports that might trigger them
3
+ warnings.filterwarnings("ignore", category=UserWarning, module="numpy")
4
+ warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
5
+ warnings.filterwarnings("ignore", message=".*subnormal.*")
6
+
7
+ class Plotter:
8
+
9
+ def __init__(self, upset_outdir, tsv_outdir, prefix):
10
+ self.upset_outdir = upset_outdir
11
+ self.tsv_outdir = tsv_outdir
12
+ self.prefix = prefix
13
+
14
+ def upset_plotter(self, analysis_dict):
15
+ ''' Create an upset plot for the analysis results for a given e-value threshold
16
+
17
+ :param analysis_dict:
18
+ :param general_outdir:
19
+ :param eval:
20
+ :return:
21
+
22
+ '''
23
+ from matplotlib import pyplot as plt
24
+ import upsetplot
25
+ import os
26
+
27
+ upset_data = upsetplot.from_contents(analysis_dict)
28
+ # write upset data to a tsv file
29
+ upset_data.to_csv(os.path.join(self.tsv_outdir, f"{self.prefix}_upset_data.tsv"), sep="\t")
30
+ upsetplot.UpSet(upset_data, subset_size="count", show_counts=True, sort_by='cardinality').plot()
31
+ plt.savefig(os.path.join(self.upset_outdir, f"{self.prefix}_upset_plot.png"), bbox_inches='tight', dpi=300)
32
+ plt.close()
33
+
34
+ # def plot_evalue(self, combined_df):
35
+ #
36
+ # sns.set(style="whitegrid")
37
+ # plt.figure(figsize=(10, 6))
38
+ # ax = sns.boxplot(x='db_name', y='E-value', data=combined_df, showfliers=False)
39
+ # plt.title(f"E-value distribution", fontweight='bold')
40
+ # plt.savefig(os.path.join(self.upset_outdir, f"{self.prefix}_evalue_plot.png"), bbox_inches='tight', dpi=300)
41
+ # plt.close()
42
+
43
+ def plot_evalue(self, combined_df):
44
+ import polars as pl
45
+ import altair as alt
46
+ import os
47
+
48
+ # Ensure the E-value column contains only positive numbers and convert to log scale
49
+ df = combined_df.filter(pl.col('E-value') > 0).with_columns([
50
+ pl.col('E-value').log10().alias('log10_evalue')
51
+ ])
52
+
53
+ chart = alt.Chart(df).mark_boxplot().encode(
54
+ x=alt.X('db_name:N', title='Database'),
55
+ y=alt.Y('log10_evalue:Q', title='log10(E-value)'),
56
+ color='db_name:N'
57
+ ).properties(
58
+ title='E-value Distribution',
59
+ width=600,
60
+ height=400
61
+ )
62
+
63
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_evalue_plot.html"))
64
+
65
+ def plot_score(self, combined_df):
66
+ import altair as alt
67
+ import os
68
+
69
+ chart = alt.Chart(combined_df).mark_boxplot().encode(
70
+ x=alt.X('db_name:N', title='Database'),
71
+ y=alt.Y('score:Q', title='Score'),
72
+ color='db_name:N'
73
+ ).properties(
74
+ title='Score Distribution',
75
+ width=600,
76
+ height=400
77
+ )
78
+
79
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_score_plot.html"))
80
+
81
+ def plot_norm_bitscore_profile(self, combined_df):
82
+ import altair as alt
83
+ import os
84
+
85
+ chart = alt.Chart(combined_df).mark_boxplot().encode(
86
+ x=alt.X('db_name:N', title='Database'),
87
+ y=alt.Y('norm_bitscore_profile:Q', title='Normalized Bitscore (Profile)'),
88
+ color='db_name:N'
89
+ ).properties(
90
+ title='Normalized Bitscore Distribution (Profile)',
91
+ width=600,
92
+ height=400
93
+ )
94
+
95
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_norm_bitscore_plot_profile.html"))
96
+
97
+ def plot_norm_bitscore_contig(self, combined_df):
98
+ import altair as alt
99
+ import os
100
+
101
+ chart = alt.Chart(combined_df).mark_boxplot().encode(
102
+ x=alt.X('db_name:N', title='Database'),
103
+ y=alt.Y('norm_bitscore_contig:Q', title='Normalized Bitscore (Contig)'),
104
+ color='db_name:N'
105
+ ).properties(
106
+ title='Normalized Bitscore Distribution (Contig)',
107
+ width=600,
108
+ height=400
109
+ )
110
+
111
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_norm_bitscore_contig_plot.html"))
112
+
113
+ def plot_ID_score(self, combined_df):
114
+ import altair as alt
115
+ import os
116
+
117
+ chart = alt.Chart(combined_df).mark_boxplot().encode(
118
+ x=alt.X('db_name:N', title='Database'),
119
+ y=alt.Y('ID_score:Q', title='Identity Score'),
120
+ color='db_name:N'
121
+ ).properties(
122
+ title='Identity Score Distribution',
123
+ width=600,
124
+ height=400
125
+ )
126
+
127
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_ID_score_plot.html"))
128
+
129
+ def plot_profile_coverage(self, combined_df):
130
+ import altair as alt
131
+ import os
132
+
133
+ chart = alt.Chart(combined_df).mark_boxplot().encode(
134
+ x=alt.X('db_name:N', title='Database'),
135
+ y=alt.Y('profile_coverage:Q', title='Profile Coverage'),
136
+ color='db_name:N'
137
+ ).properties(
138
+ title='Profile Coverage Distribution',
139
+ width=600,
140
+ height=400
141
+ )
142
+
143
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_profile_coverage_plot.html"))
144
+
145
+ def plot_contig_coverage(self, combined_df):
146
+ import altair as alt
147
+ import os
148
+
149
+ chart = alt.Chart(combined_df).mark_boxplot().encode(
150
+ x=alt.X('db_name:N', title='Database'),
151
+ y=alt.Y('contig_coverage:Q', title='Contig Coverage'),
152
+ color='db_name:N'
153
+ ).properties(
154
+ title='Contig Coverage Distribution',
155
+ width=600,
156
+ height=400
157
+ )
158
+
159
+ chart.save(os.path.join(self.upset_outdir, f"{self.prefix}_contig_coverage_plot.html"))
160
+
161
+
162
+
163
+
164
+
165
+