rdrpcatch 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ import os
2
+ import pyhmmer
3
+ class pyhmmsearch:
4
+
5
+ def __init__(self, hmmsearch_out_path, seq_file, hmm_file, cpus, e, incdomE, domE, incE, z):
6
+ self.hmmsearch_out_path = hmmsearch_out_path
7
+ self.hmmsearch_out_path_custom = str(self.hmmsearch_out_path.with_suffix('.custom.tsv'))
8
+ self.seq_file = seq_file
9
+ self.hmm_file = hmm_file
10
+ self.cpus = cpus
11
+ self.e = e
12
+ self.incdomE = incdomE
13
+ self.domE = domE
14
+ self.incE = incE
15
+ self.z = z
16
+
17
+ def run_pyhmmsearch(self):
18
+ """
19
+ TODO: 1. Add option to run hmmsearch on long sequences (longer than 100kb) as pyhmmer.Pipeline is not able to handle
20
+ TODO: long sequences. See: https://pyhmmer.readthedocs.io/en/latest/api/plan7.html#pyhmmer.plan7.LongTargetsPipeline
21
+ TODO: 2. Parameters are now hardcoded, add option to change them
22
+ """
23
+ # import pyhmmer
24
+
25
+ if not os.path.exists(self.hmmsearch_out_path):
26
+
27
+ with pyhmmer.plan7.HMMPressedFile(self.hmm_file) as handle:
28
+ hmms = list(handle)
29
+
30
+ with pyhmmer.easel.SequenceFile(self.seq_file, digital=True) as handle:
31
+ db = list(handle)
32
+
33
+ with open(self.hmmsearch_out_path, 'wb') as raw_out, open(self.hmmsearch_out_path_custom, 'wb') as custom_out:
34
+ title_line = ["t_name", "t_acc", "tlen", "q_name", "q_acc", "qlen", "E-value",
35
+ "score", "bias", "dom_num", "dom_total", "dom_c_value", "dom_i_value", "dom_score",
36
+ "dom_bias", "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc",
37
+ "description of target"]
38
+ raw_out.write("\t".join(title_line).encode("utf-8") + b"\n")
39
+ custom_out.write("\t".join(title_line).encode("utf-8") + b"\n")
40
+
41
+ for result in pyhmmer.hmmer.hmmsearch(hmms,
42
+ db,
43
+ cpus=self.cpus,
44
+ E=self.e,
45
+ incdomE=self.incdomE,
46
+ domE=self.domE,
47
+ incE=self.incE,
48
+ Z=self.z):
49
+
50
+ result.write(raw_out, format="domains", header=False)
51
+ if len(result) >= 1:
52
+ # result.reported.
53
+ # print(hits.query_name.decode())
54
+ for hit in result:
55
+ hit_desc = hit.accession or bytes("", "utf-8")
56
+ t_desc = hit.description or bytes("-", "utf-8")
57
+
58
+ # print(dir(hit.domains.ex))
59
+ # hit_name = hit.name.decode()
60
+ # join the prot name and acc into a single string because God knows why there are spaces in fasta headers
61
+ # full_prot_name = f"{hit_name} {hit_desc.decode()}"
62
+
63
+ total_domains = len(hit.domains.included)
64
+ dom_desc = result.query.description or bytes("", "utf-8")
65
+
66
+ for i, domain in enumerate(hit.domains.included):
67
+ domain_num = i + 1
68
+
69
+ # print(dir(domain.alignment))
70
+ # remove the non-numeric characters from the posterior_probabilities string, then convert to int
71
+ import re
72
+ # print(domain.alignment.posterior_probabilities)
73
+ aligned_probs = (re.sub(r'[^0-9]', '', domain.alignment.posterior_probabilities))
74
+ mean_aligned_prob = sum(int(digit) for digit in aligned_probs) / len(domain.alignment.posterior_probabilities)
75
+ MEA = mean_aligned_prob
76
+ # print(MEA)
77
+ outputline = [
78
+ f"{hit.name.decode()}", # t_name (protein)
79
+ f"{hit_desc.decode()}", # t_acc (empty if none)
80
+ f"{hit.length}", # tlen (protein length)
81
+ f"{result.query.name.decode()}", # q_name (HMM name)
82
+ f"{dom_desc.decode()}", # q_acc (empty if none)
83
+ f"{domain.alignment.hmm_length}", # qlen (HMM length)
84
+ f"{hit.evalue}", # E-value
85
+ f"{hit.score}", # score
86
+ f"{hit.bias}", # bias
87
+ f"{domain_num}", # dom_num (number of this domain)
88
+ f"{total_domains}", # dom_total (total number of domains)
89
+ f"{domain.c_evalue}", # dom_c_value
90
+ f"{domain.i_evalue}", # dom_i_value
91
+ f"{domain.score}", # dom_score
92
+ f"{domain.bias}", # dom_bias
93
+ f"{domain.alignment.hmm_from}", # hmm_from (query from)
94
+ f"{domain.alignment.hmm_to}", # hmm_to (query to)
95
+ f"{domain.alignment.target_from}", # ali_from (target from)
96
+ f"{domain.alignment.target_to}", # ali_to (target to)
97
+ f"{domain.env_from}", # env_from
98
+ f"{domain.env_to}", # env_to
99
+ f"{MEA}", # acc
100
+ f"{t_desc.decode()}" # description of target
101
+ ]
102
+ custom_out.write(("\t".join(outputline) + "\n").encode())
103
+
104
+ return self.hmmsearch_out_path
105
+
106
+ def run_pyhmmsearch_long_sequences(self):
107
+ """
108
+ Run hmmsearch for sequences longer than 100,000 residues.
109
+ """
110
+ import pyhmmer
111
+
112
+ if not os.path.exists(self.hmmsearch_out_path):
113
+ with pyhmmer.plan7.HMMPressedFile(self.hmm_file) as handle:
114
+ hmms = list(handle)
115
+
116
+ with pyhmmer.easel.SequenceFile(self.seq_file, digital=True) as handle:
117
+ db = list(handle)
118
+
119
+ # Create a LongTargetsPipeline instance
120
+ alphabet = pyhmmer.easel.Alphabet.amino()
121
+ pipeline = pyhmmer.plan7.LongTargetsPipeline(alphabet,
122
+ block_length=262144, # Default block length
123
+ F1=0.02, F2=0.003, F3=3e-05)
124
+
125
+ with open(self.hmmsearch_out_path, 'wb') as handle:
126
+ title_line = ["#t_name", "t_acc", "tlen", "q_name", "q_acc", "qlen", "E-value",
127
+ "score", "bias", "dom_num", "dom_total", "dom_c_value", "dom_i_value", "dom_score",
128
+ "dom_bias", "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc",
129
+ "description of target"]
130
+ handle.write("\t".join(title_line).encode("utf-8") + b"\n")
131
+
132
+ for hmm in hmms:
133
+ iterator = pipeline.iterate_seq(hmm, db)
134
+ max_iterations = 10 # Prevent infinite loop
135
+ for n in range(max_iterations):
136
+ _, hits, _, converged, _ = next(iterator)
137
+ if converged:
138
+ break
139
+
140
+ # Process hits and write to file
141
+ for hit in hits:
142
+ # Assuming hit is a plan7.Hit object
143
+ # Extract relevant information and write to file
144
+ # Note: This part might need adjustment based on actual hit structure
145
+ handle.write(f"{hit.target_name}\t{hit.target_accession}\t{hit.target_length}\t"
146
+ f"{hit.query_name}\t{hit.query_accession}\t{hit.query_length}\t"
147
+ f"{hit.evalue}\t{hit.score}\t{hit.bias}\t"
148
+ f"{hit.domain_num}\t{hit.domain_total}\t{hit.domain_cvalue}\t"
149
+ f"{hit.domain_ivalue}\t{hit.domain_score}\t{hit.domain_bias}\t"
150
+ f"{hit.hmm_from}\t{hit.hmm_to}\t{hit.ali_from}\t{hit.ali_to}\t"
151
+ f"{hit.env_from}\t{hit.env_to}\t{hit.acc}\t"
152
+ f"{hit.description}\n".encode("utf-8"))
153
+
154
+ return self.hmmsearch_out_path
155
+
@@ -0,0 +1,112 @@
1
+ class seqkit:
2
+
3
+ def __init__(self, input_file,
4
+ output_file,
5
+ log_file,
6
+ threads=4,
7
+ logger=None):
8
+
9
+ self.input_file = input_file
10
+ self.output_file = output_file
11
+ self.log_file = log_file
12
+ self.threads = threads
13
+ self.logger = logger
14
+
15
+ def run_seqkit_seq(self, length_thr=400):
16
+ import os
17
+ import subprocess
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ if self.logger:
22
+ self.logger.silent_log(f"Running seqkit seq on {self.input_file}")
23
+ self.logger.silent_log(f"Length threshold: {length_thr}")
24
+
25
+ seqkit_cmd = ["seqkit",
26
+ "seq",
27
+ "--threads",
28
+ str(self.threads),
29
+ "-m",
30
+ str(length_thr),
31
+ str(self.input_file),
32
+ "-o",
33
+ str(self.output_file)]
34
+
35
+ if self.logger:
36
+ self.logger.silent_log(f"Running command: {' '.join(seqkit_cmd)}")
37
+
38
+ with open(self.log_file, 'w') as fout:
39
+ try:
40
+ subprocess.run(seqkit_cmd, stdout=fout, stderr=fout, shell=False, check=True)
41
+ if self.logger:
42
+ self.logger.silent_log(f"Successfully filtered sequences to {self.output_file}")
43
+
44
+ except subprocess.CalledProcessError as e:
45
+ cmd_str = ' '.join(seqkit_cmd)
46
+ error_msg = f"Error running seqkit command: {cmd_str}"
47
+ if self.logger:
48
+ self.logger.silent_log(error_msg)
49
+ raise Exception(error_msg)
50
+
51
+ return str(self.output_file)
52
+
53
+
54
+ def run_seqkit_translate(self, gen_code=1, frame=6):
55
+ import os
56
+ import subprocess
57
+ import sys
58
+ from pathlib import Path
59
+
60
+ if self.logger:
61
+ self.logger.silent_log(f"Running seqkit translate on {self.input_file}")
62
+ self.logger.silent_log(f"Output will be written to {self.output_file}")
63
+ self.logger.silent_log(f"Using genetic code {gen_code} and frame {frame}")
64
+
65
+ seqkit_cmd = ["seqkit",
66
+ "translate",
67
+ "--threads",
68
+ str(self.threads),
69
+ "--clean",
70
+ "--append-frame",
71
+ "-f",
72
+ f"{frame}",
73
+ "-T",
74
+ f"{gen_code}",
75
+ str(self.input_file),
76
+ "-o",
77
+ str(self.output_file)]
78
+
79
+ if self.logger:
80
+ self.logger.silent_log(f"Running command: {' '.join(seqkit_cmd)}")
81
+
82
+ with open(self.log_file, 'w') as fout:
83
+ try:
84
+ subprocess.run(seqkit_cmd, stdout=fout, stderr=fout, shell=False, check=True)
85
+ # Check the output file exists and has content
86
+ if os.path.exists(self.output_file):
87
+ with open(self.output_file, 'r') as f:
88
+ first_few_lines = [next(f) for _ in range(6)]
89
+ if self.logger:
90
+ self.logger.silent_log("First few lines of output:")
91
+ for line in first_few_lines:
92
+ self.logger.silent_log(f"{line.strip()}")
93
+ else:
94
+ error_msg = f"Output file {self.output_file} was not created!"
95
+ if self.logger:
96
+ self.logger.silent_log(error_msg)
97
+ raise Exception(error_msg)
98
+
99
+ except subprocess.CalledProcessError as e:
100
+ cmd_str = ' '.join(seqkit_cmd)
101
+ error_msg = f"Error running seqkit command: {cmd_str}"
102
+ error_details = f"Error details: {str(e)}"
103
+ if self.logger:
104
+ self.logger.silent_log(error_msg)
105
+ self.logger.silent_log(error_details)
106
+ raise Exception(error_msg)
107
+
108
+ return str(self.output_file)
109
+
110
+
111
+
112
+
@@ -0,0 +1,414 @@
1
+ import logging
2
+ import time
3
+ from rich.console import Console
4
+ import os
5
+ import polars as pl
6
+ import needletail
7
+
8
+
9
+ def write_combined_results_to_gff(output_file, combined_data,seq_type):
10
+ with open(output_file, 'w') as f:
11
+ f.write("##gff-version 3\n")
12
+ for row in combined_data.iter_rows(named=True):
13
+ record = convert_record_to_gff3_record(row, seq_type)
14
+ f.write(f"{record}\n")
15
+
16
+ def convert_record_to_gff3_record(row,seq_type): # for dict objects expected to be coherced into a gff3
17
+ # taken from rolypoly https://code.jgi.doe.gov/UNeri/rolypoly/-/blob/main/src/rolypoly/commands/annotation/annotate_RNA.py
18
+
19
+ # try to identify a sequence_id columns (query, qseqid, contig_id, contig, id, name)
20
+ if seq_type == 'nuc':
21
+ sequence_id_col = "Translated_contig_name (frame)"
22
+ else:
23
+ sequence_id_columns = ["sequence_id",'query', 'qseqid', 'contig_id', 'contig', 'id', 'name','Contig_name']
24
+ sequence_id_col = next((col for col in sequence_id_columns if col in row.keys()), None)
25
+ if sequence_id_col is None:
26
+ raise ValueError(f"No sequence ID column found in row. Available columns: {list(row.keys())}")
27
+
28
+ # try to identify a score column (score, Score, bitscore, qscore, bit)
29
+ score_columns = ["score", "Score", "bitscore", "qscore", "bit","bits"]
30
+ score_col = next((col for col in score_columns if col in row.keys()), "score")
31
+
32
+ # try to identify a source column (source, Source, db, DB)
33
+ source_columns = ["source", "Source", "db", "DB"]
34
+ source_col = next((col for col in source_columns if col in row.keys()), "source")
35
+
36
+ # try to identify a type column (type, Type, feature, Feature)
37
+ type_columns = ["type", "Type", "feature", "Feature"]
38
+ type_col = next((col for col in type_columns if col in row.keys()), "type")
39
+
40
+ # try to identify a strand column (strand, Strand, sense, Sense)
41
+ strand_columns = ["strand", "Strand", "sense", "Sense"]
42
+ strand_col = next((col for col in strand_columns if col in row.keys()), "strand")
43
+
44
+ # try to identify a phase column (phase, Phase)
45
+ phase_columns = ["phase", "Phase"]
46
+ phase_col = next((col for col in phase_columns if col in row.keys()), "phase")
47
+
48
+ # Build GFF3 attributes string
49
+ attrs = []
50
+ for key, value in row.items():
51
+ if key not in [sequence_id_col, source_col, score_col, type_col, strand_col, phase_col]:
52
+ attrs.append(f"{key}={value}")
53
+
54
+ # Get values, using defaults for missing columns
55
+ sequence_id = row[sequence_id_col]
56
+ source = row.get(source_col, "rdrpcatch")
57
+ score = row.get(score_col, "0")
58
+ feature_type = row.get(type_col, "feature")
59
+ strand = row.get(strand_col, "+")
60
+ phase = row.get(phase_col, ".")
61
+
62
+ # Format GFF3 record
63
+ gff3_fields = [
64
+ sequence_id,
65
+ source,
66
+ feature_type,
67
+ str(row.get("RdRp_from(AA)", "1")),
68
+ str(row.get("RdRp_to(AA)", "1")),
69
+ str(score),
70
+ strand,
71
+ phase,
72
+ ";".join(attrs) if attrs else "."
73
+ ]
74
+
75
+ return "\t".join(gff3_fields)
76
+
77
+
78
+
79
+
80
+
81
+ class Logger:
82
+ def __init__(self, log_file):
83
+ self.console = Console()
84
+ self.log_file = log_file
85
+ self.logger = logging.getLogger('Logger')
86
+ self.logger.setLevel(logging.INFO)
87
+ handler = logging.FileHandler(self.log_file)
88
+ handler.setLevel(logging.INFO)
89
+ formatter = logging.Formatter('%(asctime)s - %(message)s')
90
+ handler.setFormatter(formatter)
91
+ self.logger.addHandler(handler)
92
+
93
+ def loud_log(self, message):
94
+ self.console.log(message)
95
+ self.logger.info(message)
96
+
97
+ def silent_log(self, message):
98
+ self.logger.info(message)
99
+
100
+ def start_timer(self):
101
+ self.start_time = time.time()
102
+
103
+ return self.start_time
104
+
105
+ def stop_timer(self, start_time, verbose=None):
106
+ end_time = time.time()
107
+
108
+ raw_execution_time = end_time - start_time
109
+
110
+ # Calculate hours, minutes, and seconds
111
+ hours = int(raw_execution_time // 3600)
112
+ minutes = int((raw_execution_time % 3600) // 60)
113
+ seconds = int(raw_execution_time % 60)
114
+ milliseconds = int((raw_execution_time % 1) * 1000)
115
+
116
+ # Format the output
117
+ execution_time = f"{hours} Hours {minutes} Minutes {seconds} Seconds {milliseconds} ms"
118
+
119
+ return execution_time
120
+
121
+
122
+ class fasta_checker:
123
+
124
+ def __init__(self, fasta_file, logger=None):
125
+ self.fasta_file = fasta_file
126
+ self.logger = logger
127
+
128
+ def check_fasta_validity(self):
129
+ reader = needletail.parse_fastx_file(self.fasta_file)
130
+ try:
131
+ first_record = next(reader)
132
+ if self.logger:
133
+ self.logger.silent_log(f"Successfully validated fasta file: {self.fasta_file}")
134
+ return True
135
+ except StopIteration:
136
+ error_msg = f"Invalid or empty fasta file: {self.fasta_file}"
137
+ if self.logger:
138
+ self.logger.silent_log(error_msg)
139
+ raise Exception(error_msg)
140
+ except Exception as e:
141
+ error_msg = f"Invalid fasta file: {self.fasta_file}, error: {str(e)}"
142
+ if self.logger:
143
+ self.logger.silent_log(error_msg)
144
+ raise Exception(error_msg)
145
+
146
+ def read_fasta(self):
147
+ fasta_dict = {}
148
+ reader = needletail.parse_fastx_file(self.fasta_file)
149
+ for record in reader:
150
+ header = f">{record.id}"
151
+ fasta_dict[header] = record.seq
152
+ if self.logger:
153
+ self.logger.silent_log(f"Read {len(fasta_dict)} sequences from {self.fasta_file}")
154
+ return fasta_dict
155
+
156
+ def check_seq_type(self):
157
+ reader = needletail.parse_fastx_file(self.fasta_file)
158
+ dna_set = {'A', 'T', 'G', 'C'}
159
+ dna_set_ambiguous = {'A', 'T', 'G', 'C', 'N'}
160
+ protein_set = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X'}
161
+
162
+ for record in reader:
163
+ seq = record.seq.upper()
164
+ if set(seq).issubset(dna_set):
165
+ if self.logger:
166
+ self.logger.silent_log(f"Detected nucleotide sequence (strict DNA alphabet)")
167
+ return 'nuc'
168
+ elif set(seq).issubset(dna_set_ambiguous):
169
+ if self.logger:
170
+ self.logger.silent_log(f"Detected nucleotide sequence (ambiguous DNA alphabet)")
171
+ return 'nuc'
172
+ elif set(seq).issubset(protein_set):
173
+ if self.logger:
174
+ self.logger.silent_log(f"Detected protein sequence")
175
+ return 'prot'
176
+ else:
177
+ error_msg = f"Invalid sequence type in fasta file: {self.fasta_file} for sequence: {record.id.encode()} with sequence: {set(seq)}"
178
+ if self.logger:
179
+ self.logger.silent_log(error_msg)
180
+ raise Exception(error_msg)
181
+
182
+ def check_seq_length(self, max_len):
183
+ if not os.path.isfile(self.fasta_file):
184
+ error_msg = f"The file '{self.fasta_file}' does not exist."
185
+ if self.logger:
186
+ self.logger.silent_log(error_msg)
187
+ raise FileNotFoundError(error_msg)
188
+
189
+ reader = needletail.parse_fastx_file(self.fasta_file)
190
+ for record in reader:
191
+ if len(record.seq) > max_len:
192
+ error_msg = f"Sequence ID: {record.id}, Length: {len(record.seq)}, " \
193
+ f"Exceeds maximum allowed length: {max_len}. Please check the input file, " \
194
+ f"as this will cause issues with the pyHMMER search."
195
+ if self.logger:
196
+ self.logger.silent_log(error_msg)
197
+ raise ValueError(error_msg)
198
+ if self.logger:
199
+ self.logger.silent_log(f"All sequences are within length limit of {max_len}")
200
+ return True
201
+
202
+
203
+
204
+ class fasta:
205
+
206
+ def __init__(self, fasta_file, logger=None):
207
+ self.fasta_file = fasta_file
208
+ self.logger = logger
209
+
210
+
211
+ def extract_contigs(self, contig_list):
212
+ """
213
+ Extract contigs from a fasta file based on a list of contig names.
214
+
215
+ :param contig_list: List of contig names to extract.
216
+ :type contig_list: list
217
+ :return: Dictionary with contig names as keys and sequences as values.
218
+ :rtype: dict
219
+ """
220
+ contig_dict = {}
221
+ reader = needletail.parse_fastx_file(self.fasta_file)
222
+ for record in reader:
223
+ # pyhmmer uses the first word of the header as the ID, so split on whitespace
224
+ if record.id.strip().split(" ")[0] in contig_list:
225
+ contig_dict[record.id] = record.seq
226
+ return contig_dict
227
+
228
+ def write_fasta(self, contig_dict, outfile):
229
+ """
230
+ Write a dictionary of contigs to a fasta file.
231
+
232
+ :param contig_dict: Dictionary with contig names as keys and sequences as values.
233
+ :type contig_dict: dict
234
+ :param outfile: Path to the output file.
235
+ :type outfile: str
236
+ :return: None
237
+ """
238
+ with open(outfile, 'w') as out_handle:
239
+ for contig_name, seq in contig_dict.items():
240
+ out_handle.write(f">{contig_name}\n{seq}\n")
241
+
242
+ def write_fasta_coords(self, rdrp_coords_list, outfile, seq_type):
243
+ """
244
+ Write a list of RdRp coordinates to a fasta file.
245
+
246
+ :param rdrp_coords_list: List of tuples containing contig name and RdRp coordinates.
247
+ :type rdrp_coords_list: list
248
+ :param outfile: Path to the output file.
249
+ :type outfile: str
250
+ :param seq_type: Type of sequence (prot or nuc).
251
+ :type seq_type: str
252
+ :return: None
253
+ """
254
+ if self.logger:
255
+ self.logger.silent_log(f"Processing {len(rdrp_coords_list)} coordinates")
256
+ self.logger.silent_log(f"First few coordinates: {rdrp_coords_list[:3]}")
257
+
258
+ reader = needletail.parse_fastx_file(self.fasta_file)
259
+ matches_found = 0
260
+ with open(outfile, 'w') as out_handle:
261
+ for record in reader:
262
+ # pyhmmer uses the first word of the header as the ID, so split on whitespace
263
+ record_id = record.id.strip().split(" ")[0]
264
+ if self.logger:
265
+ self.logger.silent_log(f"Processing record with ID: '{record_id}'")
266
+ for contig_name, rdrp_from, rdrp_to in rdrp_coords_list:
267
+ contig_name = str(contig_name).strip()
268
+ if self.logger:
269
+ self.logger.silent_log(f"Comparing record '{record_id}' with contig '{contig_name}'")
270
+ if record_id == contig_name:
271
+ matches_found += 1
272
+ seq = record.seq[rdrp_from-1:rdrp_to]
273
+ fasta_header = f"{record_id}_RdRp_{rdrp_from}-{rdrp_to}"
274
+ out_handle.write(f">{fasta_header}\n{seq}\n")
275
+ if self.logger:
276
+ self.logger.silent_log(f"Match found! Writing sequence of length {len(seq)}")
277
+ else:
278
+ if self.logger:
279
+ self.logger.silent_log(f"No match - lengths: {len(record_id)}|{len(contig_name)}, "
280
+ f"record_id bytes: {record_id.encode()}, contig bytes: {contig_name.encode()}")
281
+
282
+ if self.logger:
283
+ self.logger.silent_log(f"Total matches found: {matches_found}")
284
+
285
+
286
+ class mmseqs_parser:
287
+
288
+ def __init__(self, mmseqs_tax_out_file, mmseqs_s_out_file):
289
+ self.mmseqs_tax_out_file = mmseqs_tax_out_file
290
+ self.mmseqs_s_out_file = mmseqs_s_out_file
291
+
292
+
293
+ def parse_mmseqs_tax_lca(self):
294
+ """
295
+ Parse the MMseqs2 taxonomy output file.
296
+
297
+ :return: Dictionary with contig names as keys and taxonomy lineages as values.
298
+ :rtype: dict
299
+ """
300
+ with open(self.mmseqs_tax_out_file, 'r') as f:
301
+ lca_dict = {}
302
+ for line in f:
303
+ line = line.strip().split('\t')
304
+ contig = line[0]
305
+ if len(line) < 5:
306
+ lca_lineage = line[3]
307
+ else:
308
+ lca_lineage = line[4]
309
+ lca_dict[contig] = lca_lineage
310
+ return lca_dict
311
+
312
+ def parse_mmseqs_e_search_tophit(self):
313
+ """
314
+ Parse the MMseqs2 easy-search output file.
315
+
316
+ :return: Dictionary with contig names as keys and lists of hit information as values.
317
+ :rtype: dict
318
+ """
319
+ with open(self.mmseqs_s_out_file, 'r') as f:
320
+ tophit_dict = {}
321
+ for line in f:
322
+ line = line.strip().split('\t')
323
+ contig = line[0]
324
+
325
+ if contig not in tophit_dict:
326
+ target = line[1]
327
+ fident = line[2]
328
+ alnlen = line[3]
329
+ eval = line[10]
330
+ bits = line[11]
331
+ qcov = line[12]
332
+ lineage = line[14]
333
+ tophit_dict[contig] = [target, fident, alnlen, eval, bits, qcov, lineage]
334
+ else:
335
+ continue
336
+
337
+ return tophit_dict
338
+
339
+ def tax_to_rdrpcatch(self, rdrpcatch_out, extended_rdrpcatch_out, seq_type):
340
+ """
341
+ Add taxonomy information to the RdRpCATCH output file.
342
+
343
+ :param rdrpcatch_out: Path to the RdRpCATCH output file.
344
+ :type rdrpcatch_out: str
345
+ :param extended_rdrpcatch_out: Path to the extended RdRpCATCH output file.
346
+ :type extended_rdrpcatch_out: str
347
+ :param seq_type: Type of sequence (prot or nuc).
348
+ :type seq_type: str
349
+ :return: None
350
+ """
351
+ lca_dict = self.parse_mmseqs_tax_lca()
352
+ tophit_dict = self.parse_mmseqs_e_search_tophit()
353
+
354
+ df = pl.read_csv(rdrpcatch_out, separator='\t')
355
+
356
+ # drop columns that are not needed
357
+ df = df.drop(["Best_hit_norm_bitscore_profile", "Best_hit_norm_bitscore_contig",
358
+ "Best_hit_ID_score"])
359
+
360
+ # Create new columns for taxonomy information
361
+ # For translated sequences, use the frame-specific name
362
+ lookup_col = 'Translated_contig_name (frame)' if seq_type == 'nuc' else 'Contig_name'
363
+
364
+ df = df.with_columns([
365
+ pl.Series(name='MMseqs_Taxonomy_2bLCA', values=[lca_dict.get(row[lookup_col], '') for row in df.iter_rows(named=True)]),
366
+ pl.Series(name='MMseqs_TopHit_accession', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[0] for row in df.iter_rows(named=True)]),
367
+ pl.Series(name='MMseqs_TopHit_fident', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[1] for row in df.iter_rows(named=True)]),
368
+ pl.Series(name='MMseqs_TopHit_alnlen', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[2] for row in df.iter_rows(named=True)]),
369
+ pl.Series(name='MMseqs_TopHit_eval', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[3] for row in df.iter_rows(named=True)]),
370
+ pl.Series(name='MMseqs_TopHit_bitscore', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[4] for row in df.iter_rows(named=True)]),
371
+ pl.Series(name='MMseqs_TopHit_qcov', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[5] for row in df.iter_rows(named=True)]),
372
+ pl.Series(name='MMseqs_TopHit_lineage', values=[tophit_dict.get(row[lookup_col], ['', '', '', '', '', '', ''])[6] for row in df.iter_rows(named=True)])
373
+ ])
374
+
375
+ # Sort by Best_hit_bitscore
376
+ sorted_df = df.sort("Best_hit_bitscore", descending=True)
377
+
378
+ sorted_df.write_csv(extended_rdrpcatch_out, separator='\t')
379
+
380
+
381
+ class file_handler:
382
+
383
+ def __init__(self, file):
384
+ self.file = file
385
+
386
+ def check_file_exists(self):
387
+ if not os.path.exists(self.file):
388
+ raise Exception(f"File does not exist: {self.file}")
389
+ return True
390
+
391
+ def delete_file(self):
392
+ os.remove(self.file)
393
+ return True
394
+
395
+ def check_file_size(self):
396
+ return os.path.getsize(self.file)
397
+
398
+ def check_file_extension(self):
399
+ return os.path.splitext(self.file)[1]
400
+
401
+ def get_file_name(self):
402
+ return os.path.basename(self.file)
403
+
404
+ def get_file_dir(self):
405
+ return os.path.dirname(self.file)
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+